26

Size: px

Start display at page:

Download "26"

ちえこたかぎ
4 years ago
Views:

1 26

2 FIPP FAPP

5 I/O

7 LAMMPS LJ atomic fluid 32,000 atoms for 100 timesteps FX /

8 (FIPP)

9 FIPP fipp - C - d dir/ - Ihwm,call - i10 mpiexec./a.out GUI, fipppx - A - d dir/ - Ihwm,cpu,balance,call,src > fipp.txt procedure loop line

10 177 Procedures profile ************************************************************************************* 180 Application - procedures 181 ************************************************************************************* Cost % Operation (S) Barrier % Start End Application LAMMPS_NS::PairLJCutOMP::eval LAMMPS_NS::Neighbor._OMP_ GI printf_fp 198 MPI % Communication (S) Start End Application LAMMPS_NS::PairLJCutOMP::eval LAMMPS_NS::Neighbor._OMP_ GI printf_fp Barrier, MPI

11 GUI

12 Call graph Process 0 - Thread % <0> main [0 / 216] 0% <1> LAMMPS_NS::CiteMe::CiteMe(LAMMPS_NS::LAMMPS *) [1] 0% <1> LAMMPS_NS::LAMMPS::LAMMPS(int, char **, ompi_communicator_t *) [0 / 1] 0% <2> LAMMPS_NS::LAMMPS::post_create() [0 / 1] 0% <3> LAMMPS_NS::Input::one(const char *) [0 / 1] 0% <4> LAMMPS_NS::Input::execute_command() [0 / 1] 0% <5> LAMMPS_NS::Input::package() [0 / 1] 0% <6> LAMMPS_NS::Modify::add_fix(int, char **, int) [0 / 1] 0% <8> GI_strcmp [1] 0% <6> LAMMPS_NS::PairLJCutOMP::compute(int, int) [0 / 26] 0% <7> jwe_opar_fullset [0 / 26] ## 11% <8> LAMMPS_NS::PairLJCutOMP::eval<(int)0, (int)0, (int)1>(int, int) [24] 0% <8> LAMMPS_NS::PairLJCutOMP::compute(int, int)._omp_1 [0 / 2] 0% <9> LAMMPS_NS::ThrOMP::reduce_thr(void *, int, int) [0 / 2] 1% <10> LAMMPS_NS::data_reduce_thr(double *, int, int, int, int) [2] 3% <6> LAMMPS_NS::AtomVecAtomic::unpack_reverse(int, int *, double *) [6] 0% <6> LAMMPS_NS::Output::write(long) [0 / 158] 0% <7> LAMMPS_NS::Dump::write() [0 / 158] 0% <8> LAMMPS_NS::DumpAtom::convert_string(int, double *) [0 / 153] 0% <9> LAMMPS_NS::DumpAtom::convert_noimage(int, double *) [0 / 152] 0% <10> sprintf [0 / 152] /

13 365 LAMMPS_NS::PairLJCutOMP::eval<(int)0, (int)0, (int)1>(int, int, LAMMPS_NS::ThrData *) % 140 Process ** + 11 % 159 Process ** - 8 % 131 Process % 142 Process LAMMPS_NS::Neighbor::half_bin_newton_omp(LAMMPS_NS::NeighList *)._OMP_ % 35 Process ******* + 28 % 45 Process ******* - 29 % 25 Process % 36 Process half_

362 LAMMPS_NS::PairLJCutOMP::eval<(int)0, (int)0, (int)1>(int, int, LAMMPS_NS::ThrData *) 363 +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +

14 362 LAMMPS_NS::PairLJCutOMP::eval<(int)0, (int)0, (int)1>(int, int, LAMMPS_NS::ThrData *) % 7629 Process % 7736 Process * - 5 % 7150 Process % 7588 Process LAMMPS_NS::Neighbor::half_bin_newton_omp(LAMMPS_NS::NeighList *)._OMP_ * + 5 % 1699 Process % 1583 Process * + 5 % 1691 Process ** - 8 % 1477 Process

15 478 LAMMPS_NS::PairLJCutOMP::eval<(int)0, (int)0, (int)1>(int, int, LAMMPS_NS::ThrData *) * - 7 % 445 Thread **** + 17 % 556 Thread ***** + 20 % 572 Thread ***** + 21 % 579 Thread % 464 Thread * + 7 % 511 Thread ** + 10 % 524 Thread * + 6 % 507 Thread * + 6 % 507 Thread ** + 11 % 529 Thread * - 5 % 455 Thread ** - 9 % 433 Thread **** - 19 % 386 Thread ** - 11 % 422 Thread ******* - 29 % 338 Thread **** - 16 % 401 Thread

16 GUI fipp:application View:Profile:Cost Stacked Chart:Procedure 0 0 0MPI_barrier

17 MPI_barrier

18 p 2p 4p N 2N 4N

20 flops SIMD MPI (FAPP)

21 FAPP fapp - C - d dir/ - Ihwm - Hevent=Statistics mpiexec./a.out GUI, fapppx - A - d dir/ - Ihwm,mpi > fapp.txt

22 #include <fj_tool/fapp.h> void PairLJCutOMP::eval(int iifrom, int iito, ThrData * const thr) { fapp_start("eval", 0, 0)... fapp_stop("eval", 0, 0) }

23 Flops fapp - C - d dir - Ihwm - Hevent=Statistics mpiexec./a.out 339 Performance monitor : Statistics 351 Kind Elapsed(s) MFLOPS MFLOPS/PEAK(%) MIPS MIPS/PEAK(%) AVG eval MAX MIN Mem throughput Mem throughput 372 Kind Elapsed(s) _chip(mb/s) /PEAK(%) SIMD(%) AVG eval MAX MIN GFlops = 8x 2 GHz x 4 SIMD x 2 SIMD

24 - Koptmsg=2 - Nlst=t src.f90 src.lst

25 113 2 for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; 115 i 2 factor_lj = special_lj[sbmask(j)]; j &= NEIGHMASK; delx = xtmp - x[j].x; dely = ytmp - x[j].y; delz = ztmp - x[j].z; rsq = delx*delx + dely*dely + delz*delz; jtype = type[j]; if (rsq < cutsqi[jtype]) { r2inv = 1.0/rsq; r6inv = r2inv*r2inv*r2inv; forcelj = r6inv * (lj1i[jtype]*r6inv - lj2i[jtype]); fpair = factor_lj*forcelj*r2inv; fxtmp += delx*fpair; fytmp += dely*fpair; fztmp += delz*fpair; if (NEWTON_PAIR j < nlocal) { f[j].x -= delx*fpair; f[j].y -= dely*fpair; jwd6229s-i "../pair_lj_cut_omp.cpp", line 113: ループ内でif 文が存在するためこのループはSIMD 化できません jwd8664o-i 136 "../pair_lj_cut_omp.cpp", 2 f[j].z line 113: -= delz*fpair; ループ内に関数呼出しなどの最適化対象外の命令があるためソフトウェアパイプライニングを適用できません } jwd8202o-i 138 "../pair_lj_cut_omp.cpp", line 113: このループを展開数 2 回でループアンローリングしました jwd8670o-i "../pair_lj_cut_omp.cpp", line 113: ループ内に分岐命令があるためソフトウェアパイプライニングを適用できません jwd8101o-i "../pair_lj_cut_omp.cpp", line 115: 利用者定義の関数 '_ZN9LAMMPS_NS4Pair6sbmaskEi' をインライン展開しました jwd8209o-i "../pair_lj_cut_omp.cpp", line 121: 多項式の演算順序を変更しました jwd8209o-i "../pair_lj_cut_omp.cpp", line 128: 多項式の演算順序を変更しました jwd5202p-i "../pair_lj_cut_omp.cpp", line 134: ループ中で変数 'f' を定義する順序が逐次実行と異なるためこのループは並列化できません jwd6202s-i "../pair_lj_cut_omp.cpp", line 134: ループ中で変数 'f' を定義する順序が逐次実行と異なるためこのループはSIMD 化できません

fapp - C - d dir - Ihwm - Hevent=Cache mpiexec./a.

352 ------------------------------------------------------------------------------ 353 AVG 1.

26 fapp - C - d dir - Ihwm - Hevent=Cache mpiexec./a.out 339 Performance monitor : Cache Kind Elapsed(s) Inst LS_SIMD-or(%) LS-or(%) Prefetch AVG eval MAX MIN Kind Elapsed(s) L1-op miss(%) L2 miss(%) L2 dm miss(%) L2 pf miss(%) mtlb-op(%) AVG eval MAX MIN core L1 CPU L2 core L1 L1D % 6.25 % L2 0 %

28 113 2 for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; 115 i 2 factor_lj = special_lj[sbmask(j)]; j &= NEIGHMASK; jj 2 配列に delx = j xtmp - x[j].x; dely = ytmp - x[j].y; delz = ztmp - x[j].z; rsq = delx*delx + dely*dely + delz*delz; jtype = type[j]; if (rsq < cutsqi[jtype]) { r2inv = 1.0/rsq; r6inv = r2inv*r2inv*r2inv; forcelj = r6inv * (lj1i[jtype]*r6inv - lj2i[jtype]); fpair = factor_lj*forcelj*r2inv; fxtmp += delx*fpair; fytmp += dely*fpair; fztmp += delz*fpair; if (NEWTON_PAIR j < nlocal) { f[j].x - = delx*fpair; f[j].y - = dely*fpair; f[j].z - = delz*fpair;

29 MPI MPIFAPP 53 MPI profile ************************************************************************************* 56 Application 57 ************************************************************************************* Kind Elapsed(s) Wait(s) Byte Call ( 0-4K 4K- 64K 64K- 1024K 1024KByte- ) all AVG MPI::Comm::Barrier 64 MAX MIN AVG MPI::Comm::Bcast 68 MAX MIN AVG MPI::Comm::Allreduce 72 MAX MIN wait

30 MPI

31 , RIST office.jp/pages/seminar_text

Microsoft PowerPoint - 高速化WS富山.pptx

Microsoft PowerPoint - 高速化WS富山.pptx 京における高速化ワークショップ性能分析チューニングの手順について登録施設利用促進機関一般財団法人高度情報科学技術研究機構富山栄治一般財団法人高度情報科学技術研究機構 2 性能分析チューニング手順どの程度の並列数が実現可能か把握するインバランスの懸念があるか把握するタイムステップループ I/O 処理など注目すべき箇所を把握する並列数並列化率などの目標を設定しチューニング時の指針とする