1 /83
2 /83
3 /83,
4 /83 ( ) myrank ), FX10, Blue Gene, Anton : MPI. : ( ).
MPICH, OpenMPI ( ) MPI 1 MPI NIC memory CPU 0 (myrank=0) 5 nprocs=2 /83 NIC memory CPU 1 (myrank=1)
6 /83 hello_mpi.f: include mpif.h MPI, CALL MPI_INIT(ierr) MPI CALL MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierr) CALL MPI_COMM_RANK(MPI_COMM_WORLD,myrank,ierr) WRITE(*,*) Hello, myrank (nprocs) (myrank) 0 CALL MPI_FINALIZE(ierr) MPI STOP END > mpirun np 4./a.out > mpif90 hello_mpi.f Hello 0 Hello 1 Hello 2 Hello 3
7 /83 call MPI_SENDRECV ( sendbuf,scount,stype,dest,stag, recvbuf,rcount,rtype,source,rtag,comm,status,ierr) mpi_send + mpi_recv sendbuf : scount : stype : dest : (rank) stag : recvbuf : rcount : rtype : source : rtag : comm : status : ierr : dest=1 source=3 rank0 rank1 rank2 rank3 s 3 d 1 dest=2 source=0 s 0 d 2 dest=3 source=1 s 1 d 3 myrank dest=0 source=2 s 2 d 0 comm.. destination : source : myrank, call mpi_sendrecv() rank.
8 /83 2 +x rank12 rank13 rank14 rank15 s d 15 13 rank8 rank9 rank10 rank11 s 11 d 9 s 12 s 8 d 14 d 10 s 13 s 9 d 15 d 11 s 14 s 10 d 12 d 8 y rank4 rank5 rank6 rank7 s d s d s d s d 7 5 4 6 5 7 6 4 myrank x rank0 rank1 rank2 rank3 s 3 d 1 s 0 d 2 s 1 d 3 s 2 d 0
9 /83 2 +y rank12 rank13 rank14 rank15 d0 s8 d1 s9 d2 s10 d3 s11 rank8 rank9 rank10 rank11 d12 d13 d14 d15 s4 s5 s6 s7 rank4 rank5 rank6 rank7 y d8 s0 d9 s1 d10 s2 d11 s3 x rank0 rank1 rank2 rank3 d4 d5 d6 d7 s12 s13 s14 s15
10 /83 (1 / ) M2L +4-5 myrank +4 M2M y -5 x M2M, M2L ( )
MPI 並列化技術①: 通信衝突の回避 CMSI配信講義B 第15回 11/83 例) 多極子の通信 (1スーパーセル/プロセス) 古い通信コード コーディングイメージ iy=+4, ix=-5~+4 do iy=-5,+4 do ix=-5,+4 ip_dest =送信先プロセス番号 ip_src =受信元プロセス番号 IF( (ix, iy) が白抜き部分 ) cycle call mpi_sendrecv(..,ip_dest,..,ip_src,..) iy=-2, ix=-5~+4 iy=-5, ix=-5~+4 y x M2M, M2L演算に必要な データ範囲 (白抜き部分は除く) このままだと(103-53)+(23-1) =882 回の プロセス間通信 [3次元]
12 /83 (1 / ) myrank 8 6 do iy=-5,+4 do ix=-5,+4 ip_dest = ip_src = IF( (ix, iy) ) cycle call mpi_sendrecv(..,ip_dest,..,ip_src,..) 2 500 9 : (10 3-5 3 )+(2 3-1) =882 (1) 882
13 /83 (1 / ) other rank myrank 8 6 do iy=-5,+4 do ix=-5,+4 ip_dest = ip_src = IF( (ix, iy) ) cycle call mpi_sendrecv(..,ip_dest,..,ip_src,..) 2 500 9 : (10 3-5 3 )+(2 3-1) =882 (1) 882 (2) =
14 /83 (1 / ) other rank myrank 8 1) ± x 9 6 2 500 9 (1) 882 (2)
15 /83 (1 / ) other rank myrank 8 2) ± y 9 6 2 500 9 (1) 882 (2)
16 /83 (1 / ) other rank myrank 8 9+9=18 2 500 9 6 27 (1) 882 (2) (1) (2)
17 /83 x ipx_dest = (-x) ipx_src = (-x) do ix= -1, -4, -1 call mpi_sendrecv(..,ipx_dest,..,ipx_src,..) +x ix=-1 myrank ) call mpi_sendrecv(), myrank -x +x ix=-2 : (-x) ix=-3 ix=-4
18 /83 x ipx_dest = (-x) ipx_src = (-x) do ix= -1, -4, -1 call mpi_sendrecv(..,ipx_dest,..,ipx_src,..) +x ix=-1 myrank ipx_dest = (+x) ipx_src = (+x) do ix= +1, +5, +1 call mpi_sendrecv(..,ipx_dest,..,ipx_src,..) ix=+1 ix=+2 +x : (+x) ix=-2 : (-x) ix=+3 ix=-3 ix=+4 ix=-4 ix=+5
19 /83 y ipy_dest = (-y) ipy_src = (-y) do iy= -1, -4, -1 call mpi_sendrecv(..,ipy_dest,..,ipy_src,..) +y iy=-1 iy=-2 myrank +x : (-y)
20 /83 y ipy_dest = (-y) ipy_src = (-y) do iy= -1, -4, -1 call mpi_sendrecv(..,ipy_dest,..,ipy_src,..) +y iy=-3 iy=-4 +x
+x : (+y) 21 /83 y +y ipy_dest = (-y) ipy_src = (-y) do iy= -1, -4, -1 call mpi_sendrecv(..,ipy_dest,..,ipy_src,..) ipy_dest = (+y) ipy_src = (+y) do iy= +1, +5, +1 call mpi_sendrecv(..,ipy_dest,..,ipy_src,..) iy=+1
+x : (+y) 22 /83 +y y iy=+2 ipy_dest = (+y) ipy_src = (+y) do iy= +1, +5, +1 call mpi_sendrecv(..,ipy_dest,..,ipy_src,..) iy=+3
+y y iy=+4 23 /83 iy=+5 +x : (+y)
24 /83 ipx_dest = (-x) ipx_src = (-x) do ix= -1, -4, -1 call mpi_sendrecv(..,ipx_dest,..,ipx_src,..) ipx_dest = (+x) ipx_src = (+x) do ix= +1, +5, +1 call mpi_sendrecv(..,ipx_dest,..,ipx_src,..) ipy_dest = (-y) ipy_src = (-y) do iy=-1, -4, -1 call mpi_sendrecv(..,ipy_dest,..,ipy_src,..) -x +x -y myrank 9+9=18 27 ipy_dest = (+y) ipy_src = (+y) do iy=+1, +5, +1 call mpi_sendrecv(..,ipy_dest,..,ipy_src,..) z -z +z +y
25 /83 wk_x meta_x trans_x meta_x wk_x meta_x
26 /83 -x [ ] ipx_mdest = (-x) ipx_msrc = (-x) do ix=-1,-2,-1 c icbufm( ) = na_per_cell( 3, 3 ) call mpi_sendrecv(icbufm,...,ipx_mdest,..., ircbufm,..., ipx_msrc,...) na_per_cell( 4, 3 )=ircbufm( ) tag(4,3) c buffm( ) = meta_x( ) call mpi_sendrecv(buffm,..,ipx_mdest,.., rbuffm,...,ipx_msrc,..) meta_x( ) = rbuffm( ) Y ix=-1 myrank (3,3) (4,3) meta_x(n) ) myrank -x +x na1cell : (-x) meta_x(1) X
27 /83 -x [ ] ipx_mdest = (-x) ipx_msrc = (-x) do ix=-1,-2,-1 c icbufm( ) = na_per_cell( 4, 3 ) call mpi_sendrecv(icbufm,...,ipx_mdest,..., ircbufm,..., ipx_msrc,...) na_per_cell( 5, 3 )=ircbufm( ) tag(5,3) c buffm( ) = meta_x( ) call mpi_sendrecv(buffm,..,ipx_mdest,.., rbuffm,...,ipx_msrc,..) meta_x( ) = rbuffm( ) Y ix=-2 myrank (4,3) (5,3) meta_x(n) ) myrank -x +x na1cell : (-x) meta_x(1) X
28 /83 +x [ ] ipx_pdest = (+x) ipx_psrc = (+x) do ix=+1,+2,+1 c icbufp( ) = na_per_cell( 3, 3 ) call mpi_sendrecv(icbufp,...,ipx_pdest,..., ircbufp,..., ipx_psrc,...) na_per_cell( 2, 3 )=ircbufp( ) tag(2,3) c buffp( ) = meta_x( ) call mpi_sendrecv(buffp,..,ipx_pdest,.., rbuffp,...,ipx_psrc,..) meta_x( ) = rbuffp( ) Y ix=+1 myrank (2,3) (3,3) meta_x(n) ) myrank +x x na1cell : (+x) meta_x(1) X
29 /83 +x [ ] ipx_pdest = (+x) ipx_psrc = (+x) do ix=+1,+2,+1 c icbufp( ) = na_per_cell( 2, 3 ) call mpi_sendrecv(icbufp,...,ipx_pdest,..., ircbufp,..., ipx_psrc,...) na_per_cell( 1, 3 )=ircbufp( ) tag(1,3) c buffp( ) = meta_x( ) call mpi_sendrecv(buffp,..,ipx_pdest,.., rbuffp,...,ipx_psrc,..) meta_x( ) = rbuffp( ) Y ix=+2 (1,3) (2,3) myrank meta_x(n) ) myrank +x x na1cell : (+x) meta_x(1) X
30 /83 -y [ ] ipy_mdest = (-y) ipy_msrc = (-y) do iy=-1,-2,-1 c Y iy=-1 meta_x(n) call mpi_sendrecv(na_per_cell(1,3),5,...,ipy_mdest,..., na_per_cell(1,4),5,..., ipy_msrc,...) tag(1:5,4) c call mpi_sendrecv(meta_x( ),naline,..,ipy_mdest,.., meta_x( ),naline,...,ipy_msrc,..) (1,4) (1,3) (3,3) ) myrank -y +y na1cell naline=5*na1cell meta_x(1) : (-y) X
31 /83 -y [ ] ipy_mdest = (-y) ipy_msrc = (-y) do iy=-1,-2,-1 c Y iy=-2 (1,5) meta_x(n) call mpi_sendrecv(na_per_cell(1,4),5,...,ipy_mdest,..., na_per_cell(1,5),5,..., ipy_msrc,...) tag(1:5,5) c call mpi_sendrecv(meta_x( ),naline,..,ipy_mdest,.., meta_x( ),naline,...,ipy_msrc,..) (1,4) (3,3) ) myrank -y +y naline=5*na1cell meta_x(1) : (-y) X
32 /83 +y [ ] ipy_pdest = (+y) ipy_psrc = (+y) do iy=+1,+2,+1 c Y iy=+1 meta_x(n) call mpi_sendrecv(na_per_cell(1,3),5,...,ipy_pdest,..., na_per_cell(1,2),5,..., ipy_psrc,...) tag(1:5,2) c call mpi_sendrecv(meta_x( ),naline,..,ipy_pdest,.., meta_x( ),naline,...,ipy_psrc,..) (1,3) (3,3) (1,2) ) myrank +y -y naline=5*na1cell meta_x(1) : (+y) X
33 /83 +y [ ] ipy_pdest = (+y) ipy_psrc = (+y) do iy=+1,+2,+1 c Y iy=+2 meta_x(n) call mpi_sendrecv(na_per_cell(1,2),5,...,ipy_pdest,..., na_per_cell(1,1),5,..., ipy_psrc,...) tag(1:5,1) c call mpi_sendrecv(meta_x( ),naline,..,ipy_pdest,.., meta_x( ),naline,...,ipy_psrc,..) (3,3) (1,2) ) myrank +y -y (1,1) meta_x(1) naline=5*na1cell X : (+y)
34 /83 ipz_mdest = (-z) ipz_msrc = (-z) do iz=-1,-2,-1 c -z call mpi_sendrecv(na_per_cell( ),25,...,ipz_mdest,..., na_per_cell( ),25,..., ipz_msrc,...) tag c call mpi_sendrecv(meta_x( ),narea,..,ipz_mdest,.., meta_x( ),narea,...,ipz_msrc,..) z [ ] (1,5,5) (5,5,5) (1,1,5) (5,1,5) (1,5,4) (5,5,4) (1,1,4) (5,1,4) (1,5,3) (5,5,3) myrank (1,1,3) (1,5,2) (5,1,3) (5,5,2) ) myrank -z +z (1,1,2) narea=25*na1cell (1,5,1) (5,1,2) (5,5,1) (1,1,1) (5,1,1) x
35 /83 ipz_mdest = (-z) ipz_msrc = (-z) do iz=-1,-2,-1 c call mpi_sendrecv(na_per_cell(1,1,3),25,...,ipz_mdest,..., na_per_cell(1,1,4),25,..., ipz_msrc,...) tag(1:5,1:5,4) c -z call mpi_sendrecv(meta_x( ),narea,..,ipz_mdest,.., meta_x( ),narea,...,ipz_msrc,..) z [ ] (1,5,5) (5,5,5) (1,1,5) (5,1,5) (1,5,4) (5,5,4) (1,1,4) (5,1,4) (1,5,3) (5,5,3) iz=-1 (1,1,3) (1,5,2) (5,1,3) (5,5,2) : (-z) ) myrank -z +z (1,1,2) narea=25*na1cell (1,5,1) (5,1,2) (5,5,1) (1,1,1) (5,1,1) x
36 /83 ipz_mdest = (-z) ipz_msrc = (-z) do iz=-1,-2,-1 c call mpi_sendrecv(na_per_cell(1,1,4),25,...,ipz_mdest,..., na_per_cell(1,1,5),25,..., ipz_msrc,...) tag(1:5,1:5,5) c -z call mpi_sendrecv(meta_x( ),narea,..,ipz_mdest,.., meta_x( ),narea,...,ipz_msrc,..) z [ ] (1,5,5) (1,1,5) (1,5,4) (1,1,4) (5,5,5) (5,1,5) (5,5,4) (5,1,4) : (-z) (1,5,3) (5,5,3) iz=-2 (1,1,3) (1,5,2) (5,1,3) (5,5,2) ) myrank -z +z (1,1,2) narea=25*na1cell (1,5,1) (5,1,2) (5,5,1) (1,1,1) (5,1,1) x
37 /83 ipz_mdest = (-z) ipz_msrc = (-z) do iz=-1,-2,-1 c call mpi_sendrecv(na_per_cell(1,1,3),25,...,ipz_mdest,..., na_per_cell(1,1,2),25,..., ipz_msrc,...) tag(1:5,1:5,2) c -z call mpi_sendrecv(meta_x( ),narea,..,ipz_mdest,.., meta_x( ),narea,...,ipz_msrc,..) z [ ] (1,5,5) (5,5,5) (1,1,5) (5,1,5) (1,5,4) (5,5,4) (1,1,4) (5,1,4) (1,5,3) (5,5,3) iz=+1 (1,1,3) (1,5,2) (5,1,3) (5,5,2) : (+z) ) myrank +z -z (1,1,2) narea=25*na1cell (1,5,1) (5,1,2) (5,5,1) (1,1,1) (5,1,1) x
38 /83 ipz_mdest = (-z) ipz_msrc = (-z) do iz=-1,-2,-1 c call mpi_sendrecv(na_per_cell(1,1,2),25,...,ipz_mdest,..., na_per_cell(1,1,1),25,..., ipz_msrc,...) tag(1:5,1:5,2) c -z call mpi_sendrecv(meta_x( ),narea,..,ipz_mdest,.., meta_x( ),narea,...,ipz_msrc,..) z [ ] (1,5,5) (5,5,5) (1,1,5) (5,1,5) (1,5,4) (5,5,4) (1,1,4) (5,1,4) (1,5,3) (5,5,3) iz=+2 (1,1,3) (1,5,2) (5,1,3) (5,5,2) ) myrank +z -z (1,1,2) narea=25*na1cell (1,5,1) (5,1,2) (5,5,1) : (+z) (1,1,1) (5,1,1) x
39 /83. myrank 2 n (n 0) myrank narea=36*na1cell 2 1 2 3 narea=96*na1cell myrank 2 1 2 2 myrank naline=6*na1cell naline=12*na1cell allocate( tag(1:6,1:6) ) allocate( na_per_cell(1:6,1:6) ) allocate( tag(1:12,1:8) ) allocate( na_per_cell(1:12,1:8) ) ), z y x. comm_3.f, comm_fmm.f
40 /83 (1):,. (2):,.. ( )
41 /83 [ (2)] [ (1),(2)] FMM M2M/L2L [ (1)]
42 /83 i dihedral j K! "# 1+ cos(n!(r i, r j, r k, r l )!") $ % k l n=1 i n=2 j n=3 k l j i n=5 k do n=1,ndihedrals phi=phi(ri, rj, rk, rl) Fi, Fj, Fk, Fl f(i)=f(i)+fi f(j)=f(j)+fj f(k)=f(k)+fk f(l)=f(l)+fl n=4 l n=6
43 /83 i dihedral j K! "# 1+ cos(n!(r i, r j, r k, r l )!") $ % k l myrank (rank=0) n=1 i n=2 j n=3 k rank=1 rank=1 Fk, Fl l do n=1,ndihedrals(myrank) phi=phi(ri, rj, rk, rl) Fi, Fj, Fk, Fl MPI IF(ri in myrank) f(i)=f(i)+fi, ELSE Fi IF(rj in myrank) f(j)=f(j)+fj, ELSE Fj IF(rk in myrank) f(k)=f(k)+fk, ELSE Fk IF(rl in myrank) f(l)=f(l)+fl, ELSE Fl ELSE, rank=3 Fj rank=3 rank=1 Fl rank=4 Fi rank=4
44 /83 i dihedral j K! "# 1+ cos(n!(r i, r j, r k, r l )!") $ % k l i0=2 i0=1 myrank i i0=3 i0=5 i0 : myrank j i0=6 k rank=1 l MPI i0=4 i0=7 do i0=1,natom(myrank) dihedral i0 phi=phi(ri, rj, rk, rl) Fi f(i)=f(i)+fi 4. i0 OpenMP, SIMD md_charmm_f90.f rank=3 rank=4
45 /83 F ij =!F ji ) Lennard-Jones, Coulomb do icell(myrank) do jcell_list(myrank or otherranks) do i=na_per_cell(icell) do j=na_per_cell(jcell) rij=rij(ri, rj) Fij f(i)=f(i)+fij IF(rj in myrank) f(j)=f(j)-fij ELSE storef(j)=storef(j)-fij! j! i! jcell! icell storef(j) IF, DO Fij myrank icell jcell other ranks
46 /83 F ij =!F ji ) Lennard-Jones, Coulomb do icell(myrank) do jcell_list(myrank or otherranks) do i=na_per_cell(icell) do j=na_per_cell(jcell) rij=rij(ri, rj) Fij f(i)=f(i)+fij i ; ; ; 2. OpenMP, SIMD = myrank icell jcell other ranks ( )
47 /83 FMM M2M myrank 2 M2M (L1 L2) M2M 1 M2M (L0 L1) myrank M2M
48 /83 FMM M2M other ranks myrank 2 M2M (L1 L2) M2M 1 M2M (L0 L1) myrank M2M
49 /83 FMM M2M : 6+ 1 mpi_bcast 2 M2M (L1 L2) M2M 1 M2M (L0 L1) mpi_gather myrank M2M
50 /83 FMM M2M : 3+ 4 2 M2M (L1 L2) 1 mpi_sendrecv M2M (L0 L1) myrank M2M
51 /83 FMM L2L FMM-ewald myrank 2 L2L (L2 L1) L2L 1 L2L (L1 L0) L2L
52 /83 FMM L2L mpi_bcast : 3+ 9 FMM-ewald other ranks 2 L2L (L2 L1) 1 L2L (L1 L0)
53 /83 FMM L2L ) wm M2M : 0+ 12 FMM-ewald FMM-ewald FMM-ewald FMM-ewald 2 L2L (L2 L1) 1 L2L (L1 L0)
54 /83 IF
55 /83 ) 16GB ) 6MB Main memory 100 L2 cache 10 ) 32KB ) 128GFLOPS L1 cache CPU 1 (1) (2)
56 /83 wk_x(n) r cut other ranks i j wk_x(1)
57 /83 meta_x: X-Y (= ) meta_x(n) Y (1,5) (2,5) (3,5) (4,5) (5,5) (1,4) (2,4) (3,4) (4,4) (5,4) (1,3) (2,3) (3,3) (4,3) (5,3) (1,2) (2,2) (3,2) (4,2) (5,2) (1,1) (2,1) (3,1) (4,1) (5,1) meta_x(1) X 1) 2)
データの連続化 [1] 座標 CMSI配信講義B 第15回 58 /83 メタデータ配列 前回説明 meta_x: X-Y 平面上での原子の相対位置関係 (= メタデータ ) を保持 Y meta_x(n) (1,5) (2,5) (3,5) (4,5) (5,5) (1,4) (2,4) (3,4) (4,4) (5,4) (1,3) (2,3) (3,3) (4,3) (5,3) 帯単位での j 原子座標への連続アクセス 原子 j 原子 i (1,2) (2,2) (3,2) (4,2) (5,2) (1,1) (2,1) (3,1) (4,1) (5,1) meta_x(1) X 1) 自プロセスを中心にデータを局所化 2) 袖部に直にデータ装填
59 /83 M2L myrank M2M 1) 2)
60 /83 M2L myrank M2M 1) 2)
61 /83 do icell(myrank) do jcell do iatm=tag(icell), tag(icell)+na_per_cell(icell)-1 do jatm=tag(jcell), tag(jcell)+na_per_cell(jcell)-1 icell jcell myrank
62 /83 do icell(myrank) do jcell do iatm=tag(icell), tag(icell)+na_per_cell(icell)-1 do jatm=tag(jcell), tag(jcell)+na_per_cell(jcell)-1 Main memory 100 L2 cache 10 L1 cache 1 CPU jcell icell icell jcell myrank (1) icell icell (2) jcell., SIMD
63 /83 do jcell_line do icell(myrank) [along icell_line] do iatom=tag(icell), tag(icell)+na_per_cell(icell)-1 do jatom=tag(jcell), tag(jcell+4)+na_per_cell(jcell+4)-1 Main memory 100 L2 cache 10 L1 cache 1 CPU tag(jcell) icell_line tag(jcell+4)+ na_per_cell(jcell+4) myrank jcell_line (1) jcell_line 1.. (2) jcell_line,. SIMD
DO jcell_line 64 /83 icell icell=jcell IF 3 md_direct_f90.f
65 /83 do icell(myrank) do icy=1,10 do icx=1,10 if(icx,icy 2 ) do m1=1,(nmax+1) 2 do m2=1,(nmax+1) 2 wl_local=wl_local+m2l*wm_local endif wm_local((nmax+1) 2,10,10) M2L myrank icell
66 /83 do icell(myrank) do icy=1,10 do icx=1,10 if(icx,icy 2 ) do m1=1,(nmax+1) 2 do m2=1,(nmax+1) 2 wl_local=wl_local+m2l*wm_local endif wm_local((nmax+1) 2,10,10) M2L myrank icell
67 /83 do icell(myrank) do icy=1,10 do icx=1,10 if(icx,icy 2 ) do m1=1,(nmax+1) 2 do m2=1,(nmax+1) 2 wl_local=wl_local+m2l*wm_local endif wm_local((nmax+1) 2,10,10) icell myrank icell wm_local, m2l..
68 /83 do iblk=1,nblock do icy=icyblkst(iblk),icyblkend(iblk) do icx=icxblkst(iblk),icxblkend(iblk) do icell(myrank) if(icx,icy 2 ) do m1=1,(nmax+1) 2 do m2=1,(nmax+1) 2 wl_local=wl_local+m2l*wm_local endif iblk=1 icxblkst(1) M2L iblk=1 iblk=3 icxblkend(1) myrank iblk=2 iblk=4 icyblkend(1) icyblkst(1) iblk wm_local, m2l icell 1..
69 /83 do iblk=1,nblock do icy=icyblkst(iblk),icyblkend(iblk) do icx=icxblkst(iblk),icxblkend(iblk) do icell(myrank) if(icx,icy 2 ) do m1=1,(nmax+1) 2 do m2=1,(nmax+1) 2 wl_local=wl_local+m2l*wm_local endif iblk=2 icxblkst(2) iblk=1 iblk=3 myrank icxblkend(2) M2L iblk=2 iblk=4 icyblkend(2) icyblkst(2) iblk wm_local, m2l icell 1..
70 /83 NIC 2 (ifort, pgf90, frtpx) 2 0!$omp (Fortran) #pragma omp (C ) ( ) [do/for ] OpenMP memory core core CPU core core 3 1 2 OpenMP, SIMD
71 /83 hello_omp.f: include omp_lib.h nomp = omp_get_max_threads() OpenMP, (nomp) OMP_NUM_THREADS!$omp parallel iam = omp_get_thread_num()!$omp do Do i=1,nomp WRITE(*,*) Hello, iam Enddo!$omp end do!$omp end parallel STOP END > ifort -openmp hello_omp.f (iam) 0 > export OMP_NUM_THREADS=4 >./a.out Hello 0 Hello 1 Hello 2 Hello 3
72 /83!$omp parallel!$omp do do iblk=1,nblock do icy=icyblkst(iblk),icyblkend(iblk) do icx=icxblkst(iblk),icxblkend(iblk) do icell(myrank) if(icx,icy 2 ) do m1=1,(nmax+1) 2 do m2=1,(nmax+1) 2 wl_local=wl_local+m2l*wm_local endif!$omp end do!$omp end parallel M2L.
73 /83!$omp parallel!$omp do schedule(static,nchunk) do load=1,nload icx=lddir(1,load) icy=lddir(2,load) do icell(myrank) if(icx,icy 2 ) do m1=1,(nmax+1) 2 do m2=1,(nmax+1) 2 wl_local=wl_local+m2l*wm_local endif!$omp end do!$omp end parallel M2L lddir. nchunk. md_fmm_f90.f
74 /83 do jcell_line do icell [along icell_line]!$omp parallel!$omp do do iatom=tag(icell), tag(icell)+na_per_cell(icell)-1 do jatom=tag(jcell), tag(jcell+4)+na_per_cell(jcell+4)-1!$omp!$omp end parallel
75 /83 do jcell_line do icell [along icell_line]!$omp parallel!$omp do do iatom=tag(icell), tag(icell)+na_per_cell(icell)-1 do jatom=tag(jcell), tag(jcell+4)+na_per_cell(jcell+4)-1!$omp!$omp end parallel $!omp parallel!$omp parallel do jcell_line do icell jcell_line *icell parallel open/close [along icell_line]!$omp do do iatom=tag(icell), tag(icell)+na_per_cell(icell)-1 do jatom=tag(jcell), tag(jcell+4)+na_per_cell(jcell+4)-1!$omp!$omp end parallel
single instruction multiple data) core (/proc/cpuinfo ) ) intel SSE pentium III~ 128bit (32bit X 4, 64 bit X 2) AVX Sandy Bridge~ 256bit (32bit X 8, 64 bit X 4) SIMD : SIMD,, Fortran/C intrinsic 76 /83 data x 4 +, SIMD 2 OpenMP, SIMD
77 /83 SIMD simd.f: real(8)::a(10000,10000),b(10000) real(8)::c(10000) a=1d0 b=1d0 do i=1,10000 c(i)=0d0 do j=1,10000 c(i)=c(i)+a(i,j)*b(j) stop end C(i)=ΣA(i,j)*B(j) SIMD >ifort -O0 simd.f >time./a.out real 0m1.782s user 0m1.274s sys 0m0.508s SIMD >ifort -xhost -vec-report simd.f simd.f(3): (col. 2) remark: LOOP WAS VECTORIZED. simd.f(4): (col. 2) remark: LOOP WAS VECTORIZED. simd.f(5): (col. 2) remark: PERMUTED LOOP WAS VECTORIZED. >time./a.out real 0m0.718s user 0m0.397s sys 0m0.321s. VECTORIZED
78 /83 OpenMP, 1-4 scale 1-2 void nonbond : do imol=1,nmol-1 do jmol=imol+1,nmol do i=1,natom(imol) do j=1,natom(jmol) rij=rij(ri, rj) LJ φ nonbond =φ nonbond +φ ij f(i)=f(i)+fi f(j)=f(j)+fj do imol=1,nmol do i=1,natom(imol)-1 do j=i+1,natom(imol) rij=rij(ri, rj) LJ x = φ nonbond =φ nonbond +x*φ ij f(i)=f(i)+x*fi f(j)=f(j)+x*fj 0 if 1-2,-3 void s if 1-4 scale 1 else 1 2 (1-2, 1-3 void) 3 s (1-4 scale) [s LJ, Coulomb ] 4
1-4 scale 1-2 void nonbond : 79 /83 do jcell_line do icell [along icell_line] do iatom=tag(icell), tag(icell)+na_per_cell(icell)-1 do jatom=tag(jcell), tag(jcell+4)+na_per_cell(jcell+4)-1 rij=rij(ri,rj) LJ x= φ nonbond =φ nonbond +x*φ ij f(i)=f(i)+x*fi f(j)=f(j)+x*fj 0 if 1-2,-3 void s if 1-4 scale 1 else OpenMP jcell_line if 1 2 (1-2, 1-3 void) 3 s (1-4 scale), s LJ, Coulomb 4
80 /83 do jcell_line do icell [along icell_line] do iatom=tag(icell), tag(icell)+na_per_cell(icell)-1 do jatom=tag(jcell), tag(jcell+4)+na_per_cell(jcell+4)-1 rij=rij(ri,rj) if(rij rcut) LJ_epsilon=0d0 φ nonbond =φ nonbond +φ ij f(i)=f(i)+fi f(j)=f(j)+fj jcell_line void, scale ( ). do iatom=tag(icell), tag(icell)+na_per_cell(icell)-1 do jatom=1,voidpair123(iatom) rij=rij(ri,rj) φ nonbond =φ nonbond φ ij void f(i)=f(i) Fi f(j)=f(j) Fj do jatom=1,scalepair14(iatom) rij=rij(ri,rj) x=1-s φ nonbond =φ nonbond x*φ ij f(i)=f(i) x*fi f(j)=f(j) x*fj (MDGRAPE) scale
81 /83 frtpx -Qt md_direct_f90.f <<< Loop-information Start >>> <<< [OPTIMIZATION] <<< SIMD <<< SOFTWARE PIPELINING <<< Loop-information End >>> 140 9 p v do j0=tag(jzb-2,jyb,jxb), 141 9 & tag(jzb+2,jyb,jxb) 142 9 & + na_per_cell(jzb+2,jyb,jxb)-1 143 9 p v rx=xi-wkxyz(1,j0) 144 9 p v ry=yi-wkxyz(2,j0) 145 9 p v rz=zi-wkxyz(3,j0) 146 9 p v r2=rx*rx+ry*ry+rz*rz 149 9! ^^^ spherical cut-off ^^^ 150 10 p v if(r2<=cutrad2) then 151 10 p v eps=epsilon_sqrt_i0 152 10 & *epsilon_sqrt_table(ic,iam) 153 10 p v else 154 10 p v eps=0d0 155 10 p v endif!cut-off 167 9 p v sulj12=sulj12+ulj12 168 9 p v sulj6 =sulj6 +Ulj6 184 9 p v sucoulomb=sucoulomb+ucoulomb 170 9 p v stlcx=stlcx+tlx 171 9 p v stlcy=stlcy+tly 172 9 p v stlcz=stlcz+tlz 185 9 p v stlcx=stlcx+tcx 186 9 p v stlcy=stlcy+tcy 187 9 p v stlcz=stlcz+tcz 188 9 p v ic=ic+1 189 9 p v!j0
82 /83 frtpx -Qt md_fmm_f90.f 1050 1 p DO load = 1, nload 1051 1 p ic = lddir(1,load) 1052 1 p jc = lddir(2,load) 1053 1 p kc = lddir(3,load) 1091 4 **** multipole to local translation <<< Loop-information Start >>> <<< [OPTIMIZATION] <<< PREFETCH : 6 <<< wwl_localx: 6 <<< Loop-information End >>> 1092 5 p do m1=1,(nmax+1)*(nmax+1) <<< Loop-information Start >>> <<< [OPTIMIZATION] <<< SIMD <<< SOFTWARE PIPELINING <<< Loop-information End >>> 1093 6 p 6v do m2=1,(nmax+1)*(nmax+1) 1094 6 p 6v wwl_localx(m1,icz0,icy0,icx0,iam) 1095 6 $ = wwl_localx(m1,icz0,icy0,icx0,iam) 1096 6 $ + wm_localx(m2,icz1,icy1,icx1)*shml(m2,m1,kc,jc,ic,nl) 1097 6 p 6v 1098 5 p 1105 1 1106 1 p ENDDO! load
83 /83 3 MPI,,,.,. OpenMP (, ) SIMD (IF, ).
/83 www.modylas.org (PDF) (PDF). 21 CMSI : MODYLAS