XcalableMP入門 - PDF Free Download

XcalableMP 1 HPC-Phys@, 2018 8 22

XcalableMP XMP XMP Lattice QCD!2

XMP MPI MPI!3

XMP 1/2 PCXMP MPI Fortran CCoarray C++ MPIMPI XMP OpenMP http://xcalablemp.org!4

XMP 2/2 SPMD (Single Program Multiple Data) MPI XMP MPI Coarray HPF 2 Coarray!5

XcalableMP XMP XMP Lattice QCD!6

1 100 4 MPI 11 25 2 Loop!7

C Fortran 8

C Fortran 9

XMP MPI XMP int array[max], res = 0; #pragma xmp nodes p[*] #pragma xmp template t[max] #pragma xmp distribute t[block] onto p #pragma xmp align array[i] with t[i] int main(){ #pragma xmp loop on t[i] reduction(+:res) for (int i = 0; i < MAX; i++){ array[i] = func(i); res += array[i]; } return 0; } MPI int array[max], res = 0; int main(int argc, char **argv){ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); } int dx = MAX/size; in llimit = rank * dx; int ulimit = (rank!= (size -1))? llimit + dx : MAX; int temp_res = 0; for(int i = llimit; i < ulimit; i++){ array[i] = func(i); temp_res += array[i]; } MPI_Allreduce(&temp_res, &res, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Finalize(); return 0;!10

loop loop for #pragma xmp loop on t[i] for(int i=0;i<16;i++){ a[16] #pragma xmp nodes p[4] #pragma xmp template t[16] #pragma xmp distribute t[block] onto p int a[16]; #pragma xmp align a[i] with t[i] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 p[0] p[1] p[2] p[3]!11

loop loop for #pragma xmp loop on t[i] for(int i=2;i<11;i++){ a[16] #pragma xmp nodes p[4] #pragma xmp template t[16] #pragma xmp distribute t[block] onto p #pragma int a[16]; xmp align a[i] with t[i] #pragma xmp align a[i] with t[i] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 p[0] p[1] p[2] p[3]!12

loop!13

XMP + OpenMP!14

bcast reduction!15

gmove #pragma xmp gmove a[2:4] = b[3:4]; array-name base length a[8] 0 1 2 3 4 5 6 7 b[8] 0 1 2 3 4 5 6 7 p[0] p[1] p[2] p[3]!16

gmove!$xmp gmove a(3:6) = b(4:7) array-name( base ) a(8) 1 2 3 4 5 6 7 8 b(8) 1 2 3 4 5 6 7 8 p(1) p(2) p(3) p(4)!17

shadow/reflect #pragma xmp nodes p[3] #pragma xmp template t[9] #pragma xmp distribute t[block] onto p int a[9]; #pragma xmp align a[i] with t[i] #pragma xmp shadow a[1:1]... #pragma xmp reflect (a)!$xmp nodes p(3)!$xmp template t(9)!$xmp distribute t(block) onto p integer :: a(9)!$xmp align a(i) with t(i)!$xmp shadow a(1:1)...!$xmp reflect (a) shadow!18

shadow/reflect #pragma xmp loop on t[i] for(int i=1;i<9;i++){ b[i] = a[i-1] + a[i] + a[i+1]; }!$xmp loop on t(i) do i = 2, 8 b(i) = a(i-1) + a(i) + a(i+1) end do!20

shadow/reflect #pragma xmp shadow a[1:1]... #pragma xmp reflect (a) #pragma xmp loop on t[i] for(int i=1;i<9;i++){ b[i] = a[i-1] + a[i] + a[i+1]; }!$xmp shadow a(1:1)...!$xmp reflect (a)!$xmp loop on t(i) do i = 2, 8 b(i) = a(i-1) + a(i) + a(i+1) end do!21

1 100 4 MPI 11 25 2 Fortran 2008 coarrayxmp/c!22

Coarray in XMP/C real a(8) real b(8)[*] if(this_image() == 1) then b(6)[2] = b(4) a(0) = b(3)[2] end if sync all double a[8]; double b[8]:[*]; if(xmpc_this_image() == 1){ b[6]:[2] = b[4]; a[0] = b[3]:[2]; } xmpc_sync_all(null);!23

in XMP/C if(xmpc_this_image() == 1){ b[10:5]:[2] = b[0:5]; a[:]:[2] = b[:]; c[:][9]:[2] = c[:][0]; } image 2 image 1 c[10][10] c[10][10]!24

25 user code (a.c) $ xmpcc a.c -o a.out (a.out) https://omni-compiler.org

XcalableMP XMP XMP Lattice QCD!26

28 S = B R = B X = B sr = norm(s) T = WD(U,X) S = WD(U,T) R = R - S P = R rrp = rr = norm(r) do{ T = WD(U,P) V = WD(U,T) pap = dot(v,p) cr = rr/pap X = cr * P + X R = -cr * V + R rr = norm(r) bk = rr/rrp P = bk * P P = P + R rrp = rr }while(rr/sr > 1.E-16) // COPY // COPY // COPY // NORM // Main Kernel // Main Kernel // AXPY // COPY // NORM // Main Kernel // Main Kernel // DOT // AXPY // AXPY // NORM // SCAL // AXPY #pragma xmp reflect (X) width(/periodic/..) orthogonal WD(X,...); void WD(Quark_t X[NT][NZ][NY][NX],... ){ : #pragma xmp loop on t[t][z] #pragma omp parallel for collapse(4) for(int t=0;t<nt;t++) for(int z=0;z<nz;z++) for(int y=0;y<ny;y++) for(int x=0;x<nx;x++){ :

reflect #pragma xmp reflect (a) #pragma xmp reflect (a) width(/periodic/1,..) orthogonal #pragma xmp reflect (a) orthogonal orthogonal!29

1800 1000 Performance (GFlops) 1350 900 450 0 Intel compiler Omni compiler 1 2 4 8 16 32 64 128 256 750 500 250 0 1 2 4 8 16 32 64 128 256 Better Number of processes Number of processes Performances of Omni compiler achieve 94-105% of those of Intel compiler. 31

32 1000 750 500 250 0 OpenMP (Base code) 854 XMP+OpenMP MPI+OpenMP 968 979 Almost the same 180 135 90 45 0 34% reduce (118/178) 4 114 XMP + OpenMP How many lines the code changed from a base code to a parallel code 53 125 MPI + OpenMP Modification Addition Quantitative Qualitative While most of 114 lines in XMP+OpenMP is the insertion of XMP directives, 125 in MPI+OpenMP is a creation of new functions for communication. It is easier to develop a parallel application in XMP+OpenMP than MPI+OpenMP