OpenMP (Message Passing) (shared memory) DSMon MPI,PVM pthread, solaris thread, NT thread OpenMP annotation thread HPF annotation, distribution hint Fancy parallel programming languages for(i=0;i<1000; i++) S += A[i] 1 2 3 4 1000 + S 1 2 250 251 500 501 750 751 1000 + + + + + S
POSIX POSIX Pthread, Solaris thread for(t=1;t<n_thd;t++) r=pthread_create(thd_main,t) thd_main(0); for(t=1; t<n_thd;t++) pthread_join(); PARAMCS For(t=1; t<n_thd;t++) CREATE(thd_main); thd_main(0) WAIT_FOR_END(n_thd-1); int s; /* global */ int n_thd; /* number of threads */ int thd_main(int id) int c,b,e,i,ss; c=1000/n_thd; b=c*id; e=s+c; ss=0; for(i=b; i<e; i++) ss += a[i]; pthread_lock(); s += ss; pthread_unlock(); return s; OpenMP OK! #pragma omp parallel for reduction(+:s) for(i=0; i<1000;i++) s+= a[i]; OpenMP OpenMP OpenMP Parallel Regionwork sharing (for)(sections)single data scope orphan static extent dynamic extent OpenMP OpenMP
OpenMP (Fortran/C/C++)directive ISV Oct. 1997 Fortran ver.1.0 API Oct. 1998 C/C++ ver.1.0 API (1999 F90 API?) URL http://www.openmp.org/ SGI Cray Origin ASCI Blue Mountain System SUN Enterprise PC-based SMP SGI Power Fortran/C SUN Impact KAI/KAP OpenMP OpenMP 5%95%(?) 5% small-scale(16medium-scale (64 pthreados-oriented, general-purpose OpenMPAPI directives/pragma Fortran77, f90, C, C++ Fortran!$OMP C: #pragma omp pragma incremental
3GHz, 10GHz 90nm 65nm, 45nm VLIW L3 Intel Hyperthreading CPU Pentium OpenMP Fork-join parallel region A... #pragma omp parallel foo(); /*..B... */ C. #pragma omp parallel D E... Call foo() fork A Call foo() Call foo() B join C D E Call foo()
OpenMP OpenMP OpenMPAPI Fortran $OMP,C$OMP,*$OMPsentinel!$OMP directive_name [clause, clause, ] directive_name: clause:, C/C++ #pragma omp pragma #pragma omp directive_name [clause, clause, ] #pragma omp parallel Parallel Region (team) Parallel Parallel regionteam regionteam Fortran: C:!$OMP PARALLEL #pragma omp parallel parallel region......... Parallel region...!$omp END PARALLEL... Parallel region (contd.) ID omp_get_thread_num() IDTeam ID=0 ID omp_set_num_threads(nthreads) OMP_NUM_THREADS parallel regionjoin critical, atomic, barrier
for(i=0;i<1000; i++) S += A[i] 1 2 3 4 1000 + S 1 2 250 251 500 501 750 751 1000 + + + + + S OpenMP #pragma omp parallel int c,b,e,i,ss; c=1000/omp_get_num_threads(); b=c*omp_get_thread_num();e=s+c;ss=0; for(i=b; i<e; i++) ss += a[i]; #pragma omp atomic s += ss; OpenMP #pragma omp parallel for reduction(+:s) for(i=0; i<1000;i++) s+= a[i]; OpenMP : (data-parallel) (task-parallel) tuning : SPMD omp_get_thread_num()id SPLASH 2PARMACS Macro backend: OpenMP e.g. Polaris Compiler OpenMP Pthread, Solaris thread for(t=1;t<n_thd;t++) r=pthread_create(thd_main,t) thd_main(0); for(t=1; t<n_thd;t++) pthread_join(); PARAMCS For(t=1; t<n_thd;t++) CREATE(thd_main); thd_main(0) WAIT_FOR_END(n_thd-1); OpenMP omp_set_num_threads(n_thd); #pragma omp parallel thd_main(omp_get_thread_num());
Work sharing For Team parallel region for sections single parallel parallel for parallel sections ForDO forcanonical shape #pragma omp for [clause] for(var=lb; var logical-op ub; incr-expr) body varprivate incr-expr ++var,var++,--var,var--,var+=incr,var-=incr logical-op break clause For schedule(kind[,chunk_size]) schedule(static,chunk_size) chunk_sizeround-roubin chunk_size=1:cyclic schedule(dynamic,chunk_size) chunk_size chunk_size=1 schedule(guided,chunk_size) chunk_size schedule(runtime) OMP_SCHEDULE implementation n schedule(static,n) Schedule(static) Schedule(dynamic,n) Schedule(guided,n) Iteration space
Sections single Matvec(double a[],int row_start,int col_idx[], double x[],double y[],int n) int i,j,start,end; double t; #pragma omp parallel for private(j,t,start,end) for(i=0; i<n;i++) start=row_start[i]; end=row_start[i+1]; t = 0.0; for(j=start;j<end;j++) t += a[j]*x[col_idx[j]]; y[i]=t; Section #pragma omp sections #pragma omp section section1 #pragma omp section section2 #pragma omp single statements Barrier Work sharingnowait barrier Critical section critical Atomic atomic flush work sharingnowait #pragma omp barrier
Atomic Critical Atomic #pragma omp atomic statement x binop= expr x++,++x, x--, --xx xexpr Atomic Critical section #pragma omp critical[(name)] statements critical section critical section conditional wait master Master ordered #pragma omp master block statements ordered #pragma omp ordered block statements fordynamic extent forordered Data scope parallelwork sharing shared(var_list) private(var_list) private firstprivate(var_list) private lastprivate(var_list) private reduction(op:var_list) reduction private
Threadprivate file-scope #pragma omp threadprivate(var_list) parallel region persistent parallelcopyin(var_list) Data scope work sharing Parallel private,firstprivate,shared,reduction,copyin default(shared none) defaultnone for private,firstprivate,lastprivate,reduction sections private,firstprivate,lastprivate,reduction single private,firstprivate Orphan directiveextent extent (orphan directive) Static extent lexical dynamic extent orphan directive Static extentdynamic extent dynamic extent dynamic extentdata scope autoprivate shared main() for(it=0;it<niter;i++) resid=cgsol() printf(,resid); cgsol() #pragma omp parallel for for(i=0;i<cols;i+) p[i]=r[i]=x[i]; for(it=0;it<nitcg;i++) matvec(); #pragma omp parallel for for(i=0;i<cols;i++) z[i]+=alpha*p[i]; main() #pragma omp parallel for(it=0;it<niter;i++) resid=cgsol() #pragma omp master printf(,resid); cgsol() #pragma omp for for(i=0;i<cols;i+) p[i]=r[i]=x[i]; for(it=0;it<nitcg;i++) matvec(); #pragma omp for for(i=0;i<cols;i++) z[i]+=alpha*p[i];
Directive binding for, sections, single,master, barrier directivedynamic extentbind dynamic extent work sharingnest master, critical nested parallelism parallel directivenest Nested parallelismenableparallel Disablethread Nested parallelism Nested parallelism in FAQ ``What about nested parallelism? Nested parallelism is permitted by the OpenMP specification. Supporting nested parallelism effectively can be difficult, and we expect most vendors will start out by executing nested parallel constructs on a single thread. In ``OpenMP Fortran Interpretations Version 1.0 In Note that an OpenMP-compliant implementation is permitted to serialize a nested parallel region. Nested parallelismserialize sectionserialize serialize OpenMPmemory consistency OpenMPweak consistency Parallel region volatile nowaitwork sharing flush flush #pragma omp flush[(var_list)] consistency omp_get_num_threads, omp_set_num_threads team omp_get_thread_num id omp_get_max_threads omp_get_num_procs omp_set_dynamic, omp_get_dynamic omp_set_nested, omp_get_nested parallel regionnest lock omp_lock_t omp_nest_lock_t
OpenMP OMP_NUM_THREADS Parallel region OMP_SCHEDULE schedule(runtime) OMP_DYNAMIC SGI origin OMP_NESTED nested parallelism nestparallel region incremental Work sharing orphan directive data mapping Iteration mapping locality reduction pragma OpenMP --- (Fortran,C/C++) fork-join incremental OpenMP2.0 reduction OpenMP3.0 Gcc Omni OpenMP locality MPI,HPF