untitled

Size: px

Start display at page:

Download "untitled"

ゆいとにかどり
5 years ago
Views:

1 CUDA Vol II: CUDA : NVIDIA Q2 2008

2 CUDA...1 CUDA...3 CUDA

3 CUDA CUDA ( ) CUDA 2 NVIDIA Corporation

4 CUDA CUDA GPU GPU CPU NVIDIA Corporation

5 V V d call put 1 2 = S CND( d ) X e = X e log( d = log( = S X rt S X 1 CND( d 2 v ) + ( r + ) T 2 v T 2 v ) + ( r ) T 2 v T CND( d) = 1 CND( d) 2 rt CND( d ) S CND( d ) 2 ) 1 S X CND r v NVIDIA Corporation u 1 x 2 N ( x) = e du 2 (Hull ) 5 6 host device float CND(float d) { float K = 1.0f / (1.0f f * fabsf(d)); float CND = RSQRT2PI * expf(- 0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); if(d > 0) CND = 1.0f - CND; return CND; float 2 2 NVIDIA Corporation

6 1. : hoptprice(n) hoptstrike (N).. 2. : doptprice(n) doptstrike (N) GPU 6. GPU 7. GPU NVIDIA Corporation /* */ float *hoptprice, *hoptstrike, *hoptyear; hoptprice = (float *) malloc(sizeof(float*n); hoptstrike = (float *) malloc(sizeof(float*n); hoptyear = (float *) malloc(sizeof(float*n); /* cudamalloc GPU */ float *doptprice, *doptstrike, *doptyear; cudamalloc( (void **) &doptprice, sizeof(float)*n); cudamalloc( (void **) &doptstrike, sizeof(float)*n); cudamalloc( (void **) &doptyear, sizeof(float)*n); /* hoptprice hoptstrike hoptyear */ NVIDIA Corporation

7 4 5 /* cudamemcpy (target, source, size, direction)*/ cudamemcpy (doptprice, hoptprice, sizeof(float)*n, cudamemcpyhosttodevice); cudamemcpy (doptstrike, hoptstrike, sizeof(float)*n, cudamemcpyhosttodevice); cudamemcpy (doptyears, hoptyears, sizeof(float)*n, cudamemcpyhosttodevice); /* GPU <<<, >>>*/ / BlackScholesGPU<<<128, 256>>>( dcallresult, dputresult, doptionstrike, doptionprice, doptionyears, RISKFREE, VOLATILITY, OPT_N); NVIDIA Corporation /* cudamemcpy(target, source, size, direction)*/ cudamemcpy (hcallresult, dcallresult, sizeof(float)*n, cudamemcpydevicetohost); cudamemcpy (hputresult, dputresult, sizeof(float)*n, cudamemcpydevicetohost); /* CPU */ BlackScholesCPU( ); /* */ /* */ / free( hoptprice); cudafree(doptprice); NVIDIA Corporation

8 BlackScholesGPU OptN? = 65,536 1 = 512 OptN 33M : { global void BlackScholes (float *., int OptN) const int const int tid = blockdim.x * blockidx.x + threadidx.x; THREAD_N = blockdim.x * griddim.x; for(int opt = tid; opt < OptN; opt += THREAD_N) BlackScholesBody( d_callresult[opt], d_putresult[opt], d_optionprice[opt], d_optionstrike[opt],d_optionyears[opt], Riskfree,Volatility ); NVIDIA Corporation BlackScholes.cu nvcc O3 o BlackScholes BlacScholes.cu \\ -I../../common/inc/ -L../../lib/ -lcutil -lgl lglut : BlackScholes (1,000,000 ) GPU GPU GPU CPU : GPU : : CPU : L1 : E-08 : E-05 NVIDIA Corporation

9 PCIe PCIe GB/s CPU 3 GB/s NVIDIA LinkBoost 4 GB CUDA NVIDIA Corporation pinned cudamalloc cudamallochost cudafree cudafreehost /* malloc */ / cudamallochost ((void **) &h_callresultgpu, OPT_SZ); /* free cudafreehost(h_callresultgpu); NVIDIA Corporation

10 BlackScholesPinned.cu nvcc O3 o BlackScholesPinned BlacScholesPinned.cu\\ -I../../common/inc/ -L../../lib/ -lcutil -lgl lglut ( ) ) : BlackScholesPinned 1,000,000 GPU GPU GPU : GPU : : CPU CPU : L1 : E-08 : E-05 NVIDIA Corporation CUDA

11 CUDA NVIDIA Corporation i. ii. iii. % MATLAB % An Introduction to Financial Option Valuation: Mathematics, Stochastics % and Computation D. Higham S = 2; E = 1; r = 0.05; 05 sigma = 0.25; T = 3; M = 1e6; Svals = S*exp((r-0.5*sigma^2)*T + sigma*sqrt(t)*randn(m,1)); Pvals = exp(-r*t)*max(svals-e,0); Pmean = mean(pvals) width = 1.96*std(Pvals)/sqrt(M); conf = [Pmean - width, Pmean + width] NVIDIA Corporation

12 i. M :M=200,000,000 i. MT ii. ii. N : N=128 iii. iv. : MT GPU CUDA SDK v1.1 MonteCarloMultiGPU NVIDIA Corporation RNG DCMT RandomGPU<<<32,128>>>( d_random, N_PER_RNG, seed); : 4096 Tesla C870 GPU !!!! NVIDIA Corporation

13 BoxMullerGPU<<<32,128>>>( d_random, N_PER_RNG, RNG seed); Tesla C870 GPU #define PI f device void BoxMuller(float& u1, float& u2){ float r = sqrtf(-2.0f * logf(u1)); float phi = 2 * PI * u2; u1 = r * cosf(phi); u2 = r * sinf(phi); Beasley-Springer-Moro NVIDIA Corporation void MonteCarloGPU(d_Random,.) { // ,384 MonteCarloKernelGPU<<<64, 256, 0>>>(d_Random); // cudamemcpy(h_sum, d_sum, ACCUM_SZ, cudamemcpydevicetohost) ; cudamemcpy(h_sum2, d_sum2, ACCUM_SZ, cudamemcpydevicetohost) ; // 2 double dblsum = 0, dblsum2 = 0; for(int i = 0; i < ACCUM_N; i++){ dblsum += h_sum[i]; dblsum2 += h_sum2[i]; NVIDIA Corporation

14 global void MonteCarloKernelGPU( ) { const int tid = blockdim.x * blockidx.x + threadidx.x; const int threadn = blockdim.x * griddim.x; //... for(int iaccum = tid; iaccum < accumn; iaccum += threadn) { float sum = 0, sum2 = 0; for(int ipath = iaccum; ipath < pathn; ipath += accumn) { float r = d_random[ipath]; //... sum += endoptionprice; sum2 += endoptionprice * endoptionprice; d_sum[iaccum] = sum; d_sum2[iaccum] = sum2; NVIDIA Corporation N a i S 0 =0 S i =S i-1 +a 1 S=S n Wilkinson 1963 N 2 NVIDIA Corporation

15 Montecarlo.cu : nvcc O3 o Montecarlo Montecarlo.cu \\ -I../../common/inc/ -L../../lib/ -lcutil -lgl lglut ( ) : Montecarlo 200,000,000 GPU GPU : ; : : e-05; : e-05; : ; : : e-05; : e-06; NVIDIA Corporation :256 16M : K NVIDIA Corporation

16 1 NVIDIA Corporation NVIDIA Corporation

17 1 NVIDIA Corporation NVIDIA Corporation

18 GPU CPU GPU CPU 1 : 64 NVIDIA Corporation GPU GPU

19 NVIDIA Corporation

20 1 1 CPU cudamemcpy : 1 1 NVIDIA Corporation ? NVIDIA bool multiblock = ((numpaths / numoptions >= 8192); multiblock false NVIDIA Corporation

21 1 1 NVIDIA Corporation CUDA SDK 1.1 MonteCarlo GPU MonteCarloMultiGPU NVIDIA Corporation

22 CUDA NVIDIA Corporation

FFT GPU GPU NVIDIA Corporation 2008 41 2 φ = r FFT ( k 2 x + k 2 y ) ˆ φ = rˆ 1.

23 FFT GPU GPU NVIDIA Corporation φ = r FFT ( k 2 x + k 2 y ) ˆ φ = rˆ 1. 2 FFT r(k) k 2 FFT k 2. (k) u(k) ˆ rˆ φ = 2 ( k + 2 x k y ) 3. 2 FFT u(k) u NVIDIA Corporation

24 MATLAB % N = 64; % L = 1; % f << 1 sig = 0.1; % k = (2*pi/L)*[0:(N/2-1) (-N/2):(-1)]; % (m,n) % (x,y) [KX KY] = meshgrid(k,k); % delsq = -(KX.^2 + KY.^2); % (0,0) % % 0 delsq(1,1) = 1; % h = L/N; x = (0:(N-1))*h ; y = (0:(N-1))*h; [X Y] = meshgrid(x,y); % RHS f(x,y) rsq = (X-0.5*L).^2 + (Y-0.5*L).^2; sigsq = sig^2; f = exp(-rsq/(2*sigsq)).* (rsq - 2*sigsq)/(sigsq^2); % fhat = fft2(f); u = real(ifft2(fhat./delsq)); % u = 0 % u = u - u(1,1); % L2 Linf uex = exp(-rsq/(2*sigsq)); errmax = norm(u(:)-uex(:),inf); errmax2 = norm(u(:)-uex(:),2)/(n*n); % L2 Linf fprintf('n=%d n',n); fprintf('solution f(' i at (%d,%d):%d) ',N/2,N/2); fprintf('computed=%10.6f reference = %10.6f n',u(n/2,n/2), uex(n/2,n/2)); fprintf('linf err=%10.6e L2 norm err = %10.6e n',errmax, errmax2); NVIDIA Corporation : r (NxN) u (NxN) kx (N) ky (N) 2. : r_d u_d kx_d ky_d d d ky d 3. kx ky 4. FFT FFT FFT GPU C2C NVIDIA Corporation

25 1 3 /* */ float *kx, *ky, *r; kx = (float *) malloc(sizeof(float*n); ky = (float *) malloc(sizeof(float*n); r = (float *) malloc(sizeof(float*n*n); /* cudamalloc GPU */ float *kx_d, *ky_d, *r_d; cudamalloc( (void **) &kx_d, sizeof(cufftcomplex)*n); cudamalloc( (void **) &ky_d, sizeof(cufftcomplex)*n); cudamalloc( (void **) &r_d, sizeof(cufftcomplex)*n*n); cufftcomplex *r_complex_d; cudamalloc( (void **) &r_complex_d, sizeof(cufftcomplex)*n*n); NVIDIA Corporation /* r kx ky */ /* cudamemcpy cpy (target, source, size, direction)*/ cudamemcpy (kx_d, kx, sizeof(float)*n, cudamemcpyhosttodevice); cudamemcpy (ky_d, ky, sizeof(float)*n, cudamemcpyhosttodevice); cudamemcpy (r_d, r, sizeof(float)*n*n, cudamemcpyhosttodevice); /* CUDA FFT (FFTW ) */ cuffthandle plan; cufftplan2d( &plan, N, N, CUFFT_C2C); NVIDIA Corporation

5 /* : block_size_x*block_size_y = G80 512 */ dim3 dimblock(block_size_x, block_size_y); dim3 dimgrid (N/dimBlock.x, N/dimBlock.y); /* N block_size_x block_size_y */ if (N % block_size_x!=0 ) dimgrid.

26 5 /* : block_size_x*block_size_y = G */ dim3 dimblock(block_size_x, block_size_y); dim3 dimgrid (N/dimBlock.x, N/dimBlock.y); /* N block_size_x block_size_y */ if (N % block_size_x!=0 ) dimgrid.x+=1; if (N % block_size_y!=0 ) dimgrid.y+=1 NVIDIA Corporation /* */ real2complex<<<dimgrid, dimblock>>> (r_d, r_complex_d, N); /* FFT */ cufftexecc2c (plan, r_complex_d, r_complex_d, CUFFT_FORWARD); FORWARD) /* */ solve_poisson<<<dimgrid, dimblock>>> (r_complex_d, kx_d, ky_d,n); /* FFT */ cufftexecc2c (plan, r_complex_d, r_complex_d, CUFFT_INVERSE); /* FFT ifft */ scale = 1.f / ( (float N * (float) N ); complex2real_scaled<<<dimgrid, dimblock>>> (r_d, r_complex_d, N, scale); NVIDIA Corporation

27 11 /* cudamemcpy(target, source, size, direction)*/ cudamemcpy (r, r_d, sizeof(float)*n*n, cudamemcpydevicetohost); /* */ cufftdestroy( plan); cudafree(r_complex_d); cudafree(kx_d); NVIDIA Corporation real2complex /* */ global void real2complex (float *a, cufftcomplex *c, int N) { /* NxN idx idy */ int idx = blockid.x*blockdim.x+threadidx.x; int idy = blockid.y*blockdim.y+threadidx.y; if ( idx < N && idy <N) { int index = idx + idy*n; c[index].x = a[index]; c[index].y = 0.f; NVIDIA Corporation

28 solve_poisson global void solve_poisson (cufftcomplex *c, float *kx, float *ky, int N) { /* N N idx idy */ int idx = blockid.x*blockdim.x+threadidx.x; int idy = blockid.y*blockdim.y+threadidx.y; if ( idx < N && idy <N) { int index = idx + idy*n; float scale = - ( kx[idx]*kx[idx] + ky[idy]*ky[idy] ); if ( idx ==0 && idy == 0 ) scale =1.f; scale = 1.f / scale; c[index].x *= scale; c[index].y *= scale; ˆ rˆ φ = 2 2 ( k x + k y ) NVIDIA Corporation complex2real_scaled /* */ global void complex2real_scaled (cufftcomplex *c, float *a, int N, float scale) { /* N N idx idy */ int idx = blockid.x*blockdim.x+threadidx.x; int idy = blockid.y*blockdim.y+threadidx.y; if ( idx < N && idy <N) { int index = idx + idy*n; a[index] = scale*c[index].x ; NVIDIA Corporation

29 poisson_1 poisson_1.cu nvcc O3 o poisson_1 poisson_1.cu \\ -I/usr/local/cuda/include L/usr/local/cuda/lib -lcufft lcudart./poisson_1 -N dimblock dimgrid 2 4 L e-08: : I/O ( ): (32,32) 32) = = MATLAB N=64 (32,32) : = = Linf = e-05 L2 = e-08 NVIDIA Corporation CUDA CUDA_PROFILE: 1 0 CUDA_PROFILE_LOG: filename cuda_profile.log CUDA_PROFILE_CSV: 1 0 NVIDIA Corporation

30 Poisson_1./poisson_1 N1024 method=[ memcopy ] gputime=[ ] method=[ memcopy ] gputime=[ ] method=[ memcopy ] gputime=[ ] method=[ real2complex ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_radix4 ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ transpose ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_radix4 ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_transpose ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ solve_poisson] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_radix4 ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_transpose] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_radix4 ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_transpose] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ complex2real_scaled ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ memcopy ] gputime=[ ] NVIDIA Corporation pinned CPU GPU #ifdef PINNED cudamallochost((void **) &r,sizeof(float)*n*n); // rhs 2 #else r = (float *) malloc(sizeof(float)*n*n); // rhs 2 #endif $./poisson_1 1,024 1,024 : (ms) : (ms) I/O : ( ) (ms) $./poisson_1_pinned pinned 1,024 1,024 : (ms) : (ms) I/O : ( ) (ms) NVIDIA Corporation

31 solve_poisson kx ky ( umul24) NVIDIA Corporation solve_poisson global void solve_poisson (cufftcomplex *c, float *kx, float *ky, int N) { unsigned int idx = umul24(blockidx.x,blockdim.x)+threadidx.x; unsigned int idy = umul24(blockidx.y,blockdim.y)+threadidx.y; // k shared float kx_s[block_width], ky_s[block_height] if (threadix.x < 1) kx_s[threadidx.x] = kx[idx]; if (threadix.y < 1) ky_s[threadidx.y] = ky[idy]; syncthreads(); if ( idx < N && idy <N) { unsigned int index = idx + umul24(idy,n); float scale = - ( kx_s[threadidx.x]*kx_s[threadidx.x] + ky_s[threadidy.y]*ky_s[threadidy.y] ); if ( idx ==0 && idy == 0 ) scale =1.f; scale = 1.f / scale; c[index].x *= scale; c[index].y*= scale; NVIDIA Corporation

32 Poisson_2./poisson_2 N1024 x16 y16 method=[ memcopy ] gputime=[ ] method=[ memcopy ] gputime=[ ] method=[ memcopy ] gputime=[ ] method=[ real2complex ] gputime=[ ] cputime=[ ] occupancy=[ ] (was 1654) method=[ c2c_radix4 ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_transpose ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_radix4 ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_transpose] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ solve_poisson] gputime=[ ] cputime=[ ] occupancy=[ ] (was 6389) method=[ c2c_radix4 ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_transpose] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_radix4 ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_transpose] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ complex2real_scaled ] gputime=[ ] cputime=[ ] occupancy=[ ]??????? method=[ memcopy ] gputime=[ ] NVIDIA Corporation complex2real_scaled ( ) global void complex2real_scaled (cufftcomplex *c, float *a, int N, floatscale) { /* NxN idx idy */ int idx = blockid.x*blockdim.x+threadidx.x; kdi did int idy = blockid.y*blockdim.y+threadidx.y; volatile float2 c2; if ( idx < N && idy <N) { int index = idx + idy*n; c2.x= c[index].x; c2.y= c[index].y; a[index] = scale*c2.x c2.x ; ptx NVIDIA Corporation

33 Poisson_3 method=[ memcopy ] gputime=[ ] method=[ memcopy ] gputime=[ ] method=[ memcopy ] gputime=[ ] method=[ real2complex] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_radix4] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_transpose] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_radix4 ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_transpose] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ solve_poisson ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_radix4 ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_transpose] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_radix4 ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ c2c_transpose] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ complex2real_scaled ] gputime=[ ] cputime=[ ] occupancy=[ ] method=[ memcopy ] gputime=[ ] NVIDIA Corporation pinned pinned (r2c c2r) 67ms (10.8ms) 63ms ms (7.1ms) 59.4ms +c2r 62.1ms 58.2ms (5.8ms) Tesla C870 pinned : 10.4ms NVIDIA Corporation

34 CUDA 7 NVIDIA Corporation

35 GPU? NVIDIA Corporation ? : GPU GPU M b B >M* b GPU NVIDIA Corporation

36 : NVIDIA Corporation ? GPU GFLOP/s: : G80 GPU MHz DDR 384 * 1800 / 8 = 86.4 GB/s NVIDIA Corporation

$1: global void reduce0(int *g_idata, int *g_odata) { extern shared int sdata[]; // 1 1 unsigned int tid = threadidx.x; unsigned int i = blockidx.x*blockdim.x + threadidx.$

37 1: global void reduce0(int *g_idata, int *g_odata) { extern shared int sdata[]; // 1 1 unsigned int tid = threadidx.x; unsigned int i = blockidx.x*blockdim.x + threadidx.x; sdata[tid] = g_idata[i]; syncthreads(); // for(unsigned int s=1; s < blockdim.x; s *= 2) { if (tid % (2*s) == 0) { sdata[tid] += sdata[tid + s]; syncthreads(); // if (tid == 0) g_odata[blockidx.x] = sdata[0]; NVIDIA Corporation : 1 == 1 2 == 2 3 == 4 4 == 8 ID ID ID ID NVIDIA Corporation

38 1: global void reduce1(int *g_idata, int *g_odata) { extern shared int sdata[]; // 1 unsigned int tid = threadidx.x; unsigned int i = blockidx.x*blockdim.x + threadidx.x; sdata[tid] = g_idata[i]; syncthreads(); // for (unsigned int s=1; s < blockdim.x; s *= 2) { if (tid % (2*s) == 0) { sdata[tid] += sdata[tid + s]; : syncthreads(); // if (tid == 0) g_odata[blockidx.x] = sdata[0]; NVIDIA Corporation M 1: ms GB/s : 128 NVIDIA Corporation

39 2: for (unsigned int s=1; s < blockdim.x; s *= 2) { if (tid %(2*s) == 0) { sdata[tid] += sdata[tid + s]; syncthreads(); for (unsigned int s=1; s < blockdim.x; s *= 2) { int index = 2 * s * tid; if (index < blockdim.x) { sdata[index] += sdata[index + s]; syncthreads(); : NVIDIA Corporation M 1: 2: 2 22 int ms GB/s ms GB/s 2.33 NVIDIA Corporation

: 1 == 8 2 == 4 3 == 2 4 == 1 ID ID ID ID NVIDIA Corporation 2008 75 3: for (unsigned int s=1; s < blockdim.x; s *= 2) { int index = 2 * s * tid; if (index < blockdim.

40 : 1 == 8 2 == 4 3 == 2 4 == 1 ID ID ID ID NVIDIA Corporation : for (unsigned int s=1; s < blockdim.x; s *= 2) { int index = 2 * s * tid; if (index < blockdim.x) { sdata[index] += sdata[index + s]; syncthreads(); ID for (unsigned int s=blockdim.x/2; s>0; s>>=1) { if (tid < s) { sdata[tid] += sdata[tid + s]; syncthreads(); NVIDIA Corporation

41 4M 1: 2: 3: 2 22 int ms GB/s ms GB/s ms GB/s 2.01 NVIDIA Corporation : for (unsigned int s=blockdim.x/2; s>0; s>>=1) { if (tid <s){ sdata[tid] += sdata[tid + s]; syncthreads();! NVIDIA Corporation

42 4: 1 // 1 unsigned int tid = threadidx.x; x; unsigned int i = blockidx.x*blockdim.x + threadidx.x; sdata[tid] = g_idata[i]; syncthreads(); 2 // // unsigned int tid = threadidx.x; unsigned int i = blockidx.x*(blockdim.x*2) + threadidx.x; sdata[tid] = g_idata[i] + g_idata[i+blockdim.x]; syncthreads(); NVIDIA Corporation M 1: 2: 3: 4: 2 22 int ms GB/s ms GB/s ms GB/s ms GB/s 1.78 NVIDIA Corporation

43 17 GB/s : NVIDIA Corporation s <= 32 1 SIMD s <= 32 syncthreads() if (tid < s) 6 6 NVIDIA Corporation

44 5: for (unsigned int s=blockdim.x/2; s>32; s>>=1) { if (tid < s) sdata[tid] += sdata[tid + s]; syncthreads(); if (tid < 32) { sdata[tid] += sdata[tid + 32]; sdata[tid] += sdata[tid + 16]; sdata[tid] += sdata[tid + 8]; sdata[tid] += sdata[tid + 4]; sdata[tid] += sdata[tid + 2]; sdata[tid] += sdata[tid + 1]; for if NVIDIA Corporation M 1: 2: 3: 4: 2 22 int ms GB/s ms GB/s ms GB/s ms GB/s : ms GB/s 1.8 NVIDIA Corporation

45 GPU :? CUDA C++ NVIDIA Corporation template <unsigned int blocksize> global void reduce5(int *g_idata, int *g_odata) NVIDIA Corporation

46 6: if (blocksize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; syncthreads(); if (blocksize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; syncthreads(); if (blocksize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; syncthreads(); if (tid < 32) { if (blocksize >= 64) sdata[tid] += sdata[tid + 32]; if (blocksize >= 32) sdata[tid] += sdata[tid + 16]; if (blocksize >= 16) sdata[tid] += sdata[tid + 8]; if (blocksize >= 8) sdata[tid] += sdata[tid + 4]; if (blocksize >= 4) sdata[tid] += sdata[tid + 2]; if (blocksize >= 2) sdata[tid] += sdata[tid + 1]; : NVIDIA Corporation ? 10 switch switch (threads) { case 512: reduce5<512><<< dimgrid, dimblock, smemsize >>>(d_idata, d_odata); break; case 256: reduce5<256><<< dimgrid, dimblock, smemsize >>>(d_idata, d_odata); break; case 128: reduce5<128><<< dimgrid, dimblock, smemsize >>>(d_idata, d_odata); break; case 64: reduce5< 64><<< dimgrid, dimblock, smemsize >>>(d_idata, d_odata); break; case 32: reduce5< 32><<< dimgrid, dimblock, smemsize >>>(d_idata, d_odata); break; case 16: reduce5< 16><<< dimgrid, dimblock, smemsize >>>(d_idata, d_odata); break; case 8: reduce5< 8><<< dimgrid, dimblock, smemsize >>>(d_idata, d_odata); break; case 4: reduce5< 4><<< dimgrid, dimblock, smemsize >>>(d_idata, d_odata); break; case 2: reduce5< 2><<< dimgrid, dimblock, smemsize >>>(d_idata, d_odata); break; case 1: reduce5< 1><<< dimgrid, dimblock, smemsize >>>(d_idata, d_odata); break; NVIDIA Corporation

47 4M 1: 2: 3: 4: 2 22 int ms GB/s ms GB/s ms GB/s ms GB/s : ms GB/s 1.8 6: ms GB/s 1.41 NVIDIA Corporation Log(N) S N/2 O(log N) N=2 D S [1..D] 2 D-S = N-1 O(N): P P O(N/P + log N) O(N) N=P O(log N) NVIDIA Corporation

48 x : O(N) O(log N) O(N log N): O(N/log N) O(log N) O(N/log N) O(log N) = O((N/log N) * log N) = O(N) NVIDIA Corporation O(log n) 1 1,024 2, G ,024 4,096 NVIDIA Corporation

49 7: 2 unsigned int tid = threadidx.x; unsigned int i = blockidx.x*(blockdim.x*2) + threadidx.x; sdata[tid] = g_idata[i] + g_idata[i+blockdim.x]; syncthreads(); while unsigned int tid = threadidx.x; unsigned int i = blockidx.x*(blocksize*2) + threadidx.x; unsigned int gridsize = blocksize*2*griddim.x; sdata[tid] = 0; do { sdata[tid] += g_idata[i] + g_idata[i+blocksize]; i += gridsize; while (i < n); syncthreads(); NVIDIA Corporation M 1: 2: 3: 4: 2 22 int ms GB/s ms GB/s ms GB/s ms GB/s : ms GB/s 1.8 6: 7: ms GB/s ms GB/s M 7: 72 GB/s! : 30! NVIDIA Corporation

$template <unsigned int blocksize> global void reduce6(int *g_idata, int *g_odata, unsigned int n) { extern shared int sdata[]; unsigned int tid = threadidx.x; unsigned int i = blockidx.$

50 template <unsigned int blocksize> global void reduce6(int *g_idata, int *g_odata, unsigned int n) { extern shared int sdata[]; unsigned int tid = threadidx.x; unsigned int i = blockidx.x*(blocksize*2) + tid; unsigned int gridsize = blocksize*2*griddim.x; sdata[tid] = 0; do { sdata[tid] += g_idata[i] + g_idata[i+blocksize]; i += gridsize; while (i < n); syncthreads(); if (blocksize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; syncthreads(); if (blocksize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; syncthreads(); if (blocksize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; syncthreads(); if (tid < 32) { if (blocksize >= 64) sdata[tid] += sdata[tid + 32]; if (blocksize >= 32) sdata[tid] += sdata[tid + 16]; if (blocksize >= 16) sdata[tid] += sdata[tid + 8]; if (blocksize >= 8) sdata[tid] += sdata[tid + 4]; if (blocksize >= 4) sdata[tid] += sdata[tid + 2]; if (blocksize >= 2) sdata[tid] += sdata[tid + 1]; if (tid == 0) g_odata[blockidx.x] = sdata[0]; NVIDIA Corporation NVIDIA Corporation

Corporation Copyright 2008 NVIDIA Corporation.All rights reserved.

52 NVIDIA NVIDIA NVIDIA Corporation NVIDIA Corporation NVIDIA Corporation NVIDIA Corporation NVIDIA NVIDIA CUDA Tesla NVIDIA Corporation Copyright 2008 NVIDIA Corporation.All rights reserved. NVIDIA Corporation 2701 San Tomas Expressway Santa Clara, CA

GPU CUDA CUDA 2010/06/28 1

GPU CUDA CUDA 2010/06/28 1 GPU NVIDIA Mark Harris, Optimizing Parallel Reduction in CUDA http://developer.download.nvidia.com/ compute/cuda/1_1/website/data- Parallel_Algorithms.html#reduction CUDA SDK