untitled

Size: px

Start display at page:

Download "untitled"

けいしょうひでやま
4 years ago
Views:

1 CUDA Vol I: CUDA : NVIDIA Q2 2008

2 GPU...1 CUDA...10 CUDA G8x CUDA CUBLAS CUFFT CUDA CUDA CUDA Fortran CUDA API CUDA...176

3 GPU : Connection Machine MasPar Cray PC P-RAM V-RAM NVIDIA Corporation

4 7 CM-1 MasPar 200 Beowulf Legion NVIDIA Corporation GPU GPU PC NVIDIA Corporation

5 CUDA CUDA Compute Unified Device Architecture GPU NVIDIA NVIDIA GPU C - NVIDIA Corporation CUDA: GPU CPU 1 GPU GPU NVIDIA Corporation

6 GPU NVIDIA Corporation GPU : PDE N : : NVIDIA Corporation

7 Matlab N LIBOR Cmatch NVIDIA Corporation CUDA

8 CPU+GPU CPU GPU DRAM NVIDIA Corporation CUDA 11 CUDA CPU CUDA CUDA CPU : GPU CPU NVIDIA Corporation

9 CUDA ID NVIDIA Corporation : CUDA NVIDIA Corporation

10 GPU NVIDIA Corporation NVIDIA Corporation

11 threadidx.x ID blockidx.x ID blockdim.x blockidx.x blockdim.x = 5 threadidx.x blockidx.x*blockdim.x + threadidx.x NVIDIA Corporation ID ID: 1 2 ID: PDE NVIDIA Corporation

12 CUDA NVIDIA Corporation G SM NVIDIA Corporation

13 NVIDIA Corporation NVIDIA Corporation

14 GPGPUCUDA C NVIDIA Corporation CUDA?? N/A NVIDIA Corporation

15 CUDA CUBA GPU GPU GPU GPU CUDA : API NVIDIA Corporation

16 CPU GPU CPUGPU GPU DRAM NVIDIA Corporation CPU cudamalloc(void ** pointer, size_t nbytes) cudamemset(void * pointer, int value, size_t count) cudafree(void* pointer) int n = 1024; int nbytes = 1024*sizeof(int); int *d_a = 0; cudamalloc( (void**)&d_a, nbytes ); cudamemset( d_a, 0, nbytes); cudafree(d_a); NVIDIA Corporation

17 cudamemcpy(void *dst, void *src, size_t nbytes, enum cudamemcpykind direction); direction src dst CPU : CUDA enum cudamemcpykind cudamemcpyhosttodevice cudamemcpydevicetohost cudamemcpydevicetodevice NVIDIA Corporation CUDA CUDA Correct! solutionsolution NVIDIA Corporation

18 : Windows Microsoft Visual Studio <>.sln 4 Release Debug EmuRelease EmuDebug EmuDebug global device printf 1GPU1CPU GPU NVIDIA Corporation : Linux nvcc <filename>.cu [-o <executable>] nvcc g <filename>.cu GPU nvcc deviceemu <filename>.cu CPU nvcc deviceemu g <filename>.cu CPU gdb linux NVIDIA Corporation

19 1: gcudamallocandmemcpy Part1: d_a d_b Part2: h_a d_a Part3: d_ad_b Part4: d_bh_a Part5: d_a d_b NVIDIA Corporation GPU C GPU void varargs static CPUGPU NVIDIA Corporation

20 global : CPU GPU void device : GPU CPU host : CPU host device : CPUGPU NVIDIA Corporation C kernel<<<dim3 grid, dim3 block>>>( ) <<< >>> : x y : x y z dim3 grid(16, 16); dim3 block(16,16); kernel<<<grid, block>>>(...); kernel<<<32, 512>>>(...); NVIDIA Corporation

21 CUDA global device dim3 griddim; 2 dim3 blockdim; dim3 blockidx; dim3 threadidx; NVIDIA Corporation global void minimal( int* d_a) { *d_a = 13; } global void assign( int* d_a, int value) { int idx = blockdim.x * blockidx.x + threadidx.x; d_a[idx] = value; } NVIDIA Corporation

22 : N b N=16 blockdim=4 4 blockidx.x=0 blockdim.x=4 threadidx.x=0,1,2,3 idx=0123 blockidx.x=1 blockdim.x=4 threadidx.x=0,1,2,3 idx=4567 blockidx.x=2 blockdim.x=4 threadidx.x=0,1,2,3 idx= blockidx.x=3 blockdim.x=4 threadidx.x=0,1,2,3 idx= idx=0,1,2,3 idx=4,5,6,7 idx=8,9,10,11 idx=12,13,14,15 int idx = blockdim.x * blockid.x + threadidx.x; threadidx : blockdim 32 NVIDIA Corporation : CPU void increment_cpu(float *a, float b, int N) { } for (int idx = 0; idx<n; idx++) a[idx] = a[idx] + b; CUDA global void increment_gpu(float *a, float b, int N) { int idx = blockidx.x * blockdim.x + threadidx.x; if (idx < N) a[idx] = a[idx] + b; } void main() {... increment_cpu(a, b, N); } void main() { dim3 dimblock (blocksize); dim3 dimgrid( ceil( N / (float)blocksize) ); increment_gpu<<<dimgrid, dimblock>>>(a, b, N); } NVIDIA Corporation

23 2D global void assign2d(int* d_a, int w, int h, int value) { int iy = blockdim.y * blockidx.y + threadidx.y; int ix = blockdim.x * blockidx.x + threadidx.x; int idx = iy * w + ix; d_a[idx] = value; }... assign2d<<<dim3(64, 64), dim3(16, 16)>>>(...); NVIDIA Corporation CPU CUDA cudamemcpy() CPU CUDA cudathreadsynchronize() CUDA NVIDIA Corporation

24 : // int numbytes = N * sizeof(float) float* h_a = (float*) malloc(numbytes); // float* d_a = 0; cudamalloc((void**)&d_a, numbytes); // cudamemcpy(d_a, h_a, numbytes, cudamemcpyhosttodevice); // increment_gpu<<< N/blockSize, blocksize>>>(d_a, b); // cudamemcpy(h_a, d_a, numbytes, cudamemcpydevicetohost); // cudafree(d_a); NVIDIA Corporation : myfirstkernel Part1: d_a Part2: 1-D 1-D Part3: d_a idx = blockidx.x*blockdim.x + threadidx.x d_a[idx] = 1000*blockIdx.x + threadidx.x Part4: d_ah_a Part5: NVIDIA Corporation

25 GPU device cudamalloc device : shared : 5 NVIDIA Corporation global void kernel( ) global void kernel( ) { { shared float sdata[256]; extern shared float sdata[]; } } int main(void) int main(void) { { kernel<<<nblocks,blocksize>>>( ); smbytes = blocksize*sizeof(float); } kernel<<<nblocks, blocksize, smbytes>>>( ); } NVIDIA Corporation

26 CPUGPU [u]char[1..4], [u]short[1..4], [u]int[1..4], [u]long[1..4], float[1..4] x y z w: uint4 param; int y = param.y; dim3 uint3 (1,1,1) NVIDIA Corporation GPU void syncthreads(); RAW WAR WAW NVIDIA Corporation

27 GPU Compute capability 1.1 G80 = Compute capability 1.0 G84/G86/G92 = Compute capability AND XOR NVIDIA Corporation CPUCUDA CUDA cudaerror_t cudaerror_t cudagetlasterror(void) char* cudageterrorstring(cudaerror_t code) printf( %s n, cudageterrorstring( cudagetlasterror() ) ); NVIDIA Corporation

28 3: d_a {a 0, a 1,, a n-1 } d_b {a n-1, a n-2,, a 0 } greversearray_singleblock 1 N = numthreads = 256 Part1 : greversearrayblock() greversearrayblock() d_a d_b NVIDIA Corporation : d_a {a 0, a 1,, a n-1 } d_b {a n-1, a n-2,, a 0 } greversearray_multiblock 256 N N/256 Part1: Part2: reversearrayblock() NVIDIA Corporation

CUDA NVIDIA Corporation 2008 53 nvcc PTX

29 CUDA NVIDIA Corporation nvcc PTX float4 me = gx[gtid]; me.x += me.y * me.z; EDG GPUCPU Open64 GPU PTX Parallel Thread execution (PTX) ISA ld.global.v4.f32 {$f1,$f3,$f5,$f7}, [$r9+0]; mad.f32 $f1, $f5, $f3, $f1; NVIDIA Corporation

30 CUDA nvcc nvcc cudacc g++ cl nvcc CCPU PTX CUDA CUDAcuda CUDA cudart API CUDA NVIDIA Corporation

31 G8x NVIDIA Corporation GPU GPU GPU NVIDIA Corporation

32 vs. = NVIDIA Corporation SDKMatrix Transpose NVIDIA Corporation

33 GPU NVIDIA Corporation G8x

34 : CUDA CUDA CPU : GPU : SIMD : 2 : 1 : GPU1CUDA NVIDIA Corporation G NVIDIA Corporation

35 TPC NVIDIA Corporation NVIDIA Corporation

36 NVIDIA Corporation

37 4GB/s PCIe x vs.76 GB/s Tesla C870 NVIDIA Corporation cudamallochost() cudamemcpy 3.2 GB/s PCIe x GB/s PCIe x CUDA SDKbandwidthTest NVIDIA Corporation

38 C cudamallochost a os CUDA CPU = API: 0 = cudamemcpyasync(dst, src, size, direction, 0); NVIDIA Corporation Compute capability 1.1G84 CUDAv1.1 API cudastreamcreate(&stream1); cudastreamcreate(&stream2); t t cudamemcpyasync(dst, src, size, dir, stream1); kernel<<<grid, block, 0, stream2>>>( ); cudastreamquery(stream2); NVIDIA Corporation

39 G8x GPU NVIDIA Corporation NVIDIA Corporation

40 G8x : NVIDIA Corporation nvcc ptx : ld.global.f32 $f1, [$rd4+0]; // id:74 st.global.f32 [$rd4+0], $f2; // id:75 ld.global.v2.f32 {$f3,$f5}, [$rd7+0]; // st.global.v2.f32 [$rd7+0], {$f4,$f6}; // ld.global.v4.f32 {$f7,$f9,$f11,$f13}, [$rd10+0]; // st.global.v4.f32 [$rd10+0], {$f8,$f10,$f12,$f14};,, // NVIDIA Corporation

41 16 : : 64 - int float int2 float int4 float4 k k k k : NVIDIA Corporation : float NVIDIA Corporation

42 : float 64 NVIDIA Corporation : : : float 3M12MB 10,000 12, µs 357µs 3,494µs NVIDIA Corporation

43 : NVIDIA Corporation float3 global void accessfloat3(float3 *d_in, float3 d_out) { int index = blockidx.x * blockdim.x + threadidx.x; float3 a = d_in[index]; a.x += 2; a.y += 2; a.z += 2; } d_out[index] = a; NVIDIA Corporation

: float3 float3 12 3 sizeof(float3) 4 8 16 364B NVIDIA

44 : float3 float sizeof(float3) B NVIDIA Corporation Float NVIDIA Corporation

45 : float3 sizeof(float3)*( /) ( / ) 3 : 0 ( /) 2*( /) float3 (float3*) ID NVIDIA Corporation float3 global void accessint3shared(float *g_in, float *g_out) { int index = 3 * blockidx.x * blockdim.x + threadidx.x; shared float s_data[256*3]; s_data[threadidx.x] d = g_ in[index]; s_data[threadidx.x+256] = g_in[index+256]; s_data[threadidx.x+512] = g_in[index+512]; syncthreads(); float3 a = ((float3*)s_data)[threadidx.x]; a.x += 2; a.y += 2; a.z += 2; } ((float3*)s )s_data)[threadidx.x] d = a; syncthreads(); g_out[index] = s_data[threadidx.x]; g_out[index+256] = s_data[threadidx.x+256]; g_out[index+512] = s_data[threadidx.x+512]; NVIDIA Corporation

46 : : : float 3M (12MB) 10,000 12, float 356µs 357µs 3,494µs 4, float3 3,302µs float3 359µs float3 NVIDIA Corporation : AoS Array of Structure: SoA Structure of Array: SoA : align(x)x = AoS SoA NVIDIA Corporation

47 : AoS SoA SoA : SDKAligned Types NVIDIA Corporation timestamp gld_incoherent gld_coherent gst_incoherent gst_coherent local_load local_store branch divergent_branch instructions warp_serialize cta_launched NVIDIA Corporation

48 CUDA_PROFILE : 1 0 CUDA_PROFILE_LOG :./cuda_profile.log CUDA_PROFILE_CSV : 1 0 CUDA_PROFILE_CONFIG : 4 config NVIDIA Corporation : 00 NVIDIA Corporation

49 Visual Profiler NVIDIA Corporation : NVIDIA Corporation

50 SDKMatrix Transpose NVIDIA Corporation NVIDIA Corporation

51 stride == 1 1:1 NVIDIA Corporation way stride == 2 8way stride == 8 NVIDIA Corporation

52 SDK warp_serialize : = NVIDIA Corporation : NVIDIA Corporation

53 CUDA CPU fetch NVIDIA Corporation NVIDIA Corporation

54 2 CUDA 1 CUDA CUDA float float NVIDIA Corporation CUDA CPU CUDA tex1dfetch() tex1d() tex2d() tex3d() NVIDIA Corporation

55 = NVIDIA Corporation

56 / 1 / > 2 1 syncthreads() ,000 NVIDIA Corporation RAW Read-After-Write 11 CUDA: PTX: x = y + 5; z = x + 3; s_data[0] += 3; add.f32 $f3, $f1, $f2 add.f32 $f5, $f3, $f4 ld.shared.f32 $f3, [$r31+0] add.f32 $f3, $f3, $f % NVIDIA Corporation

57 SM SM 8,192 SM 16KB ptxas-options=-v ncvv maxrregcount=n N = LMEM - LMEM NVIDIA Corporation cubin.cubin code architecture {sm_10} abiversion {0} modname {cubin} code { name = BlackScholesGPU lmem = 0 smem = 68 reg = 20 bar = 0 bincode { 0xa x x40024c09 0x NVIDIA Corporation

58 CUDA Occupancy Calculator NVIDIA Corporation == 1==1 : NVIDIA Corporation

59 != NVIDIA Corporation GPU GPU 1 : FFTW ATLAS Experiment NVIDIA Corporation

60 CUDA 1 : Tesla C GHz NVIDIA Corporation

61 NVIDIA Corporation int float add shift min max float mul mad: 14 int multiply (*) int multiply mul24() / umul24() 2 2 : n 2 foo % n == foo & (n-1) NVIDIA Corporation

62 sin cos116 : rcp() sin() exp() y / x == rcp(x) * y 120 sqrt(x) == x * rsqrt(x) 120 NVIDIA Corporation func(): ISA ISA : sin(x), exp(x), pow(x,y) func() : 5ulp : sin(x), exp(x), pow(x,y) -use_fast_math func() func() NVIDIA Corporation

63 GPUCPU : CPU 0.5ulp 80 NVIDIA Corporation (x+y)+z == x+(y+z) x = y = z = 1 GPU CUDA NVIDIA Corporation

64 G8x SSE IBMAltivec Cell SPE Format IEEE 754 IEEE 754 IEEE 754 IEEE 754 FADD FMUL inf -inf 1,000 1,000 Na log2(x) 2^x NVIDIA Corporation G8x IEEE 754 IEEE 0.5 ulp FMAD 2 ulp NVIDIA Corporation

65 float G8x float float f foo = bar * 0.123; // foo = bar * 0.123f; // float float foo = sin(bar); // foo = sinf(bar); // float NVIDIA Corporation ID if (threadidx.x > 2) { } if (threadidx.x / WARP_SIZE > 2) { } NVIDIA Corporation

66 GPU NVIDIA Corporation CUDA

67 CUDA 2 CUBLAS: BLAS CUFFT: FFT NVIDIA Corporation CUBLAS CUDA BLAS Basic Linear Algebra Subprograms: API CUDA GPU CUBLAS GPU CUBLAS GPU NVIDIA Corporation

68 BLAS 1 - O(N) 2 - O(N 2 ) 3 - O(N 3 ) 1 CGEMM BLAS CUBLAS NVIDIA Corporation CUBLAS CUBLAS cublas.h cublas + BLAS cublassgemm CUBLAS CUBLAS CUBLAS CCUDA CUDA C C++ NVIDIA Corporation

CUBLAS SGEMM NVIDIA Corporation 2008 133 cublasinit() cublasshutdown() cublasstatus

69 CUBLAS SGEMM NVIDIA Corporation cublasinit() cublasshutdown() cublasstatus cublasinit() CUBLAS GPU CUBLAS API cublasstatus cublasshutdown() CUBLAS CPU GPU NVIDIA Corporation

70 cublasgeterror() cublasalloc() cublasfree() cublasstatus cublasgeterror() CUBLAS CUBLAS_STATE_SUCCESS cublasstatus cublasalloc(int n, int elemsize, Void **devptr) ngpu elemsize cudamalloc()devptr devptr cublasstatus cublasfree(const void *devptr) GPUdevPtr NVIDIA Corporation cublassetvector() cublasgetvector() cublasstatus cublassetvector(int n, int elemsize, const void *x, int incx, void*y, int incy) CPUxn GPU y elemsize x y incx incy cublasstatus t cublasgetvector(int t t n, int elemsize, const void *x, int incx, void *y, int incy) GPUxn CPU y NVIDIA Corporation

71 cublassetmatrix() cublasgetmatrix() cublasstatus cublassetmatrix(int rows, int cols, int elemsize, const void *A, int lda, void *B, int ldb) CPUArows*cols GPU B elemsize A lda B ldb cublasstatus cublasgetmatrix(int rows, int cols, int elemsize, const void *A, int lda, void *B, int ldb) GPUArows*cols CPU B NVIDIA Corporation FORTRAN CUBLAS FortranC CUBLAS fortran.c f t NVIDIA Corporation

72 FORTRAN CUBLAS 2 fortran.c CUBLAS_USE_THUNKING GPU CPU GPU GPU CPU GPU CUBLAS CPU GPGPU BLAS GPGPU CUBLAS_ALLOC CUBLAS_FREE GPUCPUALLOC CUBLAS CUBLAS_SET_VECTOR CUBLAS_GET_VECTOR CUBLAS_SET_MATRIX CUBLAS_GET_MATRIX NVIDIA Corporation FORTRAN 77 program matrixmod implicit none integer M, N parameter (M=6, N=5) real*4 a(m,n) integer i, j do j = 1, N do i = 1, M a(i,j) = (i-1) * M + j enddo enddo call modify (a, M, N, 2, 3, 16.0, 12.0) subroutine modify (m, ldm, n, p, q, alpha, beta) implicit none integer ldm, n, p, q real*4 m(ldm,*), alpha, beta external sscal call sscal (n-p+1, alpha, m(p,q), ldm) call sscal (ldm-p+1, beta, m(p,q), 1) return end do j = 1, N do i = 1, M write(*,"(f7.0$)") a(i,j) enddo write (*,*)*) " enddo stop end NVIDIA Corporation

73 FORTRAN 77 : program matrixmod implicit none integer M, N, sizeof_real, devptra parameter (M=6, N=5, sizeof_real=4) real*4 a(m,n) integer i, j, stat external cublas_init, cublas_set_matrix,cublas_get_matrix external cublas_shutdown, cublas_alloc integer cublas_alloc do j = 1, N do i = 1, M a(i,j) = (i-1) * M + j enddo enddo call cublas_init stat = cublas_alloc(m*n, sizeof_real, devptra) if (stat.ne. 0) then write(*,*) "device memory allocation failed" stop endif call cublas_set_matrix (M, N, sizeof_real, a, M, devptra, M) call modify (devptra, M, N, 2, 3, 16.0, 12.0) call cublas_get_matrix (M, N, sizeof_real, devptra, M, a, M) call cublas_free(devptra) call cublas_shutdown do j = 1, N do i = 1, M write(*,"(f7.0$)") a(i,j) enddo write (*,*) " enddo stop end #define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1) subroutine modify (devptrm, ldm, n, p, q, alpha, beta) implicit none integer ldm, n, p, q integer sizeof_real, devptrm parameter (sizeof_real=4) real*4 alpha, beta call cublas_sscal (n-p+1, alpha, devptrm+idx2f(p,q,ldm)*sizeof_ real,ldm) call cublas_sscal (ldm-p+1, beta, devptrm+idx2f(p,q,ldm)*sizeof_ real,1) return end 72 NVIDIA Corporation CUFFT FFT Fast Fourier Transform: CUFFT CUDA FFT NVIDIA GPUFFT GPUFFT GPU NVIDIA Corporation

74 M 2D 3D[2,16384] NVIDIA Corporation CUFFT cuffthandle CUFFT cufftresults API CUFFT_SUCCESS CUFFT_INVALID_PLAN NVIDIA Corporation

75 CUFFT_C2C CUFFT_C2R CUFFT_R2C CUFFT_FORWARD (-1) CUFFT_BACKWARD (1) cufftcomplex FFT N -> N/2+1 N0 N1 Nn -> N0 N1 (Nn/2+1) NVIDIA Corporation D 3DCUFFT C FORTRAN MATLAB CUFFT IFFT(FFT(A))= length(a)*a CUFFT API FFTWFFT FFTGPU CUFFT NVIDIA Corporation

76 cufftplan1d() cufftresult cufftplan1d(cuffthandle *plan, int nx, cuffttype type, int batch) 1FFT batch1 CUFFT 1 CUFFT plan nx type batch plan cuffthandle 256 FFT256 CUFFT_C2C nx CUFFT 1 NVIDIA Corporation cufftplan2d() cufftresult cufftplan2d(cuffthandle *plan, int nx, int ny, cuffttype type) 2FFT plan nx ny type cuffthandle X Y CUFFT_C2C plan CUFFT 2 NVIDIA Corporation

77 cufftplan3d() cufftresult cufftplan3d(cuffthandle *plan, int nx, int ny, int nz, cuffttype type) 3FFT 3FFT plan nx ny nz type plan cuffthandle X Y Z CUFFT_C2C C2C CUFFT 3 NVIDIA Corporation cufftdestroy() cufftresult cufftdestroy(cuffthandle plan) CUFFT GPU GPU plan cuffthandle NVIDIA Corporation

78 cufftexecc2c() cufftresult cufftexecc2c(cuffthandle plan, cufftcomplex *idata, cufftcomplex *odata, int direction) CUFFT idata GPU odata idata odata plan cuffthandle idata GPUGPU odata GPU direction CUFFT_FORWARD CUFFT_BACKWARD odata NVIDIA Corporation cufftexecr2c() cufftresult cufftexecr2c(cuffthandle plan, cufftreal *idata, cufftcomplex *odata) CUFFT idata GPU odata idata odata plan cuffthandle idata GPUGPU odata GPU odata NVIDIA Corporation

79 cufftexecc2r() cufftresult cufftexecc2r(cuffthandle plan, cufftreal *idata, cufftcomplex *odata) CUFFT idata GPU GPU idata odata idata odata plan cuffthandle idata GPUGPU odata GPU odata NVIDIA Corporation CUFFT FFT 1. CUDA CUFFT CUFFTGPU 1 1 CUFFT 2FFT 1FFT CUFFT API NVIDIA Corporation

80 : 1 #define NX 256 #define BATCH 10 cuffthandle plan; cufftcomplex *data; cudamalloc((void**)&data, sizeof(cufftcomplex)*nx*batch); /* 1FFT */ cufftplan1d(&plan, NX, CUFFT_C2C, BATCH); /* CUFFT */ cufftexecc2c(plan, data, data, CUFFT_FORWARD); /* */ cufftexecc2c(plan, data, data, CUFFT_INVERSE); /* : (1) ) (2) */ /* CUFFT */ cufftdestroy(plan); cudafree(data); NVIDIA Corporation : 2 #define NX 256 #define NY 128 cuffthandle plan; cufftcomplex *idata, *odata; cudamalloc((void**)&idata, sizeof(cufftcomplex)*nx*ny); cudamalloc((void**)&odata, sizeof(cufftcomplex)*nx*ny); /* 1FFT */ cufftplan2d(&plan, NX,NY, CUFFT_C2C); /* CUFFT */ cufftexecc2c(plan, idata, odata, CUFFT_FORWARD); /* */ cufftexecc2c(plan, odata, odata, CUFFT_INVERSE); /* : */ /* CUFFT */ cufftdestroy(plan); cudafree(idata), cudafree(odata); NVIDIA Corporation

81 CUDA Fortran API NVIDIA Corporation

82 CUDA CUDA CUDA 2 2 CUDA NVIDIA Corporation

83 2 CUDA 1 CUDA CUDA 1 2 loat float NVIDIA Corporation CUDA CPU CUDA tex1dfetch() tex1d() tex2d() NVIDIA Corporation

84 : int float CUDA cudareadmodeelementtype cudareadmodenormalizedfloat 816int [-1,1][0,1] 0=[0, 1] cudafiltermodepoint cudafiltermodelinear cudaaddressmodeclamp cudaaddressmodewrap NVIDIA Corporation : // texture<unsigned short, 1, cudareadmodenormalizedfloat> texref;... // unsigned short *da = 0; cudamalloc((void**)&d_a, numbytes); cudamemcpy(da, ha, numbytes, cudamemcpyhosttodevice); // cudabindtexture(null, texref, da); NVIDIA Corporation

85 cudaarray cudachannelformatdesc int x y z w: enum cudachannelformatkind cudachannelformatkindsigned cudachannelformatkindunsigned cudachannelformatkindfloat cudacreatechanneldesc<float>(void); cudacreatechanneldesc<float4>(void); cudamallocarray cudafreearray cudamemcpytoarray cudamemcpyfromarray NVIDIA Corporation : 2 // texture<float, 2, cudareadmodeelementtype> texref;... // CUDA cudachannelformatdesc cf = cudacreatechanneldesc<float>(); cudaarray *texarray = 0; cudamallocarray(&texarray, &cf, dimx, dimy); cudamempcytoarray(texarray, 0,0, ha, numbytes, cudamemcpyhosttodevice); // texref.normalized = 0; texref.filtermode = cudafiltermodelinear; texref.addressmode = cudaaddressmodeclamp; // cudabindtexturetoarray(texref, texarray); NVIDIA Corporation

86 CUDA CUDA loat 816 cudareadmodenormalizedfloat API API API half f oat16 32 API CUDA NVIDIA Corporation CUDA Fortran

87 Fortran Fortran CUBLAS Fortranpinned Fortran CUDA NVIDIA Corporation SGEMM! 3A B C real, dimension(m1,m1):: A, B, C! #ifdef CUBLAS! CUBLASSGEMM CU! call cublas_sgemm ('n','n',m1,m1,m1,alpha,a,m1,b,m1,beta,c,m1) #else! BLASSGEMM call SGEMM ('n','n',m1,m1,m1,alpha,a,m1,b,m1,beta,c,m1) #endif BLAS g95 O3 code.f90 L/usr/local/lib lblas CUBLASfortran.c NVIDIA: gcc -O3 -DCUBLAS_USE_THUNKING -I/usr/local/cuda/include -c fortran.c g95 -O3 -DCUBLAS code.f90 fortran.o -L/usr/local/cuda/lib -lcublas NVIDIA Corporation

88 pinned pinned PCIe cudamallochost CFortran 2003 iso_ c_ binding! C C type (C_PTR) type(c_ptr) :: cptr_a, cptr_b, cptr_c! Fortran real, dimension(:,:), pointer :: A, B, C! cudamallochost! Fortraniso_c_binding!C(A(m1,m1)) res = cudamallochost ( cptr_a, m1*m1*sizeof(fp_kind) ) call c_f_pointer ( cptr_a, A, (/ m1, m1 /) )! A! cudamallochost NVIDIA Corporation CUDA FortranCUDA C! Fortran -> C -> CUDA ->C ->Fortran call cudafunction(c,c2,n) /* : Fortran */ extern "C" void cudafunction_(cucomplex *a, cucomplex *b, int *Np) {... int N=*np; cudamalloc ((void **) &a_d, sizeof(cucomplex)*n); cudamemcpy( a_d, a, sizeof(cucomplex)*n,cudamemcpyhosttodevice); dim3 dimblock(block_size); dim3 dimgrid (N/dimBlock.x); if( N % block_size!= 0 ) dimgrid.x+=1; square_complex<<<dimgrid,dimblock>>>(a_d,a_d,n); cudamemcpy( b, a_d, sizeof(cucomplex)*n,cudamemcpydevicetohost); cudafree(a_d); } complex_mul: main.f90 Cuda_function.o $(FC) -o complex_mul main.f90 Cuda_function.o -L/usr/local/cuda/lib lcudart Cuda_function.o: Cuda_function.cu nvcc -c -O3 Cuda_function.cu NVIDIA Corporation

89 CUDA API CUDA CUDA CUDA CUDACPU CUDA SDK asyncapi cudaevent_t start, stop; cudaeventcreate(&start); cudaeventcreate(&stop); cudaeventrecord(start, 0); kernel<<<grid, block>>>(...); cudaeventrecord(stop, 0); cudaeventsynchronize(stop); float et; cudaeventelapsedtime(&et, start, stop); cudaeventdestroy(start); cudaeventdestroy(stop); NVIDIA Corporation CPU GPU cudagetdevicecount( int* count ) cudasetdevice( int device ) cudagetdevice( int *current_device ) cudagetdeviceproperties( cudadeviceprop* prop, int device ) cudachoosedevice( int *device, cudadeviceprop* prop ) GPU 0 1CPU1GPU CPU GPU NVIDIA Corporation

90 CPU CUDA CPUCUDACPU CUDA CPU2 GPUp 3 CUDAp NVIDIA Corporation CUDA

91 OpenGL OpenGL CUDA Direct3D9 gldrawpixels / glteximage2d () NVIDIA Corporation OpenGL CUDA cudaglregisterbufferobject(gluint buffobj); OpenGL OpenGL CUDA cudaglmapbufferobject(void **devptr, GLuint buffobj); CUDA OpenGL cudaglunmapbufferobject(gluint buffobj); cudaglunregisterbufferobject(gluint buffobj); : OpenGL NVIDIA Corporation

92 : CUDA CUDAPBO CUDA unsigned char *p_d=0; cudaglmapbufferobject((void**)&p_d, pbo); preptexture<<<height,width>>>(p_d, time); cudaglunmapbufferobject(pbo); b glbindbuffer(gl_pixel_unpack_buffer_arb, pbo); glbindtexture(gl_texture_2d, texid); gltexsubimage2d(gl_texture_2d, 0, 0,0, 256,256, GL_BGRA, GL_UNSIGNED_BYTE, 0); NVIDIA Corporation : CUDA OpenGL PBO CUDA PBO CUDA CUDAPBO unsigned char *p_d=0; cudaglregisterbufferobject(pbo); cudaglmapbufferobject((void**)&p_d, pbo); postprocess<<<blocks,threads>>>(p_d); cudaglunmapbufferobject(pbo); cudaglunregisterbufferobject(pbo);... NVIDIA Corporation

Corporation Copyright 2008 NVIDIA Corporation.All rights reserved.

94 NVIDIA NVIDIA NVIDIA Corporation NVIDIA Corporation NVIDIA Corporation NVIDIA Corporation NVIDIA NVIDIA CUDA Tesla NVIDIA Corporation Copyright 2008 NVIDIA Corporation.All rights reserved. NVIDIA Corporation 2701 San Tomas Expressway Santa Clara, CA

untitled

untitled GPGPU NVIDACUDA Learn More about CUDA - NVIDIA http://www.nvidia.co.jp/object/cuda_education_jp.html NVIDIA CUDA programming Guide CUDA http://www.sintef.no/upload/ikt/9011/simoslo/evita/2008/seland.pdf