07-二村幸孝・出口大輔.indd - PDF 無料ダウンロード

GPU Graphics Processing Units HPC High Performance Computing GPU GPGPU General-Purpose computation on GPU CPU GPU GPU *1 Intel Quad-Core Xeon E5472 3.0 GHz 2 6 MB L2 cache 1600 MHz FSB 80 GFlops 1 nvidia Geforce 8800GTX 300 GFlops CPU GPU GPU 2008 2 GPU 1 TFlops *2 TOP 500 2 2005 6 500 1.2 TFLOPS GPU 3 4 *3 GPU GPU GPGPU GPGPU 1978 Ikonas System 3 1990 GPU 2000 GPGPU GPGPU 4,5 GPGPU HLSL GLSL Cg GPU CUDA Compute unified device architecturenvidia GPU C/CHLSL GLSL *1 nvidia Geforce 8800GTX 7 8 2007 *2 2008 6 16 nvidia GeForce GTX 280 AMD AMD FireStream 9250 1 TFlops *3 BlueGene/L 478 TFLOPS 2007

DirectX OpenGL API CUDA GPU C/C CUDA GPGPU GPGPU II. CUDA III. CUDA IV. CUDA V. VI. CUDA CUDA GPU CUDA 6 GeForce 8 CUDA 1 GPU CUDA nvidia GPU 3 GeForce Quadro GeForce Tesla HPC CUDA Windows XP Windows Vista Linux CUDA Windows Visual Studio Series GeForce Quadro Tesla Products 9800 GX2, 9800 GTX, 9800 GT, 8800 Ultra, 8800 GTX, 8800 GTS, 8800 GT, 8800 GS, 8600 GTS, 8600 GT, 8500 GT, 8400 GS, 8800M GTX, 8800M GTS, 8700M GT, 8600M GT, 8600M GS, 8400M GT, 8400M GS, 8400MG FX5600, FX4600, FX3700, FX1700, FX570, FX370, NVS290, FX3600M, FX1600M, FX570M, FX360M, Quadro Plex 1000Model IV, Quadro Plex 1000Model S4, NVS320M, NVS140M, NVS135M, NVS130M C870, D870, S870

2003 2005 Linux gcc g CUDA Windows CUDA HPC CUDA 2.0 Beta CUDA SDK 3 OS OS NVIDIA Driver for Microsoft Windows XP with CUDA Support (174.55) CUDA Toolkit version 2.0 for Windows XP CUDA SDK version 2.0 for Windows XP CUDA CUDA Toolkit C:\CUDA CUDA SDK CUDA SDK C:\Program Files\NVIDIA Corporation\NVIDIA CUDA SDK CUDA CUDA SDK CUDA 1 main.cu.cu CUDA nvcc Visual Studio 2005 CUDA main.cu C:\Your\Source\Path> nvcc main.cu a.exe CUDA 1 kernel <<< nblocks, nthreads >>> ( ddata ); GPU CPU nvidia C/C CUDA GPU CPU <<<... >>><<<... >>> GPU global void kernel int *data

1 #i n c l u d e <stdio.h> 2 3 global void kernel( int data ) 4 { 5 data [ threadidx.x ] = threadidx.x; 6 } 7 8 int main( int argc, char argv [ ] ) 9 { 10 int ddata, hdata [ 5 ] ; 11 cudamalloc( ( void )&ddata, sizeof( int ) 5 ); 12 13 dim3 nthreads( 5, 1 ); 14 dim3 nblocks( 1, 1 ); 15 k e r n e l <<< nblocks, nthreads >>>( ddata ) ; 16 17 cudamemcpy( hdata, ddata, sizeof( int ) 5, cudamemcpydevicetohost ) ; 18 19 for( int i = 0 ; i < 5 ; i++ ) 20 { 21 p r i n t f ( %d, hdata [ i ] ) ; 22 } 23 p r i n t f ( \n ); 24 25 return( 0 ); 26 } III. GPGPU CUDA CUDA CUDA GPU GPU GPU CUDA GPU 1 CUDA CUDA CUDA CPU ( ) CUDA 2

CUDA CUDA C/C 2 2 CPU GPU

device global host device constant shared GPU GPU CPU GPU CPU CPU GPU GPU GPU 2 global void kernel ( int *parameter ) 2 global CPU GPU global void kernel ( int *parameter ) kernel<<< nblocks, nthreads, nbytes >>>( parameter ); nblocks nthreads nbytes nbytes syncthreads 3 4 CUDA CUDA Occupancy Calculator GPU 1 global void kernel( int parameter ) 2 { 3 // 4 } 5 6 int main( int argc, char argv [ ] ) 7 { 8 //... 9 10 // 11 k e r n e l <<< nblocks, nthreads, nbytes >>>( parameter ); 12 13 //... 14 }

griddim blockidx blockdim threadidx CUDA GPGPU C A B CUDA 16 A r c a rc CUDA CPU GPU CPU GPU global GPU GPU device CUDA CPU GPU *4 CUDA GPU CUDA GPU shared GPU 3 3 C c rc c rc = ca k=1 a rk b kc 1 GPU ra A ca A 3 GPU A *4 GPU CPU CPU GPU

1 global void multiply( float A, float B, float C, int ra, int ca ) 2 { 3 int c = threadidx.x + blockidx.x blockdim. x ; 4 int r = threadidx.y + blockidx.y blockdim. y ; 5 6 float sum = 0.0 f ; 7 for( int k = 0 ; k < ca ; k++ ) 8 { 9 sum += A[ r + k ra ] B[k + c ca ] ; 10 } 11 12 C[ c ra + r ] = sum ; 13 } B C threadidxblockdim blockdimgpu multiply threadidx GPU threadidxblockdim 8 2.2 3 3 CPU 4 CUDA CPU GPU CPU GPU 11 13 CPU 16 18 GPU 23 24 CPU GPU cudamemcpy CPU GPU GPU 27 28 31 3 CUDA III. CUDA Occupancy Calculator GPU CPU 34 39 44 3 4 CPU 3 3 CUDA GPU 4 CUDA

1 int main( int argc, char argv [ ] ) 2 { 3 int ra = 512; // A 4 int ca = 5 1 2 ; // A 5 int rb = ca ; // B 6 int cb = 512; // B 7 float ha, hb, hc ; // C P U 8 float da, db, dc ; // G P U 9 10 // C P U 11 ha = ( float )malloc( ra ca sizeof( float ) ); 12 hb = ( float )malloc( rb cb sizeof( float ) ); 13 hc = ( float )malloc( ra cb sizeof( float ) ); 14 15 // G P U 16 cudamalloc( ( void )&da, ra ca sizeof( float ) ); 17 cudamalloc( ( void )&db, rb cb sizeof( float ) ); 18 cudamalloc( ( void )&dc, ra cb sizeof( float ) ); 19 20 / / 21 22 // CPUGPU 23 cudamemcpy( da, ha, ra ca sizeof( float ), cudamemcpyhosttodevice ) ; 24 cudamemcpy( db, hb, rb cb sizeof( float ), cudamemcpyhosttodevice ) ; 25 26 // GPU 27 dim3 nthreads( 16, 16 ); 28 dim3 nblocks ( ra / nthreads.x, cb / nthreads. y ); 29 30 // G P U C = A B dc 31 multiply<<< nblocks, nthreads >>>( da, db, dc, ra, ca ); 32 33 // GPUCPU 34 cudamemcpy( hc, dc, ra cb sizeof( float ), cudamemcpydevicetohost ) ; 35 36 / hc / 37 38 // CPU GPU 39 cudafree( da ); 40 cudafree( db ); 41 cudafree( dc ); 42 f r e e ( ha ) ; 43 f r e e ( hb ) ; 44 f r e e ( hc ) ; 45 46 return( 0 ); 47 }

5 C A B 5 A B 16 16 A B 12 13 9 10 shared ta tb 15 syncthreads 17 20 16 16 GPU Bank Conflict Bank Conflict CUDA Programming Guide [8] Bank Conflict 1 global void multiply( float A, float B, float C, int ra, int ca ) 2 { 3 int c = threadidx.x + blockidx.x blockdim. x ; 4 int r = threadidx.y + blockidx.y blockdim. y ; 5 6 float sum = 0.0 f ; 7 for( int k = 0 ; k < ca ; k += 16 ) 8 { 9 shared float ta[16][16]; 10 shared float tb[16][16]; 11 12 ta[ threadidx.y ][ threadidx.x] = A[ r + ( k + threadidx.x ) ra ] ; 13 tb[ threadidx.y ][ threadidx.x] = B[( k + threadidx.y ) + c ca ] ; 14 15 syncthreads( ); 16 17 for( int t = 0 ; t < 16 ; t++ ) 18 { 19 sum += ta [ threadidx. y ] [ t ] tb[ t ][ threadidx.x ]; 20 } 21 22 syncthreads( ); 23 } 24 25 C[ c ra + r ] = sum ; 26 }

C A B A B C 3 5 3 3 Dell Precision Workstation T7400 CPU: Intel Quad Core Xeon 3.20 GHz 2 nvidia Quadro FX5600 4.0 GB RAM, Windows XP SP2 3 3 5 CPU CPU 3 5 A B C 512 512 CPU 404.5 ms. 3 191.6 ms. 5 12.0 ms. 5 CPU 33 3 16 CT MRI CUDA OS: WindowsXP CPU: Intel Quad-Core Xeon 3.20 GHz Memory: 3.0 GB GPU: NVIDIA Quadro FX5600 2 4

CPU CUDA CPU 10 CUDA CUDA 6 CUDA CUDA GPGPU 5,7 GPGPU CUDA GPGPU GPU CPU GPGPU PC GPGPU 1 GPU 32 GPU 2008 6 16 GPU 2008 GPU

CUDA Bank Conflict GPU CUDA CUDA Programming Guide [8] CUDA GPGPU [1] http://www.intel.co.jp/jp/performance/server/xeon/hpcapp.htm [2] TOP 500, http://www.top500.org [3] J. N. England, A system for interactive modeling of physical curved surface objects, Proceedings of SIGGRAPH 78, pp.336 340. 1978 [4] M. J. Harris, G. Coombe, T. Scheuermann, and A. Lastra, Physically-Based Visual Simulation on Graphics Hardware, Proceedings of SIGGRAPH 2002 / Eurographics Workshop on Graphics Hardware 2002, pp.1 10, 2002 [5] J. D. Owens, D. Luebke, N. Govindaraju, M. Harris, J. Krüger, A. E. Lefohn, and T. J. Purcell, A Survey of General-Purpose Computation on Graphics Hardware, Computer Graphics Forum, Vol.26, No.1, pp.80 113, 2007 [6] CUDA ZONE, http://www.nvidia.com/object/cuda_home.html [7] GPGPU, http://www.gpgpu.org/ [8] CUDA Programming Guide, http://www.nvidia.com/object/cuda_develop. html