07-二村幸孝・出口大輔.indd

Save this PDF as:
 WORD  PNG  TXT  JPG

Size: px
Start display at page:

Download "07-二村幸孝・出口大輔.indd"

Transcription

1 GPU Graphics Processing Units HPC High Performance Computing GPU GPGPU General-Purpose computation on GPU CPU GPU GPU *1 Intel Quad-Core Xeon E GHz 2 6 MB L2 cache 1600 MHz FSB 80 GFlops 1 nvidia Geforce 8800GTX 300 GFlops CPU GPU GPU GPU 1 TFlops *2 TOP TFLOPS GPU 3 4 *3 GPU GPU GPGPU GPGPU 1978 Ikonas System GPU 2000 GPGPU GPGPU 4,5 GPGPU HLSL GLSL Cg GPU CUDA Compute unified device architecturenvidia GPU C/CHLSL GLSL *1 nvidia Geforce 8800GTX * nvidia GeForce GTX 280 AMD AMD FireStream TFlops *3 BlueGene/L 478 TFLOPS 2007

2 DirectX OpenGL API CUDA GPU C/C CUDA GPGPU GPGPU II. CUDA III. CUDA IV. CUDA V. VI. CUDA CUDA GPU CUDA 6 GeForce 8 CUDA 1 GPU CUDA nvidia GPU 3 GeForce Quadro GeForce Tesla HPC CUDA Windows XP Windows Vista Linux CUDA Windows Visual Studio Series GeForce Quadro Tesla Products 9800 GX2, 9800 GTX, 9800 GT, 8800 Ultra, 8800 GTX, 8800 GTS, 8800 GT, 8800 GS, 8600 GTS, 8600 GT, 8500 GT, 8400 GS, 8800M GTX, 8800M GTS, 8700M GT, 8600M GT, 8600M GS, 8400M GT, 8400M GS, 8400MG FX5600, FX4600, FX3700, FX1700, FX570, FX370, NVS290, FX3600M, FX1600M, FX570M, FX360M, Quadro Plex 1000Model IV, Quadro Plex 1000Model S4, NVS320M, NVS140M, NVS135M, NVS130M C870, D870, S870

3 Linux gcc g CUDA Windows CUDA HPC CUDA 2.0 Beta CUDA SDK 3 OS OS NVIDIA Driver for Microsoft Windows XP with CUDA Support (174.55) CUDA Toolkit version 2.0 for Windows XP CUDA SDK version 2.0 for Windows XP CUDA CUDA Toolkit C:\CUDA CUDA SDK CUDA SDK C:\Program Files\NVIDIA Corporation\NVIDIA CUDA SDK CUDA CUDA SDK CUDA 1 main.cu.cu CUDA nvcc Visual Studio 2005 CUDA main.cu C:\Your\Source\Path> nvcc main.cu a.exe CUDA 1 kernel <<< nblocks, nthreads >>> ( ddata ); GPU CPU nvidia C/C CUDA GPU CPU <<<... >>><<<... >>> GPU global void kernel int *data

4 1 #i n c l u d e <stdio.h> 2 3 global void kernel( int data ) 4 { 5 data [ threadidx.x ] = threadidx.x; 6 } 7 8 int main( int argc, char argv [ ] ) 9 { 10 int ddata, hdata [ 5 ] ; 11 cudamalloc( ( void )&ddata, sizeof( int ) 5 ); dim3 nthreads( 5, 1 ); 14 dim3 nblocks( 1, 1 ); 15 k e r n e l <<< nblocks, nthreads >>>( ddata ) ; cudamemcpy( hdata, ddata, sizeof( int ) 5, cudamemcpydevicetohost ) ; for( int i = 0 ; i < 5 ; i++ ) 20 { 21 p r i n t f ( %d, hdata [ i ] ) ; 22 } 23 p r i n t f ( \n ); return( 0 ); 26 } III. GPGPU CUDA CUDA CUDA GPU GPU GPU CUDA GPU 1 CUDA CUDA CUDA CPU ( ) CUDA 2

5 CUDA CUDA C/C 2 2 CPU GPU

6 device global host device constant shared GPU GPU CPU GPU CPU CPU GPU GPU GPU 2 global void kernel ( int *parameter ) 2 global CPU GPU global void kernel ( int *parameter ) kernel<<< nblocks, nthreads, nbytes >>>( parameter ); nblocks nthreads nbytes nbytes syncthreads 3 4 CUDA CUDA Occupancy Calculator GPU 1 global void kernel( int parameter ) 2 { 3 // 4 } 5 6 int main( int argc, char argv [ ] ) 7 { 8 // // 11 k e r n e l <<< nblocks, nthreads, nbytes >>>( parameter ); // }

7 griddim blockidx blockdim threadidx CUDA GPGPU C A B CUDA 16 A r c a rc CUDA CPU GPU CPU GPU global GPU GPU device CUDA CPU GPU *4 CUDA GPU CUDA GPU shared GPU 3 3 C c rc c rc = ca k=1 a rk b kc 1 GPU ra A ca A 3 GPU A *4 GPU CPU CPU GPU

8 1 global void multiply( float A, float B, float C, int ra, int ca ) 2 { 3 int c = threadidx.x + blockidx.x blockdim. x ; 4 int r = threadidx.y + blockidx.y blockdim. y ; 5 6 float sum = 0.0 f ; 7 for( int k = 0 ; k < ca ; k++ ) 8 { 9 sum += A[ r + k ra ] B[k + c ca ] ; 10 } C[ c ra + r ] = sum ; 13 } B C threadidxblockdim blockdimgpu multiply threadidx GPU threadidxblockdim CPU 4 CUDA CPU GPU CPU GPU CPU GPU CPU GPU cudamemcpy CPU GPU GPU CUDA III. CUDA Occupancy Calculator GPU CPU CPU 3 3 CUDA GPU 4 CUDA

9 1 int main( int argc, char argv [ ] ) 2 { 3 int ra = 512; // A 4 int ca = ; // A 5 int rb = ca ; // B 6 int cb = 512; // B 7 float ha, hb, hc ; // C P U 8 float da, db, dc ; // G P U 9 10 // C P U 11 ha = ( float )malloc( ra ca sizeof( float ) ); 12 hb = ( float )malloc( rb cb sizeof( float ) ); 13 hc = ( float )malloc( ra cb sizeof( float ) ); // G P U 16 cudamalloc( ( void )&da, ra ca sizeof( float ) ); 17 cudamalloc( ( void )&db, rb cb sizeof( float ) ); 18 cudamalloc( ( void )&dc, ra cb sizeof( float ) ); / / // CPUGPU 23 cudamemcpy( da, ha, ra ca sizeof( float ), cudamemcpyhosttodevice ) ; 24 cudamemcpy( db, hb, rb cb sizeof( float ), cudamemcpyhosttodevice ) ; // GPU 27 dim3 nthreads( 16, 16 ); 28 dim3 nblocks ( ra / nthreads.x, cb / nthreads. y ); // G P U C = A B dc 31 multiply<<< nblocks, nthreads >>>( da, db, dc, ra, ca ); // GPUCPU 34 cudamemcpy( hc, dc, ra cb sizeof( float ), cudamemcpydevicetohost ) ; / hc / // CPU GPU 39 cudafree( da ); 40 cudafree( db ); 41 cudafree( dc ); 42 f r e e ( ha ) ; 43 f r e e ( hb ) ; 44 f r e e ( hc ) ; return( 0 ); 47 }

10 5 C A B 5 A B A B shared ta tb 15 syncthreads GPU Bank Conflict Bank Conflict CUDA Programming Guide [8] Bank Conflict 1 global void multiply( float A, float B, float C, int ra, int ca ) 2 { 3 int c = threadidx.x + blockidx.x blockdim. x ; 4 int r = threadidx.y + blockidx.y blockdim. y ; 5 6 float sum = 0.0 f ; 7 for( int k = 0 ; k < ca ; k += 16 ) 8 { 9 shared float ta[16][16]; 10 shared float tb[16][16]; ta[ threadidx.y ][ threadidx.x] = A[ r + ( k + threadidx.x ) ra ] ; 13 tb[ threadidx.y ][ threadidx.x] = B[( k + threadidx.y ) + c ca ] ; syncthreads( ); for( int t = 0 ; t < 16 ; t++ ) 18 { 19 sum += ta [ threadidx. y ] [ t ] tb[ t ][ threadidx.x ]; 20 } syncthreads( ); 23 } C[ c ra + r ] = sum ; 26 }

11 C A B A B C Dell Precision Workstation T7400 CPU: Intel Quad Core Xeon 3.20 GHz 2 nvidia Quadro FX GB RAM, Windows XP SP CPU CPU 3 5 A B C CPU ms ms ms. 5 CPU CT MRI CUDA OS: WindowsXP CPU: Intel Quad-Core Xeon 3.20 GHz Memory: 3.0 GB GPU: NVIDIA Quadro FX

12 CPU CUDA CPU 10 CUDA CUDA 6 CUDA CUDA GPGPU 5,7 GPGPU CUDA GPGPU GPU CPU GPGPU PC GPGPU 1 GPU 32 GPU GPU 2008 GPU

13 CUDA Bank Conflict GPU CUDA CUDA Programming Guide [8] CUDA GPGPU [1] [2] TOP 500, [3] J. N. England, A system for interactive modeling of physical curved surface objects, Proceedings of SIGGRAPH 78, pp [4] M. J. Harris, G. Coombe, T. Scheuermann, and A. Lastra, Physically-Based Visual Simulation on Graphics Hardware, Proceedings of SIGGRAPH 2002 / Eurographics Workshop on Graphics Hardware 2002, pp.1 10, 2002 [5] J. D. Owens, D. Luebke, N. Govindaraju, M. Harris, J. Krüger, A. E. Lefohn, and T. J. Purcell, A Survey of General-Purpose Computation on Graphics Hardware, Computer Graphics Forum, Vol.26, No.1, pp , 2007 [6] CUDA ZONE, [7] GPGPU, [8] CUDA Programming Guide, html