CPU GPU N Q07-065 2011 2 17 1
1 4 1.1........................................... 4 1.2.................................. 4 1.3................................... 4 2 5 2.1 GPU........................................... 5 2.1.1 GPU................................. 5 2.1.2 CPU................................. 5 2.1.3................................... 5 2.2 CUDA.......................................... 6 2.2.1 CUDA................................ 6 2.2.2 CUDA............................... 6 2.2.3 CUDA....................... 6 2.3........................................ 6 3 9 3.1 CUDA........................... 9 3.2 CUDA................................ 10 3.3.................................... 11 4 12 4.1................................... 12 4.2.............................. 12 4.3............................... 15 4.3.1.......................... 15 4.3.2.......................... 17 4.4................................... 18 5 N 19 5.1 N................................... 19 5.2............................... 19 5.3 Runge-Kutta..................................... 21 5.4 N............................... 24 5.4.1................... 24 5.4.2................... 24 5.5........................................... 25 5.6.................................... 26 2
6 27 6.1 GPU..................................... 27 6.2.......................................... 27 3
1 1.1 GPU Graphics Processing Unit 3D GPU GPU GPU CPU 1.2 CPU CPU GPU N n m n+m-1 N*N N N N N 4 Runge-Kutta CPU GPU C CUDA C CPU CUDA GPU 1.3 1: OS CPU GPU GPU einstein Vine Linux5.1 64bit Core2 Quad Q9650 3.00GHz 8GB GTX285 1GB 240 einstein 4
2 GPU CUDA 2.1 GPU 2.1.1 GPU GPU Graphics Processing Unit 3D GPU GPU GeForce GTX285 2009 [1] 1062.72GFLOPS CPU Core2 Quad CPU Q9650 [2] 48GFLOPS 22 DirectX 9.0 2002 12/20 NVIDIA CUDA AMD ATI Stream GPGPU 2.1.2 CPU CPU 3000 CPU CPU GPU CPU GPU GPU 2010 8 CPU 31231 GPU 29800 22 GPU CPU 2.1.3 GPU 2011 2 A NVIDIA Tesla M2050 7168 3D GPU 5
2.2 CUDA 2.2.1 CUDA NVIDIA GPU C CUDA GPU NVIDIA OS Windows XP,Vista,7/Fedora 7 /OpenSUSE 10.1 /Ubuntu 7.04 /Mac OS X 10.5.2 Windows CUDA Microsoft Visual Studio(Visual C++) Visual Studio Express Edition Microsoft Web http://developer.nvidia.com/object/cuda 3 2 toolkit rc.html Linux 64bit 3.2 2.2.2 CUDA GPU CUDA GPU 2.2.3 CUDA CUDA Adobe Photoshop CS4 (Adobe) PowerDirector (CyberLink) VideoStudio Pro X3 (COREL) LoiLo- Touch (LoiLo) NVIDIA Badaboom Media Converter 2010 12 MediaCoder CUDA Mathematica Ver8 2.3 100 6
A1 Z100 A Z 1 100 A1 B1 C1 Y100 Z100 1: A Z,1 100 26 100 26 100 2 A Z,1 100 26 100 A Z 100 26 A1 A2 A3 A100 B1 B2 B3 B100 C1 C2 C3 C100 Z1 Z2 Z3 Z100 2: 7
A1 B1 C1 Z1 A2 B2 C2 Z2 A3 B3 C3 Z3 A100 B100 C100 Z100 3: 2 3 2 A Z 1 100 A2 A1 2 8
3 3.1 CUDA GPU CUDA 3 2 4: CUDA ( NVIDIA CUDA )[3] 4 Host(CPU) Kernel(GPU ) Device(GPU) 4 Block(0,0) Thread(0,0) 2 4.2 9
3.2 CUDA GPU CUDA CPU GPU GPU 5: CUDA ( NVIDIA CUDA )[3] 10
3.3 11
4 4.1 CPU GPU n m n+m-1 N*N 4.1 1 2 3 4 5 6 7 8 = = 1 5 + 2 7 1 6 + 2 8 3 5 + 4 7 3 6 + 4 8 19 22 43 50 (4.1) 4.1 2 1 22 1 1 19 19 22 4.2 CUDA Interface [4] (CQ_CUDA_matrix) http://www.cqpub.co.jp/interface/ download/contents.htm 2008 8 ( ) (CPU) 1 #include <stdio.h> 2 #include <cutil.h> 3 4 // 5 #define BLOCK 16 6 #define WIDTH 512 7 1: 8 // 9 void Host(float *a, float *b, float *c); 10 global void Kernel1(float *A, float *B, float *C); 11 global void Kernel2(float *A, float *B, float *C); 12 13 // 14 float h_a[width*width]; 15 float h_b[width*width]; 16 float h_c[width*width]; 17 12
18 // 19 int main() 20 { 21 int i; 22 unsigned int timer; 23 24 // G P U 25 CUT_DEVICE_INIT (); 26 27 // G P U (1) 28 float *d_a, *d_b, *d_c; 29 cudamalloc(( void**) &d_a, sizeof(float)* WIDTH*WIDTH); 30 cudamalloc(( void**) &d_b, sizeof(float)* WIDTH*WIDTH); 31 cudamalloc(( void**) &d_c, sizeof(float)* WIDTH*WIDTH); 32 cudamemset(d_c, 0, sizeof(float)* WIDTH*WIDTH); 33 34 // 35 for(i=0; i<width*width; i++){ 36 h_a[i]=( float)i; 37 h_b[i]=( float)i; 38 } 39 40 41 // G P U (2) 42 cudamemcpy(d_a, h_a, sizeof(float)* WIDTH*WIDTH, cudamemcpyhosttodevice); 43 cudamemcpy(d_b, h_b, sizeof(float)* WIDTH*WIDTH, cudamemcpyhosttodevice); 44 45 // (3) 46 dim3 grid(width/block, WIDTH/BLOCK, 1); 47 dim3 threads(block, BLOCK, 1); 48 49 // (4) 50 Kernel1 <<< grid, threads >>>(d_a, d_b, d_c); 51 // Kernel2 <<< grid, threads >>>(d_a, d_b, d_c); 52 53 // (5) 54 cudamemcpy(h_c, d_c, sizeof(float)* WIDTH*WIDTH, cudamemcpydevicetohost); 55 56 printf(" G P U = %f\n",h_c[width*width -1]); 57 58 // G P U (6) 59 cudafree(d_a); 60 cudafree(d_b); 61 cudafree(d_c); 62 63 // 64 Host(h_a, h_b, h_c); 65 printf(" = %f\n",h_c[width*width -1]); 66 67 } CUDA (GPU) (GPU) (CPU) (CPU) (GPU) (CPU) (GPU) 13
(GPU) (1)GPU GPU (2) (GPU) (CPU) (1) (GPU) (3) WIDTH*WIDTH WIDTH*WIDTH (4) (2) (5) (4) (6)GPU (1) (GPU) 2: ( ) 1 global void Kernel1(float *A, float *B, float *C) 2 { 3 // G P U 4 int x=blockidx.x* blockdim.x + threadidx.x;(1) 5 int y=blockidx.y* blockdim.y + threadidx.y;(2) 6 float tmp=0.0; 7 8 for(int k=0; k<width; k++){ 9 int row=k+y*width; 10 int col=x+k*width; 11 tmp+=a[row]*b[col]; 12 } 13 14 C[x+y*WIDTH]=tmp; 15 } (1),(2) x,y Id blockidx 2 (2,5) blockidx.x=2,blockidx.y=5 blockdim x,y threadidx blockidx 3: ( ) 14
1 global void Kernel2(float *A, float *B, float *C) 2 { 3 // G P U 4 int bx = blockidx.x; 5 int by = blockidx.y; 6 int tx = threadidx.x; 7 int ty = threadidx.y; 8 float tmp = 0; 9 10 shared float As[BLOCK][ BLOCK ];(1) 11 shared float Bs[BLOCK][ BLOCK ];(2) 12 13 for (int a = 0, b = 0 ; a < WIDTH; a += BLOCK, b += BLOCK) { 14 15 int a_adr = WIDTH * BLOCK * by + a; 16 int b_adr = BLOCK * bx + WIDTH * b; 17 18 As[ty][tx] = A[a_adr + WIDTH*ty + tx]; 19 Bs[ty][tx] = B[b_adr + WIDTH*ty + tx]; 20 syncthreads ();(3) 21 22 for (int k = 0; k < BLOCK; k++) { 23 tmp += As[ty][k] * Bs[k][tx]; 24 } 25 syncthreads (); 26 } 27 28 int adr = WIDTH * BLOCK * by + BLOCK * bx; 29 C[adr + WIDTH * ty + tx] = tmp; 30 31 } (1),(2) Id (3) CUDA 4.3 4.3.1 CUDA G_ global_ 1 2 1 1 1 1 2 2 2 4 G_ shared_ 1 G_ global G_ shared 15
6,7 2: 16 7,936 32 64,512 64 520,192 128 4,177,920 256 33,488,896 512 268,173,312 1024 2,146,435,072 6: 6 CPU GPU 1 N=16 0.67 ( ) 16 N=1024 479.1 ( ) 16
7: 7 CPU GPU 1 N=16 0.55 ( ) 16 N=1024 3167 ( ) 6,7 1 16 37 1 16 266 4.3.2 8 GPU_ global GPU_ shared N=16 CPU GPU CPU GPU N=256 17
1000000 FLOPS MFlops 100000 10000 1000 100 CPU GPU_global GPU_shared 10 1 1.00E+03 1.00E+04 1.00E+05 1.00E+06 1.00E+07 1.00E+08 1.00E+09 1.00E+10 1.00E+11 1.00E+12 8: 4.4 GPU CPU 32 32 GPU 18
5 N 5.1 N N N 3 N 9: 1024 3 N N 5.2 19
F m a F F M m r M m m d2 r Mm = G dt2 r (5.1) 2 G (5.1) N n i m i m i d 2 r i dt 2 = n j=1 G m im j (5.2) ri 2 j r i j i j i j i, j = 1, 2, 3,, n x, y, z x, y, z m i d 2 x i dt 2 m i d 2 y i dt 2 m i d 2 z i dt 2 = n j=1 = n j=1 = n j=1 G m im j r 2 i j G m im j r 2 i j G m im j r 2 i j x i j r i j (5.3) y i j r i j (5.4) z i j r i j (5.5) x, y, z, r x i j = x j x i (5.6) y i j = y j y i (5.7) z i j = z j z i (5.8) r = (x j x i ) 2 + (y j y i ) 2 + (z j z i ) 2 (5.9) x, y, z 20
m i d 2 x i dt 2 m i d 2 y i dt 2 m i d 2 z i dt 2 n = m i m j (x j x i ) G ( (5.10) (x j x i ) 2 + (y j y i ) 2 + (z j z i ) 2 ) 3 j=1 n = m i m j (y j y i ) G ( (5.11) (x j x i ) 2 + (y j y i ) 2 + (z j z i ) 2 ) 3 j=1 n = m i m j (z j z i ) G ( (5.12) (x j x i ) 2 + (y j y i ) 2 + (z j z i ) 2 ) 3 j=1 5.3 Runge-Kutta Runge-Kutta Runge-Kutta Euler 1 x 2 Runge-Kutta dy dx = f (x, y) (5.13) (x n, y n ) x n+1 = x n + x y n+1 y n+1 = y n + 1 2 (k 1 + k 2 ) (5.14) k 1 = x f (x n, y n ) k 2 = x f (x n + x, y n + k 1 ) 21
y k 2 k 1 x n x n +Δx/2 x n +Δx x 10: 2 Runge-Kutta k 1 x n y n+1 k 2 k 1 2 O(( x) 3 ) 2 y n Runge-Kutta y n+1 = y n + s b i k i (5.15) i=1 k i = x f (x n + c i x, y n + s a i j k j ) j=1 a i j, b i, c i s 4 Runge-Kutta k 1 = x f (x n, y n ) k 2 = x f (x n + x 2, y n + 1 2 k 1) k 3 = x f (x n + x 2, y n + 1 2 k 2) k 4 = x f (x n + x, y n + k 3 ) y n+1 = y n + 1 6 (k 1 + 2k 2 + 2k 3 + k 4 ) (5.16) 22
y k 4 k 3 k 2 k 1 x n x n +Δx/2 x n +Δx x 11: 4 Runge-Kutta x n x k 1, k 2, k 3, k 4 O(( x) 5 ) 2 Runge-Kutta Runge-Kutta 1 2 (5.10)(5.11)(5.12) 2 1 (5.10)(5.11)(5.12) dx dt m i dv x dt dy dt m i dv y dt dz dt m i dv z dt = V x (5.17) = n m i m j (x j x i ) G ( (x j x i ) 2 + (y j y i ) 2 + (z j z i ) 2 ) 3 (5.18) j=1 = V y (5.19) = n m i m j (y j y i ) G ( (x j x i ) 2 + (y j y i ) 2 + (z j z i ) 2 ) 3 (5.20) j=1 = V z (5.21) = n m i m j (z j z i ) G ( (x j x i ) 2 + (y j y i ) 2 + (z j z i ) 2 ) 3 (5.22) j=1 1 Runge-Kutta 23
5.4 N 5.4.1 12 global_ GPU shared_ GPU N=256,1024,4096 1,10,100,1000,10000 dt 0.01 t 0.0 100 t 1.0 12 N 12: N CPU N=256 1 6.73 7.37 N=4096 10 103 121 5.4.2 13 12 13 N 24
256 体 1024 体 4096 体 13: N CPU N=256 1000 1.85 1.9 N=4096 1000 14.56 N=4096 10 13.7 5.5 NVIDIA GPU IEEE754 IEEE754 CPU IEEE754 CPU GPU GPU [6] [7] NVIDIA GPU GPU IEEE 0.5ulp CPU ulp Units in the Last Place NVIDIA GPU Add Multiple CPU CUDA fadd rn(x,y) fmul rn(x,y) dadd rn(x,y) 25
dmul rn(x,y) CPU rn Round Nearest IEEE754 GPU 3 4 5.6 N 121 14.56 CPU GPU 3 512 2 256 256 2 169 26
6 6.1 GPU 3D GPU GPU GPU 6.2 N CPU 3000 N CPU CUDA CPU GPU 27
[1] Mike Thomas,Steve McBarnes GPUReview http://www.gpureview.com/geforce-gtx-285-card-605.html(2010/12/19 ) [2] Intel Intel Support Home http://www.intel.com/support/processors/sb/cs-023143.htm#3(2010/12/19 ) [3] NVIDIA CUDA Ver1.1 http://www.nvidia.co.jp/docs/io/51174/nvidia CUDA Programming Guide 1. 1 JPN.pdf(2010/12/19 ) [4] CQ Interface http://interface.cqpub.co.jp/(2010/12/17 ) [5] CUDA GPU http://www.kumikomi.net/archives/2008/10/22gpu2.php?page=1(2010/12/17 ) [6] NVIDIA CUDA Information Site http://gpu.fixstars.com/index.php/gpu%e3%81%ae%e8%a8%88%e7%ae%97%e7%b2% BE%E5%BA%A6%E3%81%AB%E3%81%A4%E3%81%84%E3%81%A6(2010/1/21 ) [7] NVIDIA CUDA Ver2.3-Appendix C http://developer.download.nvidia.com/compute/cuda/2 3/toolkit/docs/ NVIDIA CUDA Programming Guide 2.3.pdf(2010/1/21 ) [8] [9] 21 11 20 CUDA( ) 28