GPGPU (II) GPGPU CUDA 1 GPGPU CUDA(CUDA Unified Device Architecture) CUDA NVIDIA GPU *1 C/C++ (nvcc) CUDA NVIDIA GPU GPU CUDA CUDA 1 CUDA CUDA 2 CUDA NVIDIA GPU PC Windows Linux MaxOSX CUDA GPU CUDA NVIDIA web CUDA Zone(http://www.nvidia.com/object/cuda home new.html) CUDA web CUDA Toolkit API ( ) web NVIDIA Forums (NVIDIA ) NVIDIA *1 1
(http://forums.nvidia.com/, http://forum.nvidia.co.jp/) CUDA CUDA CUDA CUDA (http://www.kohgakusha.co.jp/books/detail/978-4-7775-1477-9) NVIDIA CUDA Information Site wiki (http://gpu.fixstars.com/) CUDA NVIDIA GPU GPU GPU (Compute Capability 1.3) GPU CUDA Toolkit version 2.3 CUDA API web (http://www.cspp.cc.utokyo.ac.jp/ohshima/) 2 CUDA CUDA CUDA 1. CUDA (CUDA GPU ) 2. CUDA Toolkit 3. CUDA SDK NVIDIA CUDA (http://developer.nvidia.com/object/cuda download.html) Windows Linux MaxOSX OS (VisualStudio gcc ) Linux CUDA SDK Linux $HOME/NVIDIA GPU Computing SDK/C/bin/linux/release ( $HOME/NVIDIA GPU Computing SDK/C make ) CUDA GPU 2
GUI CUDA Linux (OS) PC GPU PC PC NVIDIA GPU ( CUDA CUDA ) web PC CUDA PC web ( ) 3 CUDA 1 CUDA CUDA CUDA CUDA GPU RuntimeAPI GPU DriverAPI RuntimeAPI 3.1 CUDA 1 2 1 C CUDA 2 CUDA cu 1 CUDA CPU ( CUDA 3 ) CUDA global <<< >>> cuda CPU C 3
1 C ( ) CUDA( ) 4
C > ls cpu.c > gcc -O3 cpu.c > ls a.out cpu.c >./a.out CPU: InA: 0.30 0.60 0.70 0.50 0.30 0.50 0.60 0.20 0.90 0.10 0.20 0.70 ( ) InB: 0.00 0.60 0.40 0.60 0.20 0.50 0.80 0.60 0.20 0.80 0.40 0.70 ( ) Out: 0.30 1.20 1.10 1.10 0.50 1.00 1.40 0.80 1.10 0.90 0.60 1.40 ( ) > CUDA > ls gpu.cu > nvcc -O3 gpu.cu -I${HOME/NVIDIA_GPU_Computing_SDK/C/common/inc > ls a.out gpu.cu >./a.out GPU: InA: 0.30 0.60 0.70 0.50 0.30 0.50 0.60 0.20 0.90 0.10 0.20 0.70 ( ) InB: 0.00 0.60 0.40 0.60 0.20 0.50 0.80 0.60 0.20 0.80 0.40 0.70 ( ) Out: 0.30 1.20 1.10 1.10 0.50 1.00 1.40 0.80 1.10 0.90 0.60 1.40 ( ) > 2 2 CUDA CPU (nvcc) GPU (a.out ) CUDA 1 GPU CPU global <<< >>> GPU CUDA nvcc CPU CPU GPU GPU CUDA GPU GPU GPU GPU CUDA ( CUDA ClearSpeed Advanced Accelerator Cell BE SPE ) CUDA 3 5
nvcc global CPU GPU device GPU GPU host CPU CPU (global/device CPU ) CPU GPU CUDA CPU GPU GPU CPU CPU GPU CPU-GPU API cudamemcpy API MPI OpenMP cuda API API cudasetdevice GPU ID cudamalloc GPU ( ) cudamemcpy CPU-GPU (CPU GPU GPU CPU 4 ) cudafree GPU CUDA 6
nvcc nvcc gcc nvcc gcc gcc -Wall ( ) nvcc -cubin -ptx GPU PTX ( RuntimeAPI ) GPU -arch CUDA (Compute Capability) GPU GPU GPU arch nvcc -h 3.2 CUDA CPU GPU CPU-GPU GPU 1 CPU GPU CUDA GPU GPU ID ID ( 3) pthread rank MPI 3 CUDA ID CUDA threadid BlockId ID ID CPU GPU 1 <<< >>> 7
1 16*16=256 GPU 256 CUDA CPU GPU 4 CUDA CUDA CUDA CUDA 4.1 CUDA NVIDIA GPU 2010 2 GT200 ( GT200) 4 GPU ScalarProcessor(SP) 8 MultiProcessor(MP) MP GPU MP ( MP SP ) MP 4.3 GPU ( NVIDIA GPU AMD GPU ) 4 NVIDIA GPU GPU 1GPU 30 MP SP 240 (TeslaS1070 1 4GPU 1GPU SP 240 ) GPU CPU 1GPU 1000 SP CPU CPU CPU SP SIMD CPU 8
MP 30 CPU CPU 240 CPU SP SIMD MP SP SP CUDA SP SP CPU GPU SP MP MP MP MP SP ( syncthreads ) MP SP GPU CPU GPU atomic 4.2 GPU SP MP SP 8 MP MP GPU GT200 CPU CPU GPU SP MP CPU CPU CPU GPU GPU SP SP MP CUDA SP (Thread) MP (Block) (Thread Block) (Grid) ( 5) MP MP MP 128 ( ) 9
5 SP MP 4.3 GPU 6 GPU GPU Registers MP GT200 MP 16384 CPU SharedMemory MP MP GT200 MP 16KB shared SharedMemory CPU Grid ( ) GlobalMemory GPU (VRAM) TextureMemory device global GlobalMemory CPU API Grid ConstantMemory GPU GPU 64KB GPU MP ConstantMemory 8KB constant ConstantMemory CPU API Grid TextureMemory GPU (VRAM) 10
GlobaleMemory GPU MP TextureMemory 6KB 8KB CPU API Grid LocalMemory GlobalMemory 6 GPU 5 CUDA 2 CUDA CPU(C ) GlobalMemory SharedMemory 5.1 GlobalMemory GlobalMemory 3.1 1 device GlobalMemory global 4.3 global GlobalMemory GlobalMemory GPU SP 7 1 2 1 ( global ) GlobalMemory 2 ( global ) 11
1 ( ) GlobalMemory (MP *2 ) 2 ( global ) GlobalMemory GlobalMemory GPU 8 GlobalMemory atomic 8 data[0] += 1; atomicadd(&data[0], 1); ( 16384 ) atomic (reduction) GlobalMemory ( ) 5.2 SharedMemory SharedMemory SharedMemory GlobalMemory GlobalMemory ID CPU ( ) ( ) (SharedMemory SharedMemory ) 9 fdata kernel1 GlobalMemory *2 MP MP MP 12
(test1.cu) #include <stdlib.h> #include <stdio.h> device float globalarray[2]; global void kernel1(){ if(blockidx.x==0){ globalarray[0] = 111.11f; else{ globalarray[1] = 222.22f; global void kernel2(float *array){ if(blockidx.x==0){ array[0] = globalarray[1]; else{ array[1] = globalarray[0]; int main(int argc, char** argv){ int i; printf("gpu:\n"); srand(0); cudasetdevice(0); float h_out[2]; float *d_out; cudamalloc((void**)&d_out, sizeof(float)*2); kernel1<<< 2, 1 >>>(); kernel2<<< 2, 1 >>>(d_out); cudamemcpy(h_out, d_out, sizeof(float)*2, cudamemcpydevicetohost); printf("out: "); for(i=0; i<2; i++)printf(" %.2f", h_out[i]); printf("\n"); cudafree(d_out); return 0; >nvcc -O3 -o test1 test1.cu -I/home/ohshima/NVIDIA_GPU_Computing_SDK/C/common/inc >./test1 GPU: Out: 222.22 111.11 > 7 GlobalMemory 1(GlobalMemory ) 13
(test2.cu) #include <stdlib.h> #include <stdio.h> global void kernel1(int *data){ data[0] += 1; int main(int argc, char** argv){ printf("gpu:\n"); srand(0); cudasetdevice(0); int h_out = 0; int *d_out; cudamalloc((void**)&d_out, sizeof(int)); cudamemcpy(d_out, &h_out, sizeof(int), cudamemcpyhosttodevice); kernel1<<< 128, 128 >>>(d_out); // 128*128=16384parallel cudamemcpy(&h_out, d_out, sizeof(int), cudamemcpydevicetohost); printf("out: "); printf("%d\n", h_out); cudafree(d_out); return 0; >nvcc -O3 -o test2 test2.cu -I/home/ohshima/NVIDIA_GPU_Computing_SDK/C/common/inc >./test2 GPU: Out: 2 > 8 GlobalMemory 2( ) GlobalMemory GlobalMemory kernel2 SharedMemory GlobalMemory 1 kernel2 SharedMemory SharedMemory GlobalMemory SharedMemory GlobalMemory SharedMemory SharedMemory 14
// 256 // SharedMemory global void kernel1(float *fout, float *fdata){ int i; float tmp = 0.0f; int id = blockidx.x*blockdim.x + threadidx.x; // GlobalMemory for(i=0; i<256; i++){ tmp += fdata[i]; // ID tmp *= (float)threadidx.x; fout[id] = tmp; // SharedMemory global void kernel2(float *fout, float *fdata){ shared float sdata[256]; int i; float tmp = 0.0f; int id = blockidx.x*blockdim.x + threadidx.x; // GlobalMemory SharedMemory sdata[threadidx.x] = fdata[threadidx.x]; // syncthreads(); // SharedMemory if(threadidx.x==0){ for(i=1; i<256; i++){ sdata[0] += sdata[i]; // syncthreads(); // ID tmp = sdata[0] * (float)threadidx.x; fout[id] = tmp; 9 SharedMemory CUDA GPU CUDA GPU ( ) 15