RaVioli SIMD 17 17115074
i RaVioli SIMD PC PC PC PC CPU RaVioli RaVioli CPU RaVioli CPU SIMD RaVioli RaVioli SIMD RaVioli SIMD
RaVioli SIMD 1 1 2 RaVioli 2 2.1 RaVioli....................................... 2 2.1.1.......................... 4 2.1.2.......................... 4 2.2........................................ 4 2.3 RaVioli..................................... 7 3 SIMD RaVioli 7 3.1 SIMD.......................................... 7 3.2 RaVioli SIMD.................... 8 3.3 SIMD...................... 9 4 SIMD 13 4.1...................................... 13 4.2........................................ 14 4.3................................. 16 4.4 SIMD................................... 19 5 21 6 23 24 A.1 A.2
1 1 PC PC PC Linux OS PC 1/30 1/60 CPU OS PC CPU CPU VIGRA[1] OpenCV[2] VIGRA OpenCV
2 RaVioli[3] RaVioli CPU CPU 7 3 RaVioli VIGRA OpenCV RaVioli RaVioli RaVioli SIMD SIMD 2 RaVioli 3 SIMD RaVioli 4 SIMD 5 6 2 RaVioli RaVioli 2.1 RaVioli RaVioli
3 1: RaVioli 2 RaVioli procpix 1
4 2.1.1 PC CPU RaVioli CPU RaVioli 2.1.2 RaVioli RV Pixel RV Image RV Pixel RV Pixel RGB HSV 0 100% RV Image 2.2 1 0 2
5 for(int j=0;j<height;j++){ for(int i=0;i<width;i++){ new_image[i][j]=binalization(image[i][j]); 2: Binalization() 1 0 x y i j Binalization() width height 2 RaVioli RV Image 2 RaVioli 3 3 4 main image RV Image RV Image RaVioli grain RaVioli
6 void Binalization(Pixel* p){ /*p */ void main(argc,argv[]){ RV_Image image; /* */ new_image=image.proc(binalization); /* */ 3: RaVioli ( ) RaVioil RV_Image* RV_Image::proc(RV_Pixel (* UserProgram)(RV_Pixel)){ RV_Image* tmpimage; for(int ny=0;ny<height;ny+=grain){ for(int nx=0;nx<width;nx+=grain){ tmpimage->pixel[ny*width+nx] = UserProgram(*_getPixel(nx,ny)); return(tmpimage); 4: RaVioli (RaVioli ) RaVioli 3 (0,0) 100.0
7 RaVioli (s) 2.520 0.040 RaVioli (s) 11.032 0.156 / ( ) 4.38 3.9 1: RaVioli 2.3 RaVioli RaVioli 2.2 RaVioli 1 4.4 3.9 RaVioli SIMD SIMD 3 SIMD RaVioli 3.1 SIMD SIMD Single Instruction Multiple Data SIMD 5 DSP( ) Intel Pentium III CPU PC CPU
8 5: SIMD SIMD PowerPC Pentium Cell SPE CPU Geforce RADEON GPU PC PC CPU SIMD PC CPU Intel Pentium AMD Athlon CPU SIMD SIMD Pentium SSE Athlon 3DNow! SIMD CPU Intel Pentium CPU SIMD SSE Intel CPU [4] SIMD [5] 3.2 RaVioli SIMD RaVioli SIMD RaVioli SIMD SIMD
9 SIMD SIMD RaVioli 2 RaVioli SIMD RaVioli SIMD SIMD RaVioli SIMD SIMD SIMD 2.1 RaVioli RaVioli RaVioli SIMD 3.3 SIMD RaVioli SIMD RaVioli SIMD SIMD 6 6 RGB input image input tp RGB allsum
10 int min=2147483646; int allsum=0; for(j=0;j<input_image.height-input_tp.height;j++){ for(i=0;i<input_image.width-input_tp.width;i++){ for(jj=0;jj<input_tp.height;jj++){ for(ii=0;ii<input_tp.width;ii+=16){ //SIMD asm volatile ( "movdqu (%1),%%xmm0\n\t" "movdqu (%2),%%xmm1\n\t" "movdqu (%3),%%xmm2\n\t" "movdqu (%4),%%xmm3\n\t" "movdqu (%5),%%xmm4\n\t" "movdqu (%6),%%xmm5\n\t" "psadbw %%xmm1,%%xmm0\n\t" "psadbw %%xmm3,%%xmm2\n\t" "psadbw %%xmm5,%%xmm4\n\t" "paddd %%xmm4,%%xmm2\n\t" "paddd %%xmm2,%%xmm0\n\t" "movdqu %%xmm0,%0\n\t" "emms" : "=g" (sum) : "r" (&input_image.r[(j+jj)*input_image.width+i]), "r" (&input_tp.r[(jj)*input_tp.width]), "r" (&input_image.g[(j+jj)*input_image.width+i]), "r" (&input_tp.g[(jj)*input_tp.width]), "r" (&input_image.b[(j+jj)*input_image.width+i]), "r" (&input_tp.b[(jj)*input_tp.width])); //SIMD allsum+=sum[0]+sum[2]; if(min > allsum) { min=allsum; mini=i; minj=j; allsum=0; 6: SIMD
11 RGB SIMD RaVioli 6 SIMD RaVioli 6 SIMD A.1,A.2 4 SIMD C++ SIMD SIMD SIMD SIMD SIMD SIMD mmintrin.h SIMD
12 gcc (GNU ) add %xmm0,%xmm1 Intel (Microsoft Macro Assembler) add xmm1,xmm0 7: SIMD SIMD C++ C++ asm asm asm CPU C++ C++ SIMD 7 7
13 2: 7 xmm0 xmm1 gcc x86 CPU gcc [6] 4 SIMD SIMD 4.1 SIMD 1. rv image.cpp 2. 3. 4. SIMD 5. 2 4 2
14 8: ( ) 4.2 RaVioli rv image.cpp 8 9 8
15 9: ( ) 8 image->procimagcomp(sad,input_tp); = = = procimgcomp void SAD input tp input tp RV image* rv image.cpp
16 10: ( ) 9 RaVioli 4.3 4.2 10
17 11: ( ) UserProgram User- Program UserProgram 11 11 12
18 int sum=0; /* */ void counttp(rv_doppelimage* image,rv_coord Cstart,RV_Coord Cend){ image->procimgcomp(sad,input_tp); if(min > sum) { min=sum; tmps=cstart; tmpe=cend; sum=0; void SAD(RV_Pixel* p1,rv_pixel* p2){ int r1,g1,b1,r2,g2,b2; p1->getrgb(r1,g1,b1); p2->getrgb(r2,g2,b2); sum+=abs(r1-r2)+abs(g1-g2)+abs(b1-b2); 12: 11 sum sum int void int return( );
19 13: ( ) int RV_Image::procImgComp(void (* UserProgram) (RV_Pixel*, RV_Pixel*),RV_Image* cmpimg){ /* */ return(sum); 4.4 SIMD 4.3 SIMD RaVioli SIMD RaVioli 13 SIMD SIMD SIMD RGB out = 0.299 r + 0.587 g + 0.114 b; (1) SIMD SIMD 1
20 for(ny=0;ny<bheight;ny+=grain){ for(nx=0;nx<bwidth;nx+=grain){ byte r1,g1,b1,r2,g2,b2; p1 = _getpixel(nx,ny); p2 = cmpimg->_getpixel(nx,ny); p1->getrgb(r1,g1,b1); p2->getrgb(r2,g2,b2); asm volatile ( /* SIMD */ : "=g" (sum) : "r" (&r1), "r" (&r2), "r" (&g1), "r" (&g2), "r" (&b1), "r" (&b2)); 14: (1) r,g,b 8bit byte 0.299 r 0.299 float (32bit) r byte (8bit) float (32bit) float (1) 0.587 g 0.114 b float 32bit SIMD SIMD 14 SIMD 14 SIMD 8bit byte 128bit SIMD 16 SIMD 16 16 SIMD
21 for(ny=0;ny<bheight;ny+=4*grain){ for(nx=0;nx<bwidth;nx+=4*grain){ byte r1[16],g1[16],b1[16],r2[16],g2[16],b2[16]; for(int i=0;i<16;i++){ p1 = _getpixel(nx+(i%4),ny+(i/4)); p2 = cmpimg->_getpixel((nx+(i%4)),((ny+i/4))); p1->getrgb(r1[i],g1[i],b1[i]); p2->getrgb(r2[i],g2[i],b2[i]); asm volatile ( /* SIMD */ : "=g" (sum) : "r" (&r1), "r" (&r2), "r" (&g1), "r" (&g2), "r" (&b1), "r" (&b2)); 15: 14 SIMD 16 16 14 16 4 14 15 5 SIMD RaVioli SIMD 3 16 1
22 CPU Opteron 2.0GHz 2GB GNU C++ version 4.1.2 3: 16: 16 22% 9% SIMD 16 8 16 8 2 34% SIMD 8 8 getrgb getr RV Pixel
23 (s) 0.028 7.378 0.024 (s) 0.083 3.737 0.128 (s) 0.111 11.115 0.152 4: SIMD SIMD C++ SIMD 4 SIMD 6 RaVioli SIMD RaVioli SIMD RaVioli SIMD RaVioli if SIMD
24 SIMD 2 [1] Köthe, U.: VIGRA - Vision with Generic Algorithms, 1.6.0 edition (2008). [2] Bradski, G. and Kaehler, A.: Learning OpenCV: Computer Vision With the Opencv Library, Oreilly & Associates Inc (2008). [3],,, : RaVioli, CVIM, Vol. 1, No. 4 (2009). [4] Corp., I.: IA-32, http://www.intel.co.jp/jp/download/index.htm. [5] : IA-32 SIMD, http://www.icnet.ne.jp/ nsystem/simd tobira/index. html. [6] SAITOH, A.: GCC for x86, http://www.mars.sannet.ne.jp/sci10/on gcc asm.html.
A.1 void SAD(RV_Pixel* p1,rv_pixel* p2){ byte r1,g1,b1,r2,g2,b2; p1->getrgb(r1,g1,b1); p2->getrgb(r2,g2,b2); sum+=abs(r1-r2)+abs(g1-g2)+abs(b1-b2); RaVioli void RV_Image::procImgComp(void (* UserProgram) (RV_Pixel*, RV_Pixel*),RV_Image* cmpimg){ int nx,ny; _InputCheck(); cmpgrain=cmpimg->getgrain(); for(ny=0;ny<bheight;ny+=grain){ for(nx=0;nx<bwidth;nx+=grain){ UserProgram(_getPixel(nx,ny), cmpimg->_getpixel(nx,ny)); A.2 int RV_Image::SIMD_procImgComp(RV_Image* cmpimg){ int nx,ny; int sum[4]; int allsum; int i;
byte r1[16],g1[16],b1[16],r2[16],g2[16],b2[16]; RV_Pixel* p1; RV_Pixel* p2; _InputCheck(); asm volatile ("pslldq \$255,%xmm3");//0 for(ny=0;ny<bheight;ny+=4*grain){ for(nx=0;nx<bwidth;nx+=4*grain){ for(i=0;i<16;i++){ p1 = _getpixel(nx+(i%4),ny+(i/4)); p2 = cmpimg->_getpixel((nx+(i%4)),((ny+i/4))); p1->getrgb(r1[i],g1[i],b1[i]); p2->getrgb(r2[i],g2[i],b2[i]); asm volatile ( "movdqu (%1),%%xmm0\n\t" "movdqu (%2),%%xmm1\n\t" "psadbw %%xmm1,%%xmm0\n\t" "movdqu (%3),%%xmm1\n\t" "movdqu (%4),%%xmm2\n\t" "psadbw %%xmm2,%%xmm1\n\t" "paddw %%xmm1,%%xmm0\n\t" "movdqu (%5),%%xmm1\n\t" "movdqu (%6),%%xmm2\n\t" "psadbw %%xmm2,%%xmm1\n\t" "paddw %%xmm1,%%xmm0\n\t" "paddd %%xmm0,%%xmm3" : "=g" (sum) : "r" (&r1), "r" (&r2),
"r" (&g1), "r" (&g2), "r" (&b1), "r" (&b2)); asm volatile ( "movdqu %%xmm3,%0\n\t" "emms" : "=g" (sum)); allsum=sum[0]+sum[2]; return(allsum);