(Version: 2013/5/16) Intel CPU (kashi@waseda.jp) 1 Intel CPU( AMD CPU) 64bit SIMD Inline Assemler Windows Visual C++ Linux gcc 2 FPU SSE2 Intel CPU double 8087 FPU (floating point number processing unit) SIMD (single instruction multiple data) SSE2 (Streaming SIMD Extensions 2) 2 SSE2 128bit double 2 2 C ( ) 3 32bit 64bit 64bit AMD Intel AMD64 x86 CPU 64bit Intel Itanium (IA-64) CPU 64bit AMD Intel64 AMD64 64bit CPU 32bit 64bit OS 64bit OS 32bit 64bit CPU 64bit OS 32bit OS 64bit 32bit OS 64bit OS binary 64bit CPU SSE2 32bit SSE2 CPU FPU SSE2 1
64bit SSE2 SSE2 FPU Visual C++ 64bit Inline Assembler 4 FPU SSE2 4.1 FPU Control Word FPU 16bit R R R IC RC(2) PC(2) R R PM UM OM ZM DM IM R: reserved IC: infinity control RC(2): rounding control (00: near, 01: down, 10: up, 11:chop) PC(2): precision control (00: 24bit, 01: not used, 10: 53bit, 11:64bit) PM: inexact precision mask UM: underflow mask OM: overflow mask ZM: divide by 0 mask DM: denormals mask IM: invalid numbers mask RC(2) 2bit 00: nearest mode 01: down mode 10: up mode + 11: chop mode 0 2
4.2 MXCSR Control/Status Register SSE2 32bit 0(31-16) FZ RC(2) PM UM OM ZM DM IM 0 PE UE OE ZE DE IE 0: 0 FZ: Flush to Zero mode denormal 0 bit5-0: SIMD RC(2) 2bit FPU 5 Inline Assembler 5.1 FPU Control Word 16bit linux OS gcc mode1 mode1 windows Visual C++ _asm fnstcw mode1 mode1 _asm fldcw mode1 mode1 5.2 MXCSR Control/Status Register 32bit linux OS gcc mode2 mode2 windows Visual C++ 3
_asm stmxcsr mode2 mode2 _asm ldmxcsr mode2 mode2 SSE2 Intrinsic Inline Assembler SSE2 gcc, Visual C++ mode2 mode2 # include < emmintrin.h> include 6 2bit Linux 32bit FPU down _asm volatile (" fnstcw %0" : "=m"( mode1 )); mode1 = 0 x0400 ; 2bit 0 (~ bit ) and 2bit 0 down or down # include <stdio.h> /* * Only for Little Endian */ void bit_view ( void *p, int size ) int i, j; unsigned char * p2; for (i=size -1; i >=0; i - -) p2 = ( unsigned char *) p + i; for (j =7; j >=0; j - -) if ((* p2 & (1 << j))!= 0) printf ("1"); else printf ("0"); printf ("\n"); int main () 4
double x = 1.; double y = 10.; double z; // default // nearest mode1 = 0 x0000 ; // down mode1 = 0 x0400 ; // up mode1 = 0 x0800 ; // chop mode1 = 0 x0c00 ; 0.10000000000000001 0011111110111001100110011001100110011001100110011001100110011010 0.10000000000000001 0011111110111001100110011001100110011001100110011001100110011010 0.099999999999999992 0011111110111001100110011001100110011001100110011001100110011001 0.10000000000000001 0011111110111001100110011001100110011001100110011001100110011010 0.099999999999999992 0011111110111001100110011001100110011001100110011001100110011001 Linux 64bit 5
# include <stdio.h> /* * Only for Little Endian */ void bit_view ( void *p, int size ) int i, j; unsigned char * p2; for (i=size -1; i >=0; i - -) p2 = ( unsigned char *) p + i; for (j =7; j >=0; j - -) if ((* p2 & (1 << j))!= 0) printf ("1"); else printf ("0"); printf ("\n"); int main () double x = 1.; double y = 10.; double z; // default // nearest mode2 = 0 x00000000 ; // down mode2 = 0 x00002000 ; // up mode2 = 0 x00004000 ; // chop 6
mode2 = 0 x00006000 ; 32bit 2bit bit Windows 32bit Inline Assemler # include <stdio.h> /* * Only for Little Endian */ void bit_view ( void *p, int size ) int i, j; unsigned char * p2; for (i=size -1; i >=0; i - -) p2 = ( unsigned char *) p + i; for (j =7; j >=0; j - -) if ((* p2 & (1 << j))!= 0) printf ("1"); else printf ("0"); printf ("\n"); int main () double x = 1.; double y = 10.; double z; // default // nearest _asm fnstcw mode1 mode1 = 0 x0000 ; _asm fldcw mode1 // down _asm fnstcw mode1 mode1 = 0 x0400 ; _asm fldcw mode1 7
// up _asm fnstcw mode1 mode1 = 0 x0800 ; _asm fldcw mode1 // chop _asm fldcw mode1 mode1 = 0 x0c00 ; _asm fldcw mode1 Windows 64bit Inline Assemler Intrinsic # include <stdio.h> # include < emmintrin.h> // for _mm_getcsr, _mm_setcsr /* * Only for Little Endian */ void bit_view ( void *p, int size ) int i, j; unsigned char * p2; for (i=size -1; i >=0; i - -) p2 = ( unsigned char *) p + i; for (j =7; j >=0; j - -) if ((* p2 & (1 << j))!= 0) printf ("1"); else printf ("0"); printf ("\n"); int main () double x = 1.; double y = 10.; double z; // default // nearest mode2 = 0 x00000000 ; 8
// down mode2 = 0 x00002000 ; // up mode2 = 0 x00004000 ; // chop mode2 = 0 x00006000 ; 7 OS 32/64bit 32bit FPU SSE2 _WIN32 _WIN64 Visual C++ Linux (Visual C++ ) _WIN64 64bit 32bit (Linux ) x86_64 64bit (Visual C++ ) _M_IX86_FP 2 SSE2 (Linux ) SSE2_MATH SSE2 # include <stdio.h> # if defined ( _WIN64 ) _M_IX86_FP == 2 # include < emmintrin.h> // for _mm_getcsr, _mm_setcsr void roundnear () 9
# if defined ( _WIN32 ) defined ( _WIN64 ) // Windows #if! defined ( _WIN64 ) _asm fnstcw mode1 mode1 = 0 x0000 ; _asm fldcw mode1 # if defined ( _WIN64 ) _M_IX86_FP == 2 mode2 = 0 x00000000 ; # else // Linux, etc #if! defined ( x86_64 ) mode1 = 0 x0000 ; #if defined ( x86_64 ) defined ( SSE2_MATH ) mode2 = 0 x00000000 ; void rounddown () # if defined ( _WIN32 ) defined ( _WIN64 ) // Windows #if! defined ( _WIN64 ) _asm fnstcw mode1 mode1 = 0 x0400 ; _asm fldcw mode1 # if defined ( _WIN64 ) _M_IX86_FP == 2 mode2 = 0 x00002000 ; # else // Linux, etc #if! defined ( x86_64 ) mode1 = 0 x0400 ; #if defined ( x86_64 ) defined ( SSE2_MATH ) mode2 = 0 x00002000 ; 10
void roundup () # if defined ( _WIN32 ) defined ( _WIN64 ) // Windows #if! defined ( _WIN64 ) _asm fnstcw mode1 mode1 = 0 x0800 ; _asm fldcw mode1 # if defined ( _WIN64 ) _M_IX86_FP == 2 mode2 = 0 x00004000 ; # else // Linux, etc #if! defined ( x86_64 ) mode1 = 0 x0800 ; #if defined ( x86_64 ) defined ( SSE2_MATH ) mode2 = 0 x00004000 ; void roundchop () # if defined ( _WIN32 ) defined ( _WIN64 ) // Windows #if! defined ( _WIN64 ) _asm fnstcw mode1 mode1 = 0 x0c00 ; _asm fldcw mode1 # if defined ( _WIN64 ) _M_IX86_FP == 2 mode2 = 0 x00006000 ; # else // Linux, etc #if! defined ( x86_64 ) mode1 = 0 x0c00 ; #if defined ( x86_64 ) defined ( SSE2_MATH ) mode2 = 0 x00006000 ; 11
8 : 8.1 gcc -m32 32bit -m64 64bit 32bit -msse2 SSE2 -mfpmath=sse SSE2 -mfpmath=387 FPU \mfpmath=sse,387 8.2 Visual C++ 64bit OS Visual Studio 2005(2008) x64 Win64 cl 64bit 32bit Visual Studio 2005(2008) 32bit /arch:sse2 SSE2 FPU SSE2 12