(Version: 2013/7/10) Intel CPU (kashi@waseda.jp) 1 Intel CPU( AMD CPU) 64bit SIMD Inline Assemler Windows Visual C++ Linux gcc 2 FPU SSE2 Intel CPU double 8087 FPU (floating point number processing unit) SIMD (single instruction multiple data) SSE2 (Streaming SIMD Extensions 2) 2 SSE2 128bit double 2 2 C ( ) 3 32bit 64bit 64bit AMD Intel AMD64 x86 CPU 64bit Intel Itanium (IA-64) CPU 64bit AMD Intel64 AMD64 64bit CPU 32bit 64bit OS 64bit OS 32bit 64bit CPU 64bit OS 32bit OS 64bit 32bit OS 64bit OS binary 64bit CPU SSE2 32bit SSE2 CPU FPU SSE2 1
64bit SSE2 SSE2 FPU Visual C++ 64bit Inline Assembler 4 FPU SSE2 4.1 FPU Control Word FPU 16bit R R R IC RC(2) PC(2) R R PM UM OM ZM DM IM R: reserved IC: infinity control RC(2): rounding control (00: near, 01: down, 10: up, 11:chop) PC(2): precision control (00: 24bit, 01: not used, 10: 53bit, 11:64bit) PM: inexact precision mask UM: underflow mask OM: overflow mask ZM: divide by 0 mask DM: denormals mask IM: invalid numbers mask RC(2) 2bit 00: nearest mode 01: down mode 10: up mode + 11: chop mode 0 2
4.2 MXCSR Control/Status Register SSE2 32bit 0(31-16) FZ RC(2) PM UM OM ZM DM IM 0 PE UE OE ZE DE IE 0: 0 FZ: Flush to Zero mode denormal 0 bit5-0: SIMD RC(2) 2bit FPU 5 Inline Assembler 5.1 FPU Control Word 16bit linux OS gcc asm volatile (" fnstcw %0" : "=m"( mode1 )); mode1 mode1 windows Visual C++ _asm fnstcw mode1 mode1 _asm fldcw mode1 mode1 5.2 MXCSR Control/Status Register 32bit linux OS gcc mode2 mode2 windows Visual C++ 3
_asm stmxcsr mode2 mode2 _asm ldmxcsr mode2 mode2 SSE2 Intrinsic Inline Assembler SSE2 gcc, Visual C++ mode2 mode2 # include < emmintrin.h> include 6 2bit Linux 32bit FPU down _asm volatile (" fnstcw %0" : "=m"( mode1 )); mode1 = 0 x0400 ; 2bit 0 (~ bit ) and 2bit 0 down or down # include <stdio.h> /* * Only for Little Endian */ void bit_view ( void *p, int size ) int i, j; unsigned char * p2; for (i=size -1; i >=0; i - -) p2 = ( unsigned char *) p + i; for (j =7; j >=0; j - -) if ((* p2 & (1 << j))!= 0) printf ("1"); else printf ("0"); printf ("\n"); int main () 4
double x = 1.; double y = 10.; double z; // default // nearest asm volatile (" fnstcw %0" : "=m"( mode1 )); mode1 = 0 x0000 ; // down asm volatile (" fnstcw %0" : "=m"( mode1 )); mode1 = 0 x0400 ; // up asm volatile (" fnstcw %0" : "=m"( mode1 )); mode1 = 0 x0800 ; // chop asm volatile (" fnstcw %0" : "=m"( mode1 )); mode1 = 0 x0c00 ; 0.10000000000000001 0011111110111001100110011001100110011001100110011001100110011010 0.10000000000000001 0011111110111001100110011001100110011001100110011001100110011010 0.099999999999999992 0011111110111001100110011001100110011001100110011001100110011001 0.10000000000000001 0011111110111001100110011001100110011001100110011001100110011010 0.099999999999999992 0011111110111001100110011001100110011001100110011001100110011001 Linux 64bit 5
# include <stdio.h> /* * Only for Little Endian */ void bit_view ( void *p, int size ) int i, j; unsigned char * p2; for (i=size -1; i >=0; i - -) p2 = ( unsigned char *) p + i; for (j =7; j >=0; j - -) if ((* p2 & (1 << j))!= 0) printf ("1"); else printf ("0"); printf ("\n"); int main () double x = 1.; double y = 10.; double z; // default // nearest mode2 = 0 x00000000 ; // down mode2 = 0 x00002000 ; // up mode2 = 0 x00004000 ; // chop 6
mode2 = 0 x00006000 ; 32bit 2bit bit Windows 32bit Inline Assemler # include <stdio.h> /* * Only for Little Endian */ void bit_view ( void *p, int size ) int i, j; unsigned char * p2; for (i=size -1; i >=0; i - -) p2 = ( unsigned char *) p + i; for (j =7; j >=0; j - -) if ((* p2 & (1 << j))!= 0) printf ("1"); else printf ("0"); printf ("\n"); int main () double x = 1.; double y = 10.; double z; // default // nearest _asm fnstcw mode1 mode1 = 0 x0000 ; _asm fldcw mode1 // down _asm fnstcw mode1 mode1 = 0 x0400 ; _asm fldcw mode1 7
// up _asm fnstcw mode1 mode1 = 0 x0800 ; _asm fldcw mode1 // chop _asm fldcw mode1 mode1 = 0 x0c00 ; _asm fldcw mode1 Windows 64bit Inline Assemler Intrinsic # include <stdio.h> # include < emmintrin.h> // for _mm_getcsr, _mm_setcsr /* * Only for Little Endian */ void bit_view ( void *p, int size ) int i, j; unsigned char * p2; for (i=size -1; i >=0; i - -) p2 = ( unsigned char *) p + i; for (j =7; j >=0; j - -) if ((* p2 & (1 << j))!= 0) printf ("1"); else printf ("0"); printf ("\n"); int main () double x = 1.; double y = 10.; double z; // default // nearest mode2 = 0 x00000000 ; 8
// down mode2 = 0 x00002000 ; // up mode2 = 0 x00004000 ; // chop mode2 = 0 x00006000 ; 7 C C99 fenv.h include fesetround # include <stdio.h> # include <fenv.h> /* * Only for Little Endian */ void bit_view ( void *p, int size ) int i, j; unsigned char * p2; for (i=size -1; i >=0; i - -) p2 = ( unsigned char *) p + i; for (j =7; j >=0; j - -) if ((* p2 & (1 << j))!= 0) printf ("1"); else printf ("0"); printf ("\n"); int main () 9
double x = 1.; double y = 10.; double z; // default // nearest fesetround ( FE_TONEAREST ); // down fesetround ( FE_DOWNWARD ); // up fesetround ( FE_UPWARD ); // chop fesetround ( FE_TOWARDZERO ); -lm Intel CPU Visual C++ C99 fenv.h Visual C++ float.h include controlfp # include <stdio.h> # include <float.h> /* * Only for Little Endian */ void bit_view ( void *p, int size ) int i, j; unsigned char * p2; for (i=size -1; i >=0; i - -) p2 = ( unsigned char *) p + i; for (j =7; j >=0; j - -) if ((* p2 & (1 << j))!= 0) printf ("1"); else printf ("0"); printf ("\n"); int main () double x = 1.; double y = 10.; double z; 10
// default // nearest _controlfp ( _RC_NEAR, _MCW_RC ); // down _controlfp ( _RC_DOWN, _MCW_RC ); // up _controlfp ( _RC_UP, _MCW_RC ); // chop _controlfp ( _RC_CHOP, _MCW_RC ); CPU 32/64bit inline 8 OS 32/64bit 32bit FPU SSE2 _MSC_VER Visual C++ Linux (gcc ) (Visual C++ ) _WIN64 64bit _WIN32 32bit Intel CPU (Linux ) i386 32bit x86_64 64bit Intel CPU (Visual C++ ) _M_IX86_FP 2 SSE2 (Linux ) SSE2_MATH SSE2 11
# if defined ( _WIN32 ) defined ( _WIN64 ) // Windows # if defined ( _WIN64 ) // Windows 64 bit # include < emmintrin.h> // for _mm_getcsr, _mm_setcsr void roundnear () void rounddown () mode2 = 0 x00002000 ; void roundup () mode2 = 0 x00004000 ; void roundchop () mode2 = 0 x00006000 ; # elif defined ( _WIN32 ) // Windows 32 bit # if _M_IX86_FP == 2 # include < emmintrin.h> // for _mm_getcsr, _mm_setcsr void roundnear () _asm fnstcw mode1 _asm fldcw mode1 # if _M_IX86_FP == 2 void rounddown () 12
_asm fnstcw mode1 mode1 = 0 x0400 ; _asm fldcw mode1 # if _M_IX86_FP == 2 mode2 = 0 x00002000 ; void roundup () _asm fnstcw mode1 mode1 = 0 x0800 ; _asm fldcw mode1 # if _M_IX86_FP == 2 mode2 = 0 x00004000 ; void roundchop () _asm fnstcw mode1 mode1 = 0 x0c00 ; _asm fldcw mode1 # if _M_IX86_FP == 2 mode2 = 0 x00006000 ; # else // Windows other CPU # include <float.h> void roundnear () _controlfp ( _RC_NEAR, _MCW_RC ); void rounddown () _controlfp ( _RC_DOWN, _MCW_RC ); void roundup () _controlfp ( _RC_UP, _MCW_RC ); void roundchop () 13
_controlfp ( _RC_CHOP, _MCW_RC ); # else // Linux, etc #if defined ( x86_64 ) // Linux 64 bit void roundnear () void rounddown () mode2 = 0 x00002000 ; void roundup () mode2 = 0 x00004000 ; void roundchop () mode2 = 0 x00006000 ; # elif defined ( i386 ) // Linux 32 bit void roundnear () asm volatile (" fnstcw %0" : "=m"( mode1 )); #if defined ( SSE2_MATH ) void rounddown () 14
asm volatile (" fnstcw %0" : "=m"( mode1 )); mode1 = 0 x0400 ; #if defined ( SSE2_MATH ) mode2 = 0 x00002000 ; void roundup () asm volatile (" fnstcw %0" : "=m"( mode1 )); mode1 = 0 x0800 ; #if defined ( SSE2_MATH ) mode2 = 0 x00004000 ; void roundchop () asm volatile (" fnstcw %0" : "=m"( mode1 )); mode1 = 0 x0c00 ; #if defined ( SSE2_MATH ) mode2 = 0 x00006000 ; # else // Linux other CPU # include <fenv.h> void roundnear () fesetround ( FE_TONEAREST ); void rounddown () fesetround ( FE_DOWNWARD ); void roundup () fesetround ( FE_UPWARD ); void roundchop () 15
fesetround ( FE_TOWARDZERO ); 9 # include <stdio.h> # include " roundingmode - universal. c" double div_down ( double x, double y) double r; rounddown (); r = x / y; roundnear (); return r; double div_up ( double x, double y) double r; roundup (); r = x / y; roundnear (); return r; int main () double x = 1.; double y = 10.; double z; z = div_down (x, y); z = div_up (x, y); gcc 4.4 -O0 0.099999999999999992 0.10000000000000001 -O1, -O2, -O3 0.10000000000000001 0.10000000000000001 16
( ) # include <stdio.h> # include " roundingmode - universal. c" double div_down ( double x, double y) # pragma STDC FENV_ACCESS ON double r; rounddown (); r = x / y; roundnear (); return r; double div_up ( double x, double y) # pragma STDC FENV_ACCESS ON double r; roundup (); r = x / y; roundnear (); return r; int main () double x = 1.; double y = 10.; double z; z = div_down (x, y); z = div_up (x, y); C99 FENV_ACCESS gcc # include <stdio.h> # include " roundingmode - universal. c" double div_down ( double x, double y) volatile double r, x1 = x, y1 = y; rounddown (); r = x1 / y1; roundnear (); return r; double div_up ( double x, double y) 17
volatile double r, x1 = x, y1 = y; roundup (); r = x1 / y1; roundnear (); return r; int main () double x = 1.; double y = 10.; double z; z = div_down (x, y); z = div_up (x, y); volatile 10 : 10.1 gcc -m32 32bit -m64 64bit 32bit -msse2 SSE2 -mfpmath=sse SSE2 -mfpmath=387 FPU -mfpmath=sse,387 10.2 Visual C++ 64bit OS Visual Studio 2005(2008) x64 Win64 cl 64bit 32bit Visual Studio 2005(2008) 32bit /arch:sse2 SSE2 FPU SSE2 11 fesetround Visual C++ controlfp 18