http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55266
--- Comment #4 from vincenzo Innocente <vincenzo.innocente at cern dot ch> 2013-03-03 11:58:24 UTC --- I see still problems when calling inline functions. It seems that the code to satisfy the "calling ABI" is generated anyhow. take the example below and compare the code generated for "dotd1" wrt "dotd2" dotd2 has a "storm" of move before the reduction c++ -std=c++11 -Ofast -march=corei7 -S conversions.cc -fabi-version=0 the avx version is better but for dotd4 (actually dotd1 is lelf see like) typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t; typedef double __attribute__( ( vector_size( 32 ) ) ) float64x4_t; inline float64x4_t convert(float32x4_t f) { return float64x4_t{f[0],f[1],f[2],f[3]}; } float dotf(float32x4_t x, float32x4_t y) { float ret=0; for (int i=0;i!=4;++i) ret+=x[i]*y[i]; return ret; } inline double dotd(float64x4_t x, float64x4_t y) { double ret=0; for (int i=0;i!=4;++i) ret+=x[i]*y[i]; return ret; } float dotd1(float32x4_t x, float32x4_t y) { float64x4_t dx,dy; for (int i=0;i!=4;++i) { dx[i]=x[i]; dy[i]=y[i]; } double ret=0; for (int i=0;i!=4;++i) ret+=dx[i]*dy[i]; return ret; } float dotd2(float32x4_t x, float32x4_t y) { float64x4_t dx=convert(x); float64x4_t dy=convert(y); return dotd(dx,dy); } float dotd3(float32x4_t x, float32x4_t y) { float64x4_t dx{x[0],x[1],x[2],x[3]}; float64x4_t dy{y[0],y[1],y[2],y[3]}; double ret=0; for (int i=0;i!=4;++i) ret+=dx[i]*dy[i]; return ret; } float dotd4(float32x4_t x, float32x4_t y) { float64x4_t dx,dy; for (int i=0;i!=4;++i) { dx[i]=x[i]; dy[i]=y[i]; } return dotd(dx,dy); }