http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51062
vincenzo Innocente <vincenzo.innocente at cern dot ch> changed: What |Removed |Added ---------------------------------------------------------------------------- Version|4.7.0 |4.8.0 --- Comment #3 from vincenzo Innocente <vincenzo.innocente at cern dot ch> 2012-11-30 13:53:41 UTC --- in 4.8 using typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t; typedef double __attribute__( ( vector_size( 32 ) ) ) float64x4_t; the scalar product works well IF WRITTEN as a loop! in the following dot_product2 produces exactly the code I would have expected to be emitted by "dot_product".. would be nice to have also the reduction of a single vector to emit horizonal-sum… float dot_product(float32x4_t x, float32x4_t y) { float32x4_t res = x*y; float ret=0; for (int i=0;i!=4;++i) ret+=res[i]; return ret; } float dot_product2(float32x4_t x, float32x4_t y) { float ret=0; for (int i=0;i!=4;++i) ret+=x[i]*y[i]; return ret; } double dot_product(float64x4_t x, float64x4_t y) { float64x4_t res = x*y; double ret=0; for (int i=0;i!=4;++i) ret+=res[i]; return ret; } double dot_product2(float64x4_t x, float64x4_t y) { double ret=0; for (int i=0;i!=4;++i) ret+=x[i]*y[i]; return ret; } c++ -Ofast -ftree-vectorizer-verbose=2 -S cross.cc -march=corei7-avx; cat cross.s | c++filt dot_product(float __vector, float __vector): LFB2: vmulps %xmm1, %xmm0, %xmm1 vmovaps %xmm1, %xmm0 vshufps $85, %xmm1, %xmm1, %xmm2 vaddss %xmm0, %xmm2, %xmm0 vunpckhps %xmm1, %xmm1, %xmm2 vshufps $255, %xmm1, %xmm1, %xmm1 vaddss %xmm2, %xmm0, %xmm0 vaddss %xmm1, %xmm0, %xmm0 ret LFE2: .align 4,0x90 .globl dot_product2(float __vector, float __vector) dot_product2(float __vector, float __vector): LFB3: vmulps %xmm0, %xmm1, %xmm1 vhaddps %xmm1, %xmm1, %xmm0 vhaddps %xmm0, %xmm0, %xmm0 ret LFE3: .align 4,0x90 .globl dot_product(double __vector, double __vector) dot_product(double __vector, double __vector): LFB4: vmulpd %ymm1, %ymm0, %ymm1 vmovapd %xmm1, %xmm0 vextractf128 $0x1, %ymm1, %xmm1 vhaddpd %xmm0, %xmm0, %xmm0 vmovapd %xmm1, %xmm2 vunpckhpd %xmm1, %xmm1, %xmm1 vaddsd %xmm2, %xmm0, %xmm0 vaddsd %xmm1, %xmm0, %xmm0 vzeroupper ret LFE4: .align 4,0x90 .globl dot_product2(double __vector, double __vector) dot_product2(double __vector, double __vector): LFB5: vmulpd %ymm0, %ymm1, %ymm1 vhaddpd %ymm1, %ymm1, %ymm1 pushq %rbp LCFI0: movq %rsp, %rbp LCFI1: andq $-32, %rsp addq $16, %rsp vperm2f128 $1, %ymm1, %ymm1, %ymm0 vaddpd %ymm0, %ymm1, %ymm1 vmovapd %xmm1, %xmm0 vzeroupper leave LCFI2: ret