http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50713
vincenzo Innocente <vincenzo.innocente at cern dot ch> changed: What |Removed |Added ---------------------------------------------------------------------------- Summary|SLP vs loop: code generated |SLP vs loop: code generated |differs |differs (SLP less | |efficient) --- Comment #5 from vincenzo Innocente <vincenzo.innocente at cern dot ch> 2012-10-25 13:19:23 UTC --- I think that here the problem is similar using 4.8 trunk in the following code the explicit loop is way more efficient that the vector notation for all corei7, corei7-avx and bdver1 architectures. (i spare you the assembler dumps. Inferring the number of instructions from nm should enough 0000000000000170 T dfmav(double __vector, double __vector, double __vector, double, double) 0000000000000370 T dfmal(double __vector, double __vector, double __vector, double, double) 00000000000003c0 T dfma8v(double __vector, double __vector, double __vector, double, double) 0000000000000810 T dfma8l(double __vector, double __vector, double __vector, double, double) 00000000000008b0 short EH_frame1 ) typedef double __attribute__( ( vector_size( 32 ) ) ) float64x4_t; typedef double __attribute__( ( vector_size( 64 ) ) ) float64x8_t; float64x4_t dfmav(float64x4_t x, float64x4_t y, float64x4_t z, double a, double b) { return a*x*(y+b*z); } float64x4_t dfmal(float64x4_t x, float64x4_t y, float64x4_t z, double a, double b) { float64x4_t r; for (int i=0; i!=4;++i) r[i] = a*x[i]*(y[i]+b*z[i]); return r; } float64x8_t dfma8v(float64x8_t x, float64x8_t y, float64x8_t z, double a, double b) { return a*x*(y+b*z); } float64x8_t dfma8l(float64x8_t x, float64x8_t y, float64x8_t z, double a, double b) { float64x8_t r; for (int i=0; i!=8;++i) r[i] = a*x[i]*(y[i]+b*z[i]); return r; }