http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57952
--- Comment #1 from vincenzo Innocente <vincenzo.innocente at cern dot ch> --- I modified a bit the benchmark adding timing and the new version now vectorize YMM with avx2, still not with old avx if I remove the call to rdtsc(); it does not use YMM anymore -fno-tree-pre does not help cat polyAVX.cpp //template<typename T> typedef float T; inline T polyHorner(T y) { return T(0x2.p0) + y * (T(0x2.p0) + y * (T(0x1.p0) + y * (T(0x5.55523p-4) + y * (T(0x1.5554dcp-4) + y * (T(0x4.48f41p-8) + y * T(0xb.6ad4p-12)))))) ; } #include <x86intrin.h> #include<iostream> volatile unsigned long long rdtsc() { unsigned int taux=0; return __rdtscp(&taux); } int main() { long long t=0; bool ret=true; float s =0; for (int k=0; k!=100; ++k) { float c = 1.f/10000000.f; t -=rdtsc(); for (int i=1; i<10000001; ++i) s+= polyHorner((float(i)+float(k))*c); t +=rdtsc(); } ret &= s!=0; std::cout << t <<std::endl; return ret ? 0 : -1; } [innocent@vinavx2 vectorize]$ c++ -Ofast -S polyAVX.cpp -march=core-avx2 ; grep -c "ymm" polyAVX.s 28 [innocent@vinavx2 vectorize]$ c++ -Ofast -S polyAVX.cpp -march=corei7-avx ; grep -c "ymm" polyAVX.s 0