https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84106
--- Comment #4 from Daniel Fruzynski <bugzi...@poradnik-webmastera.com> --- Here are results of small benchmark executed on Xeon E5-2683 v3. Code was compiled using gcc 4.8.5. This gcc version also splits loops. Manually vectorized code is 3.5 times faster: [out] -------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------- BM_test1 25 ns 25 ns 26989634 BM_test2 7 ns 7 ns 94495591 [/out] Benchmarko code: [code] #include <benchmark/benchmark.h> #include "immintrin.h" #define N 81 int a1[N] __attribute__((aligned(32))); int a2[N] __attribute__((aligned(32))); int a3[N] __attribute__((aligned(32))); class Init { public: Init() { for (int n = 0; n < N; n++) { a1[n] = n % 32; } } } init; static void BM_test1(benchmark::State& state) { for (auto _ : state) { for (int n = 0; n < N; n++) { a2[n] = a1[n]; a3[n] = 1 << a1[n]; } benchmark::ClobberMemory(); } } BENCHMARK(BM_test1); static void BM_test2(benchmark::State& state) { for (auto _ : state) { int n = 0; for (; n < N - 7; n += 8) { __m256i v = _mm256_load_si256((__m256i*)(&a1[0] + n)); _mm256_store_si256((__m256i*)(&a2[0] + n), v); v = _mm256_sllv_epi32(_mm256_set1_epi32(1), v); _mm256_store_si256((__m256i*)(&a3[0] + n), v); } for (; n < N; n++) { a2[n] = a1[n]; a3[n] = 1 << a1[n]; } benchmark::ClobberMemory(); } } BENCHMARK(BM_test2); BENCHMARK_MAIN(); [/code]