https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84106

--- Comment #4 from Daniel Fruzynski <bugzi...@poradnik-webmastera.com> ---
Here are results of small benchmark executed on Xeon E5-2683 v3. Code was
compiled using gcc 4.8.5. This gcc version also splits loops. Manually
vectorized code is 3.5 times faster:

[out]
--------------------------------------------------
Benchmark           Time           CPU Iterations
--------------------------------------------------
BM_test1           25 ns         25 ns   26989634
BM_test2            7 ns          7 ns   94495591
[/out]

Benchmarko code:

[code]
#include <benchmark/benchmark.h>
#include "immintrin.h"

#define N 81

int a1[N] __attribute__((aligned(32)));
int a2[N] __attribute__((aligned(32)));
int a3[N] __attribute__((aligned(32)));

class Init
{
public:
    Init()
    {
        for (int n = 0; n < N; n++)
        {
            a1[n] = n % 32;
        }
    }
} init;


static void BM_test1(benchmark::State& state)
{
    for (auto _ : state)
    {
        for (int n = 0; n < N; n++)
        {
            a2[n] = a1[n];
            a3[n] = 1 << a1[n];
        }
        benchmark::ClobberMemory();
    }
}
BENCHMARK(BM_test1);

static void BM_test2(benchmark::State& state)
{
    for (auto _ : state)
    {
        int n = 0;
        for (; n < N - 7; n += 8)
        {
            __m256i v = _mm256_load_si256((__m256i*)(&a1[0] + n));
            _mm256_store_si256((__m256i*)(&a2[0] + n), v);

            v = _mm256_sllv_epi32(_mm256_set1_epi32(1), v);
            _mm256_store_si256((__m256i*)(&a3[0] + n), v);
        }
        for (; n < N; n++)
        {
            a2[n] = a1[n];
            a3[n] = 1 << a1[n];
        }
        benchmark::ClobberMemory();
    }
}
BENCHMARK(BM_test2);

BENCHMARK_MAIN();
[/code]

Reply via email to