https://bugs.llvm.org/show_bug.cgi?id=47968

            Bug ID: 47968
           Summary: Possible missed AVX optimization opportunity
           Product: new-bugs
           Version: 11.0
          Hardware: PC
                OS: All
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: new bugs
          Assignee: unassignedb...@nondot.org
          Reporter: jo...@lophus.org
                CC: htmldevelo...@gmail.com, llvm-bugs@lists.llvm.org

I *think* there is a missed optimization opportunity in the following code, but
I might be wrong because I've never used AVX before yesterday so know very
little about it. Also, the optimization is probably pretty crazy, so not sure
this is something compilers would optimize in the first place...

The code computes the squared error sum of a float array.

Notes:

- main.c is just there to trick se.c into not optimization everything away.
- N is a compile-time flag to control the array size, select it from [1,
10000].
- SLOW is a compile-time flag to control if the AVX or naive version of se
shall be used.
- SUM_OUTER is a compile-time flag to choose between two variants of summation
if the fast version of se.

Compile with: cc ...flags... -O3 -mavx se.c -c -o se.o && cc se.o main.c

Benchmark results (flags | mean +- std):

-DSUM_OUTER=1 -DN=10    |    3.1 ms +-   4.0 ms
-DSUM_OUTER=1 -DN=100   |  690.5 ms +- 245.2 ms
-DSUM_OUTER=1 -DN=1000  |    3.6 ms +-   4.9
-DSUM_OUTER=1 -DN=10000 |  237.9 ms +-   5.8 ms
              -DN=10    |    3.4 ms +-   4.5 ms
              -DN=100   |    3.8 ms +-   5.6 ms
              -DN=1000  |  998.1 ms +-  25.7 ms
              -DN=10000 |  253.1 ms +-  37.0 ms

-DSLOW=1 -DN=10         |    2.8 ms +-  4.3 ms
-DSLOW=1 -DN=20         |  138.9 ms +-  5.9 ms
-DSLOW=1 -DN=100        |  834.2 ms +- 27.7 ms
-DSLOW=1 -DN=200        | 1871.0 ms +- 45.0 ms

Note that there are funny outliers in the AVX version. Not sure where they're
coming from.


--- main.c ---

#include <unistd.h>

void run (void *);

int main () {
    run(NULL);
}


--- se.c ---

#include <immintrin.h>
#include <string.h>


static float slowse(float *a, float *b, size_t n) {
    float out = 0;
    for (size_t i = 0; i < n; i += 1) {
        out += (a[i] + b[i]) * (a[i] + b[i]);
    }
    return out;
}


static float fastse(float *a, float *b, size_t n) {
    float a_aligned[8] __attribute__ ((aligned (32)));
    float b_aligned[8] __attribute__ ((aligned (32)));
    float out = 0;
    __m256 out1 = _mm256_set1_ps(0);
    size_t i = 0;
    for (; i + 8 <= n; i += 8) {
        memcpy(a_aligned, a, sizeof(float) * 8);
        memcpy(b_aligned, b, sizeof(float) * 8);
        __m256 res = _mm256_sub_ps(_mm256_load_ps(a_aligned),
_mm256_load_ps(b_aligned));
        res = _mm256_mul_ps(res, res);
#ifdef SUM_OUTER
        for (size_t i = 0; i < 8; ++i) {
            out += res[i];
        }
    }
#else
        out1 = _mm256_add_ps(out1, res);
    }
    for (size_t i = 0; i < 8; ++i) {
        out += out1[i];
    }
#endif
    return out + slowse(&a[i], &b[i], n - i);
}


typedef void deoptimize(float *, float *, float);

static float a[8 * N];
static float b[8 * N];

void run (deoptimize f) {
    float out;
    for (size_t i = 0; i < 10000000; ++i) {
#ifdef SLOW
        out = slowse(a, b, N);
#else
        out = fastse(a, b, N);
#endif
        if (f != NULL) {
            f(a, b, out);
        }
    }
    if (f != NULL) {
        f(a, b, out);
    }
}

-- 
You are receiving this mail because:
You are on the CC list for the bug.
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to