https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121290

--- Comment #13 from Soumya AR <soumyaa at gcc dot gnu.org> ---
Hi Tamar,

Thanks for the fix.

This has now brought back performance for the mentioned kernels with -Ofast but
is now regressing with -O3 ...

Is this something you're still looking at? Just wanted to put it up here
anyway.

Example, for the inner loop in s314:

#define iterations 100000
#define LEN_1D 32000

float a[LEN_1D];

int main()
{
    for (int i = 0; i < LEN_1D; i++) {
        a[i] = i;
    }

    float x;
    for (int nl = 0; nl < iterations*5; nl++) {
        x = a[0];
        for (int i = 0; i < LEN_1D; i++) {
            if (a[i] > x) {
                x = a[i];
            }
        }
    }

    return x;
} 

Now:

.L3:
        ldr     s25, [x0], 4
        fcmpe   s25, s26
        fcsel   s26, s25, s26, gt
        cmp     x0, x1
        bne     .L3
        subs    w2, w2, #1
        bne     .L4
        fcvtzs  w0, s26
        ret


Before:

.L3:
        ld1r    {v25.4s}, [x0], 4
        fcmgt   v24.4s, v25.4s, v26.4s
        bsl     v24.16b, v25.16b, v26.16b
        mov     v26.16b, v24.16b
        cmp     x1, x0
        bne     .L3
        add     w2, w2, 1
        cmp     w2, w4
        bne     .L4
        dup     s24, v24.s[3]
        fcvtzs  w0, s24
        ret


Looks like we don't vectorize the inner loop at all now.

Reply via email to