https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99161

            Bug ID: 99161
           Summary: Suboptimal SVE code for ld4/st4 MLA code
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64

void ld_st_4 (char *x)
{
    for (int i = 0; i < 4096; i += 4)
    {
        char r = x[i];
        char g = x[i + 1];
        char b = x[i + 2];
        char a = x[i + 3];
        char smoosh = (r + g + b) * a;
        x[i] = r - smoosh;
        x[i+1] = g + smoosh;
        x[i+2] = b - smoosh;
        x[i+3] = a + smoosh;
    }
}

with -O3 (no SVE) gives a nice loop on aarch64:

ld_st_4(char*):
        add     x1, x0, 4096
.L2:
        ld4     {v0.16b - v3.16b}, [x0]
        add     v4.16b, v0.16b, v1.16b
        add     v4.16b, v4.16b, v2.16b
        mul     v4.16b, v4.16b, v3.16b
        sub     v16.16b, v0.16b, v4.16b
        add     v17.16b, v4.16b, v1.16b
        sub     v18.16b, v2.16b, v4.16b
        add     v19.16b, v4.16b, v3.16b
        st4     {v16.16b - v19.16b}, [x0], 64
        cmp     x1, x0
        bne     .L2
        ret

with -O3 -march=armv8.2-a+sve we get:
ld_st_4(char*):
        mov     x1, 0
        mov     w2, 1024
        ptrue   p0.b, all
        whilelo p1.b, wzr, w2
.L2:
        ld4b    {z0.b - z3.b}, p1/z, [x0]
        add     z4.b, z1.b, z0.b
        add     z4.b, z4.b, z2.b
        movprfx z16, z0
        mls     z16.b, p0/m, z4.b, z3.b
        movprfx z17, z1
        mla     z17.b, p0/m, z4.b, z3.b
        movprfx z18, z2
        mls     z18.b, p0/m, z4.b, z3.b
        movprfx z19, z3
        mla     z19.b, p0/m, z4.b, z3.b
        st4b    {z16.b - z19.b}, p1, [x0]
        incb    x1
        incb    x0, all, mul #4
        whilelo p1.b, w1, w2
        b.any   .L2
        ret

There's a few things that could be improved here:
* Use x0 for limit
* Use a single predicate (avoid multiple incb instructions)
* factor in the cost of movprfx somehow (i.e. the destructive semantics of the
MLA/MLS), and prefer to use mul and add/sub

A better SVE loop would look a lot like Neon:
ld_st_4(char*):
        add     x1, x0, 4096
        ptrue   p0.b, all
.L2:
        ld4b    {z0.b - z3.b}, p0/z, [x0]
        add     z4.b, z1.b, z0.b
        add     z4.b, z4.b, z2.b
        mul     z4.b, p0/m, z4.b, z3.b
        sub     z16.b, z0.b, z4.b
        add     z17.b, z4.b, z1.b
        sub     z18.b, z2.b, z4.b
        add     z19.b, z4.b, z3.b
        st4b    {z16.b - z19.b}, p0, [x0]
        incb    x0, all, mul #4
        cmp     x1, x0
        bne     .L2
        ret

Reply via email to