word PLUS reductions in vectorisation

ktkachov at gcc dot gnu.org via Gcc-bugs Wed, 24 Jul 2024 23:33:33 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116084


            Bug ID: 116084
           Summary: Use dot-product instructions for byte->word PLUS
                    reductions in vectorisation
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64

#define N 32000
unsigned char in[N];
#define u32 unsigned

u32
foo (void)
{
  u32 res = 0;
  for (int i = 0; i < N; i++)
    res += in[i];
  return res;
}

-Ofast -march=armv9-a
Same as we do for the SAD expansions, we should used the TARGET_DOTPROD
instructions to do the byte to word plus reduction as that instruction allows
us to do a two-step plus reduction as long as it gets a vector of 1s for its
second operand. Currently this generates:
.L3:
        add     w1, w1, w4
        ld1b    z2.s, p7/z, [x0]
        ld1b    z1.s, p7/z, [x0, #1, mul vl]
        ld1b    z0.s, p7/z, [x0, #2, mul vl]
        ld1b    z30.s, p7/z, [x0, #3, mul vl]
        add     z26.s, z26.s, z2.s
        add     x0, x0, x5
        add     z27.s, z27.s, z1.s
        add     z28.s, z28.s, z0.s
        add     z29.s, z29.s, z30.s
        cmp     w1, w2
        bls     .L3
        add     z26.s, z26.s, z27.s
        add     z28.s, z28.s, z29.s
        add     z31.s, z26.s, z28.s
        cmp     w1, w6
        beq     .L4
.L2:
        mov     x0, 0
        cntw    x4
        add     x3, x3, w1, uxtw
        mov     w2, 32000
        sub     w1, w2, w1
        whilelo p7.s, wzr, w1
.L5:
        ld1b    z3.s, p7/z, [x3, x0]
        add     x0, x0, x4
        add     z31.s, p7/m, z31.s, z3.s
        whilelo p7.s, w0, w1
        b.any   .L5
.L4:
        ptrue   p7.b, all
        uaddv   d31, p7, z31.s
        fmov    x0, d31
        ret

which is not bad but we can avoid the extending SVE loads and process the full
packed SVE vector of values with each iteration if we use UDOT.

[Bug target/116084] New: Use dot-product instructions for byte->word PLUS reductions in vectorisation

Reply via email to