https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116541

            Bug ID: 116541
           Summary: [14/15 Regression] Inefficient missing use of reg+reg
                    addressing modes
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
                CC: tnfchris at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64

The loop from TSVC:
__attribute__((aligned(64))) float
a[32000],b[32000],c[32000],d[32000],e[32000],
                                  
aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000],
float[256][256], float[256][256], float[256][256], float);

#define real_t float
#define iterations 100000
#define LEN_1D 32000

real_t s242(real_t s1, real_t s2)
{
    for (int nl = 0; nl < iterations/5; nl++) {
        //#pragma unroll(1)
        for (int i = 1; i < LEN_1D; ++i) {
            a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i];
        }
    }
    return a[0];
}

Compiled with -O3 -mcpu=neoverse-v2 generates the following loop:
.L3:
        fadd    s31, s0, s31
        add     x4, x8, x0
        add     x3, x7, x0
        add     x2, x6, x0
        add     x1, x5, x0
        ldr     s3, [x4, 4]
        add     x0, x0, 4
        ldr     s2, [x3, 4]
        fadd    s31, s31, s1
        ldr     s30, [x2, 4]
        fadd    s31, s31, s3
        fadd    s31, s31, s2
        fadd    s31, s31, s30
        str     s31, [x1, 4]
        cmp     x0, x9
        bne     .L3

With GCC 13 it used better addressing modes:
.L3:
        fadd    s0, s5, s0
        ldr     s3, [x4, x0]
        ldr     s2, [x3, x0]
        ldr     s1, [x2, x0]
        fadd    s0, s0, s4
        fadd    s0, s0, s3
        fadd    s0, s0, s2
        fadd    s0, s0, s1
        str     s0, [x1, x0]
        add     x0, x0, 4
        cmp     x0, x5
        bne     .L3

I don't know if Tamar's pending IVOPTs fix this but filing it here just in case

Reply via email to