https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116337

            Bug ID: 116337
           Summary: Reverse iterated loops has redundant code compared to
                    clang
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: kugan at gcc dot gnu.org
  Target Milestone: ---

For:

extern __attribute__((aligned(64))) int a[32000],b[32000];

void s1112(void)
{
    for (int i = 32000 - 1; i >= 0; i--) {
        a[i] = b[i] + 1;
    }
}

For the loop, with -O3 -mcpu=neoverse-v2 --param=aarch64-autovec-preference=2,
gcc generates

.L3:
        ld1w    z31.s, p7/z, [x6, x0, lsl 2]
        add     w1, w1, w3
        rev     z31.s, z31.s
        add     z31.s, z31.s, #1
        rev     z31.s, z31.s
        st1w    z31.s, p7, [x2, x0, lsl 2]
        sub     x0, x0, x5
        cmp     w1, w4
        bls     .L3

clang generates with -O3 -mcpu=neoverse-v2 -fno-unroll-loops:
.LBB0_1:
        ld1w    { z0.s }, p0/z, [x14, x11, lsl #2]
        add     z0.s, z0.s, #1
        st1w    { z0.s }, p0, [x13, x11, lsl #2]
        decw    x11
        cmn     x12, x11
        b.ne    .LBB0_1


This seem to comes due to memory_access_type of VMAT_CONTIGUOUS_REVERSE and the
VEC_PERM_EXPR.

  <bb 3> [local count: 1063004408]:
  # i_10 = PHI <i_7(5), 31999(2)>
  # ivtmp_9 = PHI <ivtmp_4(5), 32000(2)>
  # vectp_b.4_8 = PHI <vectp_b.4_13(5), &MEM <int[32000]> [(void *)&b +
127984B](2)>
  # vectp_a.9_19 = PHI <vectp_a.9_20(5), &MEM <int[32000]> [(void *)&a +
127984B](2)>
  # ivtmp_23 = PHI <ivtmp_24(5), 0(2)>
  vect__1.6_14 = MEM <vector(4) int> [(int *)vectp_b.4_8];
  vect__1.7_15 = VEC_PERM_EXPR <vect__1.6_14, vect__1.6_14, { 3, 2, 1, 0 }>;
  _1 = b[i_10];
  vect__2.8_17 = vect__1.7_15 + { 1, 1, 1, 1 };
  _2 = _1 + 1;
  vect__2.11_21 = VEC_PERM_EXPR <vect__2.8_17, vect__2.8_17, { 3, 2, 1, 0 }>;
  MEM <vector(4) int> [(int *)vectp_a.9_19] = vect__2.11_21;
  i_7 = i_10 + -1;
  ivtmp_4 = ivtmp_9 - 1;
  vectp_b.4_13 = vectp_b.4_8 + 18446744073709551600;
  vectp_a.9_20 = vectp_a.9_19 + 18446744073709551600;
  ivtmp_24 = ivtmp_23 + 1;
  if (ivtmp_24 < 8000)
    goto <bb 5>; [98.99%]
  else
    goto <bb 4>; [1.01%]

  <bb 5> [local count: 1052266995]:
  goto <bb 3>; [100.00%]


gcc -v
Using built-in specs.
COLLECT_GCC=/home/kvivekananda/install/bin/gcc
COLLECT_LTO_WRAPPER=/home/kvivekananda/install/libexec/gcc/aarch64-unknown-linux-gnu/15.0.0/lto-wrapper
Target: aarch64-unknown-linux-gnu
Configured with: ../gcc_base/configure --prefix=/home/kvivekananda/install/
--enable-languages=c,c++,fortran,lto,objc
Thread model: posix
Supported LTO compression algorithms: zlib zstd
gcc version 15.0.0 20240618 (experimental) (GCC)

Reply via email to