https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113134

            Bug ID: 113134
           Summary: Middle end early break vectorization: Fail to
                    vectorize a simple early break code
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: juzhe.zhong at rivai dot ai
  Target Milestone: ---

Hi, as reference shows:

https://compiler-explorer.com/z/zMzba7WT1

void add(int N, int *__restrict a, int *__restrict b, int *__restrict c) {
  for (int i = 0; i < N; i++) {
    c[i] = a[i] + b[i];
    if (i > 1000) {
        break;
    }
  }
}

GCC failed to vectorize it:

add:
        cmp     w0, 0
        ble     .L1
        sbfiz   x6, x0, 2, 32
        mov     x4, 0
.L3:
        ldr     w0, [x1, x4]
        ldr     w5, [x2, x4]
        add     w0, w0, w5
        str     w0, [x3, x4]
        cmp     x4, 4004
        beq     .L1
        add     x4, x4, 4
        cmp     x6, x4
        bne     .L3
.L1:
        ret

But clang is able to vectorize it:

add:                                    // @add
        cmp     w0, #1
        b.lt    .LBB0_8
        mov     w8, w0
        mov     w9, #1001                       // =0x3e9
        sub     x8, x8, #1
        cmp     x8, #1001
        csel    x9, x8, x9, lo
        add     x10, x9, #1
        cnth    x9
        cmp     x10, x9
        b.hs    .LBB0_3
        mov     x9, xzr
        b       .LBB0_6
.LBB0_3:
        ptrue   p0.s
        neg     x9, x9
        mov     x11, xzr
        and     x9, x10, x9
        addvl   x12, x1, #1
        addvl   x13, x2, #1
        addvl   x14, x3, #1
.LBB0_4:                                // =>This Inner Loop Header: Depth=1
        ld1w    { z0.s }, p0/z, [x1, x11, lsl #2]
        ld1w    { z1.s }, p0/z, [x2, x11, lsl #2]
        ld1w    { z2.s }, p0/z, [x12, x11, lsl #2]
        ld1w    { z3.s }, p0/z, [x13, x11, lsl #2]
        add     z0.s, z1.s, z0.s
        add     z1.s, z3.s, z2.s
        st1w    { z0.s }, p0, [x3, x11, lsl #2]
        st1w    { z1.s }, p0, [x14, x11, lsl #2]
        inch    x11
        cmp     x9, x11
        b.ne    .LBB0_4
        cmp     x10, x9
        b.eq    .LBB0_8
.LBB0_6:                                // =>This Inner Loop Header: Depth=1
        lsl     x10, x9, #2
        cmp     x9, #1001
        ldr     w11, [x1, x10]
        ldr     w12, [x2, x10]
        add     w11, w12, w11
        str     w11, [x3, x10]
        b.eq    .LBB0_8
        cmp     x8, x9
        add     x9, x9, #1
        b.ne    .LBB0_6
.LBB0_8:
        ret

Reply via email to