https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70946

            Bug ID: 70946
           Summary: Bad interaction between IVOpt and loop unrolling
           Product: gcc
           Version: 7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: wdijkstr at arm dot com
  Target Milestone: ---

IVOpt chooses between using indexing for induction variables or incrementing
pointers. Due to way loop unrolling works, a decision that is optimal if
unrolling is disabled may become very non-optimal with unrolling.

Below are simple examples that show how the choice to use indexing can become a
very bad idea when unrolled, while using offset addressing leads to very decent
code. To improve this we either to teach IVOpt about unrolling (eg. prioritise
base+offset addressing) or add a tree unroller that unrolls small inner loops
before IVOpt.


void loop1 (int *p, int *q, int i)
{
   for (i = 0; i < 1000; i++) p[i] = q[i] + 1;
}

void loop2 (int *p, int i)
{
   for (i = 0; i < 1000; i++) p[i] = p[i] + 1;
}

On AArch64 with -O2 -funroll-loops this gives:

loop1:
        mov     x2, 0
        .p2align 2
.L41:
        ldr     w4, [x1, x2]
        add     x3, x2, 4
        add     x10, x2, 8
        add     x9, x2, 12
        add     w5, w4, 1
        str     w5, [x0, x2]
        add     x8, x2, 16
        add     x7, x2, 20
        add     x6, x2, 24
        add     x11, x2, 28
        ldr     w12, [x1, x3]
        add     x2, x2, 32
        cmp     x2, 4000
        add     w13, w12, 1
        str     w13, [x0, x3]
        ldr     w14, [x1, x10]
        add     w15, w14, 1
        str     w15, [x0, x10]
        ldr     w16, [x1, x9]
        add     w17, w16, 1
        str     w17, [x0, x9]
        ldr     w18, [x1, x8]
        add     w4, w18, 1
        str     w4, [x0, x8]
        ldr     w3, [x1, x7]
        add     w10, w3, 1
        str     w10, [x0, x7]
        ldr     w9, [x1, x6]
        add     w5, w9, 1
        str     w5, [x0, x6]
        ldr     w8, [x1, x11]
        add     w7, w8, 1
        str     w7, [x0, x11]
        bne     .L41
        ret
loop2:
        add     x6, x0, 4000
        .p2align 2
.L51:
        mov     x1, x0
        ldr     w2, [x0]
        add     x0, x0, 32
        add     w3, w2, 1
        cmp     x0, x6
        str     w3, [x1], 4
        ldr     w4, [x0, -28]
        add     w5, w4, 1
        str     w5, [x0, -28]
        ldr     w7, [x1, 4]
        add     w8, w7, 1
        str     w8, [x1, 4]
        ldp     w9, w10, [x0, -20]
        ldp     w11, w12, [x0, -12]
        add     w14, w9, 1
        ldr     w13, [x0, -4]
        add     w15, w10, 1
        add     w16, w11, 1
        add     w17, w12, 1
        add     w18, w13, 1
        stp     w14, w15, [x0, -20]
        stp     w16, w17, [x0, -12]
        str     w18, [x0, -4]
        bne     .L51
        ret

Reply via email to