[Bug tree-optimization/83253] New: -ftree-slsr causes performance regression

mikulas at artax dot karlin.mff.cuni.cz Fri, 01 Dec 2017 16:44:36 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83253


            Bug ID: 83253
           Summary: -ftree-slsr causes performance regression
           Product: gcc
           Version: 7.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: mikulas at artax dot karlin.mff.cuni.cz
  Target Milestone: ---

The straight-line strength reduce optimization causes code size increase and
performance degradation in some cases. I reduced it to this simple function:
unsigned test_2(unsigned char *ptr, unsigned long scale)
{
        if (*(ptr + scale)) {
                return *(unsigned *)(ptr + scale * 4);
        }
        return 3;
}
The slsr optimization translates the function into:
unsigned test_2(unsigned char *ptr, unsigned long scale)
{
        unsigned char *tmp = ptr + scale;
        if (*tmp) {
                return *(unsigned *)(tmp + scale * 3);
        }
        return 3;
}
We can see that it replaces a multiplication by 4 with a multiplication by 3.
Consequently, the backend won't generate the instruction with scaled
addressing. A similar problem happens without pointers, only with integer
arithmetics - if the function contains the expression "val + scale" and "val +
scale * 4", the slsr optimization will try to reuse the first expression for
calculating the second expression - it will replace multiplication by 4 with
multiplication by 3, that is less efficient.

The slsr pass should be fixed, so that it recognizes that replacing a
multiplication by a power of two with a multiplication by non-power of two is
never benefical.

The misoptimization happens only with some architectures and some target
processors:
* on X86, it happens when optimizing for 32-bit processors (i386, i486, i586,
i686, pentium4)
* on X86-64, it happens only when optimizing for nocona
* on ARM, it happens when optimizing for cortex-a7, cortex-a5 and the older
cores (arm11...)
* on aarch64, it happens when optimizing for cortex-a53 or cortex-a35
* it also happens on alpha, hppa, sh4

Example code:
unsigned test_1(unsigned char *ptr, unsigned long scale)
{
        return *(unsigned *)(ptr + scale * 4);
}

unsigned test_2(unsigned char *ptr, unsigned long scale)
{
        if (*(ptr + scale)) {
                return *(unsigned *)(ptr + scale * 4);
        }
        return 3;
}

unsigned long test_3(unsigned long val, unsigned long scale)
{
        if (val + scale) {
                return val + scale * 4;
        }
        return 3;
}

The results:
$ arm-linux-gnueabihf-gcc-7 -O2 -mcpu=cortex-a7 -c shladd.c
$ arm-linux-gnueabihf-objdump -d shladd.o
00000000 <test_1>:
--- this is the optimal code
   0:   f850 0021       ldr.w   r0, [r0, r1, lsl #2]
   4:   4770            bx      lr
   6:   bf00            nop

00000008 <test_2>:
   8:   5c43            ldrb    r3, [r0, r1]
   a:   1842            adds    r2, r0, r1
   c:   b123            cbz     r3, 18 <test_2+0x10>
--- here we can see multiplication by 3, although there was no multiplication
by 3 in the source code
   e:   2303            movs    r3, #3
  10:   fb03 f101       mul.w   r1, r3, r1
  14:   5850            ldr     r0, [r2, r1]
  16:   4770            bx      lr
  18:   2003            movs    r0, #3
  1a:   4770            bx      lr

0000001c <test_3>:
  1c:   1840            adds    r0, r0, r1
  1e:   bf1a            itte    ne
  20:   2303            movne   r3, #3
--- again - multiply-add by 3
  22:   fb03 0001       mlane   r0, r3, r1, r0
  26:   2003            moveq   r0, #3
  28:   4770            bx      lr
  2a:   bf00            nop

$ aarch64-linux-gnu-gcc-7 -O2 -mcpu=cortex-a53 -c shladd.c
$ aarch64-linux-gnu-objdump -d shladd.o
0000000000000000 <test_1>:
--- scaled addressing
   0:   b8617800        ldr     w0, [x0, x1, lsl #2]
   4:   d65f03c0        ret
   8:   d503201f        nop
   c:   d503201f        nop

0000000000000010 <test_2>:
  10:   8b010002        add     x2, x0, x1
  14:   38616800        ldrb    w0, [x0, x1]
  18:   34000080        cbz     w0, 28 <test_2+0x18>
--- multiplication by 3
  1c:   8b010421        add     x1, x1, x1, lsl #1
--- no scaled addressing
  20:   b8616840        ldr     w0, [x2, x1]
  24:   d65f03c0        ret
  28:   52800060        mov     w0, #0x3                        // #3
  2c:   d65f03c0        ret

0000000000000030 <test_3>:
  30:   8b010000        add     x0, x0, x1
--- multiplication by 3
  34:   8b010421        add     x1, x1, x1, lsl #1
  38:   8b000021        add     x1, x1, x0
  3c:   f100001f        cmp     x0, #0x0
  40:   d2800062        mov     x2, #0x3                        // #3
  44:   9a821020        csel    x0, x1, x2, ne  // ne = any
  48:   d65f03c0        ret

$ x86_64-linux-gnu-gcc-7 -O2 -m32 -march=i686 -c shladd.c
$ x86_64-linux-gnu-objdump -d shladd.o
00000000 <test_1>:
   0:   8b 44 24 04             mov    0x4(%esp),%eax
   4:   8b 54 24 08             mov    0x8(%esp),%edx
   8:   8b 04 90                mov    (%eax,%edx,4),%eax
   b:   c3                      ret    
   c:   8d 74 26 00             lea    0x0(%esi,%eiz,1),%esi

00000010 <test_2>:
  10:   8b 44 24 08             mov    0x8(%esp),%eax
  14:   8b 54 24 04             mov    0x4(%esp),%edx
  18:   01 c2                   add    %eax,%edx
  1a:   80 3a 00                cmpb   $0x0,(%edx)
  1d:   74 11                   je     30 <test_2+0x20>
--- here it uses lea to multiply by 3, instead of using addressing scaled by 4
  1f:   8d 04 40                lea    (%eax,%eax,2),%eax
  22:   8b 04 02                mov    (%edx,%eax,1),%eax
  25:   c3                      ret    
  26:   8d 76 00                lea    0x0(%esi),%esi
  29:   8d bc 27 00 00 00 00    lea    0x0(%edi,%eiz,1),%edi
  30:   b8 03 00 00 00          mov    $0x3,%eax
  35:   c3                      ret    
  36:   8d 76 00                lea    0x0(%esi),%esi
  39:   8d bc 27 00 00 00 00    lea    0x0(%edi,%eiz,1),%edi

00000040 <test_3>:
  40:   8b 54 24 08             mov    0x8(%esp),%edx
  44:   89 d1                   mov    %edx,%ecx
  46:   03 4c 24 04             add    0x4(%esp),%ecx
  4a:   74 0c                   je     58 <test_3+0x18>
--- again, multiplication by 3 and addition
  4c:   8d 04 52                lea    (%edx,%edx,2),%eax
  4f:   01 c8                   add    %ecx,%eax
  51:   c3                      ret    
  52:   8d b6 00 00 00 00       lea    0x0(%esi),%esi
  58:   b8 03 00 00 00          mov    $0x3,%eax
  5d:   c3                      ret    

$ x86_64-linux-gnu-gcc-7 -O2 -m64 -march=nocona -c shladd.c
$ x86_64-linux-gnu-objdump -d shladd.o
0000000000000000 <test_1>:
   0:   8b 04 b7                mov    (%rdi,%rsi,4),%eax
   3:   c3                      retq   

0000000000000004 <test_2>:
   4:   48 01 f7                add    %rsi,%rdi
   7:   80 3f 00                cmpb   $0x0,(%rdi)
   a:   74 08                   je     14 <test_2+0x10>
--- multiplication by 3
   c:   48 8d 04 76             lea    (%rsi,%rsi,2),%rax
  10:   8b 04 07                mov    (%rdi,%rax,1),%eax
  13:   c3                      retq   
  14:   b8 03 00 00 00          mov    $0x3,%eax
  19:   c3                      retq   

000000000000001a <test_3>:
  1a:   48 01 f7                add    %rsi,%rdi
  1d:   74 08                   je     27 <test_3+0xd>
--- multiplication by 3 and addition
  1f:   48 8d 04 76             lea    (%rsi,%rsi,2),%rax
  23:   48 01 f8                add    %rdi,%rax
  26:   c3                      retq   
  27:   b8 03 00 00 00          mov    $0x3,%eax
  2c:   c3                      retq   

$ hppa-linux-gnu-gcc-7 -O2 -c shladd.c 
$ hppa-linux-gnu-objdump -d shladd.o 
00000000 <test_1>:
   0:   e8 40 c0 00     bv r0(rp)
--- here we use scaled addressing
   4:   0f 59 20 9c     ldw,s r25(r26),ret0

00000008 <test_2>:
   8:   0f 3a 00 13     ldb r26(r25),r19
   c:   34 1c 00 06     ldi 3,ret0
  10:   86 60 20 10     cmpib,= 0,r19,20 <test_2+0x18>
  14:   0b 3a 0a 1a     add,l r26,r25,r26
--- the next instruction multiplies by 3
  18:   0b 39 0a 59     shladd,l r25,1,r25,r25
--- and here we don't use scaled addressing
  1c:   0f 3a 00 9c     ldw r26(r25),ret0
  20:   e8 40 c0 02     bv,n r0(rp)

00000024 <test_3>:
  24:   0b 3a 0a 1a     add,l r26,r25,r26
  28:   87 40 20 10     cmpib,= 0,r26,38 <test_3+0x14>
  2c:   34 1c 00 06     ldi 3,ret0
--- the next instruction multiplies by 3
  30:   0b 39 0a 59     shladd,l r25,1,r25,r25
  34:   0b 59 0a 1c     add,l r25,r26,ret0
  38:   e8 40 c0 02     bv,n r0(rp)

$ sh4-linux-gnu-gcc-7 -O2 -c shladd.c 
$ sh4-linux-gnu-objdump -d shladd.o
00000000 <test_1>:
--- left shift by 2
   0:   08 45           shll2   r5
   2:   53 60           mov     r5,r0
   4:   0b 00           rts
   6:   4e 00           mov.l   @(r0,r4),r0

00000008 <test_2>:
   8:   5c 34           add     r5,r4
   a:   40 61           mov.b   @r4,r1
   c:   18 21           tst     r1,r1
   e:   04 89           bt      1a <test_2+0x12>
--- reusing r4 and not using a shift
  10:   53 60           mov     r5,r0
  12:   0c 30           add     r0,r0
  14:   5c 30           add     r5,r0
  16:   0b 00           rts
  18:   4e 00           mov.l   @(r0,r4),r0
  1a:   0b 00           rts
  1c:   03 e0           mov     #3,r0
  1e:   09 00           nop

00000020 <test_3>:
  20:   5c 34           add     r5,r4
  22:   48 24           tst     r4,r4
  24:   04 89           bt      30 <test_3+0x10>
--- not using a shift
  26:   53 60           mov     r5,r0
  28:   0c 30           add     r0,r0
  2a:   5c 30           add     r5,r0
  2c:   0b 00           rts
  2e:   4c 30           add     r4,r0
  30:   0b 00           rts
  32:   03 e0           mov     #3,r0

$ alpha-linux-gnu-gcc-7 -O2 -c shladd.c 
$ alpha-linux-gnu-objdump -d shladd.o 
0000000000000000 <test_1>:
   0:   51 14 20 42     s4addq  a1,0,a1
   4:   11 04 11 42     addq    a0,a1,a1
   8:   00 00 11 a0     ldl     v0,0(a1)
   c:   01 80 fa 6b     ret

0000000000000010 <test_2>:
  10:   10 04 11 42     addq    a0,a1,a0
  14:   03 00 1f 20     lda     v0,3
  18:   00 00 30 2c     ldq_u   t0,0(a0)
  1c:   c1 00 30 48     extbl   t0,a0,t0
  20:   04 00 20 e4     beq     t0,34 <test_2+0x24>
--- multiplication by 4
  24:   41 14 20 42     s4addq  a1,0,t0
--- subtraction, so that a1 is multiplied by 3
  28:   31 05 31 40     subq    t0,a1,a1
  2c:   10 04 11 42     addq    a0,a1,a0
  30:   00 00 10 a0     ldl     v0,0(a0)
  34:   01 80 fa 6b     ret
  38:   1f 04 ff 47     nop
  3c:   00 00 fe 2f     unop

0000000000000040 <test_3>:
  40:   41 14 20 42     s4addq  a1,0,t0
  44:   00 04 11 42     addq    a0,a1,v0
  48:   10 04 01 42     addq    a0,t0,a0
  4c:   90 74 00 44     cmoveq  v0,0x3,a0
  50:   00 04 f0 47     mov     a0,v0
  54:   01 80 fa 6b     ret
  58:   1f 04 ff 47     nop
  5c:   00 00 fe 2f     unop

[Bug tree-optimization/83253] New: -ftree-slsr causes performance regression

Reply via email to