https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83253
Bug ID: 83253 Summary: -ftree-slsr causes performance regression Product: gcc Version: 7.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: mikulas at artax dot karlin.mff.cuni.cz Target Milestone: --- The straight-line strength reduce optimization causes code size increase and performance degradation in some cases. I reduced it to this simple function: unsigned test_2(unsigned char *ptr, unsigned long scale) { if (*(ptr + scale)) { return *(unsigned *)(ptr + scale * 4); } return 3; } The slsr optimization translates the function into: unsigned test_2(unsigned char *ptr, unsigned long scale) { unsigned char *tmp = ptr + scale; if (*tmp) { return *(unsigned *)(tmp + scale * 3); } return 3; } We can see that it replaces a multiplication by 4 with a multiplication by 3. Consequently, the backend won't generate the instruction with scaled addressing. A similar problem happens without pointers, only with integer arithmetics - if the function contains the expression "val + scale" and "val + scale * 4", the slsr optimization will try to reuse the first expression for calculating the second expression - it will replace multiplication by 4 with multiplication by 3, that is less efficient. The slsr pass should be fixed, so that it recognizes that replacing a multiplication by a power of two with a multiplication by non-power of two is never benefical. The misoptimization happens only with some architectures and some target processors: * on X86, it happens when optimizing for 32-bit processors (i386, i486, i586, i686, pentium4) * on X86-64, it happens only when optimizing for nocona * on ARM, it happens when optimizing for cortex-a7, cortex-a5 and the older cores (arm11...) * on aarch64, it happens when optimizing for cortex-a53 or cortex-a35 * it also happens on alpha, hppa, sh4 Example code: unsigned test_1(unsigned char *ptr, unsigned long scale) { return *(unsigned *)(ptr + scale * 4); } unsigned test_2(unsigned char *ptr, unsigned long scale) { if (*(ptr + scale)) { return *(unsigned *)(ptr + scale * 4); } return 3; } unsigned long test_3(unsigned long val, unsigned long scale) { if (val + scale) { return val + scale * 4; } return 3; } The results: $ arm-linux-gnueabihf-gcc-7 -O2 -mcpu=cortex-a7 -c shladd.c $ arm-linux-gnueabihf-objdump -d shladd.o 00000000 <test_1>: --- this is the optimal code 0: f850 0021 ldr.w r0, [r0, r1, lsl #2] 4: 4770 bx lr 6: bf00 nop 00000008 <test_2>: 8: 5c43 ldrb r3, [r0, r1] a: 1842 adds r2, r0, r1 c: b123 cbz r3, 18 <test_2+0x10> --- here we can see multiplication by 3, although there was no multiplication by 3 in the source code e: 2303 movs r3, #3 10: fb03 f101 mul.w r1, r3, r1 14: 5850 ldr r0, [r2, r1] 16: 4770 bx lr 18: 2003 movs r0, #3 1a: 4770 bx lr 0000001c <test_3>: 1c: 1840 adds r0, r0, r1 1e: bf1a itte ne 20: 2303 movne r3, #3 --- again - multiply-add by 3 22: fb03 0001 mlane r0, r3, r1, r0 26: 2003 moveq r0, #3 28: 4770 bx lr 2a: bf00 nop $ aarch64-linux-gnu-gcc-7 -O2 -mcpu=cortex-a53 -c shladd.c $ aarch64-linux-gnu-objdump -d shladd.o 0000000000000000 <test_1>: --- scaled addressing 0: b8617800 ldr w0, [x0, x1, lsl #2] 4: d65f03c0 ret 8: d503201f nop c: d503201f nop 0000000000000010 <test_2>: 10: 8b010002 add x2, x0, x1 14: 38616800 ldrb w0, [x0, x1] 18: 34000080 cbz w0, 28 <test_2+0x18> --- multiplication by 3 1c: 8b010421 add x1, x1, x1, lsl #1 --- no scaled addressing 20: b8616840 ldr w0, [x2, x1] 24: d65f03c0 ret 28: 52800060 mov w0, #0x3 // #3 2c: d65f03c0 ret 0000000000000030 <test_3>: 30: 8b010000 add x0, x0, x1 --- multiplication by 3 34: 8b010421 add x1, x1, x1, lsl #1 38: 8b000021 add x1, x1, x0 3c: f100001f cmp x0, #0x0 40: d2800062 mov x2, #0x3 // #3 44: 9a821020 csel x0, x1, x2, ne // ne = any 48: d65f03c0 ret $ x86_64-linux-gnu-gcc-7 -O2 -m32 -march=i686 -c shladd.c $ x86_64-linux-gnu-objdump -d shladd.o 00000000 <test_1>: 0: 8b 44 24 04 mov 0x4(%esp),%eax 4: 8b 54 24 08 mov 0x8(%esp),%edx 8: 8b 04 90 mov (%eax,%edx,4),%eax b: c3 ret c: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi 00000010 <test_2>: 10: 8b 44 24 08 mov 0x8(%esp),%eax 14: 8b 54 24 04 mov 0x4(%esp),%edx 18: 01 c2 add %eax,%edx 1a: 80 3a 00 cmpb $0x0,(%edx) 1d: 74 11 je 30 <test_2+0x20> --- here it uses lea to multiply by 3, instead of using addressing scaled by 4 1f: 8d 04 40 lea (%eax,%eax,2),%eax 22: 8b 04 02 mov (%edx,%eax,1),%eax 25: c3 ret 26: 8d 76 00 lea 0x0(%esi),%esi 29: 8d bc 27 00 00 00 00 lea 0x0(%edi,%eiz,1),%edi 30: b8 03 00 00 00 mov $0x3,%eax 35: c3 ret 36: 8d 76 00 lea 0x0(%esi),%esi 39: 8d bc 27 00 00 00 00 lea 0x0(%edi,%eiz,1),%edi 00000040 <test_3>: 40: 8b 54 24 08 mov 0x8(%esp),%edx 44: 89 d1 mov %edx,%ecx 46: 03 4c 24 04 add 0x4(%esp),%ecx 4a: 74 0c je 58 <test_3+0x18> --- again, multiplication by 3 and addition 4c: 8d 04 52 lea (%edx,%edx,2),%eax 4f: 01 c8 add %ecx,%eax 51: c3 ret 52: 8d b6 00 00 00 00 lea 0x0(%esi),%esi 58: b8 03 00 00 00 mov $0x3,%eax 5d: c3 ret $ x86_64-linux-gnu-gcc-7 -O2 -m64 -march=nocona -c shladd.c $ x86_64-linux-gnu-objdump -d shladd.o 0000000000000000 <test_1>: 0: 8b 04 b7 mov (%rdi,%rsi,4),%eax 3: c3 retq 0000000000000004 <test_2>: 4: 48 01 f7 add %rsi,%rdi 7: 80 3f 00 cmpb $0x0,(%rdi) a: 74 08 je 14 <test_2+0x10> --- multiplication by 3 c: 48 8d 04 76 lea (%rsi,%rsi,2),%rax 10: 8b 04 07 mov (%rdi,%rax,1),%eax 13: c3 retq 14: b8 03 00 00 00 mov $0x3,%eax 19: c3 retq 000000000000001a <test_3>: 1a: 48 01 f7 add %rsi,%rdi 1d: 74 08 je 27 <test_3+0xd> --- multiplication by 3 and addition 1f: 48 8d 04 76 lea (%rsi,%rsi,2),%rax 23: 48 01 f8 add %rdi,%rax 26: c3 retq 27: b8 03 00 00 00 mov $0x3,%eax 2c: c3 retq $ hppa-linux-gnu-gcc-7 -O2 -c shladd.c $ hppa-linux-gnu-objdump -d shladd.o 00000000 <test_1>: 0: e8 40 c0 00 bv r0(rp) --- here we use scaled addressing 4: 0f 59 20 9c ldw,s r25(r26),ret0 00000008 <test_2>: 8: 0f 3a 00 13 ldb r26(r25),r19 c: 34 1c 00 06 ldi 3,ret0 10: 86 60 20 10 cmpib,= 0,r19,20 <test_2+0x18> 14: 0b 3a 0a 1a add,l r26,r25,r26 --- the next instruction multiplies by 3 18: 0b 39 0a 59 shladd,l r25,1,r25,r25 --- and here we don't use scaled addressing 1c: 0f 3a 00 9c ldw r26(r25),ret0 20: e8 40 c0 02 bv,n r0(rp) 00000024 <test_3>: 24: 0b 3a 0a 1a add,l r26,r25,r26 28: 87 40 20 10 cmpib,= 0,r26,38 <test_3+0x14> 2c: 34 1c 00 06 ldi 3,ret0 --- the next instruction multiplies by 3 30: 0b 39 0a 59 shladd,l r25,1,r25,r25 34: 0b 59 0a 1c add,l r25,r26,ret0 38: e8 40 c0 02 bv,n r0(rp) $ sh4-linux-gnu-gcc-7 -O2 -c shladd.c $ sh4-linux-gnu-objdump -d shladd.o 00000000 <test_1>: --- left shift by 2 0: 08 45 shll2 r5 2: 53 60 mov r5,r0 4: 0b 00 rts 6: 4e 00 mov.l @(r0,r4),r0 00000008 <test_2>: 8: 5c 34 add r5,r4 a: 40 61 mov.b @r4,r1 c: 18 21 tst r1,r1 e: 04 89 bt 1a <test_2+0x12> --- reusing r4 and not using a shift 10: 53 60 mov r5,r0 12: 0c 30 add r0,r0 14: 5c 30 add r5,r0 16: 0b 00 rts 18: 4e 00 mov.l @(r0,r4),r0 1a: 0b 00 rts 1c: 03 e0 mov #3,r0 1e: 09 00 nop 00000020 <test_3>: 20: 5c 34 add r5,r4 22: 48 24 tst r4,r4 24: 04 89 bt 30 <test_3+0x10> --- not using a shift 26: 53 60 mov r5,r0 28: 0c 30 add r0,r0 2a: 5c 30 add r5,r0 2c: 0b 00 rts 2e: 4c 30 add r4,r0 30: 0b 00 rts 32: 03 e0 mov #3,r0 $ alpha-linux-gnu-gcc-7 -O2 -c shladd.c $ alpha-linux-gnu-objdump -d shladd.o 0000000000000000 <test_1>: 0: 51 14 20 42 s4addq a1,0,a1 4: 11 04 11 42 addq a0,a1,a1 8: 00 00 11 a0 ldl v0,0(a1) c: 01 80 fa 6b ret 0000000000000010 <test_2>: 10: 10 04 11 42 addq a0,a1,a0 14: 03 00 1f 20 lda v0,3 18: 00 00 30 2c ldq_u t0,0(a0) 1c: c1 00 30 48 extbl t0,a0,t0 20: 04 00 20 e4 beq t0,34 <test_2+0x24> --- multiplication by 4 24: 41 14 20 42 s4addq a1,0,t0 --- subtraction, so that a1 is multiplied by 3 28: 31 05 31 40 subq t0,a1,a1 2c: 10 04 11 42 addq a0,a1,a0 30: 00 00 10 a0 ldl v0,0(a0) 34: 01 80 fa 6b ret 38: 1f 04 ff 47 nop 3c: 00 00 fe 2f unop 0000000000000040 <test_3>: 40: 41 14 20 42 s4addq a1,0,t0 44: 00 04 11 42 addq a0,a1,v0 48: 10 04 01 42 addq a0,t0,a0 4c: 90 74 00 44 cmoveq v0,0x3,a0 50: 00 04 f0 47 mov a0,v0 54: 01 80 fa 6b ret 58: 1f 04 ff 47 nop 5c: 00 00 fe 2f unop