https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66003
Bug ID: 66003 Summary: missed cse opportunity in addr expressions because of tree pre/lim Product: gcc Version: 6.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: amker at gcc dot gnu.org Target Milestone: --- Below simple case is reduced from spec, typedef struct { int x; int y; } coord; extern unsigned short **org; extern coord *c; void bar (unsigned short *ptr); void foo (int s, int n) { unsigned short arr[256], *ptr = arr; int x, y; for (y = c->y; y < c->y + 16; y++) for (x = c->x; x < c->x + 16; x++) *ptr++ = org [y][x]; bar (ptr); } When compiling with below two command lines A: $gcc -Ofast -S test.c -o x.S B: $gcc -Ofast -S test.c -o y.S -fno-tree-pre -fno-tree-loop-im The assembly difference is as below: $ diff x.S y.S 12,14c12,34 < subq $520, %rsp < .cfi_def_cfa_offset 528 < movq c(%rip), %rdx --- > pushq %r15 > .cfi_def_cfa_offset 16 > .cfi_offset 15, -16 > pushq %r14 > .cfi_def_cfa_offset 24 > .cfi_offset 14, -24 > pushq %r13 > .cfi_def_cfa_offset 32 > .cfi_offset 13, -32 > pushq %r12 > .cfi_def_cfa_offset 40 > .cfi_offset 12, -40 > pushq %rbp > .cfi_def_cfa_offset 48 > .cfi_offset 6, -48 > pushq %rbx > .cfi_def_cfa_offset 56 > .cfi_offset 3, -56 > subq $568, %rsp > .cfi_def_cfa_offset 624 > movq c(%rip), %rax > movslq (%rax), %rsi > movslq 4(%rax), %rdx 16,20c36,58 < movslq 4(%rdx), %rcx < leaq (%rax,%rcx,8), %rsi < movslq (%rdx), %rcx < movq %rsp, %rax < addq %rcx, %rcx --- > addq %rsi, %rsi > leaq 24(%rsi), %rcx > leaq 22(%rsi), %rdi > leaq 2(%rsi), %r15 > leaq 4(%rsi), %r14 > leaq 6(%rsi), %r13 > leaq 8(%rsi), %r12 > movq %rcx, 8(%rsp) > leaq 26(%rsi), %rcx > leaq 10(%rsi), %rbp > leaq 12(%rsi), %rbx > leaq 14(%rsi), %r11 > leaq 16(%rsi), %r10 > movq %rcx, 16(%rsp) > leaq 28(%rsi), %rcx > leaq 18(%rsi), %r9 > leaq 20(%rsi), %r8 > movq %rdi, 40(%rsp) > movq %rcx, 24(%rsp) > leaq 30(%rsi), %rcx > movq %rcx, 32(%rsp) > leaq (%rax,%rdx,8), %rcx > leaq 48(%rsp), %rax 24c62 < movq (%rsi), %rdx --- > movq (%rcx), %rdx 26,27c64,65 < addq $8, %rsi < movzwl (%rdx,%rcx), %edi --- > addq $8, %rcx > movzwl (%rdx,%rsi), %edi 29c67 < movzwl 2(%rdx,%rcx), %edi --- > movzwl (%rdx,%r15), %edi 31c69 < movzwl 4(%rdx,%rcx), %edi --- > movzwl (%rdx,%r14), %edi 33c71 < movzwl 6(%rdx,%rcx), %edi --- > movzwl (%rdx,%r13), %edi 35c73 < movzwl 8(%rdx,%rcx), %edi --- > movzwl (%rdx,%r12), %edi 37c75 < movzwl 10(%rdx,%rcx), %edi --- > movzwl (%rdx,%rbp), %edi 39c77 < movzwl 12(%rdx,%rcx), %edi --- > movzwl (%rdx,%rbx), %edi 41c79 < movzwl 14(%rdx,%rcx), %edi --- > movzwl (%rdx,%r11), %edi 43c81 < movzwl 16(%rdx,%rcx), %edi --- > movzwl (%rdx,%r10), %edi 45c83 < movzwl 18(%rdx,%rcx), %edi --- > movzwl (%rdx,%r9), %edi 47c85 < movzwl 20(%rdx,%rcx), %edi --- > movzwl (%rdx,%r8), %edi 49c87,88 < movzwl 22(%rdx,%rcx), %edi --- > movq 40(%rsp), %rdi > movzwl (%rdx,%rdi), %edi 51c90,91 < movzwl 24(%rdx,%rcx), %edi --- > movq 8(%rsp), %rdi > movzwl (%rdx,%rdi), %edi 53c93,94 < movzwl 26(%rdx,%rcx), %edi --- > movq 16(%rsp), %rdi > movzwl (%rdx,%rdi), %edi 55c96,97 < movzwl 28(%rdx,%rcx), %edi --- > movq 24(%rsp), %rdi > movzwl (%rdx,%rdi), %edi 57c99,100 < movzwl 30(%rdx,%rcx), %edx --- > movq 32(%rsp), %rdi > movzwl (%rdx,%rdi), %edx 59c102 < leaq 512(%rsp), %rdx --- > leaq 560(%rsp), %rdx 64c107,119 < addq $520, %rsp --- > addq $568, %rsp > .cfi_def_cfa_offset 56 > popq %rbx > .cfi_def_cfa_offset 48 > popq %rbp > .cfi_def_cfa_offset 40 > popq %r12 > .cfi_def_cfa_offset 32 > popq %r13 > .cfi_def_cfa_offset 24 > popq %r14 > .cfi_def_cfa_offset 16 > popq %r15 The tree-pre dump is as below: <bb 2>: c.0_8 = c; y_9 = c.0_8->y; _47 = y_9 + 15; pretmp_112 = c.0_8->x; pretmp_128 = org; pretmp_144 = (long unsigned int) pretmp_112; pretmp_159 = pretmp_144 * 2; pretmp_160 = pretmp_112 + 1; pretmp_175 = (long unsigned int) pretmp_160; pretmp_176 = pretmp_175 * 2; pretmp_191 = pretmp_112 + 2; pretmp_192 = (long unsigned int) pretmp_191; pretmp_207 = pretmp_192 * 2; pretmp_208 = pretmp_112 + 3; pretmp_223 = (long unsigned int) pretmp_208; pretmp_224 = pretmp_223 * 2; pretmp_239 = pretmp_112 + 4; pretmp_240 = (long unsigned int) pretmp_239; pretmp_255 = pretmp_240 * 2; pretmp_256 = pretmp_112 + 5; pretmp_271 = (long unsigned int) pretmp_256; pretmp_283 = pretmp_271 * 2; pretmp_12 = pretmp_112 + 6; pretmp_50 = (long unsigned int) pretmp_12; pretmp_51 = pretmp_50 * 2; pretmp_52 = pretmp_112 + 7; pretmp_53 = (long unsigned int) pretmp_52; pretmp_65 = pretmp_53 * 2; pretmp_66 = pretmp_112 + 8; pretmp_67 = (long unsigned int) pretmp_66; pretmp_68 = pretmp_67 * 2; pretmp_69 = pretmp_112 + 9; pretmp_81 = (long unsigned int) pretmp_69; pretmp_82 = pretmp_81 * 2; pretmp_83 = pretmp_112 + 10; pretmp_84 = (long unsigned int) pretmp_83; pretmp_85 = pretmp_84 * 2; pretmp_97 = pretmp_112 + 11; pretmp_98 = (long unsigned int) pretmp_97; pretmp_99 = pretmp_98 * 2; pretmp_100 = pretmp_112 + 12; pretmp_101 = (long unsigned int) pretmp_100; pretmp_113 = pretmp_101 * 2; pretmp_114 = pretmp_112 + 13; pretmp_115 = (long unsigned int) pretmp_114; pretmp_116 = pretmp_115 * 2; pretmp_117 = pretmp_112 + 14; pretmp_129 = (long unsigned int) pretmp_117; pretmp_130 = pretmp_129 * 2; pretmp_131 = pretmp_112 + 15; pretmp_132 = (long unsigned int) pretmp_131; pretmp_133 = pretmp_132 * 2; <bb 3>: # ptr_48 = PHI <&arr(2), ptr_272(3)> # y_64 = PHI <y_9(2), y_25(3)> _34 = (long unsigned int) y_64; _35 = _34 * 8; _36 = pretmp_128 + _35; _37 = *_36; _40 = _37 + pretmp_159; _41 = *_40; *ptr_48 = _41; _56 = _37 + pretmp_176; _57 = *_56; MEM[(short unsigned int *)ptr_48 + 2B] = _57; _72 = _37 + pretmp_207; _73 = *_72; MEM[(short unsigned int *)ptr_48 + 4B] = _73; _88 = _37 + pretmp_224; _89 = *_88; MEM[(short unsigned int *)ptr_48 + 6B] = _89; _104 = _37 + pretmp_255; _105 = *_104; MEM[(short unsigned int *)ptr_48 + 8B] = _105; _120 = _37 + pretmp_283; _121 = *_120; MEM[(short unsigned int *)ptr_48 + 10B] = _121; _136 = _37 + pretmp_51; _137 = *_136; MEM[(short unsigned int *)ptr_48 + 12B] = _137; _152 = _37 + pretmp_65; _153 = *_152; MEM[(short unsigned int *)ptr_48 + 14B] = _153; _168 = _37 + pretmp_68; _169 = *_168; MEM[(short unsigned int *)ptr_48 + 16B] = _169; _184 = _37 + pretmp_82; _185 = *_184; MEM[(short unsigned int *)ptr_48 + 18B] = _185; _200 = _37 + pretmp_85; _201 = *_200; MEM[(short unsigned int *)ptr_48 + 20B] = _201; _216 = _37 + pretmp_99; _217 = *_216; MEM[(short unsigned int *)ptr_48 + 22B] = _217; _232 = _37 + pretmp_113; _233 = *_232; MEM[(short unsigned int *)ptr_48 + 24B] = _233; _248 = _37 + pretmp_116; _249 = *_248; MEM[(short unsigned int *)ptr_48 + 26B] = _249; _264 = _37 + pretmp_130; _265 = *_264; MEM[(short unsigned int *)ptr_48 + 28B] = _265; ptr_272 = &MEM[(void *)ptr_48 + 32B]; _280 = _37 + pretmp_133; _281 = *_280; MEM[(short unsigned int *)ptr_48 + 30B] = _281; y_25 = y_64 + 1; if (y_25 > _47) goto <bb 4>; else goto <bb 3>; Pre hoist the index part of addr expression "base + (reg + i) *2" out of first loop. This introduces higher register pressure, prevents gcc from using powerful addressing expression on x86. On other targets like arm, only register pressure issue may hold. Both pre and lim will do same transformation.