https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66003

            Bug ID: 66003
           Summary: missed cse opportunity in addr expressions because of
                    tree pre/lim
           Product: gcc
           Version: 6.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: amker at gcc dot gnu.org
  Target Milestone: ---

Below simple case is reduced from spec,
typedef struct
{
  int x;
  int y;
} coord;

extern unsigned short **org;
extern coord *c;
void bar (unsigned short *ptr);
void foo (int s, int n)
{
  unsigned short arr[256], *ptr = arr;
  int x, y;

  for (y = c->y; y < c->y + 16; y++)
    for (x = c->x; x < c->x + 16; x++)
      *ptr++ = org [y][x];

  bar (ptr);
}

When compiling with below two command lines
A: $gcc -Ofast -S test.c -o x.S
B: $gcc -Ofast -S test.c -o y.S -fno-tree-pre -fno-tree-loop-im

The assembly difference is as below:

$ diff  x.S y.S
12,14c12,34
<       subq    $520, %rsp
<       .cfi_def_cfa_offset 528
<       movq    c(%rip), %rdx
---
>       pushq   %r15
>       .cfi_def_cfa_offset 16
>       .cfi_offset 15, -16
>       pushq   %r14
>       .cfi_def_cfa_offset 24
>       .cfi_offset 14, -24
>       pushq   %r13
>       .cfi_def_cfa_offset 32
>       .cfi_offset 13, -32
>       pushq   %r12
>       .cfi_def_cfa_offset 40
>       .cfi_offset 12, -40
>       pushq   %rbp
>       .cfi_def_cfa_offset 48
>       .cfi_offset 6, -48
>       pushq   %rbx
>       .cfi_def_cfa_offset 56
>       .cfi_offset 3, -56
>       subq    $568, %rsp
>       .cfi_def_cfa_offset 624
>       movq    c(%rip), %rax
>       movslq  (%rax), %rsi
>       movslq  4(%rax), %rdx
16,20c36,58
<       movslq  4(%rdx), %rcx
<       leaq    (%rax,%rcx,8), %rsi
<       movslq  (%rdx), %rcx
<       movq    %rsp, %rax
<       addq    %rcx, %rcx
---
>       addq    %rsi, %rsi
>       leaq    24(%rsi), %rcx
>       leaq    22(%rsi), %rdi
>       leaq    2(%rsi), %r15
>       leaq    4(%rsi), %r14
>       leaq    6(%rsi), %r13
>       leaq    8(%rsi), %r12
>       movq    %rcx, 8(%rsp)
>       leaq    26(%rsi), %rcx
>       leaq    10(%rsi), %rbp
>       leaq    12(%rsi), %rbx
>       leaq    14(%rsi), %r11
>       leaq    16(%rsi), %r10
>       movq    %rcx, 16(%rsp)
>       leaq    28(%rsi), %rcx
>       leaq    18(%rsi), %r9
>       leaq    20(%rsi), %r8
>       movq    %rdi, 40(%rsp)
>       movq    %rcx, 24(%rsp)
>       leaq    30(%rsi), %rcx
>       movq    %rcx, 32(%rsp)
>       leaq    (%rax,%rdx,8), %rcx
>       leaq    48(%rsp), %rax
24c62
<       movq    (%rsi), %rdx
---
>       movq    (%rcx), %rdx
26,27c64,65
<       addq    $8, %rsi
<       movzwl  (%rdx,%rcx), %edi
---
>       addq    $8, %rcx
>       movzwl  (%rdx,%rsi), %edi
29c67
<       movzwl  2(%rdx,%rcx), %edi
---
>       movzwl  (%rdx,%r15), %edi
31c69
<       movzwl  4(%rdx,%rcx), %edi
---
>       movzwl  (%rdx,%r14), %edi
33c71
<       movzwl  6(%rdx,%rcx), %edi
---
>       movzwl  (%rdx,%r13), %edi
35c73
<       movzwl  8(%rdx,%rcx), %edi
---
>       movzwl  (%rdx,%r12), %edi
37c75
<       movzwl  10(%rdx,%rcx), %edi
---
>       movzwl  (%rdx,%rbp), %edi
39c77
<       movzwl  12(%rdx,%rcx), %edi
---
>       movzwl  (%rdx,%rbx), %edi
41c79
<       movzwl  14(%rdx,%rcx), %edi
---
>       movzwl  (%rdx,%r11), %edi
43c81
<       movzwl  16(%rdx,%rcx), %edi
---
>       movzwl  (%rdx,%r10), %edi
45c83
<       movzwl  18(%rdx,%rcx), %edi
---
>       movzwl  (%rdx,%r9), %edi
47c85
<       movzwl  20(%rdx,%rcx), %edi
---
>       movzwl  (%rdx,%r8), %edi
49c87,88
<       movzwl  22(%rdx,%rcx), %edi
---
>       movq    40(%rsp), %rdi
>       movzwl  (%rdx,%rdi), %edi
51c90,91
<       movzwl  24(%rdx,%rcx), %edi
---
>       movq    8(%rsp), %rdi
>       movzwl  (%rdx,%rdi), %edi
53c93,94
<       movzwl  26(%rdx,%rcx), %edi
---
>       movq    16(%rsp), %rdi
>       movzwl  (%rdx,%rdi), %edi
55c96,97
<       movzwl  28(%rdx,%rcx), %edi
---
>       movq    24(%rsp), %rdi
>       movzwl  (%rdx,%rdi), %edi
57c99,100
<       movzwl  30(%rdx,%rcx), %edx
---
>       movq    32(%rsp), %rdi
>       movzwl  (%rdx,%rdi), %edx
59c102
<       leaq    512(%rsp), %rdx
---
>       leaq    560(%rsp), %rdx
64c107,119
<       addq    $520, %rsp
---
>       addq    $568, %rsp
>       .cfi_def_cfa_offset 56
>       popq    %rbx
>       .cfi_def_cfa_offset 48
>       popq    %rbp
>       .cfi_def_cfa_offset 40
>       popq    %r12
>       .cfi_def_cfa_offset 32
>       popq    %r13
>       .cfi_def_cfa_offset 24
>       popq    %r14
>       .cfi_def_cfa_offset 16
>       popq    %r15

The tree-pre dump is as below:

  <bb 2>:
  c.0_8 = c;
  y_9 = c.0_8->y;
  _47 = y_9 + 15;
  pretmp_112 = c.0_8->x;
  pretmp_128 = org;
  pretmp_144 = (long unsigned int) pretmp_112;
  pretmp_159 = pretmp_144 * 2;
  pretmp_160 = pretmp_112 + 1;
  pretmp_175 = (long unsigned int) pretmp_160;
  pretmp_176 = pretmp_175 * 2;
  pretmp_191 = pretmp_112 + 2;
  pretmp_192 = (long unsigned int) pretmp_191;
  pretmp_207 = pretmp_192 * 2;
  pretmp_208 = pretmp_112 + 3;
  pretmp_223 = (long unsigned int) pretmp_208;
  pretmp_224 = pretmp_223 * 2;
  pretmp_239 = pretmp_112 + 4;
  pretmp_240 = (long unsigned int) pretmp_239;
  pretmp_255 = pretmp_240 * 2;
  pretmp_256 = pretmp_112 + 5;
  pretmp_271 = (long unsigned int) pretmp_256;
  pretmp_283 = pretmp_271 * 2;
  pretmp_12 = pretmp_112 + 6;
  pretmp_50 = (long unsigned int) pretmp_12;
  pretmp_51 = pretmp_50 * 2;
  pretmp_52 = pretmp_112 + 7;
  pretmp_53 = (long unsigned int) pretmp_52;
  pretmp_65 = pretmp_53 * 2;
  pretmp_66 = pretmp_112 + 8;
  pretmp_67 = (long unsigned int) pretmp_66;
  pretmp_68 = pretmp_67 * 2;
  pretmp_69 = pretmp_112 + 9;
  pretmp_81 = (long unsigned int) pretmp_69;
  pretmp_82 = pretmp_81 * 2;
  pretmp_83 = pretmp_112 + 10;
  pretmp_84 = (long unsigned int) pretmp_83;
  pretmp_85 = pretmp_84 * 2;
  pretmp_97 = pretmp_112 + 11;
  pretmp_98 = (long unsigned int) pretmp_97;
  pretmp_99 = pretmp_98 * 2;
  pretmp_100 = pretmp_112 + 12;
  pretmp_101 = (long unsigned int) pretmp_100;
  pretmp_113 = pretmp_101 * 2;
  pretmp_114 = pretmp_112 + 13;
  pretmp_115 = (long unsigned int) pretmp_114;
  pretmp_116 = pretmp_115 * 2;
  pretmp_117 = pretmp_112 + 14;
  pretmp_129 = (long unsigned int) pretmp_117;
  pretmp_130 = pretmp_129 * 2;
  pretmp_131 = pretmp_112 + 15;
  pretmp_132 = (long unsigned int) pretmp_131;
  pretmp_133 = pretmp_132 * 2;

  <bb 3>:
  # ptr_48 = PHI <&arr(2), ptr_272(3)>
  # y_64 = PHI <y_9(2), y_25(3)>
  _34 = (long unsigned int) y_64;
  _35 = _34 * 8;
  _36 = pretmp_128 + _35;
  _37 = *_36;
  _40 = _37 + pretmp_159;
  _41 = *_40;
  *ptr_48 = _41;
  _56 = _37 + pretmp_176;
  _57 = *_56;
  MEM[(short unsigned int *)ptr_48 + 2B] = _57;
  _72 = _37 + pretmp_207;
  _73 = *_72;
  MEM[(short unsigned int *)ptr_48 + 4B] = _73;
  _88 = _37 + pretmp_224;
  _89 = *_88;
  MEM[(short unsigned int *)ptr_48 + 6B] = _89;
  _104 = _37 + pretmp_255;
  _105 = *_104;
  MEM[(short unsigned int *)ptr_48 + 8B] = _105;
  _120 = _37 + pretmp_283;
  _121 = *_120;
  MEM[(short unsigned int *)ptr_48 + 10B] = _121;
  _136 = _37 + pretmp_51;
  _137 = *_136;
  MEM[(short unsigned int *)ptr_48 + 12B] = _137;
  _152 = _37 + pretmp_65;
  _153 = *_152;
  MEM[(short unsigned int *)ptr_48 + 14B] = _153;
  _168 = _37 + pretmp_68;
  _169 = *_168;
  MEM[(short unsigned int *)ptr_48 + 16B] = _169;
  _184 = _37 + pretmp_82;
  _185 = *_184;
  MEM[(short unsigned int *)ptr_48 + 18B] = _185;
  _200 = _37 + pretmp_85;
  _201 = *_200;
  MEM[(short unsigned int *)ptr_48 + 20B] = _201;
  _216 = _37 + pretmp_99;
  _217 = *_216;
  MEM[(short unsigned int *)ptr_48 + 22B] = _217;
  _232 = _37 + pretmp_113;
  _233 = *_232;
  MEM[(short unsigned int *)ptr_48 + 24B] = _233;
  _248 = _37 + pretmp_116;
  _249 = *_248;
  MEM[(short unsigned int *)ptr_48 + 26B] = _249;
  _264 = _37 + pretmp_130;
  _265 = *_264;
  MEM[(short unsigned int *)ptr_48 + 28B] = _265;
  ptr_272 = &MEM[(void *)ptr_48 + 32B];
  _280 = _37 + pretmp_133;
  _281 = *_280;
  MEM[(short unsigned int *)ptr_48 + 30B] = _281;
  y_25 = y_64 + 1;
  if (y_25 > _47)
    goto <bb 4>;
  else
    goto <bb 3>;

Pre hoist the index part of addr expression "base + (reg + i) *2" out of first
loop.  This introduces higher register pressure, prevents gcc from using
powerful addressing expression on x86.

On other targets like arm, only register pressure issue may hold.

Both pre and lim will do same transformation.

Reply via email to