int bar (void); void foo (int *); static int s[10]; void foobar (int i1, int i2, int i3, int i4, int i5, int i6) { int a[100]; int i, i7;
i7 = bar (); bar (); for (i = 0; i < 100; i++) a[i] = s[i1] + s[i2] + s[i3] + s[i4] + s[i5] + s[i6] + s[i7]; foo (&a[0]); return; } If you compare mainline to dataflow branch at -O2 you can see --- t.i.trunk 2007-02-21 11:31:09.663252586 +0100 +++ t.i.df 2007-02-21 11:31:10.548064364 +0100 @@ -37,7 +37,6 @@ movl s(,%rbx,4), %edx addl s(,%rcx,4), %edx movslq %r12d,%r12 - leaq 16(%rsp), %rdi addl s(,%r13,4), %edx addl s(,%r14,4), %edx addl s(,%r15,4), %edx @@ -46,10 +45,11 @@ addl s(,%r12,4), %edx .p2align 4,,7 .L2: - movl %edx, (%rdi,%rax,4) + movl %edx, 16(%rsp,%rax,4) addq $1, %rax cmpq $100, %rax jne .L2 + leaq 16(%rsp), %rdi call foo addq $424, %rsp popq %rbx that is, we are choosing a more expensive addressing mode in the loop not noticing that 16(%rsp) can be (G)CSEd. This makes the above loop run 33% slower on x86_64. -- Summary: [dataflow] Bad interaction with addressing mode selection and regalloc Product: gcc Version: 4.3.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: middle-end AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: rguenth at gcc dot gnu dot org GCC target triplet: x86_64-*-* http://gcc.gnu.org/bugzilla/show_bug.cgi?id=30907