------- Additional Comments From steven at gcc dot gnu dot org 2005-02-09 23:35 ------- The entire diff of .optimized dumps and .s output for twolf on AMD64 is really small, in fact the asm output is different for only one file: config1.c.t65.optimized | 120 ++++++++++++++++++++++++++++++---------------- configure.c.t65.optimized | 78 +++++++++++++++++++---------- outpins.c.t65.optimized | 6 +- outpins.s | 36 ++++++------- qsorte.c.t65.optimized | 3 - qsortg.c.t65.optimized | 3 - qsortgdx.c.t65.optimized | 3 - qsortx.c.t65.optimized | 3 - readcell.c.t65.optimized | 3 - readseg.c.t65.optimized | 6 +- ucgxp.c.t65.optimized | 3 - uloop.c.t65.optimized | 6 +- 12 files changed, 174 insertions(+), 96 deletions(-) The file with the assembler difference is outpins.c. The relevant diff is below. There is nothing in the diff that explains the ~4% slowdown I see in my SPEC benchmarks (3 runs, so the slowdown is consistent). The same instructions are there, just ordered differently and using different registers. So I'm not sure how to proceed... diff -u base/outpins.c.t65.optimized hacked/outpins.c.t65.optimized --- base/outpins.c.t65.optimized 2005-02-10 00:19:20.950581229 +0100 +++ patched/outpins.c.t65.optimized 2005-02-10 00:16:19.436444879 +0100 @@ -99,8 +99,9 @@ pairArray.39 = pairArray; carray.40 = carray; D.3698 = *((struct cellbox * *) ((long unsigned int) *(*((int * *) D.3712 + pairArray.39 - 8B) + 4B) * 8) + carray.40); + end.81 = D.3698->cxcenter + (int) D.3698->tileptr->left; temp.59 = *(carray.40 + (struct cellbox * *) ((long unsigned int) *(*(pairArray.39 + (int * *) D.3712) + 4B) * 8)); - end = MAX_EXPR <D.3698->cxcenter + (int) D.3698->tileptr->left, temp.59->cxcenter + (int) temp.59->tileptr->left>; + end = MAX_EXPR <end.81, temp.59->cxcenter + (int) temp.59->tileptr->left>; <L4>:; return end; @@ -228,9 +229,10 @@ D.3668 = *((int * *) D.3664 + pairArray.36 - 8B); carray.37 = carray; D.3646 = *((struct cellbox * *) ((long unsigned int) *(D.3668 + (int *) ((long unsigned int) *D.3668 * 4)) * 8) + carray.37); + end.121 = D.3646->cxcenter + (int) D.3646->tileptr->right; D.3676 = *(pairArray.36 + (int * *) D.3664); temp.99 = *(carray.37 + (struct cellbox * *) ((long unsigned int) *(D.3676 + (int *) ((long unsigned int) *D.3676 * 4)) * 8)); - end = MIN_EXPR <D.3646->cxcenter + (int) D.3646->tileptr->right, temp.99->cxcenter + (int) temp.99->tileptr->right>; + end = MIN_EXPR <end.121, temp.99->cxcenter + (int) temp.99->tileptr->right>; <L4>:; return end; diff -u base/outpins.s hacked/outpins.s --- base/outpins.s 2005-02-10 00:19:21.064543028 +0100 +++ patched/outpins.s 2005-02-10 00:16:19.551406289 +0100 @@ -18,18 +18,18 @@ movq -8(%rdx,%rcx), %rax movslq 4(%rax),%rax movq (%rsi,%rax,8), %rdi + movq 40(%rdi), %rax + movswl (%rax),%r8d movq (%rcx,%rdx), %rax + addl 12(%rdi), %r8d movslq 4(%rax),%rax movq (%rsi,%rax,8), %rdx - movq 40(%rdi), %rax - movswl (%rax),%ecx movq 40(%rdx), %rax - addl 12(%rdi), %ecx movswl (%rax),%eax addl 12(%rdx), %eax - cmpl %eax, %ecx - cmovl %eax, %ecx - movl %ecx, %eax + cmpl %eax, %r8d + cmovl %eax, %r8d + movl %r8d, %eax ret .p2align 4,,7 .L11: @@ -40,9 +40,9 @@ movq carray(%rip), %rax movq (%rax,%rdx,8), %rdx movq 40(%rdx), %rax - movswl (%rax),%ecx - addl 12(%rdx), %ecx - movl %ecx, %eax + movswl (%rax),%r8d + addl 12(%rdx), %r8d + movl %r8d, %eax ret .p2align 4,,7 .L12: @@ -72,18 +72,18 @@ movslq (%rcx),%rax movslq (%rcx,%rax,4),%rax movq (%rdi,%rax,8), %rcx + movq 40(%rcx), %rax + movswl 2(%rax),%r8d movslq (%rdx),%rax + addl 12(%rcx), %r8d movslq (%rdx,%rax,4),%rax movq (%rdi,%rax,8), %rdx - movq 40(%rcx), %rax - movswl 2(%rax),%esi movq 40(%rdx), %rax - addl 12(%rcx), %esi movswl 2(%rax),%eax addl 12(%rdx), %eax - cmpl %eax, %esi - cmovg %eax, %esi - movl %esi, %eax + cmpl %eax, %r8d + cmovg %eax, %r8d + movl %r8d, %eax ret .p2align 4,,7 .L22: @@ -95,9 +95,9 @@ movq carray(%rip), %rax movq (%rax,%rdx,8), %rdx movq 40(%rdx), %rax - movswl 2(%rax),%esi - addl 12(%rdx), %esi - movl %esi, %eax + movswl 2(%rax),%r8d + addl 12(%rdx), %r8d + movl %r8d, %eax ret .p2align 4,,7 .L23:
-- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17549