------- Comment #9 from rguenth at gcc dot gnu dot org 2007-10-28 16:38 ------- The main difference I see is that 4.2 avoids re-use of %eax as index register:
.L34: movq %r11, %rdi addq 8(%r10), %rdi movq 8(%r10), %rsi movq 8(%r10), %rdx movq 40(%r10), %rax leaq 4(%r11), %rbx addq %rdi, %rsi leaq 4(%rdi), %r9 movq %rdi, -8(%r10) addq %rsi, %rdx leaq 4(%rsi), %r8 movq %rsi, -24(%r10) leaq 4(%rdx), %rcx movq %r9, -16(%r10) movq %rdx, -40(%r10) movq %r8, -32(%r10) addq $7, %rax movq %rcx, -48(%r10) movsd (%rax,%rcx,2), %xmm12 leaq (%rbx,%rbx), %rcx movsd (%rax,%rdx,2), %xmm3 leaq (%rax,%r11,2), %rdx addq $8, %r11 movsd (%rax,%r8,2), %xmm14 cmpq %r11, %r13 movsd (%rax,%rsi,2), %xmm13 movsd (%rax,%r9,2), %xmm11 movsd (%rax,%rdi,2), %xmm10 movsd (%rax,%rcx), %xmm8 ... while 4.3 always re-loads %rax as index: .L26: leaq 4(%rdi), %rdx movq %rdi, %rax movq %rdx, -8(%rsp) addq (%r8), %rax movq %rax, (%r9) addq $4, %rax movq %rax, (%rbp) movq (%r9), %rax addq (%r8), %rax movq %rax, (%r10) addq $4, %rax movq %rax, (%rbx) movq (%r10), %rax addq (%r8), %rax movq %rax, (%r11) movq -64(%rsp), %rcx addq $4, %rax movq %rax, (%rcx) movq (%rsi), %rdx movq -8(%rsp), %rcx addq $7, %rdx movsd (%rdx,%rax,2), %xmm13 movq (%r11), %rax addq %rcx, %rcx movsd (%rdx,%rcx), %xmm8 movsd (%rdx,%rax,2), %xmm3 movq (%rbx), %rax movsd (%rdx,%rax,2), %xmm14 movq (%r10), %rax movsd (%rdx,%rax,2), %xmm12 movq (%rbp), %rax movsd (%rdx,%rax,2), %xmm11 movq (%r9), %rax movsd (%rdx,%rax,2), %xmm10 movq (%r12), %rax leaq (%rdx,%rdi,2), %rdx ... the root cause needs to be investigated still. -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928