the following code shows a performance regression from gcc-4.2 to gcc-4.3 and 4.4 (20090111) on an intel core2 using the x86_64 architecture:
void bench_1(float * out, float * in, float f, unsigned int n) { n /= 4; __m128 scalar = _mm_set_ps1(f); do { __m128 arg = _mm_load_ps(in); __m128 result = _mm_add_ps(arg, scalar); _mm_store_ps(out, result); in += 4; out += 4; } while (--n); } results, running the function 100000000 times, measured with performance counters (requires a patched kernel), compiled with -O3 -mfpmath=sse -msse gcc-4.2: 1946256122 cycles, 8394301290 instructions, 5005 branch misses gcc-4.3: 2191990305 cycles, 7658465214 instructions, 3442 branch misses gcc-4.4: 2532778908 cycles, 7462359830 instructions, 8593402 branch misses although the instruction count decreases, the cycles spent in the function increases. also gcc-4.4 shows a huge number of branch misses. the generated code is gcc-4.2: .globl _Z7bench_1PfS_fj .type _Z7bench_1PfS_fj, @function _Z7bench_1PfS_fj: .LFB2695: movaps %xmm0, %xmm2 shrl $2, %edx shufps $0, %xmm2, %xmm2 movaps %xmm2, %xmm1 .p2align 4,,7 .L15: movaps (%rsi), %xmm0 addq $16, %rsi addps %xmm1, %xmm0 movaps %xmm0, (%rdi) addq $16, %rdi subl $1, %edx jne .L15 rep ; ret .LFE2695: .size _Z7bench_1PfS_fj, .-_Z7bench_1PfS_fj .align 2 .p2align 4,,15 gcc-4.3 .globl _Z7bench_1PfS_fj .type _Z7bench_1PfS_fj, @function _Z7bench_1PfS_fj: .LFB2563: movaps %xmm0, %xmm2 shrl $2, %edx subl $1, %edx xorl %eax, %eax shufps $0, %xmm2, %xmm2 mov %edx, %edx addq $1, %rdx salq $4, %rdx movaps %xmm2, %xmm1 .p2align 4,,10 .p2align 3 .L17: movaps (%rsi,%rax), %xmm0 addps %xmm1, %xmm0 movaps %xmm0, (%rdi,%rax) addq $16, %rax cmpq %rdx, %rax jne .L17 rep ret .LFE2563: .size _Z7bench_1PfS_fj, .-_Z7bench_1PfS_fj .p2align 4,,15 gcc-4.4 .globl _Z7bench_1PfS_fj .type _Z7bench_1PfS_fj, @function _Z7bench_1PfS_fj: .LFB2489: .cfi_startproc .cfi_personality 0x3,__gxx_personality_v0 shrl $2, %edx shufps $0, %xmm0, %xmm0 subl $1, %edx xorl %eax, %eax addq $1, %rdx salq $4, %rdx .p2align 4,,10 .p2align 3 .L17: movaps %xmm0, %xmm1 addps (%rsi,%rax), %xmm1 movaps %xmm1, (%rdi,%rax) addq $16, %rax cmpq %rdx, %rax jne .L17 rep ret .cfi_endproc .LFE2489: .size _Z7bench_1PfS_fj, .-_Z7bench_1PfS_fj .p2align 4,,15 -- Summary: [4.4 regression] performance regression of sse code from 4.2/4.3 Product: gcc Version: 4.4.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: tim at klingt dot org http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38824