4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

lucier at math dot purdue dot edu Wed, 06 May 2009 22:28:00 -0700


------- Comment #66 from lucier at math dot purdue dot edu  2009-05-07 05:27 
-------
Adding -frename-registers gives a significant speedup (sometimes as fast as
4.1.2 on this shared machine, i.e., it somtimes hits 108 ms instead of
132-140ms), the command line with -fforward-propagate -fno-move-loop-invariants
-frename-registers  is


/pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W -Wno-unused
-O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing
-fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp -fforward-propagate
-fno-move-loop-invariants -frename-registers -DHAVE_CONFIG_H -D___PRIMAL
-D___LIBRARY -D___GAMBCDIR="\"/usr/local/Gambit-C/v4.1.2\""
-D___SYS_TYPE_CPU="\"x86_64\"" -D___SYS_TYPE_VENDOR="\"unknown\""
-D___SYS_TYPE_OS="\"linux-gnu\"" -c _num.c

and the loop is

.L2752:
        movq    %rcx, %r12
        addq    8(%rax), %r12
        leaq    4(%rcx), %rdi
        movq    %r12, -8(%rax)
        leaq    4(%r12), %r8
        addq    8(%rax), %r12
        movq    %r8, -16(%rax)
        movq    -8(%rax), %r8
        movq    -16(%rax), %rdx
        movq    %r12, -24(%rax)
        leaq    4(%r12), %rbx
        addq    8(%rax), %r12
        movq    -24(%rax), %r9
        movq    %rbx, -32(%rax)
        movq    24(%rax), %rbx
        movq    -32(%rax), %r10
        leaq    4(%r12), %r11
        movq    %r12, -40(%rax)
        movq    40(%rax), %r12
        movq    -40(%rax), %r14
        movq    %r11, -48(%rax)
        movsd   15(%rbx), %xmm1
        movsd   7(%rbx), %xmm2
        movsd   7(%r12,%r11,2), %xmm9
        movapd  %xmm1, %xmm3
        movsd   7(%r12,%r14,2), %xmm11
        leaq    7(%r12,%rcx,2), %r11
        movapd  %xmm2, %xmm10
        leaq    (%rdi,%rdi), %r14
        mulsd   %xmm11, %xmm3
        movapd  %xmm2, %xmm12
        mulsd   %xmm9, %xmm10
        addq    $8, %rcx
        mulsd   %xmm1, %xmm9
        cmpq    %rcx, %r13
        mulsd   %xmm2, %xmm11
        movsd   7(%r12,%r10,2), %xmm5
        movsd   7(%r12,%r9,2), %xmm7
        addsd   %xmm10, %xmm3
        movsd   7(%r12,%r8,2), %xmm6
        subsd   %xmm9, %xmm11
        mulsd   %xmm7, %xmm2
        movapd  %xmm1, %xmm9
        mulsd   %xmm5, %xmm1
        movapd  %xmm6, %xmm13
        movsd   7(%r12,%rdx,2), %xmm14
        mulsd   %xmm5, %xmm12
        mulsd   %xmm7, %xmm9
        subsd   %xmm11, %xmm13
        movsd   31(%rbx), %xmm0
        addsd   %xmm6, %xmm11
        movsd   .LC5(%rip), %xmm6
        subsd   %xmm1, %xmm2
        movsd   (%r11), %xmm4
        movapd  %xmm14, %xmm10
        xorpd   %xmm0, %xmm6
        addsd   %xmm12, %xmm9
        movsd   7(%r14,%r12), %xmm8
        subsd   %xmm3, %xmm10
        movapd  %xmm4, %xmm7
        addsd   %xmm14, %xmm3
        movsd   23(%rbx), %xmm15
        subsd   %xmm2, %xmm7
        movapd  %xmm8, %xmm5
        addsd   %xmm4, %xmm2
        movapd  %xmm6, %xmm4
        subsd   %xmm9, %xmm5
        movapd  %xmm15, %xmm14
        addsd   %xmm8, %xmm9
        mulsd   %xmm10, %xmm4
        movapd  %xmm15, %xmm8
        mulsd   %xmm15, %xmm10
        movapd  %xmm0, %xmm12
        mulsd   %xmm11, %xmm15
        mulsd   %xmm3, %xmm0
        movapd  %xmm7, %xmm1
        mulsd   %xmm13, %xmm6
        mulsd   %xmm3, %xmm8
        movapd  %xmm9, %xmm3
        mulsd   %xmm11, %xmm12
        subsd   %xmm0, %xmm15
        mulsd   %xmm13, %xmm14
        subsd   %xmm10, %xmm6
        movapd  %xmm2, %xmm10
        movapd  %xmm5, %xmm0
        addsd   %xmm12, %xmm8
        addsd   %xmm15, %xmm10
        subsd   %xmm15, %xmm2
        addsd   %xmm14, %xmm4
        addsd   %xmm8, %xmm3
        movsd   %xmm10, (%r11)
        movq    40(%rax), %r10
        subsd   %xmm8, %xmm9
        addsd   %xmm6, %xmm1
        addsd   %xmm4, %xmm0
        movsd   %xmm3, 7(%r14,%r10)
        movq    -8(%rax), %r9
        movq    40(%rax), %rdx
        subsd   %xmm6, %xmm7
        subsd   %xmm4, %xmm5
        movsd   %xmm2, 7(%rdx,%r9,2)
        movq    -16(%rax), %r8
        movq    40(%rax), %r12
        movsd   %xmm9, 7(%r12,%r8,2)
        movq    -24(%rax), %rbx
        movq    40(%rax), %r11
        movsd   %xmm1, 7(%r11,%rbx,2)
        movq    -32(%rax), %r14
        movq    40(%rax), %r10
        movsd   %xmm0, 7(%r10,%r14,2)
        movq    -40(%rax), %r9
        movq    40(%rax), %rdx
        movsd   %xmm7, 7(%rdx,%r9,2)
        movq    -48(%rax), %r8
        movq    40(%rax), %r12
        movsd   %xmm5, 7(%r12,%r8,2)
        jg      .L2752

Adding -fforward-propagate -fno-move-loop-invariants -fweb instead of
-fforward-propagate -fno-move-loop-invariants -frename-registers, so the
compile line is

/pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W -Wno-unused
-O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing
-fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp -fforward-propagate
-fno-move-loop-invariants -fweb -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY
-D___GAMBCDIR="\"/usr/local/Gambit-C/v4.1.2\"" -D___SYS_TYPE_CPU="\"x86_64\""
-D___SYS_TYPE_VENDOR="\"unknown\"" -D___SYS_TYPE_OS="\"linux-gnu\"" -c _num.c

the time is not so good (consistently 128ms) and the loop is

.L2752:
        movq    %rcx, %rdx
        addq    8(%rax), %rdx
        leaq    4(%rcx), %rdi
        movq    %rdx, -8(%rax)
        leaq    4(%rdx), %rbx
        addq    8(%rax), %rdx
        movq    %rbx, -16(%rax)
        movq    %rdx, -24(%rax)
        leaq    4(%rdx), %rbx
        addq    8(%rax), %rdx
        movq    %rbx, -32(%rax)
        movq    %rdx, -40(%rax)
        leaq    4(%rdx), %rbx
        movq    40(%rax), %rdx
        movq    %rbx, -48(%rax)
        movsd   7(%rdx,%rbx,2), %xmm9
        movq    -40(%rax), %rbx
        leaq    7(%rdx,%rcx,2), %r8
        addq    $8, %rcx
        movsd   (%r8), %xmm4
        cmpq    %rcx, %r13
        movsd   7(%rdx,%rbx,2), %xmm11
        movq    -32(%rax), %rbx
        movsd   7(%rdx,%rbx,2), %xmm5
        movq    -24(%rax), %rbx
        movsd   7(%rdx,%rbx,2), %xmm7
        movq    -16(%rax), %rbx
        movsd   7(%rdx,%rbx,2), %xmm14
        movq    -8(%rax), %rbx
        movsd   7(%rdx,%rbx,2), %xmm6
        leaq    (%rdi,%rdi), %rbx
        movsd   7(%rbx,%rdx), %xmm8
        movq    24(%rax), %rdx
        movapd  %xmm6, %xmm13
        movsd   15(%rdx), %xmm1
        movsd   7(%rdx), %xmm2
        movapd  %xmm1, %xmm10
        movsd   31(%rdx), %xmm3
        movapd  %xmm2, %xmm12
        mulsd   %xmm11, %xmm10
        mulsd   %xmm9, %xmm12
        mulsd   %xmm2, %xmm11
        mulsd   %xmm1, %xmm9
        movsd   23(%rdx), %xmm0
        addsd   %xmm12, %xmm10
        movapd  %xmm2, %xmm12
        mulsd   %xmm7, %xmm2
        subsd   %xmm9, %xmm11
        movapd  %xmm1, %xmm9
        mulsd   %xmm5, %xmm12
        mulsd   %xmm5, %xmm1
        movapd  %xmm8, %xmm5
        mulsd   %xmm7, %xmm9
        movapd  %xmm4, %xmm7
        subsd   %xmm11, %xmm13
        addsd   %xmm6, %xmm11
        movsd   .LC5(%rip), %xmm6
        subsd   %xmm1, %xmm2
        movapd  %xmm0, %xmm1
        addsd   %xmm12, %xmm9
        movapd  %xmm14, %xmm12
        xorpd   %xmm3, %xmm6
        subsd   %xmm10, %xmm12
        mulsd   %xmm13, %xmm1
        subsd   %xmm2, %xmm7
        addsd   %xmm4, %xmm2
        movapd  %xmm6, %xmm4
        addsd   %xmm14, %xmm10
        mulsd   %xmm13, %xmm6
        mulsd   %xmm12, %xmm4
        subsd   %xmm9, %xmm5
        mulsd   %xmm0, %xmm12
        addsd   %xmm8, %xmm9
        movapd  %xmm0, %xmm8
        mulsd   %xmm11, %xmm0
        addsd   %xmm1, %xmm4
        movapd  %xmm3, %xmm1
        mulsd   %xmm10, %xmm3
        subsd   %xmm12, %xmm6
        mulsd   %xmm11, %xmm1
        mulsd   %xmm10, %xmm8
        subsd   %xmm3, %xmm0
        addsd   %xmm1, %xmm8
        movapd  %xmm2, %xmm1
        addsd   %xmm0, %xmm1
        subsd   %xmm0, %xmm2
        movapd  %xmm7, %xmm0
        subsd   %xmm6, %xmm7
        addsd   %xmm6, %xmm0
        movsd   %xmm1, (%r8)
        movapd  %xmm9, %xmm1
        movq    40(%rax), %rdx
        subsd   %xmm8, %xmm9
        addsd   %xmm8, %xmm1
        movsd   %xmm1, 7(%rbx,%rdx)
        movq    -8(%rax), %rbx
        movq    40(%rax), %rdx
        movsd   %xmm2, 7(%rdx,%rbx,2)
        movq    -16(%rax), %rbx
        movq    40(%rax), %rdx
        movsd   %xmm9, 7(%rdx,%rbx,2)
        movq    -24(%rax), %rbx
        movq    40(%rax), %rdx
        movsd   %xmm0, 7(%rdx,%rbx,2)
        movapd  %xmm5, %xmm0
        movq    -32(%rax), %rbx
        movq    40(%rax), %rdx
        subsd   %xmm4, %xmm5
        addsd   %xmm4, %xmm0
        movsd   %xmm0, 7(%rdx,%rbx,2)
        movq    -40(%rax), %rbx
        movq    40(%rax), %rdx
        movsd   %xmm7, 7(%rdx,%rbx,2)
        movq    -48(%rax), %rbx
        movq    40(%rax), %rdx
        movsd   %xmm5, 7(%rdx,%rbx,2)
        jg      .L2752

And I still count 117 instructions in the loop in comment 64 (whether that
matters, I don't know).


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928

[Bug rtl-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

Reply via email to