[Bug rtl-optimization/19780] Floating point computation far slower for -mfpmath=sse

rguenth at gcc dot gnu dot org Tue, 24 Oct 2006 06:28:25 -0700


------- Comment #5 from rguenth at gcc dot gnu dot org  2006-10-24 13:28 -------
With more registers (x86_64) the stack moves are gone, but: (!)


[EMAIL PROTECTED]:/abuild/rguenther/trunk-g/gcc> ./xgcc -B. -O2 -o t t.c
-mfpmath=387
[EMAIL PROTECTED]:/abuild/rguenther/trunk-g/gcc> /usr/bin/time ./t
Start?
Stop!
Result = 0.000000, 0.000000, 1.000000
5.31user 0.00system 0:05.32elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+135minor)pagefaults 0swaps
[EMAIL PROTECTED]:/abuild/rguenther/trunk-g/gcc> ./xgcc -B. -O2 -o t t.c        
[EMAIL PROTECTED]:/abuild/rguenther/trunk-g/gcc> /usr/bin/time ./t
Start?
Stop!
Result = 0.000000, 0.000000, 1.000000
9.96user 0.05system 0:10.06elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+135minor)pagefaults 0swaps

that is almost twice as fast with 387 math than with SSE math on x86_64!

The inner loop is

.L7:
        movaps  %xmm3, %xmm6
        movaps  %xmm1, %xmm5
        movaps  %xmm0, %xmm4
.L2:
        movaps  %xmm2, %xmm3
        mulss   %xmm6, %xmm2
        movaps  %xmm7, %xmm0
        addl    $1, %eax
        mulss   %xmm4, %xmm3
        movaps  %xmm7, %xmm1
        mulss   %xmm5, %xmm0
        cmpl    $1000000000, %eax
        mulss   %xmm6, %xmm1
        movaps  %xmm4, %xmm7
        subss   %xmm0, %xmm3
        movaps  %xmm8, %xmm0
        mulss   %xmm4, %xmm0
        subss   %xmm0, %xmm1
        movaps  %xmm8, %xmm0
        movaps  %xmm6, %xmm8
        mulss   %xmm5, %xmm0
        subss   %xmm2, %xmm0
        movaps  %xmm5, %xmm2
        jne     .L7

vs.

.L7:
        fxch    %st(3)
        fxch    %st(2)
.L2:
        fld     %st(2)
        addl    $1, %eax
        cmpl    $1000000000, %eax
        fmul    %st(1), %st
        flds    76(%rsp)
        fmul    %st(5), %st
        fsubrp  %st, %st(1)
        flds    76(%rsp)
        fmul    %st(3), %st
        flds    72(%rsp)
        fmul    %st(3), %st
        fsubrp  %st, %st(1)
        flds    72(%rsp)
        fmul    %st(6), %st
        fxch    %st(5)
        fmul    %st(4), %st
        fsubrp  %st, %st(5)
        fxch    %st(2)
        fstps   76(%rsp)
        fxch    %st(2)
        fstps   72(%rsp)
        jne     .L7

(testing done on AMD Athlon fam 15 model 35 stepping 2)


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19780

[Bug rtl-optimization/19780] Floating point computation far slower for -mfpmath=sse

Reply via email to