------- Comment #11 from michaelni at gmx dot at  2008-03-24 00:08 -------
Subject: Re:  Performance degradation when
        building code that uses MMX intrinsics with gcc-4.0.0

On Sun, Mar 23, 2008 at 10:46:41AM -0000, ubizjak at gmail dot com wrote:
> 
> 
> ------- Comment #10 from ubizjak at gmail dot com  2008-03-23 10:46 -------
> (In reply to comment #9)
> 
> > So on my duron 4.3 seems to beat 4.4 as i expected from the generated asm.
> 
> Can you tell from code dumps of 4.4 vs 4.3, where you think that 4.4 code is
> worse than 4.3 for Duron? For Core2, 4.4 avoids store forwarding stall, but 
> I'm
> not sure why Duron prefers moves via memory instead of keeping values in %mm
> registers.

--- freaky_mmx_code-4.3.s       2008-03-24 00:48:11.000000000 +0100
+++ freaky_mmx_code-4.4.s       2008-03-24 00:48:03.000000000 +0100
...
 .L24:
        movl    -36(%ebp), %eax
        testl   %ebx, %ebx
        movl    (%edi,%esi,4), %edx
        movl    (%eax,%esi,4), %ecx
@@ -182,113 +183,102 @@
        xorl    %eax, %eax
        movq    -24(%ebp), %mm2
        .p2align 4,,7
        .p2align 3

 .L23:
        movq    (%ecx,%eax,2), %mm0
        psubw   (%edx,%eax,2), %mm0
        addl    $4, %eax
        cmpl    %eax, %ebx
        movq    %mm0, %mm1
-       psraw   $15, %mm0
-       pxor    %mm0, %mm1
-       psubw   %mm0, %mm1
-       movq    %mm1, %mm0
-       punpcklwd       %mm1, %mm1
-       punpckhwd       %mm3, %mm0
-       psrad   $16, %mm1
-       paddd   %mm0, %mm1
-       paddd   %mm1, %mm2
+       psraw   $15, %mm1
+       pxor    %mm1, %mm0
+       psubw   %mm1, %mm0
+       movq    %mm0, %mm1
+       punpcklwd       %mm0, %mm0
+       punpckhwd       %mm3, %mm1
+       psrad   $16, %mm0
+       paddd   %mm1, %mm0
+       paddd   %mm0, %mm2
        movq    %mm2, -24(%ebp)
        jg      .L23
 .L22:
        addl    $1, %esi
-       cmpl    %esi, -40(%ebp)
-       jg      .L24
+       cmpl    -40(%ebp), %esi
+       jl      .L24

...
-       .ident  "GCC: (Debian 4.3.0-1) 4.3.1 20080309 (prerelease)"
+       .ident  "GCC: (GNU) 4.4.0 20080321 (experimental)"
------------------
What i _think_ makes 4.4 slower on duron is that
psraw   $15, %mm1
reads a register which has been written in the previous instruction
while 4.3 choose the other register which contains the same value.
4.4 simply has a longer dependancy chain than 4.3.

PS: both compiled with -mmmx -O2 -S
PS2: 4.3 is from debian, 4.4 is from gcc svn

[...]


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21395

Reply via email to