Bug#512050: gcc-4.3: pessimizes function without SSE intrinsics

brian m. carlson Sat, 17 Jan 2009 09:15:24 -0800

On Fri, Jan 16, 2009 at 08:10:15PM +0100, Martin Michlmayr wrote:

* brian m. carlson <[email protected]> [2009-01-16 18:38]:

Obviously, since the two functions do the exact same thing, they should
be optimized to be identical.  Instead, mul is pessimized.


Can you check if this happens with gcc-4.3 and trunk from SVN, and if
so, file the bug upstream?


It does happen with gcc-4.3 (hence, I filed the bug there) as well as
gcc-snapshot 20090107-1.  I really would prefer not to build from SVN if
I don't have to.

I believe that it's the maintainers job to file the bug upstream, and
upstream has not won any points with me for neglecting a bug that I
reported with a trivial patch.  Thus, I am hesitant to forward the bug
myself.

I have been informed that Apple's GCC does better on this[0]; I will see
if I can get my friend to provide a .i and .s file from that version.  I
also just noted that gcc-4.1 and gcc-4.2 produce much less bad code.
I've attached intrinsics.s from both of those files; they each use 8
movss and 4 mulss.  Nevertheless, they still do not convert the code
into three SSE instructions.

[0] I believe Apple's GCC is based on an older version of FSF gcc, which
is probably why it does better; the results are likely identical between
the two.

--
brian m. carlson / brian with sandals: Houston, Texas, US
+1 713 440 7475 | http://crustytoothpaste.ath.cx/~bmc | My opinion only
troff on top of XML: http://crustytoothpaste.ath.cx/~bmc/code/thwack
OpenPGP: RSA v4 4096b 88AC E9B2 9196 305B A994 7552 F1BA 225C 0223 B187

        .file   "intrinsics.c"
        .text
        .p2align 4,,15
.globl mul
        .type   mul, @function
mul:
.LFB491:
        movss   (%rdi), %xmm0
        mulss   (%rsi), %xmm0
        movss   %xmm0, (%rdx)
        movss   4(%rdi), %xmm0
        mulss   4(%rsi), %xmm0
        movss   %xmm0, 4(%rdx)
        movss   8(%rdi), %xmm0
        mulss   8(%rsi), %xmm0
        movss   %xmm0, 8(%rdx)
        movss   12(%rdi), %xmm0
        mulss   12(%rsi), %xmm0
        movss   %xmm0, 12(%rdx)
        ret
.LFE491:
        .size   mul, .-mul
        .p2align 4,,15
.globl mul2
        .type   mul2, @function
mul2:
.LFB492:
        movaps  (%rdi), %xmm0
        mulps   (%rsi), %xmm0
        movaps  %xmm0, (%rdx)
        ret
.LFE492:
        .size   mul2, .-mul2
        .section        .rodata.str1.1,"aMS",@progbits,1
.LC8:
        .string "%f %f %f %f\n"
        .section        .rodata.cst8,"aM",@progbits,8
        .align 8
.LC13:
        .long   1610612736
        .long   -1071225242
        .align 8
.LC14:
        .long   3758096384
        .long   1075212451
        .align 8
.LC15:
        .long   536870912
        .long   1075983155
        .align 8
.LC16:
        .long   3221225472
        .long   -1075125945
        .text
        .p2align 4,,15
.globl main
        .type   main, @function
main:
.LFB493:
        subq    $56, %rsp
.LCFI0:
        movl    $.LC8, %edi
        movl    $4, %eax
        movsd   .LC13(%rip), %xmm3
        movl    $0x3f99999a, 32(%rsp)
        movsd   .LC14(%rip), %xmm2
        movl    $0x40600000, 36(%rsp)
        movsd   .LC15(%rip), %xmm1
        movl    $0x3fd9999a, 40(%rsp)
        movsd   .LC16(%rip), %xmm0
        movl    $0x40333333, 44(%rsp)
        movl    $0xbf333333, 16(%rsp)
        movl    $0x40266666, 20(%rsp)
        movl    $0x40533333, 24(%rsp)
        movl    $0xc0800000, 28(%rsp)
        movl    $0xbf570a3e, (%rsp)
        movl    $0x41119999, 4(%rsp)
        movl    $0x40b3851f, 8(%rsp)
        movl    $0xc1333333, 12(%rsp)
        call    printf
        xorl    %eax, %eax
        addq    $56, %rsp
        ret
.LFE493:
        .size   main, .-main
        .section        .eh_frame,"a",@progbits
.Lframe1:
        .long   .LECIE1-.LSCIE1
.LSCIE1:
        .long   0x0
        .byte   0x1
        .string "zR"
        .uleb128 0x1
        .sleb128 -8
        .byte   0x10
        .uleb128 0x1
        .byte   0x3
        .byte   0xc
        .uleb128 0x7
        .uleb128 0x8
        .byte   0x90
        .uleb128 0x1
        .align 8
.LECIE1:
.LSFDE1:
        .long   .LEFDE1-.LASFDE1
.LASFDE1:
        .long   .LASFDE1-.Lframe1
        .long   .LFB491
        .long   .LFE491-.LFB491
        .uleb128 0x0
        .align 8
.LEFDE1:
.LSFDE3:
        .long   .LEFDE3-.LASFDE3
.LASFDE3:
        .long   .LASFDE3-.Lframe1
        .long   .LFB492
        .long   .LFE492-.LFB492
        .uleb128 0x0
        .align 8
.LEFDE3:
.LSFDE5:
        .long   .LEFDE5-.LASFDE5
.LASFDE5:
        .long   .LASFDE5-.Lframe1
        .long   .LFB493
        .long   .LFE493-.LFB493
        .uleb128 0x0
        .byte   0x4
        .long   .LCFI0-.LFB493
        .byte   0xe
        .uleb128 0x40
        .align 8
.LEFDE5:
        .ident  "GCC: (GNU) 4.1.3 20080704 (prerelease) (Debian 4.1.2-24)"
        .section        .note.GNU-stack,"",@progbits

        .file   "intrinsics.c"
        .text
        .p2align 4,,15
.globl mul
        .type   mul, @function
mul:
.LFB513:
        movss   (%rdi), %xmm0
        mulss   (%rsi), %xmm0
        movss   %xmm0, (%rdx)
        movss   4(%rdi), %xmm0
        mulss   4(%rsi), %xmm0
        movss   %xmm0, 4(%rdx)
        movss   8(%rdi), %xmm0
        mulss   8(%rsi), %xmm0
        movss   %xmm0, 8(%rdx)
        movss   12(%rdi), %xmm0
        mulss   12(%rsi), %xmm0
        movss   %xmm0, 12(%rdx)
        ret
.LFE513:
        .size   mul, .-mul
        .p2align 4,,15
.globl mul2
        .type   mul2, @function
mul2:
.LFB514:
        movaps  (%rdi), %xmm0
        mulps   (%rsi), %xmm0
        movaps  %xmm0, (%rdx)
        ret
.LFE514:
        .size   mul2, .-mul2
        .section        .rodata.str1.1,"aMS",@progbits,1
.LC8:
        .string "%f %f %f %f\n"
        .text
        .p2align 4,,15
.globl main
        .type   main, @function
main:
.LFB515:
        subq    $56, %rsp
.LCFI0:
        movl    $.LC8, %edi
        movl    $4, %eax
        movsd   .LC14(%rip), %xmm3
        movl    $0x3f99999a, 32(%rsp)
        movsd   .LC15(%rip), %xmm2
        movl    $0x40600000, 36(%rsp)
        movsd   .LC16(%rip), %xmm1
        movl    $0x3fd9999a, 40(%rsp)
        movsd   .LC10(%rip), %xmm0
        movl    $0x40333333, 44(%rsp)
        movl    $0xbf333333, 16(%rsp)
        movl    $0x40266666, 20(%rsp)
        movl    $0x40533333, 24(%rsp)
        movl    $0xc0800000, 28(%rsp)
        movl    $0xbf570a3e, (%rsp)
        movl    $0x41119999, 4(%rsp)
        movl    $0x40b3851f, 8(%rsp)
        movl    $0xc1333333, 12(%rsp)
        call    printf
        xorl    %eax, %eax
        addq    $56, %rsp
        ret
.LFE515:
        .size   main, .-main
        .section        .rodata.cst8,"aM",@progbits,8
        .align 8
.LC10:
        .long   3221225472
        .long   -1075125945
        .align 8
.LC14:
        .long   1610612736
        .long   -1071225242
        .align 8
.LC15:
        .long   3758096384
        .long   1075212451
        .align 8
.LC16:
        .long   536870912
        .long   1075983155
        .section        .eh_frame,"a",@progbits
.Lframe1:
        .long   .LECIE1-.LSCIE1
.LSCIE1:
        .long   0x0
        .byte   0x1
        .string "zR"
        .uleb128 0x1
        .sleb128 -8
        .byte   0x10
        .uleb128 0x1
        .byte   0x3
        .byte   0xc
        .uleb128 0x7
        .uleb128 0x8
        .byte   0x90
        .uleb128 0x1
        .align 8
.LECIE1:
.LSFDE1:
        .long   .LEFDE1-.LASFDE1
.LASFDE1:
        .long   .LASFDE1-.Lframe1
        .long   .LFB513
        .long   .LFE513-.LFB513
        .uleb128 0x0
        .align 8
.LEFDE1:
.LSFDE3:
        .long   .LEFDE3-.LASFDE3
.LASFDE3:
        .long   .LASFDE3-.Lframe1
        .long   .LFB514
        .long   .LFE514-.LFB514
        .uleb128 0x0
        .align 8
.LEFDE3:
.LSFDE5:
        .long   .LEFDE5-.LASFDE5
.LASFDE5:
        .long   .LASFDE5-.Lframe1
        .long   .LFB515
        .long   .LFE515-.LFB515
        .uleb128 0x0
        .byte   0x4
        .long   .LCFI0-.LFB515
        .byte   0xe
        .uleb128 0x40
        .align 8
.LEFDE5:
        .ident  "GCC: (GNU) 4.2.4 (Debian 4.2.4-5)"
        .section        .note.GNU-stack,"",@progbits

signature.asc
Description: Digital signature

Bug#512050: gcc-4.3: pessimizes function without SSE intrinsics

Reply via email to