On Fri, Jan 16, 2009 at 08:10:15PM +0100, Martin Michlmayr wrote:
* brian m. carlson <[email protected]> [2009-01-16 18:38]:Obviously, since the two functions do the exact same thing, they should be optimized to be identical. Instead, mul is pessimized.Can you check if this happens with gcc-4.3 and trunk from SVN, and if so, file the bug upstream?
It does happen with gcc-4.3 (hence, I filed the bug there) as well as gcc-snapshot 20090107-1. I really would prefer not to build from SVN if I don't have to. I believe that it's the maintainers job to file the bug upstream, and upstream has not won any points with me for neglecting a bug that I reported with a trivial patch. Thus, I am hesitant to forward the bug myself. I have been informed that Apple's GCC does better on this[0]; I will see if I can get my friend to provide a .i and .s file from that version. I also just noted that gcc-4.1 and gcc-4.2 produce much less bad code. I've attached intrinsics.s from both of those files; they each use 8 movss and 4 mulss. Nevertheless, they still do not convert the code into three SSE instructions. [0] I believe Apple's GCC is based on an older version of FSF gcc, which is probably why it does better; the results are likely identical between the two. -- brian m. carlson / brian with sandals: Houston, Texas, US +1 713 440 7475 | http://crustytoothpaste.ath.cx/~bmc | My opinion only troff on top of XML: http://crustytoothpaste.ath.cx/~bmc/code/thwack OpenPGP: RSA v4 4096b 88AC E9B2 9196 305B A994 7552 F1BA 225C 0223 B187
.file "intrinsics.c"
.text
.p2align 4,,15
.globl mul
.type mul, @function
mul:
.LFB491:
movss (%rdi), %xmm0
mulss (%rsi), %xmm0
movss %xmm0, (%rdx)
movss 4(%rdi), %xmm0
mulss 4(%rsi), %xmm0
movss %xmm0, 4(%rdx)
movss 8(%rdi), %xmm0
mulss 8(%rsi), %xmm0
movss %xmm0, 8(%rdx)
movss 12(%rdi), %xmm0
mulss 12(%rsi), %xmm0
movss %xmm0, 12(%rdx)
ret
.LFE491:
.size mul, .-mul
.p2align 4,,15
.globl mul2
.type mul2, @function
mul2:
.LFB492:
movaps (%rdi), %xmm0
mulps (%rsi), %xmm0
movaps %xmm0, (%rdx)
ret
.LFE492:
.size mul2, .-mul2
.section .rodata.str1.1,"aMS",@progbits,1
.LC8:
.string "%f %f %f %f\n"
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LC13:
.long 1610612736
.long -1071225242
.align 8
.LC14:
.long 3758096384
.long 1075212451
.align 8
.LC15:
.long 536870912
.long 1075983155
.align 8
.LC16:
.long 3221225472
.long -1075125945
.text
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB493:
subq $56, %rsp
.LCFI0:
movl $.LC8, %edi
movl $4, %eax
movsd .LC13(%rip), %xmm3
movl $0x3f99999a, 32(%rsp)
movsd .LC14(%rip), %xmm2
movl $0x40600000, 36(%rsp)
movsd .LC15(%rip), %xmm1
movl $0x3fd9999a, 40(%rsp)
movsd .LC16(%rip), %xmm0
movl $0x40333333, 44(%rsp)
movl $0xbf333333, 16(%rsp)
movl $0x40266666, 20(%rsp)
movl $0x40533333, 24(%rsp)
movl $0xc0800000, 28(%rsp)
movl $0xbf570a3e, (%rsp)
movl $0x41119999, 4(%rsp)
movl $0x40b3851f, 8(%rsp)
movl $0xc1333333, 12(%rsp)
call printf
xorl %eax, %eax
addq $56, %rsp
ret
.LFE493:
.size main, .-main
.section .eh_frame,"a",@progbits
.Lframe1:
.long .LECIE1-.LSCIE1
.LSCIE1:
.long 0x0
.byte 0x1
.string "zR"
.uleb128 0x1
.sleb128 -8
.byte 0x10
.uleb128 0x1
.byte 0x3
.byte 0xc
.uleb128 0x7
.uleb128 0x8
.byte 0x90
.uleb128 0x1
.align 8
.LECIE1:
.LSFDE1:
.long .LEFDE1-.LASFDE1
.LASFDE1:
.long .LASFDE1-.Lframe1
.long .LFB491
.long .LFE491-.LFB491
.uleb128 0x0
.align 8
.LEFDE1:
.LSFDE3:
.long .LEFDE3-.LASFDE3
.LASFDE3:
.long .LASFDE3-.Lframe1
.long .LFB492
.long .LFE492-.LFB492
.uleb128 0x0
.align 8
.LEFDE3:
.LSFDE5:
.long .LEFDE5-.LASFDE5
.LASFDE5:
.long .LASFDE5-.Lframe1
.long .LFB493
.long .LFE493-.LFB493
.uleb128 0x0
.byte 0x4
.long .LCFI0-.LFB493
.byte 0xe
.uleb128 0x40
.align 8
.LEFDE5:
.ident "GCC: (GNU) 4.1.3 20080704 (prerelease) (Debian 4.1.2-24)"
.section .note.GNU-stack,"",@progbits
.file "intrinsics.c"
.text
.p2align 4,,15
.globl mul
.type mul, @function
mul:
.LFB513:
movss (%rdi), %xmm0
mulss (%rsi), %xmm0
movss %xmm0, (%rdx)
movss 4(%rdi), %xmm0
mulss 4(%rsi), %xmm0
movss %xmm0, 4(%rdx)
movss 8(%rdi), %xmm0
mulss 8(%rsi), %xmm0
movss %xmm0, 8(%rdx)
movss 12(%rdi), %xmm0
mulss 12(%rsi), %xmm0
movss %xmm0, 12(%rdx)
ret
.LFE513:
.size mul, .-mul
.p2align 4,,15
.globl mul2
.type mul2, @function
mul2:
.LFB514:
movaps (%rdi), %xmm0
mulps (%rsi), %xmm0
movaps %xmm0, (%rdx)
ret
.LFE514:
.size mul2, .-mul2
.section .rodata.str1.1,"aMS",@progbits,1
.LC8:
.string "%f %f %f %f\n"
.text
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB515:
subq $56, %rsp
.LCFI0:
movl $.LC8, %edi
movl $4, %eax
movsd .LC14(%rip), %xmm3
movl $0x3f99999a, 32(%rsp)
movsd .LC15(%rip), %xmm2
movl $0x40600000, 36(%rsp)
movsd .LC16(%rip), %xmm1
movl $0x3fd9999a, 40(%rsp)
movsd .LC10(%rip), %xmm0
movl $0x40333333, 44(%rsp)
movl $0xbf333333, 16(%rsp)
movl $0x40266666, 20(%rsp)
movl $0x40533333, 24(%rsp)
movl $0xc0800000, 28(%rsp)
movl $0xbf570a3e, (%rsp)
movl $0x41119999, 4(%rsp)
movl $0x40b3851f, 8(%rsp)
movl $0xc1333333, 12(%rsp)
call printf
xorl %eax, %eax
addq $56, %rsp
ret
.LFE515:
.size main, .-main
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LC10:
.long 3221225472
.long -1075125945
.align 8
.LC14:
.long 1610612736
.long -1071225242
.align 8
.LC15:
.long 3758096384
.long 1075212451
.align 8
.LC16:
.long 536870912
.long 1075983155
.section .eh_frame,"a",@progbits
.Lframe1:
.long .LECIE1-.LSCIE1
.LSCIE1:
.long 0x0
.byte 0x1
.string "zR"
.uleb128 0x1
.sleb128 -8
.byte 0x10
.uleb128 0x1
.byte 0x3
.byte 0xc
.uleb128 0x7
.uleb128 0x8
.byte 0x90
.uleb128 0x1
.align 8
.LECIE1:
.LSFDE1:
.long .LEFDE1-.LASFDE1
.LASFDE1:
.long .LASFDE1-.Lframe1
.long .LFB513
.long .LFE513-.LFB513
.uleb128 0x0
.align 8
.LEFDE1:
.LSFDE3:
.long .LEFDE3-.LASFDE3
.LASFDE3:
.long .LASFDE3-.Lframe1
.long .LFB514
.long .LFE514-.LFB514
.uleb128 0x0
.align 8
.LEFDE3:
.LSFDE5:
.long .LEFDE5-.LASFDE5
.LASFDE5:
.long .LASFDE5-.Lframe1
.long .LFB515
.long .LFE515-.LFB515
.uleb128 0x0
.byte 0x4
.long .LCFI0-.LFB515
.byte 0xe
.uleb128 0x40
.align 8
.LEFDE5:
.ident "GCC: (GNU) 4.2.4 (Debian 4.2.4-5)"
.section .note.GNU-stack,"",@progbits
signature.asc
Description: Digital signature

