------- Comment #9 from dominiq at lps dot ens dot fr 2009-02-01 10:58 -------
> Did you try enabling SSE3 btw?
No. How do I get the enabled SSE* by default?
> Can you post the ifort assembly of the loop?
L_B1.14: # Preds L_B1.14 L_B1.13
lea (%rsi,%r9,8), %r11 #
lea mymatmul_$A.0.1(%rip), %r10 #27.33
movaps (%r10,%r11), %xmm2 #27.33
movaps 16(%r10,%r11), %xmm4 #27.33
movaps %xmm0, %xmm3 #27.40
mulps %xmm2, %xmm3 #27.40
shufps $177, %xmm2, %xmm2 #27.40
lea (%rdx,%r9,8), %r15 #
lea mymatmul_$C.0.1(%rip), %r14 #27.24
movaps %xmm0, %xmm5 #27.40
addq $4, %r9 #26.12
mulps %xmm1, %xmm2 #27.40
cmpq $128, %r9 #26.12
addsubps %xmm2, %xmm3 #27.40
addps (%r14,%r15), %xmm3 #27.15
movaps %xmm3, (%r14,%r15) #27.15
mulps %xmm4, %xmm5 #27.40
shufps $177, %xmm4, %xmm4 #27.40
mulps %xmm1, %xmm4 #27.40
addsubps %xmm4, %xmm5 #27.40
addps 16(%r14,%r15), %xmm5 #27.15
movaps %xmm5, 16(%r14,%r15) #27.15
jl L_B1.14 # Prob 99% #26.12
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38968