------- Comment #4 from changpeng dot fang at amd dot com 2010-08-24 00:46
-------
Ooops, the open64 generated code posted in last comment is for non-vectorized
loop, the vectorized one is similar:
.LBB23_f:
.loc 1 7 0
movups 0(%r10),%xmm3 # [0] id:65
movups 0(%rax),%xmm1 # [1] id:64
subps %xmm3,%xmm1 # [3]
.loc 1 8 0
mulps %xmm1,%xmm1 # [7]
movups 0(%r9),%xmm2 # [9] id:66
mulps %xmm2,%xmm1 # [11]
addq $16,%rax # [13]
addq $16,%r9 # [14]
addq $16,%r10 # [14]
.loc 1 7 0
prefetchnta 112(%rax) # [14] L1
prefetchnta 112(%r10) # [15] L1
.loc 1 8 0
prefetchnta 112(%r9) # [15] L1
subps %xmm1,%xmm0 # [15]
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=45391