------- Comment #1 from jv244 at cam dot ac dot uk 2007-03-08 11:11 ------- The following is (for me) an even more interesting example, as it times only the loop that thus the actual multiply / add but also tricks my version of ifort into generating the expected asm. Ifort is about twice as fast as gfortran on it.
SUBROUTINE collocate_core_2_2_0_0(jg,cmax) IMPLICIT NONE integer, INTENT(IN) :: jg,cmax INTEGER, PARAMETER :: wp = SELECTED_REAL_KIND ( 14, 200 ) INTEGER, PARAMETER :: N=10,Nit=100000000 TYPE vec real(wp) :: a(2) END TYPE vec TYPE(vec) :: dpy(1000) TYPE(vec) :: pxy(1000) TYPE(vec) :: s(02) integer :: i,j DO i=1,N pxy(i)%a=0.0_wp ENDDO DO i=1,N dpy(i)%a=0.0_wp ENDDO s(01)%a(1)=0.0_wp s(01)%a(2)=0.0_wp s(02)%a(1)=0.0_wp s(02)%a(2)=0.0_wp CALL USE(dpy,pxy,s) DO j=1,Nit DO i=1,N s(01)%a(:)=s(01)%a(:)+pxy(i)%a(:)*dpy(i)%a(1) s(02)%a(:)=s(02)%a(:)+pxy(i)%a(:)*dpy(i)%a(2) ENDDO ENDDO CALL USE(dpy,pxy,s) END SUBROUTINE [EMAIL PROTECTED]:/data/vondele/extracted_collocate/test> gfortran -O2 -march=native -ftree-vectorize -ffast-math test.f90 [EMAIL PROTECTED]:/data/vondele/extracted_collocate/test> ./a.out 4.288268 [EMAIL PROTECTED]:/data/vondele/extracted_collocate/test> ifort -O2 -xT test.f90 test.f90(16) : (col. 8) remark: LOOP WAS VECTORIZED. test.f90(19) : (col. 8) remark: LOOP WAS VECTORIZED. test.f90(31) : (col. 6) remark: LOOP WAS VECTORIZED. test.f90(31) : (col. 6) remark: LOOP WAS VECTORIZED. test.f90(32) : (col. 6) remark: LOOP WAS VECTORIZED. test.f90(32) : (col. 6) remark: LOOP WAS VECTORIZED. [EMAIL PROTECTED]:/data/vondele/extracted_collocate/test> ./a.out 1.944121 The inner loop asm looks, with ifort, also the way I was hoping it to look like: .B2.7: # Preds ..B2.7 ..B2.6 movddup -16+collocate_core_2_2_0_0_$DPY.0.0(%rcx), %xmm2 #31.41 movddup -8+collocate_core_2_2_0_0_$DPY.0.0(%rcx), %xmm3 #32.41 addq $16, %rdx #33.4 movapd collocate_core_2_2_0_0_$PXY.0.0(%rdx), %xmm4 #31.6 mulpd %xmm4, %xmm2 #31.39 mulpd %xmm3, %xmm4 #32.39 addpd %xmm2, %xmm1 #31.7 addpd %xmm4, %xmm0 #32.7 addq $16, %rcx #33.5 cmpq $160, %rcx #33.4 jle ..B2.7 # Prob 90% #33.4 # LOE rdx rcx rbx rbp r12 r13 r14 r15 eax xmm0 xmm1 -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31079