------- Comment #4 from jv244 at cam dot ac dot uk 2008-01-07 22:00 ------- timings have improved a lot with a recent gfortran, at least on an opteron, I have now for ifort 3.7s for gfortran 4.5s (20% slower only) for the following code:
SUBROUTINE collocate_core_2_2_0_0(jg,cmax) IMPLICIT NONE integer, INTENT(IN) :: jg,cmax INTEGER, PARAMETER :: wp = SELECTED_REAL_KIND ( 14, 200 ) INTEGER, PARAMETER :: N=10,Nit=100000000 TYPE vec real(wp) :: a(2) END TYPE vec TYPE(vec) :: dpy(1000) TYPE(vec) :: pxy(1000) TYPE(vec) :: s(02) integer :: i,j DO i=1,N pxy(i)%a=0.0_wp ENDDO DO i=1,N dpy(i)%a=0.0_wp ENDDO s(01)%a(1)=0.0_wp s(01)%a(2)=0.0_wp s(02)%a(1)=0.0_wp s(02)%a(2)=0.0_wp CALL USE(dpy,pxy,s) ! this is the hot loop DO j=1,Nit DO i=1,N s(01)%a(:)=s(01)%a(:)+pxy(i)%a(:)*dpy(i)%a(1) s(02)%a(:)=s(02)%a(:)+pxy(i)%a(:)*dpy(i)%a(2) ENDDO ENDDO CALL USE(dpy,pxy,s) END SUBROUTINE SUBROUTINE USE(a,b,c) INTEGER, PARAMETER :: wp = SELECTED_REAL_KIND ( 14, 200 ) REAL(kind=wp) :: a(*),b(*),c(*) END SUBROUTINE USE PROGRAM TEST integer, parameter :: cmax=5 integer*8 :: t1,t2,tbest real :: time1,time2 jg=0 CALL cpu_time(time1) tbest=huge(tbest) DO i=1,1 ! t1=nanotime_ia32() CALL collocate_core_2_2_0_0(0,cmax) ! t2=nanotime_ia32() ! if(t2-t1>0 .AND. t2-t1<tbest) tbest=t2-t1 ENDDO CALL cpu_time(time2) ! write(6,*) tbest,time2-time1 write(6,*) time2-time1 END PROGRAM TEST using ifort -xW -O3 test.f90 gfortran -march=native -O3 -ffast-math test.f90 gfortran's inner loop asm looks like: .L8: movlpd (%rbp,%rax), %xmm0 movsd %xmm0, %xmm1 mulsd (%rbx,%rax), %xmm1 addsd %xmm1, %xmm2 movsd %xmm2, 32000(%rsp) mulsd 8(%rbx,%rax), %xmm0 addsd %xmm0, %xmm5 movsd %xmm5, 32008(%rsp) movlpd 8(%rbp,%rax), %xmm0 movsd %xmm0, %xmm1 mulsd (%rbx,%rax), %xmm1 addsd %xmm1, %xmm4 movsd %xmm4, 32016(%rsp) mulsd 8(%rbx,%rax), %xmm0 addq $16, %rax cmpq $160, %rax addsd %xmm0, %xmm3 movsd %xmm3, 32024(%rsp) jne .L8 while ifort's loop looks like: ..B3.7: # Preds ..B3.7 ..B3.6 movsd collocate_core_2_2_0_0_$DPY.0.0(%rdx), %xmm2 #31.41 movsd 8+collocate_core_2_2_0_0_$DPY.0.0(%rdx), %xmm3 #32.41 movaps collocate_core_2_2_0_0_$PXY.0.0(%rdx), %xmm4 #31.7 unpcklpd %xmm2, %xmm2 #31.41 mulpd %xmm4, %xmm2 #31.40 addpd %xmm2, %xmm1 #31.7 unpcklpd %xmm3, %xmm3 #32.41 mulpd %xmm3, %xmm4 #32.40 addpd %xmm4, %xmm0 #32.7 addq $16, %rdx #30.5 cmpq $160, %rdx #30.5 jl ..B3.7 # Prob 90% #30.5 so I guess ifort vectorizes where gfortran does not. -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31079