gfortran

jv244 at cam dot ac dot uk Mon, 07 Jan 2008 14:01:15 -0800


------- Comment #4 from jv244 at cam dot ac dot uk  2008-01-07 22:00 -------
timings have improved a lot with a recent gfortran, at least on an opteron, I
have now for ifort 3.7s for gfortran 4.5s (20% slower only) for the following
code:


SUBROUTINE collocate_core_2_2_0_0(jg,cmax)
    IMPLICIT NONE
    integer, INTENT(IN)  :: jg,cmax
    INTEGER, PARAMETER :: wp = SELECTED_REAL_KIND ( 14, 200 )
    INTEGER, PARAMETER :: N=10,Nit=100000000
    TYPE vec
      real(wp) :: a(2)
    END TYPE vec
    TYPE(vec) :: dpy(1000)
    TYPE(vec) ::  pxy(1000)
    TYPE(vec) :: s(02)
    integer :: i,j


    DO i=1,N
        pxy(i)%a=0.0_wp
    ENDDO
    DO i=1,N
        dpy(i)%a=0.0_wp
    ENDDO

    s(01)%a(1)=0.0_wp
    s(01)%a(2)=0.0_wp
    s(02)%a(1)=0.0_wp
    s(02)%a(2)=0.0_wp

    CALL USE(dpy,pxy,s)

    ! this is the hot loop
    DO j=1,Nit
    DO i=1,N
      s(01)%a(:)=s(01)%a(:)+pxy(i)%a(:)*dpy(i)%a(1)
      s(02)%a(:)=s(02)%a(:)+pxy(i)%a(:)*dpy(i)%a(2)
    ENDDO
    ENDDO

    CALL USE(dpy,pxy,s)

END SUBROUTINE

SUBROUTINE USE(a,b,c)
 INTEGER, PARAMETER :: wp = SELECTED_REAL_KIND ( 14, 200 )
 REAL(kind=wp) :: a(*),b(*),c(*)
END SUBROUTINE USE

PROGRAM TEST
    integer, parameter :: cmax=5
    integer*8 :: t1,t2,tbest
    real :: time1,time2
    jg=0
    CALL cpu_time(time1)
    tbest=huge(tbest)
    DO i=1,1
     ! t1=nanotime_ia32()
       CALL collocate_core_2_2_0_0(0,cmax)
     ! t2=nanotime_ia32()
     ! if(t2-t1>0 .AND. t2-t1<tbest) tbest=t2-t1
    ENDDO
    CALL cpu_time(time2)
    ! write(6,*) tbest,time2-time1
    write(6,*) time2-time1
END PROGRAM TEST

using 

ifort -xW -O3 test.f90
gfortran -march=native -O3 -ffast-math test.f90

gfortran's inner loop asm looks like:

.L8:
        movlpd  (%rbp,%rax), %xmm0
        movsd   %xmm0, %xmm1
        mulsd   (%rbx,%rax), %xmm1
        addsd   %xmm1, %xmm2
        movsd   %xmm2, 32000(%rsp)
        mulsd   8(%rbx,%rax), %xmm0
        addsd   %xmm0, %xmm5
        movsd   %xmm5, 32008(%rsp)
        movlpd  8(%rbp,%rax), %xmm0
        movsd   %xmm0, %xmm1
        mulsd   (%rbx,%rax), %xmm1
        addsd   %xmm1, %xmm4
        movsd   %xmm4, 32016(%rsp)
        mulsd   8(%rbx,%rax), %xmm0
        addq    $16, %rax
        cmpq    $160, %rax
        addsd   %xmm0, %xmm3
        movsd   %xmm3, 32024(%rsp)
        jne     .L8

while ifort's loop looks like:

..B3.7:                         # Preds ..B3.7 ..B3.6
        movsd     collocate_core_2_2_0_0_$DPY.0.0(%rdx), %xmm2  #31.41
        movsd     8+collocate_core_2_2_0_0_$DPY.0.0(%rdx), %xmm3 #32.41
        movaps    collocate_core_2_2_0_0_$PXY.0.0(%rdx), %xmm4  #31.7
        unpcklpd  %xmm2, %xmm2                                  #31.41
        mulpd     %xmm4, %xmm2                                  #31.40
        addpd     %xmm2, %xmm1                                  #31.7
        unpcklpd  %xmm3, %xmm3                                  #32.41
        mulpd     %xmm3, %xmm4                                  #32.40
        addpd     %xmm4, %xmm0                                  #32.7
        addq      $16, %rdx                                     #30.5
        cmpq      $160, %rdx                                    #30.5
        jl        ..B3.7        # Prob 90%                      #30.5

so I guess ifort vectorizes where gfortran does not.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31079

[Bug tree-optimization/31079] 300% difference between ifort/gfortran

Reply via email to