------- Comment #1 from jv244 at cam dot ac dot uk  2007-03-08 11:11 -------
The following is (for me) an even more interesting example, as it times only
the loop that thus the actual multiply / add but also tricks my version of
ifort into generating the expected asm. Ifort is about twice as fast as
gfortran on it.

SUBROUTINE collocate_core_2_2_0_0(jg,cmax)
    IMPLICIT NONE
    integer, INTENT(IN)  :: jg,cmax
    INTEGER, PARAMETER :: wp = SELECTED_REAL_KIND ( 14, 200 )
    INTEGER, PARAMETER :: N=10,Nit=100000000
    TYPE vec
      real(wp) :: a(2)
    END TYPE vec
    TYPE(vec) :: dpy(1000)
    TYPE(vec) ::  pxy(1000)
    TYPE(vec) :: s(02)
    integer :: i,j


    DO i=1,N
        pxy(i)%a=0.0_wp
    ENDDO
    DO i=1,N
        dpy(i)%a=0.0_wp
    ENDDO

    s(01)%a(1)=0.0_wp
    s(01)%a(2)=0.0_wp
    s(02)%a(1)=0.0_wp
    s(02)%a(2)=0.0_wp

    CALL USE(dpy,pxy,s)

    DO j=1,Nit
    DO i=1,N
      s(01)%a(:)=s(01)%a(:)+pxy(i)%a(:)*dpy(i)%a(1)
      s(02)%a(:)=s(02)%a(:)+pxy(i)%a(:)*dpy(i)%a(2)
    ENDDO
    ENDDO

    CALL USE(dpy,pxy,s)

END SUBROUTINE

[EMAIL PROTECTED]:/data/vondele/extracted_collocate/test> gfortran -O2
-march=native -ftree-vectorize  -ffast-math  test.f90
[EMAIL PROTECTED]:/data/vondele/extracted_collocate/test> ./a.out
   4.288268
[EMAIL PROTECTED]:/data/vondele/extracted_collocate/test> ifort -O2 -xT test.f90
test.f90(16) : (col. 8) remark: LOOP WAS VECTORIZED.
test.f90(19) : (col. 8) remark: LOOP WAS VECTORIZED.
test.f90(31) : (col. 6) remark: LOOP WAS VECTORIZED.
test.f90(31) : (col. 6) remark: LOOP WAS VECTORIZED.
test.f90(32) : (col. 6) remark: LOOP WAS VECTORIZED.
test.f90(32) : (col. 6) remark: LOOP WAS VECTORIZED.
[EMAIL PROTECTED]:/data/vondele/extracted_collocate/test> ./a.out
   1.944121

The inner loop asm looks, with ifort, also the way I was hoping it to look
like:

.B2.7:                         # Preds ..B2.7 ..B2.6
        movddup   -16+collocate_core_2_2_0_0_$DPY.0.0(%rcx), %xmm2 #31.41
        movddup   -8+collocate_core_2_2_0_0_$DPY.0.0(%rcx), %xmm3 #32.41
        addq      $16, %rdx                                     #33.4
        movapd    collocate_core_2_2_0_0_$PXY.0.0(%rdx), %xmm4  #31.6
        mulpd     %xmm4, %xmm2                                  #31.39
        mulpd     %xmm3, %xmm4                                  #32.39
        addpd     %xmm2, %xmm1                                  #31.7
        addpd     %xmm4, %xmm0                                  #32.7
        addq      $16, %rcx                                     #33.5
        cmpq      $160, %rcx                                    #33.4
        jle       ..B2.7        # Prob 90%                      #33.4
                                # LOE rdx rcx rbx rbp r12 r13 r14 r15 eax xmm0
xmm1


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31079

Reply via email to