On Wed, Apr 22, 2015 at 6:59 PM, Toon Moene wrote:
> Why is loop fusion important, especially in Fortran 90 and later programs ?
>
> Because without it, every array assignment is a single loop nest, isolated
> from related, same-shape assignments.

Why is this a bad thing? When you're talking about single-node
machines, separate loops is probably faster if your arrays are large
enough: better cache locality and easier to vectorize.


----- 8< -----
$ cat test.f90; gfortran.exe -O2 test.f90 ; ./a.exe
        PROGRAM TEST_FUSION
        IMPLICIT NONE
        REAL, PARAMETER :: TSTEP = 0.01

        CALL ONE_TEST(100)
        CALL ONE_TEST(200)
        CALL ONE_TEST(400)
        STOP


        CONTAINS

        SUBROUTINE ONE_TEST(N)
        IMPLICIT NONE
        INTEGER :: N, I
        REAL, DIMENSION(:,:,:), ALLOCATABLE :: T, U, V, Q
        REAL, DIMENSION(:,:,:), ALLOCATABLE :: DTDT, DUDT, DVDT, DQDT
        REAL :: START, FINISH

        PRINT '("Test with N=",I3)', N
        ALLOCATE (T(N,N,N), U(N,N,N), V(N,N,N), Q(N,N,N))
        ALLOCATE (DTDT(N,N,N), DUDT(N,N,N), DVDT(N,N,N), DQDT(N,N,N))
        !
        CALL CPU_TIME(START)
        DO I=1,100
          CALL UPDATE_DT_1(T, U, V, Q, DTDT, DUDT, DVDT, DQDT, N, N, N, TSTEP)
        END DO
        CALL CPU_TIME(FINISH)
        PRINT '("F90-style array assignments -
time=",f6.3,"seconds.")', (FINISH - START)
        !
        CALL CPU_TIME(START)
        DO I=1,100
          CALL UPDATE_DT_2(T, U, V, Q, DTDT, DUDT, DVDT, DQDT, N, N, N, TSTEP)
        END DO
        CALL CPU_TIME(FINISH)
        PRINT '("F77-style loopy assignments -
time=",f6.3,"seconds.")', (FINISH - START)
        !
        !
        DEALLOCATE (T, U, V, Q)
        DEALLOCATE (DTDT, DUDT, DVDT, DQDT)
        PRINT *
        END SUBROUTINE ONE_TEST


        SUBROUTINE UPDATE_DT_1(T, U, V, Q, DTDT, DUDT, DVDT, DQDT, &
                               & NLON, NLAT, NLEV, TSTEP)
        IMPLICIT NONE
        INTEGER :: NLON, NLAT, NLEV
        REAL :: TSTEP
        REAL, DIMENSION(NLON, NLAT, NLEV) :: T, U, V, Q, DTDT, DUDT, DVDT, DQDT
        T = T + TSTEP*DTDT ! Update temperature
        U = U + TSTEP*DUDT ! Update east-west wind component
        V = V + TSTEP*DVDT ! Update north-south wind component
        Q = Q + TSTEP*DQDT ! Update specific humidity
        END SUBROUTINE UPDATE_DT_1

        SUBROUTINE UPDATE_DT_2(T, U, V, Q, DTDT, DUDT, DVDT, DQDT, &
                               & NLON, NLAT, NLEV, TSTEP)
        IMPLICIT NONE
        INTEGER :: NLON, NLAT, NLEV
        REAL :: TSTEP
        REAL, DIMENSION(NLON, NLAT, NLEV) :: T, U, V, Q, DTDT, DUDT, DVDT, DQDT
        INTEGER :: JLON, JLAT, JLEV
        DO JLEV = 1, NLEV
          DO JLAT = 1, NLAT
            DO JLON = 1, NLON
              T(JLON, JLAT, JLEV) = T(JLON, JLAT, JLEV) +
TSTEP*DTDT(JLON, JLAT, JLEV)
              U(JLON, JLAT, JLEV) = U(JLON, JLAT, JLEV) +
TSTEP*DUDT(JLON, JLAT, JLEV)
              V(JLON, JLAT, JLEV) = V(JLON, JLAT, JLEV) +
TSTEP*DVDT(JLON, JLAT, JLEV)
              Q(JLON, JLAT, JLEV) = Q(JLON, JLAT, JLEV) +
TSTEP*DQDT(JLON, JLAT, JLEV)
            ENDDO
          ENDDO
        ENDDO
        END SUBROUTINE UPDATE_DT_2

        END PROGRAM
Test with N=100
F90-style array assignments - time= 0.390seconds.
F77-style loopy assignments - time= 0.578seconds.

Test with N=200
F90-style array assignments - time= 2.969seconds.
F77-style loopy assignments - time= 4.765seconds.

Test with N=400
F90-style array assignments - time=24.344seconds.
F77-style loopy assignments - time=38.672seconds.


$
----- 8< -----

Loop fusion is only a win if you iterate through the same array
variables. Writing such a pass is not so hard for the simple, most
common cases. The front end could do some of the rewriting from
F90-style array assignments to fused loops if it notices consecutive
array assignments/operations on the same variables.

Ciao!
Steven

Reply via email to