On 10/26/2011 11:56 PM, Jakub Jelinek wrote:

Hi!

This patch implements gather vectorization with -mavx2, if
dr_may_alias (which apparently doesn't use tbaa :(( ) can figure out
there is no overlap with stores in the loop (if any).
The testcases show what is possible to get vectorized.

Hmmm,

I wonder whether it will work with the attached Fortran routine - it sure would mean a boost to the 18%+ heaviest CPU user in our code.

What follows is the single CPU breakdown of the most demanding codes in our weather forecasting code (from my 2006 GCC Summit "contribution", which wasn't approved):

Flat profile:
% time  calls name
 18.34  85684 verint_ <-- That's the one attached
  9.34   1380 invlo4_
  7.84  85684 bixint_
  6.76    133 sl2tim_
  5.30  14950 condcv_
  4.74  14950 radia_
  4.65  14950 vcbr_
  3.25    133 sldyn_
  2.98  14950 phtask_
  2.42    133 sldynm_
  2.29  14950 phys_
  2.19  14950 prevap_

--
Toon Moene - e-mail: t...@moene.org - phone: +31 346 214290  | 4 more
Saturnushof 14, 3738 XG  Maartensdijk, The Netherlands       | 4 44
At home: http://moene.org/~toon/; weather: http://moene.org/~hirlam/
Progress of GNU Fortran: http://gcc.gnu.org/wiki/GFortran#news
# 1 "/scratch/hirlam/hl_home/MPI/lib/src/grdy/verint.F"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "/scratch/hirlam/hl_home/MPI/lib/src/grdy/verint.F"
c Library:grdy $RCSfile$, $Revision: 7536 $
c checked in by $Author: ovignes $ at $Date: 2009-12-18 14:23:36 +0100 (Fri, 18 Dec 2009) $
c $State$, $Locker$
c $Log$
c Revision 1.3  1999/04/22 09:30:45  DagBjoerge
c MPP code
c
c Revision 1.2  1999/03/09 10:23:13  GerardCats
c Add SGI paralllellisation directives DOACROSS
c
c Revision 1.1  1996/09/06 13:12:18  GCats
c Created from grdy.apl, 1 version 2.6.1, by Gerard Cats
c
      SUBROUTINE VERINT (
     I   KLON   , KLAT   , KLEV   , KINT  , KHALO
     I , KLON1  , KLON2  , KLAT1  , KLAT2
     I , KP     , KQ     , KR
     R , PARG   , PRES
     R , PALFH  , PBETH
     R , PALFA  , PBETA  , PGAMA   )
C
C*******************************************************************
C
C  VERINT - THREE DIMENSIONAL INTERPOLATION
C
C  PURPOSE:
C
C  THREE DIMENSIONAL INTERPOLATION
C
C  INPUT PARAMETERS:
C
C  KLON      NUMBER OF GRIDPOINTS IN X-DIRECTION
C  KLAT      NUMBER OF GRIDPOINTS IN Y-DIRECTION
C  KLEV      NUMBER OF VERTICAL LEVELS
C  KINT      TYPE OF INTERPOLATION
C            = 1 - LINEAR
C            = 2 - QUADRATIC
C            = 3 - CUBIC
C            = 4 - MIXED CUBIC/LINEAR
C  KLON1     FIRST GRIDPOINT IN X-DIRECTION
C  KLON2     LAST  GRIDPOINT IN X-DIRECTION
C  KLAT1     FIRST GRIDPOINT IN Y-DIRECTION
C  KLAT2     LAST  GRIDPOINT IN Y-DIRECTION
C  KP        ARRAY OF INDEXES FOR HORIZONTAL DISPLACEMENTS
C  KQ        ARRAY OF INDEXES FOR HORIZONTAL DISPLACEMENTS
C  KR        ARRAY OF INDEXES FOR VERTICAL   DISPLACEMENTS
C  PARG      ARRAY OF ARGUMENTS
C  PALFH     ALFA HAT
C  PBETH     BETA HAT
C  PALFA     ARRAY OF WEIGHTS IN X-DIRECTION
C  PBETA     ARRAY OF WEIGHTS IN Y-DIRECTION
C  PGAMA     ARRAY OF WEIGHTS IN VERTICAL DIRECTION
C
C  OUTPUT PARAMETERS:
C
C  PRES      INTERPOLATED FIELD
C
C  HISTORY:
C
C  J.E. HAUGEN       1      1992
C
C*******************************************************************
C
      IMPLICIT NONE
C
      INTEGER KLON   , KLAT   , KLEV   , KINT   , KHALO,
     I        KLON1  , KLON2  , KLAT1  , KLAT2
C
      INTEGER   KP(KLON,KLAT), KQ(KLON,KLAT), KR(KLON,KLAT)
      REAL    PARG(2-KHALO:KLON+KHALO-1,2-KHALO:KLAT+KHALO-1,KLEV)  ,   
     R        PRES(KLON,KLAT)     ,
     R       PALFH(KLON,KLAT)     ,  PBETH(KLON,KLAT)  ,
     R       PALFA(KLON,KLAT,4)   ,  PBETA(KLON,KLAT,4),
     R       PGAMA(KLON,KLAT,4)
C
      INTEGER JX, JY, IDX, IDY, ILEV
      REAL Z1MAH, Z1MBH
C
      IF (KINT.EQ.1) THEN
C  LINEAR INTERPOLATION
C
      DO JY = KLAT1,KLAT2
      DO JX = KLON1,KLON2
         IDX  = KP(JX,JY)
         IDY  = KQ(JX,JY)
         ILEV = KR(JX,JY)
C
         PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV-1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV-1) ) )
C    +
     +               + PGAMA(JX,JY,2)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV  ) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV  ) ) )
      ENDDO
      ENDDO
C
      ELSE
     +IF (KINT.EQ.2) THEN
C  QUADRATIC INTERPOLATION
C
      DO JY = KLAT1,KLAT2
      DO JX = KLON1,KLON2
         IDX  = KP(JX,JY)
         IDY  = KQ(JX,JY)
         ILEV = KR(JX,JY)
C
         PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV-1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY  ,ILEV-1) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV-1) ) )
C    +
     +               + PGAMA(JX,JY,2)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV  ) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY  ,ILEV  ) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV  ) ) )
C    +
     +               + PGAMA(JX,JY,3)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV+1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY  ,ILEV+1) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV+1) ) )
      ENDDO
      ENDDO
C
      ELSE
     +IF (KINT.EQ.3) THEN
C  CUBIC INTERPOLATION
C
      DO JY = KLAT1,KLAT2
      DO JX = KLON1,KLON2
         IDX  = KP(JX,JY)
         IDY  = KQ(JX,JY)
         ILEV = KR(JX,JY)
C
         PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV-2)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV-2)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-2,ILEV-2)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV-2) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-2)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-2)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV-2)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-2) ) 
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV-2)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV-2)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV-2)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV-2) )
     + + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV-2)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV-2)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY+1,ILEV-2)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV-2) ) )
C    +
     +               + PGAMA(JX,JY,2)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-2,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV-1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-1) ) 
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV-1) )
     + + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV-1) ) )
C    +
     +               + PGAMA(JX,JY,3)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-2,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV  ) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV  ) ) 
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV  ) )
     + + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV  ) ) )
C    +
     +               + PGAMA(JX,JY,4)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-2,ILEV+1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV+1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV+1) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV+1) )
     + + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV+1) ) )
      ENDDO
      ENDDO
C
      ELSE
     +IF (KINT.EQ.4) THEN
C  MIXED CUBIC/LINEAR INTERPOLATION
C
      DO JY = KLAT1,KLAT2
      DO JX = KLON1,KLON2
         IDX  = KP(JX,JY)
         IDY  = KQ(JX,JY)
         ILEV = KR(JX,JY)
C
         Z1MAH = 1.0 - PALFH(JX,JY)
         Z1MBH = 1.0 - PBETH(JX,JY)
C
         PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
     +   PBETH(JX,JY)  *( PALFH(JX,JY)  *PARG(IDX-1,IDY-1,ILEV-2)
     +                  + Z1MAH         *PARG(IDX  ,IDY-1,ILEV-2) )
     + + Z1MBH         *( PALFH(JX,JY)  *PARG(IDX-1,IDY  ,ILEV-2)
     +                  + Z1MAH         *PARG(IDX  ,IDY  ,ILEV-2) ) )
C    +
     +               + PGAMA(JX,JY,4)*(
C    +
     +   PBETH(JX,JY)  *( PALFH(JX,JY)  *PARG(IDX-1,IDY-1,ILEV+1)
     +                  + Z1MAH         *PARG(IDX  ,IDY-1,ILEV+1) )
     + + Z1MBH         *( PALFH(JX,JY)  *PARG(IDX-1,IDY  ,ILEV+1)
     +                  + Z1MAH         *PARG(IDX  ,IDY  ,ILEV+1) ) )
C    +
     +               + PGAMA(JX,JY,2)*(
C    +
     +   PBETA(JX,JY,1)*( PALFH(JX,JY)  *PARG(IDX-1,IDY-2,ILEV-1)
     +                  + Z1MAH         *PARG(IDX  ,IDY-2,ILEV-1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-1) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV-1) )
     + + PBETA(JX,JY,4)*( PALFH(JX,JY)  *PARG(IDX-1,IDY+1,ILEV-1)
     +                  + Z1MAH         *PARG(IDX  ,IDY+1,ILEV-1) ) )
C    +
     +               + PGAMA(JX,JY,3)*(
C    +
     +   PBETA(JX,JY,1)*( PALFH(JX,JY)  *PARG(IDX-1,IDY-2,ILEV  )
     +                  + Z1MAH         *PARG(IDX  ,IDY-2,ILEV  ) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV  ) ) 
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV  ) )
     + + PBETA(JX,JY,4)*( PALFH(JX,JY)  *PARG(IDX-1,IDY+1,ILEV  )
     +                  + Z1MAH         *PARG(IDX  ,IDY+1,ILEV  ) ) )
      ENDDO
      ENDDO
C
      ENDIF
C
      RETURN
      END

Reply via email to