On 10/26/2011 11:56 PM, Jakub Jelinek wrote:
Hi!
This patch implements gather vectorization with -mavx2, if
dr_may_alias (which apparently doesn't use tbaa :(( ) can figure out
there is no overlap with stores in the loop (if any).
The testcases show what is possible to get vectorized.
Hmmm,
I wonder whether it will work with the attached Fortran routine - it
sure would mean a boost to the 18%+ heaviest CPU user in our code.
What follows is the single CPU breakdown of the most demanding codes in
our weather forecasting code (from my 2006 GCC Summit "contribution",
which wasn't approved):
Flat profile:
% time calls name
18.34 85684 verint_ <-- That's the one attached
9.34 1380 invlo4_
7.84 85684 bixint_
6.76 133 sl2tim_
5.30 14950 condcv_
4.74 14950 radia_
4.65 14950 vcbr_
3.25 133 sldyn_
2.98 14950 phtask_
2.42 133 sldynm_
2.29 14950 phys_
2.19 14950 prevap_
--
Toon Moene - e-mail: t...@moene.org - phone: +31 346 214290 | 4 more
Saturnushof 14, 3738 XG Maartensdijk, The Netherlands | 4 44
At home: http://moene.org/~toon/; weather: http://moene.org/~hirlam/
Progress of GNU Fortran: http://gcc.gnu.org/wiki/GFortran#news
# 1 "/scratch/hirlam/hl_home/MPI/lib/src/grdy/verint.F"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "/scratch/hirlam/hl_home/MPI/lib/src/grdy/verint.F"
c Library:grdy $RCSfile$, $Revision: 7536 $
c checked in by $Author: ovignes $ at $Date: 2009-12-18 14:23:36 +0100 (Fri, 18 Dec 2009) $
c $State$, $Locker$
c $Log$
c Revision 1.3 1999/04/22 09:30:45 DagBjoerge
c MPP code
c
c Revision 1.2 1999/03/09 10:23:13 GerardCats
c Add SGI paralllellisation directives DOACROSS
c
c Revision 1.1 1996/09/06 13:12:18 GCats
c Created from grdy.apl, 1 version 2.6.1, by Gerard Cats
c
SUBROUTINE VERINT (
I KLON , KLAT , KLEV , KINT , KHALO
I , KLON1 , KLON2 , KLAT1 , KLAT2
I , KP , KQ , KR
R , PARG , PRES
R , PALFH , PBETH
R , PALFA , PBETA , PGAMA )
C
C*******************************************************************
C
C VERINT - THREE DIMENSIONAL INTERPOLATION
C
C PURPOSE:
C
C THREE DIMENSIONAL INTERPOLATION
C
C INPUT PARAMETERS:
C
C KLON NUMBER OF GRIDPOINTS IN X-DIRECTION
C KLAT NUMBER OF GRIDPOINTS IN Y-DIRECTION
C KLEV NUMBER OF VERTICAL LEVELS
C KINT TYPE OF INTERPOLATION
C = 1 - LINEAR
C = 2 - QUADRATIC
C = 3 - CUBIC
C = 4 - MIXED CUBIC/LINEAR
C KLON1 FIRST GRIDPOINT IN X-DIRECTION
C KLON2 LAST GRIDPOINT IN X-DIRECTION
C KLAT1 FIRST GRIDPOINT IN Y-DIRECTION
C KLAT2 LAST GRIDPOINT IN Y-DIRECTION
C KP ARRAY OF INDEXES FOR HORIZONTAL DISPLACEMENTS
C KQ ARRAY OF INDEXES FOR HORIZONTAL DISPLACEMENTS
C KR ARRAY OF INDEXES FOR VERTICAL DISPLACEMENTS
C PARG ARRAY OF ARGUMENTS
C PALFH ALFA HAT
C PBETH BETA HAT
C PALFA ARRAY OF WEIGHTS IN X-DIRECTION
C PBETA ARRAY OF WEIGHTS IN Y-DIRECTION
C PGAMA ARRAY OF WEIGHTS IN VERTICAL DIRECTION
C
C OUTPUT PARAMETERS:
C
C PRES INTERPOLATED FIELD
C
C HISTORY:
C
C J.E. HAUGEN 1 1992
C
C*******************************************************************
C
IMPLICIT NONE
C
INTEGER KLON , KLAT , KLEV , KINT , KHALO,
I KLON1 , KLON2 , KLAT1 , KLAT2
C
INTEGER KP(KLON,KLAT), KQ(KLON,KLAT), KR(KLON,KLAT)
REAL PARG(2-KHALO:KLON+KHALO-1,2-KHALO:KLAT+KHALO-1,KLEV) ,
R PRES(KLON,KLAT) ,
R PALFH(KLON,KLAT) , PBETH(KLON,KLAT) ,
R PALFA(KLON,KLAT,4) , PBETA(KLON,KLAT,4),
R PGAMA(KLON,KLAT,4)
C
INTEGER JX, JY, IDX, IDY, ILEV
REAL Z1MAH, Z1MBH
C
IF (KINT.EQ.1) THEN
C LINEAR INTERPOLATION
C
DO JY = KLAT1,KLAT2
DO JX = KLON1,KLON2
IDX = KP(JX,JY)
IDY = KQ(JX,JY)
ILEV = KR(JX,JY)
C
PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV-1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV-1) ) )
C +
+ + PGAMA(JX,JY,2)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV ) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV ) ) )
ENDDO
ENDDO
C
ELSE
+IF (KINT.EQ.2) THEN
C QUADRATIC INTERPOLATION
C
DO JY = KLAT1,KLAT2
DO JX = KLON1,KLON2
IDX = KP(JX,JY)
IDY = KQ(JX,JY)
ILEV = KR(JX,JY)
C
PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV-1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY ,ILEV-1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY+1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV-1) ) )
C +
+ + PGAMA(JX,JY,2)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV ) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY ,ILEV ) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY+1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV ) ) )
C +
+ + PGAMA(JX,JY,3)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV+1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY ,ILEV+1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY+1,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV+1) ) )
ENDDO
ENDDO
C
ELSE
+IF (KINT.EQ.3) THEN
C CUBIC INTERPOLATION
C
DO JY = KLAT1,KLAT2
DO JX = KLON1,KLON2
IDX = KP(JX,JY)
IDY = KQ(JX,JY)
ILEV = KR(JX,JY)
C
PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV-2)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV-2)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-2,ILEV-2)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV-2) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-2)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-2)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV-2)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-2) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV-2)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV-2)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV-2)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV-2) )
+ + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV-2)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV-2)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY+1,ILEV-2)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV-2) ) )
C +
+ + PGAMA(JX,JY,2)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-2,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV-1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV-1) )
+ + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY+1,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV-1) ) )
C +
+ + PGAMA(JX,JY,3)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-2,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV ) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV ) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV ) )
+ + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY+1,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV ) ) )
C +
+ + PGAMA(JX,JY,4)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-2,ILEV+1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV+1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV+1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV+1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV+1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV+1) )
+ + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY+1,ILEV+1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV+1) ) )
ENDDO
ENDDO
C
ELSE
+IF (KINT.EQ.4) THEN
C MIXED CUBIC/LINEAR INTERPOLATION
C
DO JY = KLAT1,KLAT2
DO JX = KLON1,KLON2
IDX = KP(JX,JY)
IDY = KQ(JX,JY)
ILEV = KR(JX,JY)
C
Z1MAH = 1.0 - PALFH(JX,JY)
Z1MBH = 1.0 - PBETH(JX,JY)
C
PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
+ PBETH(JX,JY) *( PALFH(JX,JY) *PARG(IDX-1,IDY-1,ILEV-2)
+ + Z1MAH *PARG(IDX ,IDY-1,ILEV-2) )
+ + Z1MBH *( PALFH(JX,JY) *PARG(IDX-1,IDY ,ILEV-2)
+ + Z1MAH *PARG(IDX ,IDY ,ILEV-2) ) )
C +
+ + PGAMA(JX,JY,4)*(
C +
+ PBETH(JX,JY) *( PALFH(JX,JY) *PARG(IDX-1,IDY-1,ILEV+1)
+ + Z1MAH *PARG(IDX ,IDY-1,ILEV+1) )
+ + Z1MBH *( PALFH(JX,JY) *PARG(IDX-1,IDY ,ILEV+1)
+ + Z1MAH *PARG(IDX ,IDY ,ILEV+1) ) )
C +
+ + PGAMA(JX,JY,2)*(
C +
+ PBETA(JX,JY,1)*( PALFH(JX,JY) *PARG(IDX-1,IDY-2,ILEV-1)
+ + Z1MAH *PARG(IDX ,IDY-2,ILEV-1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV-1) )
+ + PBETA(JX,JY,4)*( PALFH(JX,JY) *PARG(IDX-1,IDY+1,ILEV-1)
+ + Z1MAH *PARG(IDX ,IDY+1,ILEV-1) ) )
C +
+ + PGAMA(JX,JY,3)*(
C +
+ PBETA(JX,JY,1)*( PALFH(JX,JY) *PARG(IDX-1,IDY-2,ILEV )
+ + Z1MAH *PARG(IDX ,IDY-2,ILEV ) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV ) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV ) )
+ + PBETA(JX,JY,4)*( PALFH(JX,JY) *PARG(IDX-1,IDY+1,ILEV )
+ + Z1MAH *PARG(IDX ,IDY+1,ILEV ) ) )
ENDDO
ENDDO
C
ENDIF
C
RETURN
END