------- Comment #2 from jv244 at cam dot ac dot uk 2007-03-05 11:47 -------
(In reply to comment #1)
> We don't unroll non-innermost loops at the moment. I don't know if sccp can
> be taught to handle this case (and if it's worth it).
such small loops are quite typical for some quantum chemistry integral
routines.
I'm just experimenting rewriting the kernel mentioned in PR 31021. If I do this
unrolling by hand I get quite a speedup on the full kernel:
hand unrolled:
# best time 5.260329
loops:
# best time 6.616413
which is quite impressive because these loops take at most 30% of the kernel
total time:
The actual code in question is:
coef(:,:)=0.0_wp
lxy=0 ; lx=0
DO lxa=0,1
DO lxb=0,1
lx = lx + 1
g1=0.0_wp
g2=0.0_wp
g1k=0.0_wp
g2k=0.0_wp
DO lya=0,1-lxa
DO lyb=0,1-lxb
lxy=lxy+1
g1=g1+pyx(1,lxy)*dpy(lyb,lya,jg)
g2=g2+pyx(1,lxy)*dpy(lyb,lya,jg2)
g1k=g1k+pyx(2,lxy)*dpy(lyb,lya,jg)
g2k=g2k+pyx(2,lxy)*dpy(lyb,lya,jg2)
ENDDO
ENDDO
DO icoef=1,3
coef(icoef,1)=coef(icoef,1)+alpha(icoef,lx)*g1
coef(icoef,2)=coef(icoef,2)+alpha(icoef,lx)*g2
coef(icoef,3)=coef(icoef,3)+alpha(icoef,lx)*g1k
coef(icoef,4)=coef(icoef,4)+alpha(icoef,lx)*g2k
ENDDO
ENDDO
ENDDO
and the hand-unrolling just explicitly expands all loops to the loop free
version of exactly the same statements:
coef(:,:)=0.0_wp
g1=0.0_wp
g2=0.0_wp
g1k=0.0_wp
g2k=0.0_wp
g1=g1+pyx(1,1)*dpy(0,0,jg)
g2=g2+pyx(1,1)*dpy(0,0,jg2)
g1k=g1k+pyx(2,1)*dpy(0,0,jg)
g2k=g2k+pyx(2,1)*dpy(0,0,jg2)
g1=g1+pyx(1,2)*dpy(1,0,jg)
g2=g2+pyx(1,2)*dpy(1,0,jg2)
g1k=g1k+pyx(2,2)*dpy(1,0,jg)
g2k=g2k+pyx(2,2)*dpy(1,0,jg2)
g1=g1+pyx(1,3)*dpy(0,1,jg)
g2=g2+pyx(1,3)*dpy(0,1,jg2)
g1k=g1k+pyx(2,3)*dpy(0,1,jg)
g2k=g2k+pyx(2,3)*dpy(0,1,jg2)
g1=g1+pyx(1,4)*dpy(1,1,jg)
g2=g2+pyx(1,4)*dpy(1,1,jg2)
g1k=g1k+pyx(2,4)*dpy(1,1,jg)
g2k=g2k+pyx(2,4)*dpy(1,1,jg2)
coef(01,01)=coef(01,01)+alpha(1,1)*g1
coef(01,02)=coef(01,02)+alpha(1,1)*g2
coef(01,03)=coef(01,03)+alpha(1,1)*g1k
coef(01,04)=coef(01,04)+alpha(1,1)*g2k
coef(02,01)=coef(02,01)+alpha(2,1)*g1
coef(02,02)=coef(02,02)+alpha(2,1)*g2
coef(02,03)=coef(02,03)+alpha(2,1)*g1k
coef(02,04)=coef(02,04)+alpha(2,1)*g2k
coef(03,01)=coef(03,01)+alpha(3,1)*g1
coef(03,02)=coef(03,02)+alpha(3,1)*g2
coef(03,03)=coef(03,03)+alpha(3,1)*g1k
coef(03,04)=coef(03,04)+alpha(3,1)*g2k
g1=0.0_wp
g2=0.0_wp
g1k=0.0_wp
g2k=0.0_wp
g1=g1+pyx(1,5)*dpy(0,0,jg)
g2=g2+pyx(1,5)*dpy(0,0,jg2)
g1k=g1k+pyx(2,5)*dpy(0,0,jg)
g2k=g2k+pyx(2,5)*dpy(0,0,jg2)
g1=g1+pyx(1,6)*dpy(0,1,jg)
g2=g2+pyx(1,6)*dpy(0,1,jg2)
g1k=g1k+pyx(2,6)*dpy(0,1,jg)
g2k=g2k+pyx(2,6)*dpy(0,1,jg2)
coef(01,01)=coef(01,01)+alpha(1,2)*g1
coef(01,02)=coef(01,02)+alpha(1,2)*g2
coef(01,03)=coef(01,03)+alpha(1,2)*g1k
coef(01,04)=coef(01,04)+alpha(1,2)*g2k
coef(02,01)=coef(02,01)+alpha(2,2)*g1
coef(02,02)=coef(02,02)+alpha(2,2)*g2
coef(02,03)=coef(02,03)+alpha(2,2)*g1k
coef(02,04)=coef(02,04)+alpha(2,2)*g2k
coef(03,01)=coef(03,01)+alpha(3,2)*g1
coef(03,02)=coef(03,02)+alpha(3,2)*g2
coef(03,03)=coef(03,03)+alpha(3,2)*g1k
coef(03,04)=coef(03,04)+alpha(3,2)*g2k
g1=0.0_wp
g2=0.0_wp
g1k=0.0_wp
g2k=0.0_wp
g1=g1+pyx(1,7)*dpy(0,0,jg)
g2=g2+pyx(1,7)*dpy(0,0,jg2)
g1k=g1k+pyx(2,7)*dpy(0,0,jg)
g2k=g2k+pyx(2,7)*dpy(0,0,jg2)
g1=g1+pyx(1,8)*dpy(1,0,jg)
g2=g2+pyx(1,8)*dpy(1,0,jg2)
g1k=g1k+pyx(2,8)*dpy(1,0,jg)
g2k=g2k+pyx(2,8)*dpy(1,0,jg2)
coef(01,01)=coef(01,01)+alpha(1,3)*g1
coef(01,02)=coef(01,02)+alpha(1,3)*g2
coef(01,03)=coef(01,03)+alpha(1,3)*g1k
coef(01,04)=coef(01,04)+alpha(1,3)*g2k
coef(02,01)=coef(02,01)+alpha(2,3)*g1
coef(02,02)=coef(02,02)+alpha(2,3)*g2
coef(02,03)=coef(02,03)+alpha(2,3)*g1k
coef(02,04)=coef(02,04)+alpha(2,3)*g2k
coef(03,01)=coef(03,01)+alpha(3,3)*g1
coef(03,02)=coef(03,02)+alpha(3,3)*g2
coef(03,03)=coef(03,03)+alpha(3,3)*g1k
coef(03,04)=coef(03,04)+alpha(3,3)*g2k
g1=0.0_wp
g2=0.0_wp
g1k=0.0_wp
g2k=0.0_wp
g1=g1+pyx(1,9)*dpy(0,0,jg)
g2=g2+pyx(1,9)*dpy(0,0,jg2)
g1k=g1k+pyx(2,9)*dpy(0,0,jg)
g2k=g2k+pyx(2,9)*dpy(0,0,jg2)
coef(01,01)=coef(01,01)+alpha(1,4)*g1
coef(01,02)=coef(01,02)+alpha(1,4)*g2
coef(01,03)=coef(01,03)+alpha(1,4)*g1k
coef(01,04)=coef(01,04)+alpha(1,4)*g2k
coef(02,01)=coef(02,01)+alpha(2,4)*g1
coef(02,02)=coef(02,02)+alpha(2,4)*g2
coef(02,03)=coef(02,03)+alpha(2,4)*g1k
coef(02,04)=coef(02,04)+alpha(2,4)*g2k
coef(03,01)=coef(03,01)+alpha(3,4)*g1
coef(03,02)=coef(03,02)+alpha(3,4)*g2
coef(03,03)=coef(03,03)+alpha(3,4)*g1k
coef(03,04)=coef(03,04)+alpha(3,4)*g2k
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31040