Hello world, attached are a few more speedups for special eoshift cases. This time, nothing fancy, just use memcpy for copying in the contiguous case.
I am still looking at eoshift2 (scalar shift, array boundary) to see if it would be possible to duplicate the speed gains for eoshift0 (scalar shift, scalar boundary), but it won't hurt to do this first. At least the shift along dimension 1 should be faster by about a factor of two. I have also added a few test cases which test eoshift in all the variants touched by this patch. Regression-testing as I write this. I don't expect anything bad (because I tested all test cases containing *eoshift*). OK for trunk if this passes? Regards Thomas 2017-06-03 Thomas Koenig <tkoe...@gcc.gnu.org> * intrinsics/eoshift2.c (eoshift2): Use memcpy for innermost copy where possible. * m4/eoshift1.m4 (eoshift1): Likewise. * m4/eoshift3.m4 (eoshift3): Likewise. * generated/eoshift1_16.c: Regenerated. * generated/eoshift1_4.c: Regenerated. * generated/eoshift1_8.c: Regenerated. * generated/eoshift3_16.c: Regenerated. * generated/eoshift3_4.c: Regenerated. * generated/eoshift3_8.c: Regenerated. 2017-06-03 Thomas Koenig <tkoe...@gcc.gnu.org> * gfortran.dg/eoshift_4.f90: New test. * gfortran.dg/eoshift_5.f90: New test. * gfortran.dg/eoshift_6.f90: New test.
Index: intrinsics/eoshift2.c =================================================================== --- intrinsics/eoshift2.c (Revision 249936) +++ intrinsics/eoshift2.c (Arbeitskopie) @@ -181,12 +181,23 @@ eoshift2 (gfc_array_char *ret, const gfc_array_cha src = sptr; dest = &rptr[-shift * roffset]; } - for (n = 0; n < len; n++) - { - memcpy (dest, src, size); - dest += roffset; - src += soffset; - } + + /* If the elements are contiguous, perform a single block move. */ + if (soffset == size && roffset == size) + { + size_t chunk = size * len; + memcpy (dest, src, chunk); + dest += chunk; + } + else + { + for (n = 0; n < len; n++) + { + memcpy (dest, src, size); + dest += roffset; + src += soffset; + } + } if (shift >= 0) { n = shift; Index: m4/eoshift1.m4 =================================================================== --- m4/eoshift1.m4 (Revision 249936) +++ m4/eoshift1.m4 (Arbeitskopie) @@ -184,12 +184,23 @@ eoshift1 (gfc_array_char * const restrict ret, src = sptr; dest = &rptr[delta * roffset]; } - for (n = 0; n < len - delta; n++) - { - memcpy (dest, src, size); - dest += roffset; - src += soffset; - } + + /* If the elements are contiguous, perform a single block move. */ + if (soffset == size && roffset == size) + { + size_t chunk = size * (len - delta); + memcpy (dest, src, chunk); + dest += chunk; + } + else + { + for (n = 0; n < len - delta; n++) + { + memcpy (dest, src, size); + dest += roffset; + src += soffset; + } + } if (sh < 0) dest = rptr; n = delta; Index: m4/eoshift3.m4 =================================================================== --- m4/eoshift3.m4 (Revision 249936) +++ m4/eoshift3.m4 (Arbeitskopie) @@ -199,12 +199,24 @@ eoshift3 (gfc_array_char * const restrict ret, src = sptr; dest = &rptr[delta * roffset]; } - for (n = 0; n < len - delta; n++) - { - memcpy (dest, src, size); - dest += roffset; - src += soffset; - } + + /* If the elements are contiguous, perform a single block move. */ + if (soffset == size && roffset == size) + { + size_t chunk = size * (len - delta); + memcpy (dest, src, chunk); + dest += chunk; + } + else + { + for (n = 0; n < len - delta; n++) + { + memcpy (dest, src, size); + dest += roffset; + src += soffset; + } + } + if (sh < 0) dest = rptr; n = delta; Index: generated/eoshift1_16.c =================================================================== --- generated/eoshift1_16.c (Revision 249936) +++ generated/eoshift1_16.c (Arbeitskopie) @@ -183,12 +183,23 @@ eoshift1 (gfc_array_char * const restrict ret, src = sptr; dest = &rptr[delta * roffset]; } - for (n = 0; n < len - delta; n++) - { - memcpy (dest, src, size); - dest += roffset; - src += soffset; - } + + /* If the elements are contiguous, perform a single block move. */ + if (soffset == size && roffset == size) + { + size_t chunk = size * (len - delta); + memcpy (dest, src, chunk); + dest += chunk; + } + else + { + for (n = 0; n < len - delta; n++) + { + memcpy (dest, src, size); + dest += roffset; + src += soffset; + } + } if (sh < 0) dest = rptr; n = delta; Index: generated/eoshift1_4.c =================================================================== --- generated/eoshift1_4.c (Revision 249936) +++ generated/eoshift1_4.c (Arbeitskopie) @@ -183,12 +183,23 @@ eoshift1 (gfc_array_char * const restrict ret, src = sptr; dest = &rptr[delta * roffset]; } - for (n = 0; n < len - delta; n++) - { - memcpy (dest, src, size); - dest += roffset; - src += soffset; - } + + /* If the elements are contiguous, perform a single block move. */ + if (soffset == size && roffset == size) + { + size_t chunk = size * (len - delta); + memcpy (dest, src, chunk); + dest += chunk; + } + else + { + for (n = 0; n < len - delta; n++) + { + memcpy (dest, src, size); + dest += roffset; + src += soffset; + } + } if (sh < 0) dest = rptr; n = delta; Index: generated/eoshift1_8.c =================================================================== --- generated/eoshift1_8.c (Revision 249936) +++ generated/eoshift1_8.c (Arbeitskopie) @@ -183,12 +183,23 @@ eoshift1 (gfc_array_char * const restrict ret, src = sptr; dest = &rptr[delta * roffset]; } - for (n = 0; n < len - delta; n++) - { - memcpy (dest, src, size); - dest += roffset; - src += soffset; - } + + /* If the elements are contiguous, perform a single block move. */ + if (soffset == size && roffset == size) + { + size_t chunk = size * (len - delta); + memcpy (dest, src, chunk); + dest += chunk; + } + else + { + for (n = 0; n < len - delta; n++) + { + memcpy (dest, src, size); + dest += roffset; + src += soffset; + } + } if (sh < 0) dest = rptr; n = delta; Index: generated/eoshift3_16.c =================================================================== --- generated/eoshift3_16.c (Revision 249936) +++ generated/eoshift3_16.c (Arbeitskopie) @@ -198,12 +198,24 @@ eoshift3 (gfc_array_char * const restrict ret, src = sptr; dest = &rptr[delta * roffset]; } - for (n = 0; n < len - delta; n++) - { - memcpy (dest, src, size); - dest += roffset; - src += soffset; - } + + /* If the elements are contiguous, perform a single block move. */ + if (soffset == size && roffset == size) + { + size_t chunk = size * (len - delta); + memcpy (dest, src, chunk); + dest += chunk; + } + else + { + for (n = 0; n < len - delta; n++) + { + memcpy (dest, src, size); + dest += roffset; + src += soffset; + } + } + if (sh < 0) dest = rptr; n = delta; Index: generated/eoshift3_4.c =================================================================== --- generated/eoshift3_4.c (Revision 249936) +++ generated/eoshift3_4.c (Arbeitskopie) @@ -198,12 +198,24 @@ eoshift3 (gfc_array_char * const restrict ret, src = sptr; dest = &rptr[delta * roffset]; } - for (n = 0; n < len - delta; n++) - { - memcpy (dest, src, size); - dest += roffset; - src += soffset; - } + + /* If the elements are contiguous, perform a single block move. */ + if (soffset == size && roffset == size) + { + size_t chunk = size * (len - delta); + memcpy (dest, src, chunk); + dest += chunk; + } + else + { + for (n = 0; n < len - delta; n++) + { + memcpy (dest, src, size); + dest += roffset; + src += soffset; + } + } + if (sh < 0) dest = rptr; n = delta; Index: generated/eoshift3_8.c =================================================================== --- generated/eoshift3_8.c (Revision 249936) +++ generated/eoshift3_8.c (Arbeitskopie) @@ -198,12 +198,24 @@ eoshift3 (gfc_array_char * const restrict ret, src = sptr; dest = &rptr[delta * roffset]; } - for (n = 0; n < len - delta; n++) - { - memcpy (dest, src, size); - dest += roffset; - src += soffset; - } + + /* If the elements are contiguous, perform a single block move. */ + if (soffset == size && roffset == size) + { + size_t chunk = size * (len - delta); + memcpy (dest, src, chunk); + dest += chunk; + } + else + { + for (n = 0; n < len - delta; n++) + { + memcpy (dest, src, size); + dest += roffset; + src += soffset; + } + } + if (sh < 0) dest = rptr; n = delta;
! { dg-do run } ! Check that eoshift works for three-dimensional arrays. module x implicit none contains subroutine eoshift_2 (array, shift, boundary, dim, res) real, dimension(:,:,:), intent(in) :: array real, dimension(:,:,:), intent(out) :: res integer, value :: shift real, optional, dimension(:,:), intent(in) :: boundary integer, optional, intent(in) :: dim integer :: s1, s2, s3 integer :: n1, n2, n3 real :: b integer :: d if (present(dim)) then d = dim else d = 1 end if n1 = size(array,1) n2 = size(array,2) n3 = size(array,3) select case(dim) case(1) if (shift > 0) then shift = min(shift, n1) do s3=1,n3 do s2=1,n2 b = boundary(s2,s3) do s1= 1, n1 - shift res(s1,s2,s3) = array(s1+shift,s2,s3) end do do s1 = n1 - shift + 1,n1 res(s1,s2,s3) = b end do end do end do else shift = max(shift, -n1) do s3=1,n3 do s2=1,n2 b = boundary(s2,s3) do s1=1,-shift res(s1,s2,s3) = b end do do s1= 1-shift,n1 res(s1,s2,s3) = array(s1+shift,s2,s3) end do end do end do end if case(2) if (shift > 0) then shift = min(shift, n2) do s3=1,n3 do s2=1, n2 - shift do s1=1,n1 res(s1,s2,s3) = array(s1,s2+shift,s3) end do end do do s2=n2 - shift + 1, n2 do s1=1,n1 b = boundary(s1,s3) res(s1,s2,s3) = b end do end do end do else shift = max(shift, -n2) do s3=1,n3 do s2=1,-shift do s1=1,n1 b = boundary(s1,s3) res(s1,s2,s3) = b end do end do do s2=1-shift,n2 do s1=1,n1 res(s1,s2,s3) = array(s1,s2+shift,s3) end do end do end do end if case(3) if (shift > 0) then shift = min(shift, n3) do s3=1,n3 - shift do s2=1, n2 do s1=1,n1 res(s1,s2,s3) = array(s1,s2,s3+shift) end do end do end do do s3=n3 - shift + 1, n3 do s2=1, n2 do s1=1,n1 b = boundary(s1,s2) res(s1,s2,s3) = b end do end do end do else shift = max(shift, -n3) do s3=1,-shift do s2=1,n2 do s1=1,n1 b = boundary(s1,s2) res(s1,s2,s3) = b end do end do end do do s3=1-shift,n3 do s2=1,n2 do s1=1,n1 res(s1,s2,s3) = array(s1,s2,s3+shift) end do end do end do end if case default stop "Illegal dim" end select end subroutine eoshift_2 end module x program main use x implicit none integer, parameter :: n1=20,n2=30,n3=40 real, dimension(n1,n2,n3) :: a,b,c real, dimension(2*n1,n2,n3) :: a2,c2 integer :: dim, shift, shift_lim real, dimension(n2,n3), target :: b1 real, dimension(n1,n3), target :: b2 real, dimension(n1,n2), target :: b3 real, dimension(:,:), pointer :: bp call random_number(a) call random_number (b1) call random_number (b2) call random_number (b3) do dim=1,3 if (dim == 1) then shift_lim = n1 + 1 bp => b1 else if (dim == 2) then shift_lim = n2 + 1 bp => b2 else shift_lim = n3 + 1 bp => b3 end if do shift=-shift_lim, shift_lim b = eoshift(a,shift,dim=dim, boundary=bp) call eoshift_2 (a, shift=shift, dim=dim, boundary=bp, res=c) if (any (b /= c)) then print *,"dim = ", dim, "shift = ", shift print *,b print *,c call abort end if a2 = 42. a2(1:2*n1:2,:,:) = a b = eoshift(a2(1:2*n1:2,:,:), shift, dim=dim, boundary=bp) if (any (b /= c)) then call abort end if c2 = 43. c2(1:2*n1:2,:,:) = eoshift(a,shift,dim=dim, boundary=bp) if (any(c2(1:2*n1:2,:,:) /= c)) then call abort end if if (any(c2(2:2*n1:2,:,:) /= 43)) then call abort end if end do end do end program main
! { dg-do run } ! Check that eoshift works for three-dimensional arrays. module x implicit none contains subroutine eoshift_1 (array, shift, boundary, dim, res) real, dimension(:,:,:), intent(in) :: array real, dimension(:,:,:), intent(out) :: res integer, dimension(:,:), intent(in) :: shift real, optional, intent(in) :: boundary integer, optional, intent(in) :: dim integer :: s1, s2, s3 integer :: n1, n2, n3 integer :: sh real :: b integer :: d if (present(boundary)) then b = boundary else b = 0.0 end if if (present(dim)) then d = dim else d = 1 end if n1 = size(array,1) n2 = size(array,2) n3 = size(array,3) select case(dim) case(1) do s3=1,n3 do s2=1,n2 sh = shift(s2,s3) if (sh > 0) then sh = min(sh, n1) do s1= 1, n1 - sh res(s1,s2,s3) = array(s1+sh,s2,s3) end do do s1 = n1 - sh + 1,n1 res(s1,s2,s3) = b end do else sh = max(sh, -n1) do s1=1,-sh res(s1,s2,s3) = b end do do s1= 1-sh,n1 res(s1,s2,s3) = array(s1+sh,s2,s3) end do end if end do end do case(2) do s3=1,n3 do s1=1,n1 sh = shift(s1,s3) if (sh > 0) then sh = min (sh, n2) do s2=1, n2 - sh res(s1,s2,s3) = array(s1,s2+sh,s3) end do do s2=n2 - sh + 1, n2 res(s1,s2,s3) = b end do else sh = max(sh, -n2) do s2=1,-sh res(s1,s2,s3) = b end do do s2=1-sh,n2 res(s1,s2,s3) = array(s1,s2+sh,s3) end do end if end do end do case(3) do s2=1, n2 do s1=1,n1 sh = shift(s1, s2) if (sh > 0) then sh = min(sh, n3) do s3=1,n3 - sh res(s1,s2,s3) = array(s1,s2,s3+sh) end do do s3=n3 - sh + 1, n3 res(s1,s2,s3) = b end do else sh = max(sh, -n3) do s3=1,-sh res(s1,s2,s3) = b end do do s3=1-sh,n3 res(s1,s2,s3) = array(s1,s2,s3+sh) end do end if end do end do case default stop "Illegal dim" end select end subroutine eoshift_1 subroutine fill_shift(x, n) integer, intent(out), dimension(:,:) :: x integer, intent(in) :: n integer :: n1, n2, s1, s2 integer :: v v = -n - 1 n1 = size(x,1) n2 = size(x,2) do s2=1,n2 do s1=1,n1 x(s1,s2) = v v = v + 1 if (v > n + 1) v = -n - 1 end do end do end subroutine fill_shift end module x program main use x implicit none integer, parameter :: n1=20,n2=30,n3=40 real, dimension(n1,n2,n3) :: a,b,c real, dimension(2*n1,n2,n3) :: a2, c2 integer :: dim integer, dimension(n2,n3), target :: sh1 integer, dimension(n1,n3), target :: sh2 integer, dimension(n1,n2), target :: sh3 real, dimension(n2,n3), target :: b1 real, dimension(n1,n3), target :: b2 real, dimension(n1,n2), target :: b3 integer, dimension(:,:), pointer :: sp real, dimension(:,:), pointer :: bp call random_number(a) call fill_shift(sh1, n1) call fill_shift(sh2, n2) call fill_shift(sh3, n3) do dim=1,3 if (dim == 1) then sp => sh1 else if (dim == 2) then sp => sh2 else sp => sh3 end if b = eoshift(a,shift=sp,dim=dim,boundary=-0.5) call eoshift_1 (a, shift=sp, dim=dim, boundary=-0.5,res=c) if (any (b /= c)) then print *,"dim = ", dim print *,"sp = ", sp print '(99F8.4)',b print '(99F8.4)',c call abort end if a2 = 42. a2(1:2*n1:2,:,:) = a b = eoshift(a2(1:2*n1:2,:,:), shift=sp, dim=dim, boundary=-0.5) if (any(b /= c)) then call abort end if c2 = 43. c2(1:2*n1:2,:,:) = eoshift(a, shift=sp, dim=dim, boundary=-0.5) if (any(c2(1:2*n1:2,:,:) /= c)) then call abort end if if (any(c2(2:2*n1:2,:,:) /= 43.)) then call abort end if end do end program main
! { dg-do run } ! Check that eoshift works for three-dimensional arrays. module x implicit none contains subroutine eoshift_3 (array, shift, boundary, dim, res) real, dimension(:,:,:), intent(in) :: array real, dimension(:,:,:), intent(out) :: res integer, dimension(:,:), intent(in) :: shift real, optional, dimension(:,:), intent(in) :: boundary integer, optional, intent(in) :: dim integer :: s1, s2, s3 integer :: n1, n2, n3 integer :: sh real :: b integer :: d if (present(dim)) then d = dim else d = 1 end if n1 = size(array,1) n2 = size(array,2) n3 = size(array,3) select case(dim) case(1) do s3=1,n3 do s2=1,n2 sh = shift(s2,s3) b = boundary(s2,s3) if (sh > 0) then sh = min(sh, n1) do s1= 1, n1 - sh res(s1,s2,s3) = array(s1+sh,s2,s3) end do do s1 = n1 - sh + 1,n1 res(s1,s2,s3) = b end do else sh = max(sh, -n1) do s1=1,-sh res(s1,s2,s3) = b end do do s1= 1-sh,n1 res(s1,s2,s3) = array(s1+sh,s2,s3) end do end if end do end do case(2) do s3=1,n3 do s1=1,n1 sh = shift(s1,s3) b = boundary(s1,s3) if (sh > 0) then sh = min (sh, n2) do s2=1, n2 - sh res(s1,s2,s3) = array(s1,s2+sh,s3) end do do s2=n2 - sh + 1, n2 res(s1,s2,s3) = b end do else sh = max(sh, -n2) do s2=1,-sh res(s1,s2,s3) = b end do do s2=1-sh,n2 res(s1,s2,s3) = array(s1,s2+sh,s3) end do end if end do end do case(3) do s2=1, n2 do s1=1,n1 sh = shift(s1, s2) b = boundary(s1, s2) if (sh > 0) then sh = min(sh, n3) do s3=1,n3 - sh res(s1,s2,s3) = array(s1,s2,s3+sh) end do do s3=n3 - sh + 1, n3 res(s1,s2,s3) = b end do else sh = max(sh, -n3) do s3=1,-sh res(s1,s2,s3) = b end do do s3=1-sh,n3 res(s1,s2,s3) = array(s1,s2,s3+sh) end do end if end do end do case default stop "Illegal dim" end select end subroutine eoshift_3 subroutine fill_shift(x, n) integer, intent(out), dimension(:,:) :: x integer, intent(in) :: n integer :: n1, n2, s1, s2 integer :: v v = -n - 1 n1 = size(x,1) n2 = size(x,2) do s2=1,n2 do s1=1,n1 x(s1,s2) = v v = v + 1 if (v > n + 1) v = -n - 1 end do end do end subroutine fill_shift end module x program main use x implicit none integer, parameter :: n1=10,n2=30,n3=40 real, dimension(n1,n2,n3) :: a,b,c real, dimension(2*n1,n2,n3) :: a2, c2 integer :: dim integer, dimension(n2,n3), target :: sh1 integer, dimension(n1,n3), target :: sh2 integer, dimension(n1,n2), target :: sh3 real, dimension(n2,n3), target :: b1 real, dimension(n1,n3), target :: b2 real, dimension(n1,n2), target :: b3 integer, dimension(:,:), pointer :: sp real, dimension(:,:), pointer :: bp call random_number(a) call random_number(b1) call random_number(b2) call random_number(b3) call fill_shift(sh1, n1) call fill_shift(sh2, n2) call fill_shift(sh3, n3) do dim=1,3 if (dim == 1) then sp => sh1 bp => b1 else if (dim == 2) then sp => sh2 bp => b2 else sp => sh3 bp => b3 end if b = eoshift(a,shift=sp,dim=dim,boundary=bp) call eoshift_3 (a, shift=sp, dim=dim, boundary=bp,res=c) if (any (b /= c)) then call abort end if a2 = 42. a2(1:2*n1:2,:,:) = a b = eoshift(a2(1:2*n1:2,:,:), shift=sp, dim=dim, boundary=bp) if (any(b /= c)) then call abort end if c2 = 43. c2(1:2*n1:2,:,:) = eoshift(a, shift=sp, dim=dim, boundary=bp) if (any(c2(1:2*n1:2,:,:) /= c)) then call abort end if if (any(c2(2:2*n1:2,:,:) /= 43.)) then call abort end if end do end program main