On Wed, Apr 7, 2021 at 6:43 PM Richard Biener <rguent...@suse.de> wrote:
>
> When we apply store motion and DSE manually to the bwaves kernel
> in gfortran.dg/pr81303.f loop interchange no longer happens because
> the perfect nest considered covers outer loops we cannot analyze
> strides for.  The following compensates for this by shrinking the
> nest in this analysis which was already possible but on a too coarse
> granularity.  It shares the shrinked nest with the rest of the DRs
> so the complexity overhead should be negligible.
>
> Bootstrapped & tested on x86_64-unknown-linux-gnu, SPEC FP CPU 2017
> build and there's still only the single loop interchange in bwaves.
>
> Queued for stage1.

And applied as g:6ff66d1ea48960fe96bb51a750c01135e65fe452

> 2021-04-07  Richard Biener  <rguent...@suse.de>
>
>         PR tree-optimization/99956
>         * gimple-loop-interchange.cc (compute_access_stride):
>         Try instantiating the access in a shallower loop nest
>         if instantiating failed.
>         (compute_access_strides): Pass adjustable loop_nest
>         to compute_access_stride.
>
>         * gfortran.dg/pr99956.f: New testcase.
> ---
>  gcc/gimple-loop-interchange.cc      | 68 +++++++++++++++++------------
>  gcc/testsuite/gfortran.dg/pr99956.f | 45 +++++++++++++++++++
>  2 files changed, 85 insertions(+), 28 deletions(-)
>  create mode 100644 gcc/testsuite/gfortran.dg/pr99956.f
>
> diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc
> index f45b9364644..80f749b6071 100644
> --- a/gcc/gimple-loop-interchange.cc
> +++ b/gcc/gimple-loop-interchange.cc
> @@ -1280,12 +1280,15 @@ tree_loop_interchange::move_code_to_inner_loop (class 
> loop *outer,
>            arr[i][j - 1][k] = 0;  */
>
>  static void
> -compute_access_stride (class loop *loop_nest, class loop *loop,
> +compute_access_stride (class loop *&loop_nest, class loop *loop,
>                        data_reference_p dr)
>  {
>    vec<tree> *strides = new vec<tree> ();
> -  basic_block bb = gimple_bb (DR_STMT (dr));
> +  dr->aux = strides;
>
> +  basic_block bb = gimple_bb (DR_STMT (dr));
> +  if (!flow_bb_inside_loop_p (loop_nest, bb))
> +    return;
>    while (!flow_bb_inside_loop_p (loop, bb))
>      {
>        strides->safe_push (build_int_cst (sizetype, 0));
> @@ -1313,39 +1316,47 @@ compute_access_stride (class loop *loop_nest, class 
> loop *loop,
>         }
>        /* Otherwise punt.  */
>        else
> -       {
> -         dr->aux = strides;
> -         return;
> -       }
> +       return;
>      }
>    tree scev_base = build_fold_addr_expr (ref);
>    tree scev = analyze_scalar_evolution (loop, scev_base);
> -  scev = instantiate_scev (loop_preheader_edge (loop_nest), loop, scev);
> -  if (! chrec_contains_undetermined (scev))
> +  if (chrec_contains_undetermined (scev))
> +    return;
> +
> +  tree orig_scev = scev;
> +  do
> +    {
> +      scev = instantiate_scev (loop_preheader_edge (loop_nest),
> +                              loop, orig_scev);
> +      if (! chrec_contains_undetermined (scev))
> +       break;
> +
> +      /* If we couldn't instantiate for the desired nest, shrink it.  */
> +      if (loop_nest == loop)
> +       return;
> +      loop_nest = loop_nest->inner;
> +    } while (1);
> +
> +  tree sl = scev;
> +  class loop *expected = loop;
> +  while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
>      {
> -      tree sl = scev;
> -      class loop *expected = loop;
> -      while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
> +      class loop *sl_loop = get_chrec_loop (sl);
> +      while (sl_loop != expected)
>         {
> -         class loop *sl_loop = get_chrec_loop (sl);
> -         while (sl_loop != expected)
> -           {
> -             strides->safe_push (size_int (0));
> -             expected = loop_outer (expected);
> -           }
> -         strides->safe_push (CHREC_RIGHT (sl));
> -         sl = CHREC_LEFT (sl);
> +         strides->safe_push (size_int (0));
>           expected = loop_outer (expected);
>         }
> -      if (! tree_contains_chrecs (sl, NULL))
> -       while (expected != loop_outer (loop_nest))
> -         {
> -           strides->safe_push (size_int (0));
> -           expected = loop_outer (expected);
> -         }
> +      strides->safe_push (CHREC_RIGHT (sl));
> +      sl = CHREC_LEFT (sl);
> +      expected = loop_outer (expected);
>      }
> -
> -  dr->aux = strides;
> +  if (! tree_contains_chrecs (sl, NULL))
> +    while (expected != loop_outer (loop_nest))
> +      {
> +       strides->safe_push (size_int (0));
> +       expected = loop_outer (expected);
> +      }
>  }
>
>  /* Given loop nest LOOP_NEST with innermost LOOP, the function computes
> @@ -1363,9 +1374,10 @@ compute_access_strides (class loop *loop_nest, class 
> loop *loop,
>    data_reference_p dr;
>    vec<tree> *stride;
>
> +  class loop *interesting_loop_nest = loop_nest;
>    for (i = 0; datarefs.iterate (i, &dr); ++i)
>      {
> -      compute_access_stride (loop_nest, loop, dr);
> +      compute_access_stride (interesting_loop_nest, loop, dr);
>        stride = DR_ACCESS_STRIDE (dr);
>        if (stride->length () < num_loops)
>         {
> diff --git a/gcc/testsuite/gfortran.dg/pr99956.f 
> b/gcc/testsuite/gfortran.dg/pr99956.f
> new file mode 100644
> index 00000000000..b5c0be3912d
> --- /dev/null
> +++ b/gcc/testsuite/gfortran.dg/pr99956.f
> @@ -0,0 +1,45 @@
> +! { dg-do compile }
> +! { dg-options "-O3 -ffast-math -floop-interchange 
> -fdump-tree-linterchange-details" }
> +
> +        subroutine mat_times_vec(y,x,a,axp,ayp,azp,axm,aym,azm,
> +     $  nb,nx,ny,nz)
> +        implicit none
> +        integer nb,nx,ny,nz,i,j,k,m,l,kit,im1,ip1,jm1,jp1,km1,kp1
> +
> +        real*8 y(nb,nx,ny,nz),x(nb,nx,ny,nz),tem
> +
> +        real*8 a(nb,nb,nx,ny,nz),
> +     1  axp(nb,nb,nx,ny,nz),ayp(nb,nb,nx,ny,nz),azp(nb,nb,nx,ny,nz),
> +     2  axm(nb,nb,nx,ny,nz),aym(nb,nb,nx,ny,nz),azm(nb,nb,nx,ny,nz)
> +
> +
> +      do k=1,nz
> +         km1=mod(k+nz-2,nz)+1
> +         kp1=mod(k,nz)+1
> +         do j=1,ny
> +            jm1=mod(j+ny-2,ny)+1
> +            jp1=mod(j,ny)+1
> +            do i=1,nx
> +               im1=mod(i+nx-2,nx)+1
> +               ip1=mod(i,nx)+1
> +               do l=1,nb
> +                  tem=0.0
> +                  do m=1,nb
> +                     tem=tem+
> +     1               a(l,m,i,j,k)*x(m,i,j,k)+
> +     2               axp(l,m,i,j,k)*x(m,ip1,j,k)+
> +     3               ayp(l,m,i,j,k)*x(m,i,jp1,k)+
> +     4               azp(l,m,i,j,k)*x(m,i,j,kp1)+
> +     5               axm(l,m,i,j,k)*x(m,im1,j,k)+
> +     6               aym(l,m,i,j,k)*x(m,i,jm1,k)+
> +     7               azm(l,m,i,j,k)*x(m,i,j,km1)
> +                  enddo
> +                  y(l,i,j,k)=tem
> +               enddo
> +            enddo
> +         enddo
> +        enddo
> +        return
> +        end
> +
> +! { dg-final { scan-tree-dump-times "is interchanged" 1 "linterchange" } }
> --
> 2.26.2

Reply via email to