Re: Fix PR90796

Richard Biener Tue, 22 Oct 2019 05:09:06 -0700

On Tue, Oct 22, 2019 at 11:36 AM Michael Matz <m...@suse.de> wrote:
>
> Hi,
>
> this was still collecting dust on my disk, so here it is.  See the
> extensive comment in the patch for what happens, in short invariant IVs
> are affine but still have to be handled more conservative than other
> affine IVs in transformations that reorder instructions.  Making our
> dependence analysis more conservative instead would be too much, we
> wouldn't be able to handle cases that we should handle anymore.
>
> Regstrapped on x86-64-linux without regressions (all languages+Ada).


OK for trunk and affected branches.

Thanks,
Richard.

>
> Ciao,
> Michael.
>
>         PR middle-end/90796
>         * gimple-loop-jam.c (any_access_function_variant_p): New function.
>         (adjust_unroll_factor): Use it to constrain safety, new parameter.
>         (tree_loop_unroll_and_jam): Adjust call and profitable unroll factor.
>
> testsuite/
>         * gcc.dg/unroll-and-jam.c: Add three invalid and one valid case.
>
> diff --git a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c
> index 11153f5..899653b 100644
> --- a/gcc/gimple-loop-jam.c
> +++ b/gcc/gimple-loop-jam.c
> @@ -360,16 +360,33 @@ fuse_loops (class loop *loop)
>    rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
>  }
>
> +/* Return true if any of the access functions for dataref A
> +   isn't invariant with respect to loop LOOP_NEST.  */
> +static bool
> +any_access_function_variant_p (const struct data_reference *a,
> +                              const class loop *loop_nest)
> +{
> +  unsigned int i;
> +  vec<tree> fns = DR_ACCESS_FNS (a);
> +  tree t;
> +
> +  FOR_EACH_VEC_ELT (fns, i, t)
> +    if (!evolution_function_is_invariant_p (t, loop_nest->num))
> +      return true;
> +
> +  return false;
> +}
> +
>  /* Returns true if the distance in DDR can be determined and adjusts
>     the unroll factor in *UNROLL to make unrolling valid for that distance.
> -   Otherwise return false.
> +   Otherwise return false.  DDR is with respect to the outer loop of INNER.
>
>     If this data dep can lead to a removed memory reference, increment
>     *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
>     for this to happen.  */
>
>  static bool
> -adjust_unroll_factor (struct data_dependence_relation *ddr,
> +adjust_unroll_factor (class loop *inner, struct data_dependence_relation 
> *ddr,
>                       unsigned *unroll, unsigned *profit_unroll,
>                       unsigned *removed)
>  {
> @@ -392,9 +409,59 @@ adjust_unroll_factor (struct data_dependence_relation 
> *ddr,
>             gcc_unreachable ();
>           else if ((unsigned)dist >= *unroll)
>             ;
> -         else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 
> 1)
> -                  || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 
> 1)
> -                      && dist > 0))
> +         else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
> +           {
> +             /* We have (a,0) with a < N, so this will be transformed into
> +                (0,0) after unrolling by N.  This might potentially be a
> +                problem, if it's not a read-read dependency.  */
> +             if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
> +               ;
> +             else
> +               {
> +                 /* So, at least one is a write, and we might reduce the
> +                    distance vector to (0,0).  This is still no problem
> +                    if both data-refs are affine with respect to the inner
> +                    loops.  But if one of them is invariant with respect
> +                    to an inner loop our reordering implicit in loop fusion
> +                    corrupts the program, as our data dependences don't
> +                    capture this.  E.g. for:
> +                      for (0 <= i < n)
> +                        for (0 <= j < m)
> +                          a[i][0] = a[i+1][0] + 2;    // (1)
> +                          b[i][j] = b[i+1][j] + 2;    // (2)
> +                    the distance vector for both statements is (-1,0),
> +                    but exchanging the order for (2) is okay, while
> +                    for (1) it is not.  To see this, write out the original
> +                    accesses (assume m is 2):
> +                      a i j original
> +                      0 0 0 r a[1][0] b[1][0]
> +                      1 0 0 w a[0][0] b[0][0]
> +                      2 0 1 r a[1][0] b[1][1]
> +                      3 0 1 w a[0][0] b[0][1]
> +                      4 1 0 r a[2][0] b[2][0]
> +                      5 1 0 w a[1][0] b[1][0]
> +                    after unroll-by-2 and fusion the accesses are done in
> +                    this order (from column a): 0,1, 4,5, 2,3, i.e. this:
> +                      a i j transformed
> +                      0 0 0 r a[1][0] b[1][0]
> +                      1 0 0 w a[0][0] b[0][0]
> +                      4 1 0 r a[2][0] b[2][0]
> +                      5 1 0 w a[1][0] b[1][0]
> +                      2 0 1 r a[1][0] b[1][1]
> +                      3 0 1 w a[0][0] b[0][1]
> +                    Note how access 2 accesses the same element as access 5
> +                    for array 'a' but not for array 'b'.  */
> +                 if (any_access_function_variant_p (DDR_A (ddr), inner)
> +                     && any_access_function_variant_p (DDR_B (ddr), inner))
> +                   ;
> +                 else
> +                   /* And if any dataref of this pair is invariant with
> +                      respect to the inner loop, we have no chance than
> +                      to reduce the unroll factor.  */
> +                   *unroll = dist;
> +               }
> +           }
> +         else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 
> 1))
>             ;
>           else
>             *unroll = dist;
> @@ -486,7 +553,7 @@ tree_loop_unroll_and_jam (void)
>           /* Now check the distance vector, for determining a sensible
>              outer unroll factor, and for validity of merging the inner
>              loop copies.  */
> -         if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
> +         if (!adjust_unroll_factor (loop, ddr, &unroll_factor, 
> &profit_unroll,
>                                      &removed))
>             {
>               /* Couldn't get the distance vector.  For two reads that's
> @@ -506,7 +573,7 @@ tree_loop_unroll_and_jam (void)
>          to ignore all profitability concerns and apply the transformation
>          always.  */
>        if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
> -       profit_unroll = 2;
> +       profit_unroll = MAX(2, profit_unroll);
>        else if (removed * 100 / datarefs.length ()
>           < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
>         profit_unroll = 1;
> diff --git a/gcc/testsuite/gcc.dg/unroll-and-jam.c 
> b/gcc/testsuite/gcc.dg/unroll-and-jam.c
> index 70910d3..bcfe1bd 100644
> --- a/gcc/testsuite/gcc.dg/unroll-and-jam.c
> +++ b/gcc/testsuite/gcc.dg/unroll-and-jam.c
> @@ -31,10 +31,10 @@ void checkb(void)
>    //printf("  %d\n", sum);
>  }
>
> +unsigned i, j;
>  #define TEST(name, body, test) \
>  static void __attribute__((noinline,noclone)) name (unsigned long n, 
> unsigned long m) \
>  { \
> -  unsigned long i, j; \
>    for (i = 1; i < m; i++) { \
>        for (j = 1; j < n; j++) { \
>           body; \
> @@ -58,9 +58,14 @@ TEST(foo3, aa[i+1][j-1]=aa[i][j] * aa[i][j] / 2, 
> checkaa()) //notok, -1,1
>  TEST(foo4, aa[i][j] = aa[i-1][j+1] * aa[i-1][j+1] / 2, checkaa()) //notok, 
> -1,1
>  TEST(foo5, aa[i][j] = aa[i+1][j+1] * aa[i+1][j+1] / 2, checkaa()) //ok, 1,1
>  TEST(foo6, aa[i][j] = aa[i+1][j] * aa[i+1][j] / 2, checkaa()) //ok, -1,0
> +TEST(foo61, aa[i][0] = aa[i+1][0] * aa[i+1][0] / 2, checkaa()) //notok, -1,0
> +TEST(foo62, aa[i][j/2] = aa[i+1][j/2] * aa[i+1][j/2] / 2, checkaa()) 
> //notok, not affine
> +TEST(foo63, aa[i][j%2] = aa[i+1][j%2] * aa[i+1][j%2] / 2, checkaa()) 
> //notok, not affine
>  TEST(foo7, aa[i+1][j] = aa[i][j] * aa[i][j] / 2, checkaa()) //ok, 1,0
>  TEST(foo9, b[j] = 3*b[j+1] + 1, checkb()) //notok, 0,-1
>  TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0
> +extern int f;
> +TEST(foo11, f = b[i-1] = 1 + 3* b[i+1], checkb()) //ok, 2,0 but must reduce 
> unroll factor to 2, (it would be incorrect with unroll-by-3, which the 
> profitability would suggest)
>
>  /* foo8 should work as well, but currently doesn't because the distance
>     vectors we compute are too pessimistic.  We compute
> @@ -68,6 +73,7 @@ TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0
>     and the last one causes us to lose.  */
>  TEST(foo8, b[j+1] = 3*b[j] + 1, checkb()) //ok, 0,1
>
> +int f;
>  unsigned int a[1024];
>  unsigned int b[1024];
>  unsigned int aa[16][1024];
> @@ -88,10 +94,12 @@ void init(void)
>      printf(" %s\n", #name); \
>      init();for(i=0;i<4;i++)name##noopt(32,8); checka = checksum; \
>      init();for(i=0;i<4;i++)name(32,8); \
> +    if (checka != checksum) fail = 1; \
>      printf("%sok %s\n", checka != checksum ? "NOT " : "", #name);
>
>  int main()
>  {
> +  int fail = 0;
>    int i;
>    unsigned checka;
>    RUN(foo1);
> @@ -100,12 +108,18 @@ int main()
>    RUN(foo4);
>    RUN(foo5);
>    RUN(foo6);
> +  RUN(foo61);
> +  RUN(foo62);
> +  RUN(foo63);
>    RUN(foo7);
>    RUN(foo8);
>    RUN(foo9);
>    RUN(foo10);
> -  return 0;
> +  RUN(foo11);
> +  if (fail)
> +    __builtin_abort();
> +  return fail;
>  }
>
> -/* Five loops should be unroll-jammed (actually six, but see above).  */
> -/* { dg-final { scan-tree-dump-times "applying unroll and jam" 5 "unrolljam" 
> } } */
> +/* Six loops should be unroll-jammed (actually seven, but see above).  */
> +/* { dg-final { scan-tree-dump-times "applying unroll and jam" 6 "unrolljam" 
> } } */

Re: Fix PR90796

Reply via email to