On Tue, Oct 22, 2019 at 11:36 AM Michael Matz <m...@suse.de> wrote: > > Hi, > > this was still collecting dust on my disk, so here it is. See the > extensive comment in the patch for what happens, in short invariant IVs > are affine but still have to be handled more conservative than other > affine IVs in transformations that reorder instructions. Making our > dependence analysis more conservative instead would be too much, we > wouldn't be able to handle cases that we should handle anymore. > > Regstrapped on x86-64-linux without regressions (all languages+Ada).
OK for trunk and affected branches. Thanks, Richard. > > Ciao, > Michael. > > PR middle-end/90796 > * gimple-loop-jam.c (any_access_function_variant_p): New function. > (adjust_unroll_factor): Use it to constrain safety, new parameter. > (tree_loop_unroll_and_jam): Adjust call and profitable unroll factor. > > testsuite/ > * gcc.dg/unroll-and-jam.c: Add three invalid and one valid case. > > diff --git a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c > index 11153f5..899653b 100644 > --- a/gcc/gimple-loop-jam.c > +++ b/gcc/gimple-loop-jam.c > @@ -360,16 +360,33 @@ fuse_loops (class loop *loop) > rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop); > } > > +/* Return true if any of the access functions for dataref A > + isn't invariant with respect to loop LOOP_NEST. */ > +static bool > +any_access_function_variant_p (const struct data_reference *a, > + const class loop *loop_nest) > +{ > + unsigned int i; > + vec<tree> fns = DR_ACCESS_FNS (a); > + tree t; > + > + FOR_EACH_VEC_ELT (fns, i, t) > + if (!evolution_function_is_invariant_p (t, loop_nest->num)) > + return true; > + > + return false; > +} > + > /* Returns true if the distance in DDR can be determined and adjusts > the unroll factor in *UNROLL to make unrolling valid for that distance. > - Otherwise return false. > + Otherwise return false. DDR is with respect to the outer loop of INNER. > > If this data dep can lead to a removed memory reference, increment > *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor > for this to happen. */ > > static bool > -adjust_unroll_factor (struct data_dependence_relation *ddr, > +adjust_unroll_factor (class loop *inner, struct data_dependence_relation > *ddr, > unsigned *unroll, unsigned *profit_unroll, > unsigned *removed) > { > @@ -392,9 +409,59 @@ adjust_unroll_factor (struct data_dependence_relation > *ddr, > gcc_unreachable (); > else if ((unsigned)dist >= *unroll) > ; > - else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - > 1) > - || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - > 1) > - && dist > 0)) > + else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) > + { > + /* We have (a,0) with a < N, so this will be transformed into > + (0,0) after unrolling by N. This might potentially be a > + problem, if it's not a read-read dependency. */ > + if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))) > + ; > + else > + { > + /* So, at least one is a write, and we might reduce the > + distance vector to (0,0). This is still no problem > + if both data-refs are affine with respect to the inner > + loops. But if one of them is invariant with respect > + to an inner loop our reordering implicit in loop fusion > + corrupts the program, as our data dependences don't > + capture this. E.g. for: > + for (0 <= i < n) > + for (0 <= j < m) > + a[i][0] = a[i+1][0] + 2; // (1) > + b[i][j] = b[i+1][j] + 2; // (2) > + the distance vector for both statements is (-1,0), > + but exchanging the order for (2) is okay, while > + for (1) it is not. To see this, write out the original > + accesses (assume m is 2): > + a i j original > + 0 0 0 r a[1][0] b[1][0] > + 1 0 0 w a[0][0] b[0][0] > + 2 0 1 r a[1][0] b[1][1] > + 3 0 1 w a[0][0] b[0][1] > + 4 1 0 r a[2][0] b[2][0] > + 5 1 0 w a[1][0] b[1][0] > + after unroll-by-2 and fusion the accesses are done in > + this order (from column a): 0,1, 4,5, 2,3, i.e. this: > + a i j transformed > + 0 0 0 r a[1][0] b[1][0] > + 1 0 0 w a[0][0] b[0][0] > + 4 1 0 r a[2][0] b[2][0] > + 5 1 0 w a[1][0] b[1][0] > + 2 0 1 r a[1][0] b[1][1] > + 3 0 1 w a[0][0] b[0][1] > + Note how access 2 accesses the same element as access 5 > + for array 'a' but not for array 'b'. */ > + if (any_access_function_variant_p (DDR_A (ddr), inner) > + && any_access_function_variant_p (DDR_B (ddr), inner)) > + ; > + else > + /* And if any dataref of this pair is invariant with > + respect to the inner loop, we have no chance than > + to reduce the unroll factor. */ > + *unroll = dist; > + } > + } > + else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - > 1)) > ; > else > *unroll = dist; > @@ -486,7 +553,7 @@ tree_loop_unroll_and_jam (void) > /* Now check the distance vector, for determining a sensible > outer unroll factor, and for validity of merging the inner > loop copies. */ > - if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll, > + if (!adjust_unroll_factor (loop, ddr, &unroll_factor, > &profit_unroll, > &removed)) > { > /* Couldn't get the distance vector. For two reads that's > @@ -506,7 +573,7 @@ tree_loop_unroll_and_jam (void) > to ignore all profitability concerns and apply the transformation > always. */ > if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) > - profit_unroll = 2; > + profit_unroll = MAX(2, profit_unroll); > else if (removed * 100 / datarefs.length () > < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) > profit_unroll = 1; > diff --git a/gcc/testsuite/gcc.dg/unroll-and-jam.c > b/gcc/testsuite/gcc.dg/unroll-and-jam.c > index 70910d3..bcfe1bd 100644 > --- a/gcc/testsuite/gcc.dg/unroll-and-jam.c > +++ b/gcc/testsuite/gcc.dg/unroll-and-jam.c > @@ -31,10 +31,10 @@ void checkb(void) > //printf(" %d\n", sum); > } > > +unsigned i, j; > #define TEST(name, body, test) \ > static void __attribute__((noinline,noclone)) name (unsigned long n, > unsigned long m) \ > { \ > - unsigned long i, j; \ > for (i = 1; i < m; i++) { \ > for (j = 1; j < n; j++) { \ > body; \ > @@ -58,9 +58,14 @@ TEST(foo3, aa[i+1][j-1]=aa[i][j] * aa[i][j] / 2, > checkaa()) //notok, -1,1 > TEST(foo4, aa[i][j] = aa[i-1][j+1] * aa[i-1][j+1] / 2, checkaa()) //notok, > -1,1 > TEST(foo5, aa[i][j] = aa[i+1][j+1] * aa[i+1][j+1] / 2, checkaa()) //ok, 1,1 > TEST(foo6, aa[i][j] = aa[i+1][j] * aa[i+1][j] / 2, checkaa()) //ok, -1,0 > +TEST(foo61, aa[i][0] = aa[i+1][0] * aa[i+1][0] / 2, checkaa()) //notok, -1,0 > +TEST(foo62, aa[i][j/2] = aa[i+1][j/2] * aa[i+1][j/2] / 2, checkaa()) > //notok, not affine > +TEST(foo63, aa[i][j%2] = aa[i+1][j%2] * aa[i+1][j%2] / 2, checkaa()) > //notok, not affine > TEST(foo7, aa[i+1][j] = aa[i][j] * aa[i][j] / 2, checkaa()) //ok, 1,0 > TEST(foo9, b[j] = 3*b[j+1] + 1, checkb()) //notok, 0,-1 > TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0 > +extern int f; > +TEST(foo11, f = b[i-1] = 1 + 3* b[i+1], checkb()) //ok, 2,0 but must reduce > unroll factor to 2, (it would be incorrect with unroll-by-3, which the > profitability would suggest) > > /* foo8 should work as well, but currently doesn't because the distance > vectors we compute are too pessimistic. We compute > @@ -68,6 +73,7 @@ TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0 > and the last one causes us to lose. */ > TEST(foo8, b[j+1] = 3*b[j] + 1, checkb()) //ok, 0,1 > > +int f; > unsigned int a[1024]; > unsigned int b[1024]; > unsigned int aa[16][1024]; > @@ -88,10 +94,12 @@ void init(void) > printf(" %s\n", #name); \ > init();for(i=0;i<4;i++)name##noopt(32,8); checka = checksum; \ > init();for(i=0;i<4;i++)name(32,8); \ > + if (checka != checksum) fail = 1; \ > printf("%sok %s\n", checka != checksum ? "NOT " : "", #name); > > int main() > { > + int fail = 0; > int i; > unsigned checka; > RUN(foo1); > @@ -100,12 +108,18 @@ int main() > RUN(foo4); > RUN(foo5); > RUN(foo6); > + RUN(foo61); > + RUN(foo62); > + RUN(foo63); > RUN(foo7); > RUN(foo8); > RUN(foo9); > RUN(foo10); > - return 0; > + RUN(foo11); > + if (fail) > + __builtin_abort(); > + return fail; > } > > -/* Five loops should be unroll-jammed (actually six, but see above). */ > -/* { dg-final { scan-tree-dump-times "applying unroll and jam" 5 "unrolljam" > } } */ > +/* Six loops should be unroll-jammed (actually seven, but see above). */ > +/* { dg-final { scan-tree-dump-times "applying unroll and jam" 6 "unrolljam" > } } */