On Wed, 29 Nov 2017, Richard Biener wrote: > > It turns out that we don't vectorize the 2nd testcase in PR83202 > (or rather we do that in weird ways during BB vectorization) because > cunrolli decides to peel the inner loop completely based on > the size of the accessed arrays. That unfortunately leaves exit > tests in the outer loop body which in turn makes us not vectorize > the loop. > > We have a late unrolling pass for these kind of unrollings so this > patch simply avoids doing this during cunrolli. > > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
And this is what I applied after reviewing testsuite regressions of the first. Bootstrapped and tested on x86_64-unknown-linux-gnu. Richard. 2017-11-30 Richard Biener <rguent...@suse.de> PR tree-optimization/83202 * tree-ssa-loop-ivcanon.c (try_unroll_loop_completely): Add allow_peel argument and guard peeling. (canonicalize_loop_induction_variables): Likewise. (canonicalize_induction_variables): Pass false. (tree_unroll_loops_completely_1): Pass unroll_outer to disallow peeling from cunrolli. * gcc.dg/vect/pr83202-1.c: New testcase. * gcc.dg/tree-ssa/pr61743-1.c: Adjust. Index: gcc/tree-ssa-loop-ivcanon.c =================================================================== --- gcc/tree-ssa-loop-ivcanon.c (revision 255201) +++ gcc/tree-ssa-loop-ivcanon.c (working copy) @@ -679,7 +679,7 @@ try_unroll_loop_completely (struct loop edge exit, tree niter, enum unroll_level ul, HOST_WIDE_INT maxiter, - location_t locus) + location_t locus, bool allow_peel) { unsigned HOST_WIDE_INT n_unroll = 0; bool n_unroll_found = false; @@ -711,7 +711,8 @@ try_unroll_loop_completely (struct loop exit = NULL; /* See if we can improve our estimate by using recorded loop bounds. */ - if (maxiter >= 0 + if ((allow_peel || maxiter == 0 || ul == UL_NO_GROWTH) + && maxiter >= 0 && (!n_unroll_found || (unsigned HOST_WIDE_INT)maxiter < n_unroll)) { n_unroll = maxiter; @@ -1139,7 +1140,7 @@ try_peel_loop (struct loop *loop, static bool canonicalize_loop_induction_variables (struct loop *loop, bool create_iv, enum unroll_level ul, - bool try_eval) + bool try_eval, bool allow_peel) { edge exit = NULL; tree niter; @@ -1207,7 +1208,8 @@ canonicalize_loop_induction_variables (s populates the loop bounds. */ modified |= remove_redundant_iv_tests (loop); - if (try_unroll_loop_completely (loop, exit, niter, ul, maxiter, locus)) + if (try_unroll_loop_completely (loop, exit, niter, ul, maxiter, locus, + allow_peel)) return true; if (create_iv @@ -1238,7 +1240,7 @@ canonicalize_induction_variables (void) { changed |= canonicalize_loop_induction_variables (loop, true, UL_SINGLE_ITER, - true); + true, false); } gcc_assert (!need_ssa_update_p (cfun)); @@ -1353,7 +1355,7 @@ tree_unroll_loops_completely_1 (bool may ul = UL_NO_GROWTH; if (canonicalize_loop_induction_variables - (loop, false, ul, !flag_tree_loop_ivcanon)) + (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer)) { /* If we'll continue unrolling, we need to propagate constants within the new basic blocks to fold away induction variable Index: gcc/testsuite/gcc.dg/vect/pr83202-1.c =================================================================== --- gcc/testsuite/gcc.dg/vect/pr83202-1.c (nonexistent) +++ gcc/testsuite/gcc.dg/vect/pr83202-1.c (working copy) @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ + +void test(double data[8][8]) +{ + for (int i = 0; i < 8; i++) + { + for (int j = 0; j < i; j+=4) + { + data[i][j] *= data[i][j]; + data[i][j+1] *= data[i][j+1]; + data[i][j+2] *= data[i][j+2]; + data[i][j+3] *= data[i][j+3]; + } + } +} + +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */ +/* { dg-final { scan-tree-dump "ectorized 1 loops" "vect" } } */ Index: gcc/testsuite/gcc.dg/tree-ssa/pr61743-1.c =================================================================== --- gcc/testsuite/gcc.dg/tree-ssa/pr61743-1.c (revision 255201) +++ gcc/testsuite/gcc.dg/tree-ssa/pr61743-1.c (working copy) @@ -48,5 +48,6 @@ int foo1 (e_u8 a[4][N], int b1, int b2, return 0; } -/* { dg-final { scan-tree-dump-times "loop with 3 iterations completely unrolled" 8 "cunroll" } } */ -/* { dg-final { scan-tree-dump-times "loop with 8 iterations completely unrolled" 2 "cunrolli" } } */ +/* { dg-final { scan-tree-dump-times "loop with 3 iterations completely unrolled" 2 "cunroll" } } */ +/* { dg-final { scan-tree-dump-times "loop with 7 iterations completely unrolled" 2 "cunroll" } } */ +/* { dg-final { scan-tree-dump-not "completely unrolled" "cunrolli" } } */