Hi, this is version of patch I intend to commit after re-testing at x86_64-linux with loop peeling enabled at -O3.
It drops -fpeel-all-loops, add logic to not peel loops multiple times and fix profile updating. Bootstrapped/regtested x86_64-linux Honza * doc/invoke.texi (-fpeel-loops,-O3): Update documentation. * opts.c (default_options): Enable peel loops at -O3. * tree-ssa-loop-ivcanon.c (peeled_loops): New static var. (try_peel_loop): Do not re-peel already peeled loops; use likely upper bounds; fix profile updating. (pass_complete_unroll::execute): Initialize peeled_loops. * gcc.dg/tree-ssa/peel1.c: New testcase. * gcc.dg/tree-ssa/peel2.c: New testcase. * gcc.dg/tree-ssa/pr61743-1.c: Disable loop peeling. * gcc.dg/tree-ssa/pr61743-2.c: Disable loop peeling. Index: doc/invoke.texi =================================================================== --- doc/invoke.texi (revision 236873) +++ doc/invoke.texi (working copy) @@ -6338,7 +6338,8 @@ by @option{-O2} and also turns on the @o @option{-fgcse-after-reload}, @option{-ftree-loop-vectorize}, @option{-ftree-loop-distribute-patterns}, @option{-fsplit-paths} @option{-ftree-slp-vectorize}, @option{-fvect-cost-model}, -@option{-ftree-partial-pre} and @option{-fipa-cp-clone} options. +@option{-ftree-partial-pre}, @option{-fpeel-loops} +and @option{-fipa-cp-clone} options. @item -O0 @opindex O0 @@ -8661,10 +8662,11 @@ the loop is entered. This usually makes @item -fpeel-loops @opindex fpeel-loops Peels loops for which there is enough information that they do not -roll much (from profile feedback). It also turns on complete loop peeling -(i.e.@: complete removal of loops with small constant number of iterations). +roll much (from profile feedback or static analysis). It also turns on +complete loop peeling (i.e.@: complete removal of loops with small constant +number of iterations). -Enabled with @option{-fprofile-use}. +Enabled with @option{-O3} and/or @option{-fprofile-use}. @item -fmove-loop-invariants @opindex fmove-loop-invariants Index: opts.c =================================================================== --- opts.c (revision 236873) +++ opts.c (working copy) @@ -535,6 +535,7 @@ static const struct default_options defa { OPT_LEVELS_3_PLUS, OPT_fvect_cost_model_, NULL, VECT_COST_MODEL_DYNAMIC }, { OPT_LEVELS_3_PLUS, OPT_fipa_cp_clone, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_ftree_partial_pre, NULL, 1 }, + { OPT_LEVELS_3_PLUS, OPT_fpeel_loops, NULL, 1 }, /* -Ofast adds optimizations to -O3. */ { OPT_LEVELS_FAST, OPT_ffast_math, NULL, 1 }, Index: testsuite/gcc.dg/tree-ssa/peel1.c =================================================================== --- testsuite/gcc.dg/tree-ssa/peel1.c (revision 0) +++ testsuite/gcc.dg/tree-ssa/peel1.c (working copy) @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fdump-tree-cunroll-details" } */ +struct foo {int b; int a[3];} foo; +void add(struct foo *a,int l) +{ + int i; + for (i=0;i<l;i++) + a->a[i]++; +} +/* { dg-final { scan-tree-dump "Loop 1 likely iterates at most 3 times." "cunroll"} } */ +/* { dg-final { scan-tree-dump "Peeled loop 1, 4 times." "cunroll"} } */ Index: testsuite/gcc.dg/tree-ssa/peel2.c =================================================================== --- testsuite/gcc.dg/tree-ssa/peel2.c (revision 0) +++ testsuite/gcc.dg/tree-ssa/peel2.c (working copy) @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fpeel-all-loops -fdump-tree-cunroll-details --param max-peel-times=16 --param max-peeled-insns=100" } */ +void add(int *a,int l) +{ + int i; + for (i=0;i<l;i++) + a[i]++; +} +/* { dg-final { scan-tree-dump "Peeled loop 1, 16 times." "cunroll"} } */ Index: testsuite/gcc.dg/tree-ssa/pr61743-1.c =================================================================== --- testsuite/gcc.dg/tree-ssa/pr61743-1.c (revision 236873) +++ testsuite/gcc.dg/tree-ssa/pr61743-1.c (working copy) @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details" } */ +/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details -fno-peel-loops" } */ #define N 8 #define M 14 Index: testsuite/gcc.dg/tree-ssa/pr61743-2.c =================================================================== --- testsuite/gcc.dg/tree-ssa/pr61743-2.c (revision 236873) +++ testsuite/gcc.dg/tree-ssa/pr61743-2.c (working copy) @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details" } */ +/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details -fno-peel-loops" } */ #define N 8 #define M 14 Index: tree-ssa-loop-ivcanon.c =================================================================== --- tree-ssa-loop-ivcanon.c (revision 236878) +++ tree-ssa-loop-ivcanon.c (working copy) @@ -594,6 +594,8 @@ remove_redundant_iv_tests (struct loop * /* Stores loops that will be unlooped after we process whole loop tree. */ static vec<loop_p> loops_to_unloop; static vec<int> loops_to_unloop_nunroll; +/* Stores loops that has been peeled. */ +static bitmap peeled_loops; /* Cancel all fully unrolled loops by putting __builtin_unreachable on the latch edge. @@ -962,14 +964,16 @@ try_peel_loop (struct loop *loop, vec<edge> to_remove = vNULL; edge e; - /* If the iteration bound is known and large, then we can safely eliminate - the check in peeled copies. */ - if (TREE_CODE (niter) != INTEGER_CST) - exit = NULL; - if (!flag_peel_loops || PARAM_VALUE (PARAM_MAX_PEEL_TIMES) <= 0) return false; + if (bitmap_bit_p (peeled_loops, loop->num)) + { + if (dump_file) + fprintf (dump_file, "Not peeling: loop is already peeled\n"); + return false; + } + /* Peel only innermost loops. While the code is perfectly capable of peeling non-innermost loops, the heuristics would probably need some improvements. */ @@ -990,6 +994,8 @@ try_peel_loop (struct loop *loop, /* Check if there is an estimate on the number of iterations. */ npeel = estimated_loop_iterations_int (loop); if (npeel < 0) + npeel = likely_max_loop_iterations_int (loop); + if (npeel < 0) { if (dump_file) fprintf (dump_file, "Not peeling: number of iterations is not " @@ -1036,8 +1042,7 @@ try_peel_loop (struct loop *loop, && wi::leu_p (npeel, wi::to_widest (niter))) { bitmap_ones (wont_exit); - if (wi::eq_p (wi::to_widest (niter), npeel)) - bitmap_clear_bit (wont_exit, 0); + bitmap_clear_bit (wont_exit, 0); } else { @@ -1074,14 +1079,14 @@ try_peel_loop (struct loop *loop, } if (loop->any_upper_bound) { - if (wi::ltu_p (npeel, loop->nb_iterations_estimate)) + if (wi::ltu_p (npeel, loop->nb_iterations_upper_bound)) loop->nb_iterations_upper_bound -= npeel; else loop->nb_iterations_upper_bound = 0; } if (loop->any_likely_upper_bound) { - if (wi::ltu_p (npeel, loop->nb_iterations_estimate)) + if (wi::ltu_p (npeel, loop->nb_iterations_likely_upper_bound)) loop->nb_iterations_likely_upper_bound -= npeel; else { @@ -1107,6 +1112,7 @@ try_peel_loop (struct loop *loop, else if (loop->header->frequency) scale = RDIV (entry_freq * REG_BR_PROB_BASE, loop->header->frequency); scale_loop_profile (loop, scale, 0); + bitmap_set_bit (peeled_loops, loop->num); return true; } /* Adds a canonical induction variable to LOOP if suitable. @@ -1519,9 +1526,20 @@ pass_complete_unroll::execute (function if (number_of_loops (fun) <= 1) return 0; - return tree_unroll_loops_completely (flag_unroll_loops - || flag_peel_loops - || optimize >= 3, true); + /* If we ever decide to run loop peeling more than once, we will need to + track loops already peeled in loop structures themselves to avoid + re-peeling the same loop multiple times. */ + if (flag_peel_loops) + peeled_loops = BITMAP_ALLOC (NULL); + int val = tree_unroll_loops_completely (flag_unroll_loops + || flag_peel_loops + || optimize >= 3, true); + if (peeled_loops) + { + BITMAP_FREE (peeled_loops); + peeled_loops = NULL; + } + return val; } } // anon namespace