On Tue, Apr 24, 2012 at 2:26 PM, Teresa Johnson <tejohn...@google.com> wrote: > This patch adds heuristics to limit unrolling in loops with branches that may > increase > branch mispredictions. It affects loops that are not frequently iterated, and > that are > nested within a hot region of code that already contains many branch > instructions. > > Performance tested with both internal benchmarks and with SPEC 2000/2006 on a > variety > of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD > Opteron systems. > This improves performance of an internal search indexing benchmark by close > to 2% on > all the tested Intel platforms. It also consistently improves 445.gobmk > (with FDO feedback > where unrolling kicks in) by close to 1% on AMD Opteron. Other performance > effects are > neutral. > > Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? > > Thanks, > Teresa > > 2012-04-24 Teresa Johnson <tejohn...@google.com> > > * loop-unroll.c (loop_has_call): New function. > (loop_has_FP_comp): Ditto. > (compute_weighted_branches): Ditto. > (max_unroll_with_branches): Ditto. > (decide_unroll_constant_iterations): Add heuristic to avoid > increasing branch mispredicts when unrolling. > (decide_unroll_runtime_iterations): Ditto. > * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param. > (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto. > > Index: loop-unroll.c > =================================================================== > --- loop-unroll.c (revision 186783) > +++ loop-unroll.c (working copy) > @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc > basic_block); > static rtx get_expansion (struct var_to_expand *); > > +/* Determine whether LOOP contains call. */ > +static bool > +loop_has_call(struct loop *loop) > +{ > + basic_block *body, bb; > + unsigned i; > + rtx insn; > + > + body = get_loop_body (loop); > + for (i = 0; i < loop->num_nodes; i++) > + { > + bb = body[i]; > + > + FOR_BB_INSNS (bb, insn) > + { > + if (CALL_P (insn)) > + { > + free (body); > + return true; > + } > + } > + } > + free (body); > + return false; > +} > + > +/* Determine whether LOOP contains floating-point computation. */ > +static bool > +loop_has_FP_comp(struct loop *loop) > +{ > + rtx set, dest; > + basic_block *body, bb; > + unsigned i; > + rtx insn; > + > + body = get_loop_body (loop); > + for (i = 0; i < loop->num_nodes; i++) > + { > + bb = body[i]; > + > + FOR_BB_INSNS (bb, insn) > + { > + set = single_set (insn); > + if (!set) > + continue; > + > + dest = SET_DEST (set); > + if (FLOAT_MODE_P (GET_MODE (dest))) > + { > + free (body); > + return true; > + } > + } > + } > + free (body); > + return false; > +} > + > +/* Compute the number of branches in LOOP, weighted by execution counts. */ > +static float > +compute_weighted_branches(struct loop *loop) > +{ > + int header_count = loop->header->count; > + unsigned i; > + float n; > + basic_block * body; > + > + /* If no profile feedback data exists, don't limit unrolling */ > + if (header_count == 0) > + return 0.0; > + > + gcc_assert (loop->latch != EXIT_BLOCK_PTR); > + > + body = get_loop_body (loop); > + n = 0.0; > + for (i = 0; i < loop->num_nodes; i++) > + { > + if (EDGE_COUNT (body[i]->succs) >= 2) > + { > + /* If this block is executed less frequently than the header (loop > + entry), then it is weighted based on the ratio of times it is > + executed compared to the header. */ > + if (body[i]->count < header_count) > + n += ((float)body[i]->count)/header_count;
Please don't introduce more floating point usage into the compiler since it could change between different hosts (sse vs x87 for an example). Maybe use a fixed point multiply of 1000 (note use a macro for this special value though) like what is used in the rest of the predict code. Thanks, Andrew Pinski > + > + /* When it is executed more frequently than the header (i.e. it is > + in a nested inner loop), simply weight the branch at 1.0. */ > + else > + n += 1.0; > + } > + } > + free (body); > + > + return n; > +} > + > +/* Compute the maximum number of times LOOP can be unrolled without exceeding > + a branch budget, which can increase branch mispredictions. The number of > + branches is computed by weighting each branch with its expected execution > + probability through the loop based on profile data. If no profile feedback > + data exists, simply return the current NUNROLL factor. */ > +static unsigned > +max_unroll_with_branches(struct loop *loop, unsigned nunroll) > +{ > + struct loop *outer; > + struct niter_desc *outer_desc; > + int outer_niters = 1; > + float weighted_outer_branches = 0.0; > + float weighted_num_branches = compute_weighted_branches (loop); > + > + /* If there was no profile feedback data, weighted_num_branches will be 0.0 > + and we won't limit unrolling. If the weighted_num_branches is at most > 1.0, > + also don't limit unrolling as the back-edge branch will not be > duplicated. */ > + if (weighted_num_branches <= 1.0) > + return nunroll; > + > + /* Walk up the loop tree until we find a hot outer loop in which the > current > + loop is nested. At that point we will compute the number of times the > + current loop can be unrolled based on the number of branches in the hot > + outer loop. */ > + outer = loop_outer(loop); > + /* The loop structure contains a fake outermost loop, so this should always > + be non-NULL for our current loop. */ > + gcc_assert (outer); > + /* Detect if this is the fake outermost loop (at which point we are done) > + by checking its outer loop. */ > + while (loop_outer(outer)) > + { > + outer_desc = get_simple_loop_desc (outer); > + > + if (outer_desc->const_iter) > + outer_niters *= outer_desc->niter; > + else if (outer->header->count) > + outer_niters *= expected_loop_iterations (outer); > + > + weighted_outer_branches = compute_weighted_branches (outer); > + > + /* Should have been checked by caller. */ > + gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1); > + > + /* If the outer loop has enough iterations to be considered hot, then > + we can stop our upwards loop tree traversal and examine the current > + outer loop. */ > + if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) > + { > + /* Assume that any call will cause the branch budget to be > exceeded, > + and that we can't unroll the current loop without increasing > + mispredicts. */ > + if (loop_has_call(outer)) > + return 0; > + > + /* Otherwise, compute the maximum number of times current loop can > be > + unrolled without exceeding our branch budget. First we subtract > + off the outer loop's weighted branch count from the budget. Note > + that this includes the branches in the current loop. This yields > + the number of branches left in the budget for the unrolled > copies. > + We divide this by the number of branches in the current loop > that > + must be duplicated when we unroll, which is the total weighted > + number of branches minus the back-edge branch. This yields the > + number of new loop body copies that can be created by unrolling > + without exceeding the budget, to which we add 1 to get the > unroll > + factor. */ > + return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) - > + weighted_outer_branches)/(weighted_num_branches - 1) + 1; > + } > + outer = loop_outer(outer); > + } > + > + /* The current loop is not enclosed by a hot enough outer loop in this > + procedure, since the hot outer loop is inter-procedural, assume that > + it already contains a significant number of branches, so don't unroll. > */ > + return 0; > +} > + > /* Unroll and/or peel (depending on FLAGS) LOOPS. */ > void > unroll_and_peel_loops (int flags) > @@ -522,6 +696,7 @@ static void > decide_unroll_constant_iterations (struct loop *loop, int flags) > { > unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i; > + unsigned nunroll_branches; > struct niter_desc *desc; > > if (!(flags & UAP_UNROLL)) > @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo > return; > } > > + /* Be careful when unrolling loops with branches inside -- it can increase > + the number of mispredicts. Ignore loops with FP computation as these > + tend to benefit much more consistently from unrolling. */ > + if (num_loop_branches (loop) > 1 > + && loop_has_FP_comp(loop) > + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 > + && desc->niter < (unsigned) PARAM_VALUE > (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) > + { > + nunroll_branches = max_unroll_with_branches(loop, nunroll); > + if (nunroll > nunroll_branches) > + nunroll = nunroll_branches; > + if (nunroll <= 1) > + { > + if (dump_file) > + fprintf (dump_file, ";; Not unrolling, contains branches\n"); > + return; > + } > + } > + > /* Check whether the loop rolls enough to consider. */ > if (desc->niter < 2 * nunroll) > { > @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop > static void > decide_unroll_runtime_iterations (struct loop *loop, int flags) > { > - unsigned nunroll, nunroll_by_av, i; > + unsigned nunroll, nunroll_by_av, nunroll_branches, i; > struct niter_desc *desc; > > if (!(flags & UAP_UNROLL)) > @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo > return; > } > > + /* Be careful when unrolling loops with branches inside -- it can increase > + the number of mispredicts. Ignore loops with FP computation as these > + tend to benefit much more consistently from unrolling. */ > + if (num_loop_branches (loop) > 1 > + && loop_has_FP_comp(loop) > + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 > + && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE > (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) > + { > + nunroll_branches = max_unroll_with_branches(loop, nunroll); > + if (nunroll > nunroll_branches) > + nunroll = nunroll_branches; > + if (nunroll <= 1) > + { > + if (dump_file) > + fprintf (dump_file, ";; Not unrolling, contains branches\n"); > + return; > + } > + } > + > /* If we have profile feedback, check whether the loop rolls. */ > if ((loop->header->count > && expected_loop_iterations (loop) < 2 * nunroll) > Index: params.def > =================================================================== > --- params.def (revision 186783) > +++ params.def (working copy) > @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS, > "The maximum depth of a loop nest we completely peel", > 8, 0, 0) > > +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES, > + "min-iter-unroll-with-branches", > + "Minimum iteration count to ignore branch effects when unrolling", > + 50, 0, 0) > + > +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET, > + "unroll-outer-loop-branch-budget", > + "Maximum number of branches allowed in hot outer loop region after > unroll", > + 25, 0, 0) > + > /* The maximum number of insns of an unswitched loop. */ > DEFPARAM(PARAM_MAX_UNSWITCH_INSNS, > "max-unswitch-insns", > > -- > This patch is available for review at http://codereview.appspot.com/6099055