Re: [PATCH] Take branch misprediction effects into account when RTL loop unrolling (issue6099055)

Xinliang David Li Wed, 25 Apr 2012 09:12:26 -0700

I think the general mechanism applies to most of the targets. What is
needed is target specific parameter (branch budget) tuning which can
be done separately -- there exist a way to do that already.


David

On Wed, Apr 25, 2012 at 2:03 AM, Richard Guenther
<richard.guent...@gmail.com> wrote:
> On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohn...@google.com> wrote:
>> This patch adds heuristics to limit unrolling in loops with branches that 
>> may increase
>> branch mispredictions. It affects loops that are not frequently iterated, 
>> and that are
>> nested within a hot region of code that already contains many branch 
>> instructions.
>>
>> Performance tested with both internal benchmarks and with SPEC 2000/2006 on 
>> a variety
>> of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD 
>> Opteron systems.
>> This improves performance of an internal search indexing benchmark by close 
>> to 2% on
>> all the tested Intel platforms.  It also consistently improves 445.gobmk 
>> (with FDO feedback
>> where unrolling kicks in) by close to 1% on AMD Opteron. Other performance 
>> effects are
>> neutral.
>>
>> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for trunk?
>>
>> Thanks,
>> Teresa
>>
>> 2012-04-24   Teresa Johnson  <tejohn...@google.com>
>>
>>        * loop-unroll.c (loop_has_call): New function.
>>        (loop_has_FP_comp): Ditto.
>>        (compute_weighted_branches): Ditto.
>>        (max_unroll_with_branches): Ditto.
>>        (decide_unroll_constant_iterations): Add heuristic to avoid
>>        increasing branch mispredicts when unrolling.
>>        (decide_unroll_runtime_iterations): Ditto.
>>        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>>        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>>
>> Index: loop-unroll.c
>> ===================================================================
>> --- loop-unroll.c       (revision 186783)
>> +++ loop-unroll.c       (working copy)
>> @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
>>                                             basic_block);
>>  static rtx get_expansion (struct var_to_expand *);
>>
>> +/* Determine whether LOOP contains call.  */
>> +static bool
>> +loop_has_call(struct loop *loop)
>> +{
>> +  basic_block *body, bb;
>> +  unsigned i;
>> +  rtx insn;
>> +
>> +  body = get_loop_body (loop);
>
> You repeatedly do this and walk over all blocks.  Please think about
> compile-time
> issues when writing code.
>
> This all looks sort-of target specific to me and I don't see why this
> very specialized
> patch is a good idea when unrolling does a very poor job deciding what and how
> much to unroll generally.
>
> Richard.
>
>> +  for (i = 0; i < loop->num_nodes; i++)
>> +    {
>> +      bb = body[i];
>> +
>> +      FOR_BB_INSNS (bb, insn)
>> +        {
>> +          if (CALL_P (insn))
>> +            {
>> +              free (body);
>> +              return true;
>> +            }
>> +        }
>> +    }
>> +  free (body);
>> +  return false;
>> +}
>> +
>> +/* Determine whether LOOP contains floating-point computation.  */
>> +static bool
>> +loop_has_FP_comp(struct loop *loop)
>> +{
>> +  rtx set, dest;
>> +  basic_block *body, bb;
>> +  unsigned i;
>> +  rtx insn;
>> +
>> +  body = get_loop_body (loop);
>> +  for (i = 0; i < loop->num_nodes; i++)
>> +    {
>> +      bb = body[i];
>> +
>> +      FOR_BB_INSNS (bb, insn)
>> +        {
>> +          set = single_set (insn);
>> +          if (!set)
>> +            continue;
>> +
>> +          dest = SET_DEST (set);
>> +          if (FLOAT_MODE_P (GET_MODE (dest)))
>> +            {
>> +              free (body);
>> +              return true;
>> +            }
>> +        }
>> +    }
>> +  free (body);
>> +  return false;
>> +}
>> +
>> +/* Compute the number of branches in LOOP, weighted by execution counts.  */
>> +static float
>> +compute_weighted_branches(struct loop *loop)
>> +{
>> +  int header_count = loop->header->count;
>> +  unsigned i;
>> +  float n;
>> +  basic_block * body;
>> +
>> +  /* If no profile feedback data exists, don't limit unrolling  */
>> +  if (header_count == 0)
>> +    return 0.0;
>> +
>> +  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
>> +
>> +  body = get_loop_body (loop);
>> +  n = 0.0;
>> +  for (i = 0; i < loop->num_nodes; i++)
>> +    {
>> +      if (EDGE_COUNT (body[i]->succs) >= 2)
>> +        {
>> +          /* If this block is executed less frequently than the header (loop
>> +             entry), then it is weighted based on the ratio of times it is
>> +             executed compared to the header.  */
>> +          if (body[i]->count < header_count)
>> +            n += ((float)body[i]->count)/header_count;
>> +
>> +          /* When it is executed more frequently than the header (i.e. it is
>> +             in a nested inner loop), simply weight the branch at 1.0.  */
>> +          else
>> +            n += 1.0;
>> +        }
>> +    }
>> +  free (body);
>> +
>> +  return n;
>> +}
>> +
>> +/* Compute the maximum number of times LOOP can be unrolled without 
>> exceeding
>> +   a branch budget, which can increase branch mispredictions. The number of
>> +   branches is computed by weighting each branch with its expected execution
>> +   probability through the loop based on profile data. If no profile 
>> feedback
>> +   data exists, simply return the current NUNROLL factor.  */
>> +static unsigned
>> +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
>> +{
>> +  struct loop *outer;
>> +  struct niter_desc *outer_desc;
>> +  int outer_niters = 1;
>> +  float weighted_outer_branches = 0.0;
>> +  float weighted_num_branches = compute_weighted_branches (loop);
>> +
>> +  /* If there was no profile feedback data, weighted_num_branches will be 
>> 0.0
>> +     and we won't limit unrolling. If the weighted_num_branches is at most 
>> 1.0,
>> +     also don't limit unrolling as the back-edge branch will not be 
>> duplicated.  */
>> +  if (weighted_num_branches <= 1.0)
>> +    return nunroll;
>> +
>> +  /* Walk up the loop tree until we find a hot outer loop in which the 
>> current
>> +     loop is nested. At that point we will compute the number of times the
>> +     current loop can be unrolled based on the number of branches in the hot
>> +     outer loop.  */
>> +  outer = loop_outer(loop);
>> +  /* The loop structure contains a fake outermost loop, so this should 
>> always
>> +     be non-NULL for our current loop.  */
>> +  gcc_assert (outer);
>> +  /* Detect if this is the fake outermost loop (at which point we are done)
>> +     by checking its outer loop.  */
>> +  while (loop_outer(outer))
>> +    {
>> +      outer_desc = get_simple_loop_desc (outer);
>> +
>> +      if (outer_desc->const_iter)
>> +        outer_niters *= outer_desc->niter;
>> +      else if (outer->header->count)
>> +        outer_niters *= expected_loop_iterations (outer);
>> +
>> +      weighted_outer_branches = compute_weighted_branches (outer);
>> +
>> +      /* Should have been checked by caller.  */
>> +      gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);
>> +
>> +      /* If the outer loop has enough iterations to be considered hot, then
>> +         we can stop our upwards loop tree traversal and examine the current
>> +         outer loop.  */
>> +      if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> +        {
>> +          /* Assume that any call will cause the branch budget to be 
>> exceeded,
>> +             and that we can't unroll the current loop without increasing
>> +             mispredicts.  */
>> +          if (loop_has_call(outer))
>> +            return 0;
>> +
>> +          /* Otherwise, compute the maximum number of times current loop 
>> can be
>> +             unrolled without exceeding our branch budget. First we subtract
>> +             off the outer loop's weighted branch count from the budget. 
>> Note
>> +             that this includes the branches in the current loop. This 
>> yields
>> +             the number of branches left in the budget for the unrolled 
>> copies.
>> +             We divide this by the number of branches in the current loop 
>> that
>> +             must be duplicated when we unroll, which is the total weighted
>> +             number of branches minus the back-edge branch. This yields the
>> +             number of new loop body copies that can be created by unrolling
>> +             without exceeding the budget, to which we add 1 to get the 
>> unroll
>> +             factor.  */
>> +          return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
>> +                  weighted_outer_branches)/(weighted_num_branches - 1) + 1;
>> +        }
>> +      outer = loop_outer(outer);
>> +    }
>> +
>> +  /* The current loop is not enclosed by a hot enough outer loop in this
>> +     procedure, since the hot outer loop is inter-procedural, assume that
>> +     it already contains a significant number of branches, so don't unroll. 
>>  */
>> +  return 0;
>> +}
>> +
>>  /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
>>  void
>>  unroll_and_peel_loops (int flags)
>> @@ -522,6 +696,7 @@ static void
>>  decide_unroll_constant_iterations (struct loop *loop, int flags)
>>  {
>>   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
>> +  unsigned nunroll_branches;
>>   struct niter_desc *desc;
>>
>>   if (!(flags & UAP_UNROLL))
>> @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo
>>       return;
>>     }
>>
>> +  /* Be careful when unrolling loops with branches inside -- it can increase
>> +     the number of mispredicts. Ignore loops with FP computation as these
>> +     tend to benefit much more consistently from unrolling.  */
>> +  if (num_loop_branches (loop) > 1
>> +      && loop_has_FP_comp(loop)
>> +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
>> +      && desc->niter < (unsigned) PARAM_VALUE 
>> (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> +    {
>> +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
>> +      if (nunroll > nunroll_branches)
>> +        nunroll = nunroll_branches;
>> +      if (nunroll <= 1)
>> +        {
>> +          if (dump_file)
>> +           fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> +          return;
>> +        }
>> +    }
>> +
>>   /* Check whether the loop rolls enough to consider.  */
>>   if (desc->niter < 2 * nunroll)
>>     {
>> @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop
>>  static void
>>  decide_unroll_runtime_iterations (struct loop *loop, int flags)
>>  {
>> -  unsigned nunroll, nunroll_by_av, i;
>> +  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
>>   struct niter_desc *desc;
>>
>>   if (!(flags & UAP_UNROLL))
>> @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo
>>       return;
>>     }
>>
>> +  /* Be careful when unrolling loops with branches inside -- it can increase
>> +     the number of mispredicts. Ignore loops with FP computation as these
>> +     tend to benefit much more consistently from unrolling.  */
>> +  if (num_loop_branches (loop) > 1
>> +      && loop_has_FP_comp(loop)
>> +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
>> +      && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE 
>> (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> +    {
>> +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
>> +      if (nunroll > nunroll_branches)
>> +        nunroll = nunroll_branches;
>> +      if (nunroll <= 1)
>> +        {
>> +          if (dump_file)
>> +            fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> +          return;
>> +        }
>> +    }
>> +
>>   /* If we have profile feedback, check whether the loop rolls.  */
>>   if ((loop->header->count
>>        && expected_loop_iterations (loop) < 2 * nunroll)
>> Index: params.def
>> ===================================================================
>> --- params.def  (revision 186783)
>> +++ params.def  (working copy)
>> @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
>>         "The maximum depth of a loop nest we completely peel",
>>         8, 0, 0)
>>
>> +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
>> +       "min-iter-unroll-with-branches",
>> +       "Minimum iteration count to ignore branch effects when unrolling",
>> +       50, 0, 0)
>> +
>> +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
>> +       "unroll-outer-loop-branch-budget",
>> +       "Maximum number of branches allowed in hot outer loop region after 
>> unroll",
>> +       25, 0, 0)
>> +
>>  /* The maximum number of insns of an unswitched loop.  */
>>  DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
>>        "max-unswitch-insns",
>>
>> --
>> This patch is available for review at http://codereview.appspot.com/6099055

Re: [PATCH] Take branch misprediction effects into account when RTL loop unrolling (issue6099055)

Reply via email to