Re: [PATCH] unroll: Avoid unnecessary tail loops for constant niters

Richard Biener via Gcc-patches Tue, 20 Jul 2021 11:01:28 -0700

On July 20, 2021 5:31:17 PM GMT+02:00, Richard Sandiford via Gcc-patches 
<gcc-patches@gcc.gnu.org> wrote:
>unroll and jam can decide to unroll the outer loop of a nest like:
>
>  for (int j = 0; j < n; ++j)
>    for (int i = 0; i < n; ++i)
>      x[i] += __builtin_expf (y[j][i]);
>
>It then uses a tail loop to handle any left-over iterations.
>
>However, the code is structured so that this tail loop is always used.
>If n is a multiple of the unroll factor UF, the final UF iterations
>will
>use the tail loop rather than the unrolled loop.
>
>“Fixing” that for variable loop counts would mean introducing another
>runtime test: a branch around the tail loop if there are no more
>iterations.  There's at least an argument that the overhead of doing
>that test might not pay for itself.
>
>But we use this structure even if the iteration count is provably
>a multiple of UF at compile time.  E.g. with s/n/100/ and an
>unroll factor of 2, the first 98 iterations use the unrolled loop
>and the final 2 iterations use the original loop.
>
>This patch makes the unroller avoid a tail loop in that case.
>The end result seemed easier to follow if variables were declared
>at the point of initialisation, so that it's more obvious which
>ones are meaningful even when there's no tail loop.
>
>Tested on aarch64-linux-gnu so far, will test on x86_64-linux-gnu too.
>OK to install if testing passes?


Ok. 

Richard. 

>Richard
>
>
>gcc/
>       * tree-ssa-loop-manip.c (determine_exit_conditions): Return a null
>       exit condition if no tail loop is needed, and if the original exit
>       condition should therefore be kept as-is.
>       (tree_transform_and_unroll_loop): Handle that case here too.
>
>gcc/testsuite/
>       * gcc.dg/unroll-9.c: New test/
>---
> gcc/testsuite/gcc.dg/unroll-9.c |  12 ++
> gcc/tree-ssa-loop-manip.c       | 306 +++++++++++++++++---------------
> 2 files changed, 176 insertions(+), 142 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/unroll-9.c
>
>diff --git a/gcc/tree-ssa-loop-manip.c b/gcc/tree-ssa-loop-manip.c
>index 28ae1316fa0..41f9872ca10 100644
>--- a/gcc/tree-ssa-loop-manip.c
>+++ b/gcc/tree-ssa-loop-manip.c
>@@ -997,8 +997,10 @@ can_unroll_loop_p (class loop *loop, unsigned
>factor,
>/* Determines the conditions that control execution of LOOP unrolled
>FACTOR
>    times.  DESC is number of iterations of LOOP.  ENTER_COND is set to
>    condition that must be true if the main loop can be entered.
>+   If the loop does not always iterate an exact multiple of FACTOR
>times,
>EXIT_BASE, EXIT_STEP, EXIT_CMP and EXIT_BOUND are set to values
>describing
>-   how the exit from the unrolled loop should be controlled.  */
>+   how the exit from the unrolled loop should be controlled. 
>Otherwise,
>+   the trees are set to null and EXIT_CMP is set to ERROR_MARK.  */
> 
> static void
>determine_exit_conditions (class loop *loop, class tree_niter_desc
>*desc,
>@@ -1079,6 +1081,16 @@ determine_exit_conditions (class loop *loop,
>class tree_niter_desc *desc,
>   assum = fold_build2 (cmp, boolean_type_node, base, bound);
>   cond = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, assum, cond);
> 
>+  if (integer_nonzerop (cond)
>+      && integer_zerop (desc->may_be_zero))
>+    {
>+      /* Convert the latch count to an iteration count.  */
>+      tree niter = fold_build2 (PLUS_EXPR, type, desc->niter,
>+                              build_one_cst (type));
>+      if (multiple_of_p (type, niter, bigstep))
>+      return;
>+    }
>+
>cond = force_gimple_operand (unshare_expr (cond), &stmts, false,
>NULL_TREE);
>   if (stmts)
>  gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
>@@ -1234,137 +1246,138 @@ tree_transform_and_unroll_loop (class loop
>*loop, unsigned factor,
>                               transform_callback transform,
>                               void *data)
> {
>-  gcond *exit_if;
>-  tree ctr_before, ctr_after;
>-  tree enter_main_cond, exit_base, exit_step, exit_bound;
>-  enum tree_code exit_cmp;
>-  gphi *phi_old_loop, *phi_new_loop, *phi_rest;
>-  gphi_iterator psi_old_loop, psi_new_loop;
>-  tree init, next, new_init;
>-  class loop *new_loop;
>-  basic_block rest, exit_bb;
>-  edge old_entry, new_entry, old_latch, precond_edge, new_exit;
>-  edge new_nonexit, e;
>-  gimple_stmt_iterator bsi;
>-  use_operand_p op;
>-  bool ok;
>-  unsigned i;
>-  profile_probability prob, prob_entry, scale_unrolled;
>-  profile_count freq_e, freq_h;
>   gcov_type new_est_niter = niter_for_unrolled_loop (loop, factor);
>unsigned irr = loop_preheader_edge (loop)->flags &
>EDGE_IRREDUCIBLE_LOOP;
>-  auto_vec<edge> to_remove;
> 
>+  enum tree_code exit_cmp;
>+  tree enter_main_cond, exit_base, exit_step, exit_bound;
>   determine_exit_conditions (loop, desc, factor,
>                            &enter_main_cond, &exit_base, &exit_step,
>                            &exit_cmp, &exit_bound);
>+  bool single_loop_p = !exit_base;
> 
>/* Let us assume that the unrolled loop is quite likely to be entered. 
>*/
>+  profile_probability prob_entry;
>   if (integer_nonzerop (enter_main_cond))
>     prob_entry = profile_probability::always ();
>   else
>     prob_entry = profile_probability::guessed_always ()
>                       .apply_scale (PROB_UNROLLED_LOOP_ENTERED, 100);
> 
>-  /* The values for scales should keep profile consistent, and
>somewhat close
>-     to correct.
>-
>-     TODO: The current value of SCALE_REST makes it appear that the
>loop that
>-     is created by splitting the remaining iterations of the unrolled
>loop is
>-     executed the same number of times as the original loop, and with
>the same
>-     frequencies, which is obviously wrong.  This does not appear to
>cause
>-     problems, so we do not bother with fixing it for now.  To make
>the profile
>-     correct, we would need to change the probability of the exit edge
>of the
>-     loop, and recompute the distribution of frequencies in its body
>because
>-     of this change (scale the frequencies of blocks before and after
>the exit
>-     by appropriate factors).  */
>-  scale_unrolled = prob_entry;
>-
>-  new_loop = loop_version (loop, enter_main_cond, NULL, prob_entry,
>-                         prob_entry.invert (), scale_unrolled,
>-                         profile_probability::guessed_always (),
>-                         true);
>-  gcc_assert (new_loop != NULL);
>-  update_ssa (TODO_update_ssa);
>-
>-  /* Prepare the cfg and update the phi nodes.  Move the loop exit to
>the
>-     loop latch (and make its condition dummy, for the moment).  */
>-  rest = loop_preheader_edge (new_loop)->src;
>-  precond_edge = single_pred_edge (rest);
>-  split_edge (loop_latch_edge (loop));
>-  exit_bb = single_pred (loop->latch);
>-
>-  /* Since the exit edge will be removed, the frequency of all the
>blocks
>-     in the loop that are dominated by it must be scaled by
>-     1 / (1 - exit->probability).  */
>-  if (exit->probability.initialized_p ())
>-    scale_dominated_blocks_in_loop (loop, exit->src,
>-                                  /* We are scaling up here so probability
>-                                     does not fit.  */
>-                                  loop->header->count,
>-                                  loop->header->count
>-                                  - loop->header->count.apply_probability
>-                                       (exit->probability));
>-
>-  bsi = gsi_last_bb (exit_bb);
>-  exit_if = gimple_build_cond (EQ_EXPR, integer_zero_node,
>-                             integer_zero_node,
>-                             NULL_TREE, NULL_TREE);
>-
>-  gsi_insert_after (&bsi, exit_if, GSI_NEW_STMT);
>-  new_exit = make_edge (exit_bb, rest, EDGE_FALSE_VALUE | irr);
>-  rescan_loop_exit (new_exit, true, false);
>-
>-  /* Set the probability of new exit to the same of the old one.  Fix
>-     the frequency of the latch block, by scaling it back by
>-     1 - exit->probability.  */
>-  new_exit->probability = exit->probability;
>-  new_nonexit = single_pred_edge (loop->latch);
>-  new_nonexit->probability = exit->probability.invert ();
>-  new_nonexit->flags = EDGE_TRUE_VALUE;
>-  if (new_nonexit->probability.initialized_p ())
>-    scale_bbs_frequencies (&loop->latch, 1, new_nonexit->probability);
>-
>-  old_entry = loop_preheader_edge (loop);
>-  new_entry = loop_preheader_edge (new_loop);
>-  old_latch = loop_latch_edge (loop);
>-  for (psi_old_loop = gsi_start_phis (loop->header),
>-       psi_new_loop = gsi_start_phis (new_loop->header);
>-       !gsi_end_p (psi_old_loop);
>-       gsi_next (&psi_old_loop), gsi_next (&psi_new_loop))
>+  gcond *exit_if = nullptr;
>+  class loop *new_loop = nullptr;
>+  basic_block rest;
>+  edge new_exit;
>+  if (!single_loop_p)
>     {
>-      phi_old_loop = psi_old_loop.phi ();
>-      phi_new_loop = psi_new_loop.phi ();
>-
>-      init = PHI_ARG_DEF_FROM_EDGE (phi_old_loop, old_entry);
>-      op = PHI_ARG_DEF_PTR_FROM_EDGE (phi_new_loop, new_entry);
>-      gcc_assert (operand_equal_for_phi_arg_p (init, USE_FROM_PTR
>(op)));
>-      next = PHI_ARG_DEF_FROM_EDGE (phi_old_loop, old_latch);
>-
>-      /* Prefer using original variable as a base for the new ssa
>name.
>-       This is necessary for virtual ops, and useful in order to avoid
>-       losing debug info for real ops.  */
>-      if (TREE_CODE (next) == SSA_NAME
>-        && useless_type_conversion_p (TREE_TYPE (next),
>-                                      TREE_TYPE (init)))
>-      new_init = copy_ssa_name (next);
>-      else if (TREE_CODE (init) == SSA_NAME
>-             && useless_type_conversion_p (TREE_TYPE (init),
>-                                           TREE_TYPE (next)))
>-      new_init = copy_ssa_name (init);
>-      else if (useless_type_conversion_p (TREE_TYPE (next), TREE_TYPE
>(init)))
>-      new_init = make_temp_ssa_name (TREE_TYPE (next), NULL, "unrinittmp");
>-      else
>-      new_init = make_temp_ssa_name (TREE_TYPE (init), NULL, "unrinittmp");
>+      /* The values for scales should keep profile consistent, and
>somewhat
>+       close to correct.
>+
>+       TODO: The current value of SCALE_REST makes it appear that the loop
>+       that is created by splitting the remaining iterations of the
>unrolled
>+       loop is executed the same number of times as the original loop, and
>+       with the same frequencies, which is obviously wrong.  This does not
>+       appear to cause problems, so we do not bother with fixing it for
>now.
>+       To make the profile correct, we would need to change the probability
>+       of the exit edge of the loop, and recompute the distribution of
>+       frequencies in its body because of this change (scale the
>frequencies
>+       of blocks before and after the exit by appropriate factors).  */
>+      profile_probability scale_unrolled = prob_entry;
>+      new_loop = loop_version (loop, enter_main_cond, NULL,
>prob_entry,
>+                             prob_entry.invert (), scale_unrolled,
>+                             profile_probability::guessed_always (),
>+                             true);
>+      gcc_assert (new_loop != NULL);
>+      update_ssa (TODO_update_ssa);
>+
>+      /* Prepare the cfg and update the phi nodes.  Move the loop exit
>to the
>+       loop latch (and make its condition dummy, for the moment).  */
>+      rest = loop_preheader_edge (new_loop)->src;
>+      edge precond_edge = single_pred_edge (rest);
>+      split_edge (loop_latch_edge (loop));
>+      basic_block exit_bb = single_pred (loop->latch);
>+
>+      /* Since the exit edge will be removed, the frequency of all the
>blocks
>+       in the loop that are dominated by it must be scaled by
>+       1 / (1 - exit->probability).  */
>+      if (exit->probability.initialized_p ())
>+      scale_dominated_blocks_in_loop (loop, exit->src,
>+                                      /* We are scaling up here so
>+                                         probability does not fit.  */
>+                                      loop->header->count,
>+                                      loop->header->count
>+                                      - loop->header->count.apply_probability
>+                                          (exit->probability));
>+
>+      gimple_stmt_iterator bsi = gsi_last_bb (exit_bb);
>+      exit_if = gimple_build_cond (EQ_EXPR, integer_zero_node,
>+                                 integer_zero_node,
>+                                 NULL_TREE, NULL_TREE);
>+
>+      gsi_insert_after (&bsi, exit_if, GSI_NEW_STMT);
>+      new_exit = make_edge (exit_bb, rest, EDGE_FALSE_VALUE | irr);
>+      rescan_loop_exit (new_exit, true, false);
>+
>+      /* Set the probability of new exit to the same of the old one. 
>Fix
>+       the frequency of the latch block, by scaling it back by
>+       1 - exit->probability.  */
>+      new_exit->probability = exit->probability;
>+      edge new_nonexit = single_pred_edge (loop->latch);
>+      new_nonexit->probability = exit->probability.invert ();
>+      new_nonexit->flags = EDGE_TRUE_VALUE;
>+      if (new_nonexit->probability.initialized_p ())
>+      scale_bbs_frequencies (&loop->latch, 1, new_nonexit->probability);
>+
>+      edge old_entry = loop_preheader_edge (loop);
>+      edge new_entry = loop_preheader_edge (new_loop);
>+      edge old_latch = loop_latch_edge (loop);
>+      for (gphi_iterator psi_old_loop = gsi_start_phis (loop->header),
>+           psi_new_loop = gsi_start_phis (new_loop->header);
>+         !gsi_end_p (psi_old_loop);
>+         gsi_next (&psi_old_loop), gsi_next (&psi_new_loop))
>+      {
>+        gphi *phi_old_loop = psi_old_loop.phi ();
>+        gphi *phi_new_loop = psi_new_loop.phi ();
>+
>+        tree init = PHI_ARG_DEF_FROM_EDGE (phi_old_loop, old_entry);
>+        use_operand_p op
>+          = PHI_ARG_DEF_PTR_FROM_EDGE (phi_new_loop, new_entry);
>+        gcc_assert (operand_equal_for_phi_arg_p (init, USE_FROM_PTR (op)));
>+        tree next = PHI_ARG_DEF_FROM_EDGE (phi_old_loop, old_latch);
>+
>+        /* Prefer using original variable as a base for the new ssa name.
>+           This is necessary for virtual ops, and useful in order to avoid
>+           losing debug info for real ops.  */
>+        tree new_init;
>+        if (TREE_CODE (next) == SSA_NAME
>+            && useless_type_conversion_p (TREE_TYPE (next),
>+                                          TREE_TYPE (init)))
>+          new_init = copy_ssa_name (next);
>+        else if (TREE_CODE (init) == SSA_NAME
>+                 && useless_type_conversion_p (TREE_TYPE (init),
>+                                               TREE_TYPE (next)))
>+          new_init = copy_ssa_name (init);
>+        else if (useless_type_conversion_p (TREE_TYPE (next),
>+                                            TREE_TYPE (init)))
>+          new_init = make_temp_ssa_name (TREE_TYPE (next), NULL,
>+                                         "unrinittmp");
>+        else
>+          new_init = make_temp_ssa_name (TREE_TYPE (init), NULL,
>+                                         "unrinittmp");
> 
>-      phi_rest = create_phi_node (new_init, rest);
>+        gphi *phi_rest = create_phi_node (new_init, rest);
>+        add_phi_arg (phi_rest, init, precond_edge, UNKNOWN_LOCATION);
>+        add_phi_arg (phi_rest, next, new_exit, UNKNOWN_LOCATION);
>+        SET_USE (op, new_init);
>+      }
> 
>-      add_phi_arg (phi_rest, init, precond_edge, UNKNOWN_LOCATION);
>-      add_phi_arg (phi_rest, next, new_exit, UNKNOWN_LOCATION);
>-      SET_USE (op, new_init);
>+      remove_path (exit);
>+    }
>+  else
>+    {
>+      new_exit = exit;
>+      rest = exit->dest;
>     }
>-
>-  remove_path (exit);
> 
>   /* Transform the loop.  */
>   if (transform)
>@@ -1376,57 +1389,66 @@ tree_transform_and_unroll_loop (class loop
>*loop, unsigned factor,
>   bitmap_ones (wont_exit);
>   bitmap_clear_bit (wont_exit, factor - 1);
> 
>-  ok = gimple_duplicate_loop_to_header_edge
>+  auto_vec<edge> to_remove;
>+  bool ok = gimple_duplicate_loop_to_header_edge
>         (loop, loop_latch_edge (loop), factor - 1,
>          wont_exit, new_exit, &to_remove, DLTHE_FLAG_UPDATE_FREQ);
>   gcc_assert (ok);
> 
>-  FOR_EACH_VEC_ELT (to_remove, i, e)
>+  for (edge e : to_remove)
>     {
>       ok = remove_path (e);
>       gcc_assert (ok);
>     }
>   update_ssa (TODO_update_ssa);
> 
>-  /* Ensure that the frequencies in the loop match the new estimated
>-     number of iterations, and change the probability of the new
>-     exit edge.  */
>-
>-  freq_h = loop->header->count;
>-  freq_e = (loop_preheader_edge (loop))->count ();
>-  if (freq_h.nonzero_p ())
>+  if (!single_loop_p)
>     {
>-      /* Avoid dropping loop body profile counter to 0 because of zero
>count
>-       in loop's preheader.  */
>-      if (freq_h.nonzero_p () && !(freq_e == profile_count::zero ()))
>-        freq_e = freq_e.force_nonzero ();
>-      scale_loop_frequencies (loop, freq_e.probability_in (freq_h));
>+      /* Ensure that the frequencies in the loop match the new
>estimated
>+       number of iterations, and change the probability of the new
>+       exit edge.  */
>+
>+      profile_count freq_h = loop->header->count;
>+      profile_count freq_e = (loop_preheader_edge (loop))->count ();
>+      if (freq_h.nonzero_p ())
>+      {
>+        /* Avoid dropping loop body profile counter to 0 because of zero
>+           count in loop's preheader.  */
>+        if (freq_h.nonzero_p () && !(freq_e == profile_count::zero ()))
>+          freq_e = freq_e.force_nonzero ();
>+        scale_loop_frequencies (loop, freq_e.probability_in (freq_h));
>+      }
>     }
> 
>-  exit_bb = single_pred (loop->latch);
>+  basic_block exit_bb = single_pred (loop->latch);
>   new_exit = find_edge (exit_bb, rest);
>   new_exit->probability = profile_probability::always ()
>                               .apply_scale (1, new_est_niter + 1);
> 
>-  rest->count += new_exit->count ();
>+  if (!single_loop_p)
>+    rest->count += new_exit->count ();
> 
>-  new_nonexit = single_pred_edge (loop->latch);
>-  prob = new_nonexit->probability;
>+  edge new_nonexit = single_pred_edge (loop->latch);
>+  profile_probability prob = new_nonexit->probability;
>   new_nonexit->probability = new_exit->probability.invert ();
>   prob = new_nonexit->probability / prob;
>   if (prob.initialized_p ())
>     scale_bbs_frequencies (&loop->latch, 1, prob);
> 
>-  /* Finally create the new counter for number of iterations and add
>the new
>-     exit instruction.  */
>-  bsi = gsi_last_nondebug_bb (exit_bb);
>-  exit_if = as_a <gcond *> (gsi_stmt (bsi));
>-  create_iv (exit_base, exit_step, NULL_TREE, loop,
>-           &bsi, false, &ctr_before, &ctr_after);
>-  gimple_cond_set_code (exit_if, exit_cmp);
>-  gimple_cond_set_lhs (exit_if, ctr_after);
>-  gimple_cond_set_rhs (exit_if, exit_bound);
>-  update_stmt (exit_if);
>+  if (!single_loop_p)
>+    {
>+      /* Finally create the new counter for number of iterations and
>add
>+       the new exit instruction.  */
>+      tree ctr_before, ctr_after;
>+      gimple_stmt_iterator bsi = gsi_last_nondebug_bb (exit_bb);
>+      exit_if = as_a <gcond *> (gsi_stmt (bsi));
>+      create_iv (exit_base, exit_step, NULL_TREE, loop,
>+               &bsi, false, &ctr_before, &ctr_after);
>+      gimple_cond_set_code (exit_if, exit_cmp);
>+      gimple_cond_set_lhs (exit_if, ctr_after);
>+      gimple_cond_set_rhs (exit_if, exit_bound);
>+      update_stmt (exit_if);
>+    }
> 
>   checking_verify_flow_info ();
>   checking_verify_loop_structure ();
>diff --git a/gcc/testsuite/gcc.dg/unroll-9.c
>b/gcc/testsuite/gcc.dg/unroll-9.c
>new file mode 100644
>index 00000000000..2d65ec3691d
>--- /dev/null
>+++ b/gcc/testsuite/gcc.dg/unroll-9.c
>@@ -0,0 +1,12 @@
>+/* { dg-options "-O3 -fdump-tree-unrolljam -fno-math-errno" } */
>+
>+void
>+f (float *restrict x, float y[100][100])
>+{
>+  for (int j = 0; j < 100; ++j)
>+    for (int i = 0; i < 100; ++i)
>+      x[i] += __builtin_expf (y[j][i]);
>+}
>+
>+/* The loop should be unrolled 2 times, without a tail loop.  */
>+/* { dg-final { scan-tree-dump-times "__builtin_expf" 2 "unrolljam" }
>} */

Re: [PATCH] unroll: Avoid unnecessary tail loops for constant niters

Reply via email to