On Fri, 7 Jul 2023, Tamar Christina wrote:

> Hi All,
> 
> This patch builds on the previous patch by fixing another issue with the
> way ifcvt currently picks which branches to test.
> 
> The issue with the current implementation is while it sorts for
> occurrences of the argument, it doesn't check for complexity of the arguments.
> 
> As an example:
> 
>   <bb 15> [local count: 528603100]:
>   ...
>   if (distbb_75 >= 0.0)
>     goto <bb 17>; [59.00%]
>   else
>     goto <bb 16>; [41.00%]
> 
>   <bb 16> [local count: 216727269]:
>   ...
>   goto <bb 19>; [100.00%]
> 
>   <bb 17> [local count: 311875831]:
>   ...
>   if (distbb_75 < iftmp.0_98)
>     goto <bb 18>; [20.00%]
>   else
>     goto <bb 19>; [80.00%]
> 
>   <bb 18> [local count: 62375167]:
>   ...
> 
>   <bb 19> [local count: 528603100]:
>   # prephitmp_175 = PHI <_173(18), 0.0(17), _174(16)>
> 
> All tree arguments to the PHI have the same number of occurrences, namely 1,
> however it makes a big difference which comparison we test first.
> 
> Sorting only on occurrences we'll pick the compares coming from BB 18 and BB 
> 17,
> This means we end up generating 4 comparisons, while 2 would have been enough.
> 
> By keeping track of the "complexity" of the COND in each BB, (i.e. the number
> of comparisons needed to traverse from the start [BB 15] to end [BB 19]) and
> using a key tuple of <occurrences, complexity> we end up selecting the compare
> from BB 16 and BB 18 first.  BB 16 only requires 1 compare, and BB 18, after 
> we
> test BB 16 also only requires one additional compare.  This change paired with
> the one previous above results in the optimal 2 compares.
> 
> For deep nesting, i.e. for
> 
> ...
>   _79 = vr_15 > 20;
>   _80 = _68 & _79;
>   _82 = vr_15 <= 20;
>   _83 = _68 & _82;
>   _84 = vr_15 < -20;
>   _85 = _73 & _84;
>   _87 = vr_15 >= -20;
>   _88 = _73 & _87;
>   _ifc__111 = _55 ? 10 : 12;
>   _ifc__112 = _70 ? 7 : _ifc__111;
>   _ifc__113 = _85 ? 8 : _ifc__112;
>   _ifc__114 = _88 ? 9 : _ifc__113;
>   _ifc__115 = _45 ? 1 : _ifc__114;
>   _ifc__116 = _63 ? 3 : _ifc__115;
>   _ifc__117 = _65 ? 4 : _ifc__116;
>   _ifc__118 = _83 ? 6 : _ifc__117;
>   _ifc__119 = _60 ? 2 : _ifc__118;
>   _ifc__120 = _43 ? 13 : _ifc__119;
>   _ifc__121 = _75 ? 11 : _ifc__120;
>   vw_1 = _80 ? 5 : _ifc__121;
> 
> Most of the comparisons are still needed because the chain of
> occurrences to not negate eachother. i.e. _80 is _73 & vr_15 >= -20 and
> _85 is _73 & vr_15 < -20.  clearly given _73 needs to be true in both 
> branches,
> the only additional test needed is on vr_15, where the one test is the 
> negation
> of the other.  So we don't need to do the comparison of _73 twice.
> 
> The changes in the patch reduces the overall number of compares by one, but 
> has
> a bigger effect on the dependency chain.
> 
> Previously we would generate 5 instructions chain:
> 
>       cmple   p7.s, p4/z, z29.s, z30.s
>       cmpne   p7.s, p7/z, z29.s, #0
>       cmple   p6.s, p7/z, z31.s, z30.s
>       cmpge   p6.s, p6/z, z27.s, z25.s
>       cmplt   p15.s, p6/z, z28.s, z21.s
> 
> as the longest chain.  With this patch we generate 3:
> 
>       cmple   p7.s, p3/z, z27.s, z30.s
>       cmpne   p7.s, p7/z, z27.s, #0
>       cmpgt   p7.s, p7/z, z31.s, z30.s
> 
> and I don't think (x <= y) && (x != 0) && (z > y) can be reduced further.
> 
> Bootstrapped and Regtested on aarch64-none-linux-gnu and no issues.
> 
> Not sure how to write a non-fragile testcase for this as the
> conditionals chosen depends on threading etc. Any Suggestions?
> 
> Ok for master?

OK.

Likewise for the testcase - GIMPLE one starting at fix_loops.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>       PR tree-optimization/109154
>       * tree-if-conv.cc (INCLUDE_ALGORITHM): Include.
>       (struct bb_predicate): Add no_predicate_stmts.
>       (set_bb_predicate): Increase predicate count.
>       (set_bb_predicate_gimplified_stmts): Conditionally initialize
>       no_predicate_stmts.
>       (get_bb_num_predicate_stmts): New.
>       (init_bb_predicate): Initialzie no_predicate_stmts.
>       (release_bb_predicate): Cleanup no_predicate_stmts.
>       (insert_gimplified_predicates): Preserve no_predicate_stmts.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index 
> 16b36dd8b0226f796c1a3fc6d45a9059385e812b..0ed50d99c46f99a4d1ea0e827ee2b2a3f494b2da
>  100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -80,6 +80,7 @@ along with GCC; see the file COPYING3.  If not see
>       <L18>:;
>  */
>  
> +#define INCLUDE_ALGORITHM
>  #include "config.h"
>  #include "system.h"
>  #include "coretypes.h"
> @@ -231,6 +232,10 @@ struct bb_predicate {
>       recorded here, in order to avoid the duplication of computations
>       that occur in previous conditions.  See PR44483.  */
>    gimple_seq predicate_gimplified_stmts;
> +
> +  /* Records the number of statements recorded into
> +     PREDICATE_GIMPLIFIED_STMTS.   */
> +  unsigned no_predicate_stmts;
>  };
>  
>  /* Returns true when the basic block BB has a predicate.  */
> @@ -254,10 +259,16 @@ bb_predicate (basic_block bb)
>  static inline void
>  set_bb_predicate (basic_block bb, tree cond)
>  {
> +  auto aux = (struct bb_predicate *) bb->aux;
>    gcc_assert ((TREE_CODE (cond) == TRUTH_NOT_EXPR
>              && is_gimple_val (TREE_OPERAND (cond, 0)))
>             || is_gimple_val (cond));
> -  ((struct bb_predicate *) bb->aux)->predicate = cond;
> +  aux->predicate = cond;
> +  aux->no_predicate_stmts++;
> +
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    fprintf (dump_file, "Recording block %d value %d\n", bb->index,
> +          aux->no_predicate_stmts);
>  }
>  
>  /* Returns the sequence of statements of the gimplification of the
> @@ -270,12 +281,16 @@ bb_predicate_gimplified_stmts (basic_block bb)
>  }
>  
>  /* Sets the sequence of statements STMTS of the gimplification of the
> -   predicate for basic block BB.  */
> +   predicate for basic block BB.  If PRESERVE_COUNTS then don't clear the 
> predicate
> +   counts.  */
>  
>  static inline void
> -set_bb_predicate_gimplified_stmts (basic_block bb, gimple_seq stmts)
> +set_bb_predicate_gimplified_stmts (basic_block bb, gimple_seq stmts,
> +                                bool preserve_counts)
>  {
>    ((struct bb_predicate *) bb->aux)->predicate_gimplified_stmts = stmts;
> +  if (stmts == NULL && !preserve_counts)
> +    ((struct bb_predicate *) bb->aux)->no_predicate_stmts = 0;
>  }
>  
>  /* Adds the sequence of statements STMTS to the sequence of statements
> @@ -296,18 +311,28 @@ add_bb_predicate_gimplified_stmts (basic_block bb, 
> gimple_seq stmts)
>        gimple *stmt = gsi_stmt (gsi);
>        delink_stmt_imm_use (stmt);
>        gimple_set_modified (stmt, true);
> +      ((struct bb_predicate *) bb->aux)->no_predicate_stmts++;
>      }
>    gimple_seq_add_seq_without_update
>      (&(((struct bb_predicate *) bb->aux)->predicate_gimplified_stmts), 
> stmts);
>  }
>  
> +/* Return the number of statements the predicate of the basic block consists
> +   of.  */
> +
> +static inline unsigned
> +get_bb_num_predicate_stmts (basic_block bb)
> +{
> +  return ((struct bb_predicate *) bb->aux)->no_predicate_stmts;
> +}
> +
>  /* Initializes to TRUE the predicate of basic block BB.  */
>  
>  static inline void
>  init_bb_predicate (basic_block bb)
>  {
>    bb->aux = XNEW (struct bb_predicate);
> -  set_bb_predicate_gimplified_stmts (bb, NULL);
> +  set_bb_predicate_gimplified_stmts (bb, NULL, false);
>    set_bb_predicate (bb, boolean_true_node);
>  }
>  
> @@ -327,7 +352,7 @@ release_bb_predicate (basic_block bb)
>  
>        /* Discard them.  */
>        gimple_seq_discard (stmts);
> -      set_bb_predicate_gimplified_stmts (bb, NULL);
> +      set_bb_predicate_gimplified_stmts (bb, NULL, false);
>      }
>  }
>  
> @@ -2041,7 +2066,6 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator 
> *gsi)
>    tree rhs, res, arg0, arg1, op0, op1, scev;
>    tree cond;
>    unsigned int index0;
> -  unsigned int max, args_len;
>    edge e;
>    basic_block bb;
>    unsigned int i;
> @@ -2145,7 +2169,6 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator 
> *gsi)
>    bool swap = false;
>    hash_map<tree_operand_hash, auto_vec<int> > phi_arg_map;
>    unsigned int num_args = gimple_phi_num_args (phi);
> -  int max_ind = -1;
>    /* Vector of different PHI argument values.  */
>    auto_vec<tree> args (num_args);
>  
> @@ -2160,28 +2183,38 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator 
> *gsi)
>        phi_arg_map.get_or_insert (arg).safe_push (i);
>      }
>  
> -  /* Determine element with max number of occurrences.  */
> -  max_ind = -1;
> -  max = 1;
> -  args_len = args.length ();
> -  for (i = 0; i < args_len; i++)
> +  /* Determine element with max number of occurrences and complexity.  
> Looking at only
> +     number of occurrences as a measure for complexity isn't enough as all 
> usages can
> +     be unique but the comparisons to reach the PHI node differ per branch.  
> */
> +  typedef std::pair <tree, std::pair <unsigned, unsigned>> ArgEntry;
> +  auto_vec<ArgEntry> argsKV;
> +  for (i = 0; i < args.length (); i++)
>      {
> -      unsigned int len;
> -      if ((len = phi_arg_map.get (args[i])->length ()) > max)
> +      unsigned int len = 0;
> +      for (int index : phi_arg_map.get (args[i]))
>       {
> -       max_ind = (int) i;
> -       max = len;
> +       edge e = gimple_phi_arg_edge (phi, index);
> +       len += get_bb_num_predicate_stmts (e->src);
>       }
> +
> +      unsigned occur = phi_arg_map.get (args[i])->length ();
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +     fprintf (dump_file, "Ranking %d as len=%d, idx=%d\n", i, len, occur);
> +      argsKV.safe_push ({ args[i], { len, occur }});
>      }
>  
> -  /* Put element with max number of occurences to the end of ARGS.  */
> -  if (max_ind != -1 && max_ind + 1 != (int) args_len)
> -    std::swap (args[args_len - 1], args[max_ind]);
> +  /* Sort elements based on rankings ARGS.  */
> +  std::sort(argsKV.begin(), argsKV.end(), [](ArgEntry &left, ArgEntry 
> &right) {
> +    return left.second < right.second;
> +  });
> +
> +  for (i = 0; i < args.length (); i++)
> +    args[i] = argsKV[i].first;
>  
>    /* Handle one special case when number of arguments with different values
>       is equal 2 and one argument has the only occurrence.  Such PHI can be
>       handled as if would have only 2 arguments.  */
> -  if (args_len == 2 && phi_arg_map.get (args[0])->length () == 1)
> +  if (args.length () == 2 && phi_arg_map.get (args[0])->length () == 1)
>      {
>        vec<int> *indexes;
>        indexes = phi_arg_map.get (args[0]);
> @@ -2317,7 +2350,7 @@ insert_gimplified_predicates (loop_p loop)
>           }
>  
>         /* Once the sequence is code generated, set it to NULL.  */
> -       set_bb_predicate_gimplified_stmts (bb, NULL);
> +       set_bb_predicate_gimplified_stmts (bb, NULL, true);
>       }
>      }
>  }
> 
> 
> 
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

Reply via email to