On Wed, May 26, 2021 at 8:41 PM Richard Biener
<richard.guent...@gmail.com> wrote:
>
> On Wed, May 26, 2021 at 7:06 AM Hongtao Liu <crazy...@gmail.com> wrote:
> >
> > On Tue, May 25, 2021 at 6:24 PM Richard Biener
> > <richard.guent...@gmail.com> wrote:
> > >
> > > On Mon, May 24, 2021 at 11:52 AM Hongtao Liu <crazy...@gmail.com> wrote:
> > > >
> > > > Hi:
> > > >   Details described in PR.
> > > >   Bootstrapped and regtest on
> > > > x86_64-linux-gnu{-m32,}/x86_64-linux-gnu{-m32\
> > > > -march=cascadelake,-march=cascadelake}
> > > >   Ok for trunk?
> > >
> > > +static tree
> > > +strip_nop_cond_scalar_reduction (bool has_nop, tree op)
> > > +{
> > > +  if (!has_nop)
> > > +    return op;
> > > +
> > > +  if (TREE_CODE (op) != SSA_NAME)
> > > +    return NULL_TREE;
> > > +
> > > +  gimple* stmt = SSA_NAME_DEF_STMT (op);
> > > +  if (!stmt
> > > +      || gimple_code (stmt) != GIMPLE_ASSIGN
> > > +      || gimple_has_volatile_ops (stmt)
> > > +      || gimple_assign_rhs_code (stmt) != NOP_EXPR)
> > > +    return NULL_TREE;
> > > +
> > > +  return gimple_assign_rhs1 (stmt);
> > >
> > > this allows arbitrary conversions where the comment suggests you
> > > only want to allow conversions to the same precision but different sign.
> > > Sth like
> > >
> > >   gassign *stmt = safe_dyn_cast <gassign *> (SSA_NAME_DEF_STMT (op));
> > >   if (!stmt
> > >       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
> > >       || !tree_nop_conversion_p (TREE_TYPE (op), TREE_TYPE
> > > (gimple_assign_rhs1 (stmt))))
> > >     return NULL_TREE;
> > >
Changed.
> > > +      if (gimple_bb (stmt) != gimple_bb (*nop_reduc)
> > > +         || gimple_code (stmt) != GIMPLE_ASSIGN
> > > +         || gimple_has_volatile_ops (stmt))
> > > +       return false;
> > >
> > > !is_gimple_assign (stmt) instead of gimple_code (stmt) != GIMPLE_ASSIGN
> > >
Changed.
> > > the gimple_has_volatile_ops check is superfluous given you restrict
> > > the assign code.
> > >
Changed.
> > > +      /* Check that R_NOP1 is used in nop_stmt or in PHI only.  */
> > > +      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, r_nop1)
> > > +       {
> > > +         gimple *use_stmt = USE_STMT (use_p);
> > > +         if (is_gimple_debug (use_stmt))
> > > +           continue;
> > > +         if (use_stmt == SSA_NAME_DEF_STMT (r_op1))
> > > +           continue;
> > > +         if (gimple_code (use_stmt) != GIMPLE_PHI)
> > > +           return false;
> > >
> > > can the last check be use_stmt == phi since we should have the
> > > PHI readily available?
> > >
Yes, changed.
> > > @@ -1735,6 +1822,23 @@ convert_scalar_cond_reduction (gimple *reduc,
> > > gimple_stmt_iterator *gsi,
> > >    rhs = fold_build2 (gimple_assign_rhs_code (reduc),
> > >                      TREE_TYPE (rhs1), op0, tmp);
> > >
> > > +  if (has_nop)
> > > +    {
> > > +      /* Create assignment nop_rhs = op0 +/- _ifc_.  */
> > > +      tree nop_rhs = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, 
> > > "_nop_");
> > > +      gimple* new_assign2 = gimple_build_assign (nop_rhs, rhs);
> > > +      gsi_insert_before (gsi, new_assign2, GSI_SAME_STMT);
> > > +      /* Rebuild rhs for nop_expr.  */
> > > +      rhs = fold_build1 (NOP_EXPR,
> > > +                        TREE_TYPE (gimple_assign_lhs (nop_reduc)),
> > > +                        nop_rhs);
> > > +
> > > +      /* Delete nop_reduc.  */
> > > +      stmt_it = gsi_for_stmt (nop_reduc);
> > > +      gsi_remove (&stmt_it, true);
> > > +      release_defs (nop_reduc);
> > > +    }
> > > +
> > >
> > > hmm, the whole function could be cleaned up with sth like
> > >
> > >      /* Build rhs for unconditional increment/decrement.  */
> > >      gimple_seq stmts = NULL;
> > >      rhs = gimple_build (&stmts, gimple_assing_rhs_code (reduc),
> > >                                     TREE_TYPE (rhs1), op0, tmp);
> > >      if (has_nop)
> > >        rhs = gimple_convert (&stmts, TREE_TYPE (gimple_assign_lhs
> > > (nop_reduc)), rhs);
> > >      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > >
> > > plus in the caller moving the
> > >
> > >       new_stmt = gimple_build_assign (res, rhs);
> > >       gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> > >
> > > to the else branch as well as the folding done on new_stmt (maybe return
> > > new_stmt instead of rhs from convert_scalar_cond_reduction.
> > Eventually, we needed to assign rhs to res, and with an extra mov stmt
> > from rhs to res, the vectorizer failed.
> > the only difference in 166t.ifcvt between successfully vectorization
> > and failed vectorization is below
> >    char * _24;
> >    char _25;
> >    unsigned char _ifc__29;
> > +  unsigned char _30;
> >
> >    <bb 2> [local count: 118111600]:
> >    if (n_10(D) != 0)
> > @@ -70,7 +71,8 @@ char foo2 (char * a, char * c, int n)
> >    _5 = c_14(D) + _1;
> >    _6 = *_5;
> >    _ifc__29 = _3 == _6 ? 1 : 0;
> > -  cnt_7 = cnt_18 + _ifc__29;
> > +  _30 = cnt_18 + _ifc__29;
> > +  cnt_7 = _30;
> >    i_16 = i_20 + 1;
> >    if (n_10(D) != i_16)
> >      goto <bb 9>; [89.00%]
> > @@ -110,7 +112,7 @@ char foo2 (char * a, char * c, int n)
> >    goto <bb 12>; [100.00%]
> >
> >    <bb 6> [local count: 105119324]:
> > -  # cnt_19 = PHI <cnt_7(3), cnt_27(15)>
> > +  # cnt_19 = PHI <_30(3), cnt_27(15)>
> >    _21 = (char) cnt_19;
> >
> > if we want to eliminate the extra move, gimple_build and
> > gimple_convert is not suitable since they create a new lhs, is there
> > any interface like gimple_build_assign but accept stmts?
>
> Hmm, OK.  So what you could do is replacing all uses of 'res'
> with 'rhs', alternatively replacing the last generated stmt LHS
> by 'res'.  Both is a bit hackish of course.  Usually the vectorizer
> just ignores copies like this but the reduction matching is
> unnecessarily strict here (but it's also a bit awkward to fix there).
>
> There's redundant_ssa_names which seems to be designed to
> handle propagating those out and there's the do_rpo_vn run,
> so I wonder why the stmt remains.  Now, for redundant_ssa_names
> you'd need to push a std::pair (res, rhs) to it in case rhs is an
> SSA name - does that help?
Yes, it worked.
>
> Richard.
> > >
> > > Richard.
> > >
> > > >   gcc/ChangeLog:
> > > >
> > > >         PR tree-optimization/pr98365
> > > >         * tree-if-conv.c (strip_nop_cond_scalar_reduction): New 
> > > > function.
> > > >         (is_cond_scalar_reduction): Handle nop_expr in cond scalar 
> > > > reduction.
> > > >         (convert_scalar_cond_reduction): Ditto.
> > > >         (predicate_scalar_phi): Ditto.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >         PR tree-optimization/pr98365
> > > >         * gcc.target/i386/pr98365.c: New test.
> > > >
> > > > --
> > > > BR,
> > > > Hongtao
> >
> >
> >
> > --
> > BR,
> > Hongtao


Update patch.

-- 
BR,
Hongtao
From f0593d1866af6f752e70e2ef2f1477471c252536 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao....@intel.com>
Date: Wed, 6 Jan 2021 16:33:27 +0800
Subject: [PATCH] Extend is_cond_scalar_reduction to handle nop_expr
 after/before scalar reduction.[PR98365]

gcc/ChangeLog:

	PR tree-optimization/pr98365
	* tree-if-conv.c (strip_nop_cond_scalar_reduction): New function.
	(is_cond_scalar_reduction): Handle nop_expr in cond scalar reduction.
	(convert_scalar_cond_reduction): Ditto.
	(predicate_scalar_phi): Ditto.

gcc/testsuite/ChangeLog:

	PR tree-optimization/pr98365
	* gcc.target/i386/pr98365.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr98365.c |  22 ++++
 gcc/tree-if-conv.c                      | 142 +++++++++++++++++++++---
 2 files changed, 146 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr98365.c

diff --git a/gcc/testsuite/gcc.target/i386/pr98365.c b/gcc/testsuite/gcc.target/i386/pr98365.c
new file mode 100644
index 00000000000..652210dcdd5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr98365.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */
+short foo1 (short* a, short* c, int n)
+{
+  int i;
+  short cnt=0;
+  for (int i = 0;i != n; i++)
+    if (a[i] == c[i])
+      cnt++;
+  return cnt;
+}
+
+char foo2 (char* a, char* c, int n)
+{
+  int i;
+  char cnt=0;
+  for (int i = 0;i != n; i++)
+    if (a[i] == c[i])
+      cnt++;
+  return cnt;
+}
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index 716eae44a21..5eed087de36 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -1579,6 +1579,31 @@ if_convertible_loop_p (class loop *loop)
   return res;
 }
 
+/* Return reduc_1 if has_nop.
+
+   if (...)
+     tmp1 = (unsigned type) reduc_1;
+     tmp2 = tmp1 + rhs2;
+     reduc_3 = (signed type) tmp2.  */
+static tree
+strip_nop_cond_scalar_reduction (bool has_nop, tree op)
+{
+  if (!has_nop)
+    return op;
+
+  if (TREE_CODE (op) != SSA_NAME)
+    return NULL_TREE;
+
+  gassign *stmt = safe_dyn_cast <gassign *> (SSA_NAME_DEF_STMT (op));
+  if (!stmt
+      || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
+      || !tree_nop_conversion_p (TREE_TYPE (op), TREE_TYPE
+				 (gimple_assign_rhs1 (stmt))))
+    return NULL_TREE;
+
+  return gimple_assign_rhs1 (stmt);
+}
+
 /* Returns true if def-stmt for phi argument ARG is simple increment/decrement
    which is in predicated basic block.
    In fact, the following PHI pattern is searching:
@@ -1595,9 +1620,10 @@ if_convertible_loop_p (class loop *loop)
 
 static bool
 is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
-			  tree *op0, tree *op1, bool extended)
+			  tree *op0, tree *op1, bool extended, bool* has_nop,
+			  gimple **nop_reduc)
 {
-  tree lhs, r_op1, r_op2;
+  tree lhs, r_op1, r_op2, r_nop1, r_nop2;
   gimple *stmt;
   gimple *header_phi = NULL;
   enum tree_code reduction_op;
@@ -1608,7 +1634,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
   use_operand_p use_p;
   edge e;
   edge_iterator ei;
-  bool result = false;
+  bool result = *has_nop = false;
   if (TREE_CODE (arg_0) != SSA_NAME || TREE_CODE (arg_1) != SSA_NAME)
     return false;
 
@@ -1656,18 +1682,77 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
     return false;
 
   reduction_op = gimple_assign_rhs_code (stmt);
+
+    /* Catch something like below
+
+     loop-header:
+     reduc_1 = PHI <..., reduc_2>
+     ...
+     if (...)
+     tmp1 = (unsigned type) reduc_1;
+     tmp2 = tmp1 + rhs2;
+     reduc_3 = (signed type) tmp2;
+
+     reduc_2 = PHI <reduc_1, reduc_3>
+
+     and convert to
+
+     reduc_2 = PHI <0, reduc_3>
+     tmp1 = (unsigned type)reduce_1;
+     ifcvt = cond_expr ? rhs2 : 0
+     tmp2 = tmp1 +/- ifcvt;
+     reduce_1 = (signed type)tmp2;  */
+
+  if (reduction_op == NOP_EXPR)
+    {
+      lhs = gimple_assign_rhs1 (stmt);
+      if (TREE_CODE (lhs) != SSA_NAME
+	  || !has_single_use (lhs))
+	return false;
+
+      *nop_reduc = stmt;
+      stmt = SSA_NAME_DEF_STMT (lhs);
+      if (gimple_bb (stmt) != gimple_bb (*nop_reduc)
+	  || !is_gimple_assign (stmt))
+	return false;
+
+      *has_nop = true;
+      reduction_op = gimple_assign_rhs_code (stmt);
+    }
+
   if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR)
     return false;
   r_op1 = gimple_assign_rhs1 (stmt);
   r_op2 = gimple_assign_rhs2 (stmt);
 
+  r_nop1 = strip_nop_cond_scalar_reduction (*has_nop, r_op1);
+  r_nop2 = strip_nop_cond_scalar_reduction (*has_nop, r_op2);
+
   /* Make R_OP1 to hold reduction variable.  */
-  if (r_op2 == PHI_RESULT (header_phi)
+  if (r_nop2 == PHI_RESULT (header_phi)
       && reduction_op == PLUS_EXPR)
-    std::swap (r_op1, r_op2);
-  else if (r_op1 != PHI_RESULT (header_phi))
+    {
+      std::swap (r_op1, r_op2);
+      std::swap (r_nop1, r_nop2);
+    }
+  else if (r_nop1 != PHI_RESULT (header_phi))
     return false;
 
+  if (*has_nop)
+    {
+      /* Check that R_NOP1 is used in nop_stmt or in PHI only.  */
+      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, r_nop1)
+	{
+	  gimple *use_stmt = USE_STMT (use_p);
+	  if (is_gimple_debug (use_stmt))
+	    continue;
+	  if (use_stmt == SSA_NAME_DEF_STMT (r_op1))
+	    continue;
+	  if (use_stmt != phi)
+	    return false;
+	}
+    }
+
   /* Check that R_OP1 is used in reduction stmt or in PHI only.  */
   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, r_op1)
     {
@@ -1705,7 +1790,8 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
 
 static tree
 convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
-			       tree cond, tree op0, tree op1, bool swap)
+			       tree cond, tree op0, tree op1, bool swap,
+			       bool has_nop, gimple* nop_reduc)
 {
   gimple_stmt_iterator stmt_it;
   gimple *new_assign;
@@ -1714,6 +1800,7 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
   tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
   tree c;
   tree zero = build_zero_cst (TREE_TYPE (rhs1));
+  gimple_seq stmts = NULL;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
@@ -1732,8 +1819,18 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
   new_assign = gimple_build_assign (tmp, c);
   gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
   /* Build rhs for unconditional increment/decrement.  */
-  rhs = fold_build2 (gimple_assign_rhs_code (reduc),
-		     TREE_TYPE (rhs1), op0, tmp);
+  rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc),
+		      TREE_TYPE (rhs1), op0, tmp);
+
+  if (has_nop)
+    {
+      rhs = gimple_convert (&stmts,
+			    TREE_TYPE (gimple_assign_lhs (nop_reduc)), rhs);
+      stmt_it = gsi_for_stmt (nop_reduc);
+      gsi_remove (&stmt_it, true);
+      release_defs (nop_reduc);
+    }
+  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
 
   /* Delete original reduction stmt.  */
   stmt_it = gsi_for_stmt (reduc);
@@ -1808,7 +1905,7 @@ ifcvt_follow_ssa_use_edges (tree val)
 static void
 predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
 {
-  gimple *new_stmt = NULL, *reduc;
+  gimple *new_stmt = NULL, *reduc, *nop_reduc;
   tree rhs, res, arg0, arg1, op0, op1, scev;
   tree cond;
   unsigned int index0;
@@ -1816,6 +1913,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
   edge e;
   basic_block bb;
   unsigned int i;
+  bool has_nop;
 
   res = gimple_phi_result (phi);
   if (virtual_operand_p (res))
@@ -1876,10 +1974,15 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
 	  arg1 = gimple_phi_arg_def (phi, 1);
 	}
       if (is_cond_scalar_reduction (phi, &reduc, arg0, arg1,
-				    &op0, &op1, false))
-	/* Convert reduction stmt into vectorizable form.  */
-	rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
-					     true_bb != gimple_bb (reduc));
+				    &op0, &op1, false, &has_nop,
+				    &nop_reduc))
+	{
+	  /* Convert reduction stmt into vectorizable form.  */
+	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
+					       true_bb != gimple_bb (reduc),
+					       has_nop, nop_reduc);
+	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
+	}
       else
 	/* Build new RHS using selected condition and arguments.  */
 	rhs = fold_build_cond_expr (TREE_TYPE (res), unshare_expr (cond),
@@ -1961,14 +2064,17 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
 					 is_gimple_condexpr, NULL_TREE,
 					 true, GSI_SAME_STMT);
       if (!(is_cond_scalar_reduction (phi, &reduc, arg0 , arg1,
-				      &op0, &op1, true)))
+				      &op0, &op1, true, &has_nop, &nop_reduc)))
 	rhs = fold_build_cond_expr (TREE_TYPE (res), unshare_expr (cond),
 				    swap? arg1 : arg0,
 				    swap? arg0 : arg1);
       else
-	/* Convert reduction stmt into vectorizable form.  */
-	rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
-					     swap);
+	{
+	  /* Convert reduction stmt into vectorizable form.  */
+	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
+					       swap,has_nop, nop_reduc);
+	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
+	}
       new_stmt = gimple_build_assign (res, rhs);
       gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
       update_stmt (new_stmt);
-- 
2.18.1

Reply via email to