Hi All,

The following example:

void f11(double * restrict z, double * restrict w, double * restrict x,
         double * restrict y, int n)
{
    for (int i = 0; i < n; i++) {
        z[i] = (w[i] > 0) ? w[i] : y[i];
    }
}

Generates currently:

        ptrue   p2.b, all
        ld1d    z0.d, p0/z, [x1, x2, lsl 3]
        fcmgt   p1.d, p2/z, z0.d, #0.0
        bic     p3.b, p2/z, p0.b, p1.b
        ld1d    z1.d, p3/z, [x3, x2, lsl 3]

and after the previous patches generates:

        ptrue   p3.b, all
        ld1d    z0.d, p0/z, [x1, x2, lsl 3]
        fcmgt   p1.d, p0/z, z0.d, #0.0
        fcmgt   p2.d, p3/z, z0.d, #0.0
        not     p1.b, p0/z, p1.b
        ld1d    z1.d, p1/z, [x3, x2, lsl 3]

where a duplicate comparison is performed for w[i] > 0.

This is because in the vectorizer we're emitting a comparison for both a and ~a
where we just need to emit one of them and invert the other.  After this patch
we generate:

        ld1d    z0.d, p0/z, [x1, x2, lsl 3]
        fcmgt   p1.d, p0/z, z0.d, #0.0
        mov     p2.b, p1.b
        not     p1.b, p0/z, p1.b
        ld1d    z1.d, p1/z, [x3, x2, lsl 3]

In order to perform the check I have to fully expand the NOT stmts when
recording them as the SSA names for the top level expressions differ but
their arguments don't. e.g. in _31 = ~_34 the value of _34 differs but not
the operands in _34.

But we only do this when the operation is an ordered one because mixing
ordered and unordered expressions can lead to de-optimized code.

Note: This patch series is working incrementally towards generating the most
      efficient code for this and other loops in small steps. The mov is
      created by postreload when it does a late CSE.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * fold-const.c (tree_comparison_ordered_p): New.
        * fold-const.h (tree_comparison_ordered_p): New.
        * tree-vect-stmts.c (vectorizable_condition): Check if inverse of mask
        is live.
        * tree-vectorizer.c (scalar_cond_masked_key::get_cond_ops_from_tree):
        Register mask inverses.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/sve/pred-not-gen.c: Update testcase.

--- inline copy of patch -- 
diff --git a/gcc/fold-const.h b/gcc/fold-const.h
index 
7bac84ba33145c17d1dac9afe70bbd1c89a4b3fa..852fc37b25023a108410fcf375604d082357efa2
 100644
--- a/gcc/fold-const.h
+++ b/gcc/fold-const.h
@@ -144,6 +144,7 @@ extern enum tree_code swap_tree_comparison (enum tree_code);
 
 extern bool ptr_difference_const (tree, tree, poly_int64_pod *);
 extern enum tree_code invert_tree_comparison (enum tree_code, bool);
+extern bool tree_comparison_ordered_p (enum tree_code);
 extern bool inverse_conditions_p (const_tree, const_tree);
 
 extern bool tree_unary_nonzero_warnv_p (enum tree_code, tree, tree, bool *);
diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index 
7dcecc9a5c08d56703075229f762f750ed6c5d93..04991457db7e5166e8ce17d4bfa3b107f619dbc1
 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -2669,6 +2669,37 @@ invert_tree_comparison (enum tree_code code, bool 
honor_nans)
     }
 }
 
+/* Given a tree comparison code return whether the comparison is for an
+   ordered expression or not.  */
+
+bool
+tree_comparison_ordered_p (enum tree_code code)
+{
+  switch (code)
+    {
+    case EQ_EXPR:
+    case NE_EXPR:
+    case GT_EXPR:
+    case GE_EXPR:
+    case LT_EXPR:
+    case LE_EXPR:
+    case LTGT_EXPR:
+      return true;
+    case UNEQ_EXPR:
+    case UNGT_EXPR:
+    case UNGE_EXPR:
+    case UNLT_EXPR:
+    case UNLE_EXPR:
+    case ORDERED_EXPR:
+    case UNORDERED_EXPR:
+      return false;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+
+
 /* Similar, but return the comparison that results if the operands are
    swapped.  This is safe for floating-point.  */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c
index 
18d5cf8dcb46e227aecfcbacb833670427ed0586..e4251de32fe347d6193d6f964a74d30e28f5d128
 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c
@@ -24,7 +24,6 @@ void f10(double * restrict z, double * restrict w, double * 
restrict x, double *
 ** f11:
 ** ...
 **     ld1d    z0.d, p0/z, \[x1, x2, lsl 3\]
-**     fcmgt   p2.d, p3/z, z0.d, #0.0
 **     fcmgt   p1.d, p0/z, z0.d, #0.0
 **     not     p1.b, p0/z, p1.b
 **     ld1d    z1.d, p1/z, \[x3, x2, lsl 3\]
@@ -55,5 +54,3 @@ void f12(int * restrict z, int * restrict w, int * restrict 
x, int * restrict y,
     }
 }
 
-/* { dg-final { scan-assembler-not {\tbic\t} } } */
-/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 2 } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 
074dfdcf385f31f2ba753012131985544dfd69f8..54cce92066c058d85ad010091c0c0eb6716f8979
 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -10216,6 +10216,7 @@ vectorizable_condition (vec_info *vinfo,
          else
            {
              bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
+             tree_code orig_code = cond.code;
              cond.code = invert_tree_comparison (cond.code, honor_nans);
              if (loop_vinfo->scalar_cond_masked_set.contains (cond))
                {
@@ -10223,6 +10224,21 @@ vectorizable_condition (vec_info *vinfo,
                  cond_code = cond.code;
                  swap_cond_operands = true;
                }
+             else if (tree_comparison_ordered_p (orig_code))
+               {
+                 /* Try the inverse of the current mask.  We check if the
+                    inverse mask is live and if so we generate a negate of
+                    the current mask such that we still honor NaNs.  */
+                 cond.code = invert_tree_comparison (orig_code, false);
+                 if (loop_vinfo->scalar_cond_masked_set.contains (cond))
+                   {
+                     bitop1 = GT_EXPR;
+                     bitop2 = BIT_NOT_EXPR;
+                     masks = &LOOP_VINFO_MASKS (loop_vinfo);
+                     cond_code = cond.code;
+                     swap_cond_operands = true;
+                   }
+               }
            }
        }
     }
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 
b9709a613d557445c060669f5b4517a15058f89d..c2d9970d79f6a9afaf0ad1fbb80a2d5a97bab89e
 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1682,6 +1682,22 @@ scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
            this->op1 = gimple_assign_rhs2 (stmt);
            return;
          }
+       else if (code == BIT_NOT_EXPR)
+         {
+           tree n_op = gimple_assign_rhs1 (stmt);
+           if ((stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (n_op))))
+             {
+               code = gimple_assign_rhs_code (stmt);
+               if (TREE_CODE_CLASS (code) == tcc_comparison
+                   && tree_comparison_ordered_p (code))
+                 {
+                   this->code = invert_tree_comparison (code, false);
+                   this->op0 = gimple_assign_rhs1 (stmt);
+                   this->op1 = gimple_assign_rhs2 (stmt);
+                   return;
+                 }
+             }
+         }
       }
 
   this->code = NE_EXPR;


-- 
diff --git a/gcc/fold-const.h b/gcc/fold-const.h
index 7bac84ba33145c17d1dac9afe70bbd1c89a4b3fa..852fc37b25023a108410fcf375604d082357efa2 100644
--- a/gcc/fold-const.h
+++ b/gcc/fold-const.h
@@ -144,6 +144,7 @@ extern enum tree_code swap_tree_comparison (enum tree_code);
 
 extern bool ptr_difference_const (tree, tree, poly_int64_pod *);
 extern enum tree_code invert_tree_comparison (enum tree_code, bool);
+extern bool tree_comparison_ordered_p (enum tree_code);
 extern bool inverse_conditions_p (const_tree, const_tree);
 
 extern bool tree_unary_nonzero_warnv_p (enum tree_code, tree, tree, bool *);
diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index 7dcecc9a5c08d56703075229f762f750ed6c5d93..04991457db7e5166e8ce17d4bfa3b107f619dbc1 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -2669,6 +2669,37 @@ invert_tree_comparison (enum tree_code code, bool honor_nans)
     }
 }
 
+/* Given a tree comparison code return whether the comparison is for an
+   ordered expression or not.  */
+
+bool
+tree_comparison_ordered_p (enum tree_code code)
+{
+  switch (code)
+    {
+    case EQ_EXPR:
+    case NE_EXPR:
+    case GT_EXPR:
+    case GE_EXPR:
+    case LT_EXPR:
+    case LE_EXPR:
+    case LTGT_EXPR:
+      return true;
+    case UNEQ_EXPR:
+    case UNGT_EXPR:
+    case UNGE_EXPR:
+    case UNLT_EXPR:
+    case UNLE_EXPR:
+    case ORDERED_EXPR:
+    case UNORDERED_EXPR:
+      return false;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+
+
 /* Similar, but return the comparison that results if the operands are
    swapped.  This is safe for floating-point.  */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c
index 18d5cf8dcb46e227aecfcbacb833670427ed0586..e4251de32fe347d6193d6f964a74d30e28f5d128 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c
@@ -24,7 +24,6 @@ void f10(double * restrict z, double * restrict w, double * restrict x, double *
 ** f11:
 ** ...
 ** 	ld1d	z0.d, p0/z, \[x1, x2, lsl 3\]
-** 	fcmgt	p2.d, p3/z, z0.d, #0.0
 ** 	fcmgt	p1.d, p0/z, z0.d, #0.0
 ** 	not	p1.b, p0/z, p1.b
 ** 	ld1d	z1.d, p1/z, \[x3, x2, lsl 3\]
@@ -55,5 +54,3 @@ void f12(int * restrict z, int * restrict w, int * restrict x, int * restrict y,
     }
 }
 
-/* { dg-final { scan-assembler-not {\tbic\t} } } */
-/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 2 } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 074dfdcf385f31f2ba753012131985544dfd69f8..54cce92066c058d85ad010091c0c0eb6716f8979 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -10216,6 +10216,7 @@ vectorizable_condition (vec_info *vinfo,
 	  else
 	    {
 	      bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
+	      tree_code orig_code = cond.code;
 	      cond.code = invert_tree_comparison (cond.code, honor_nans);
 	      if (loop_vinfo->scalar_cond_masked_set.contains (cond))
 		{
@@ -10223,6 +10224,21 @@ vectorizable_condition (vec_info *vinfo,
 		  cond_code = cond.code;
 		  swap_cond_operands = true;
 		}
+	      else if (tree_comparison_ordered_p (orig_code))
+		{
+		  /* Try the inverse of the current mask.  We check if the
+		     inverse mask is live and if so we generate a negate of
+		     the current mask such that we still honor NaNs.  */
+		  cond.code = invert_tree_comparison (orig_code, false);
+		  if (loop_vinfo->scalar_cond_masked_set.contains (cond))
+		    {
+		      bitop1 = GT_EXPR;
+		      bitop2 = BIT_NOT_EXPR;
+		      masks = &LOOP_VINFO_MASKS (loop_vinfo);
+		      cond_code = cond.code;
+		      swap_cond_operands = true;
+		    }
+		}
 	    }
 	}
     }
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index b9709a613d557445c060669f5b4517a15058f89d..c2d9970d79f6a9afaf0ad1fbb80a2d5a97bab89e 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1682,6 +1682,22 @@ scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
 	    this->op1 = gimple_assign_rhs2 (stmt);
 	    return;
 	  }
+	else if (code == BIT_NOT_EXPR)
+	  {
+	    tree n_op = gimple_assign_rhs1 (stmt);
+	    if ((stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (n_op))))
+	      {
+		code = gimple_assign_rhs_code (stmt);
+		if (TREE_CODE_CLASS (code) == tcc_comparison
+		    && tree_comparison_ordered_p (code))
+		  {
+		    this->code = invert_tree_comparison (code, false);
+		    this->op0 = gimple_assign_rhs1 (stmt);
+		    this->op1 = gimple_assign_rhs2 (stmt);
+		    return;
+		  }
+	      }
+	  }
       }
 
   this->code = NE_EXPR;

Reply via email to