Hi All, The following example:
void f11(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n) { for (int i = 0; i < n; i++) { z[i] = (w[i] > 0) ? w[i] : y[i]; } } Generates currently: ptrue p2.b, all ld1d z0.d, p0/z, [x1, x2, lsl 3] fcmgt p1.d, p2/z, z0.d, #0.0 bic p3.b, p2/z, p0.b, p1.b ld1d z1.d, p3/z, [x3, x2, lsl 3] and after the previous patches generates: ptrue p3.b, all ld1d z0.d, p0/z, [x1, x2, lsl 3] fcmgt p1.d, p0/z, z0.d, #0.0 fcmgt p2.d, p3/z, z0.d, #0.0 not p1.b, p0/z, p1.b ld1d z1.d, p1/z, [x3, x2, lsl 3] where a duplicate comparison is performed for w[i] > 0. This is because in the vectorizer we're emitting a comparison for both a and ~a where we just need to emit one of them and invert the other. After this patch we generate: ld1d z0.d, p0/z, [x1, x2, lsl 3] fcmgt p1.d, p0/z, z0.d, #0.0 mov p2.b, p1.b not p1.b, p0/z, p1.b ld1d z1.d, p1/z, [x3, x2, lsl 3] In order to perform the check I have to fully expand the NOT stmts when recording them as the SSA names for the top level expressions differ but their arguments don't. e.g. in _31 = ~_34 the value of _34 differs but not the operands in _34. But we only do this when the operation is an ordered one because mixing ordered and unordered expressions can lead to de-optimized code. Note: This patch series is working incrementally towards generating the most efficient code for this and other loops in small steps. The mov is created by postreload when it does a late CSE. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * fold-const.c (tree_comparison_ordered_p): New. * fold-const.h (tree_comparison_ordered_p): New. * tree-vect-stmts.c (vectorizable_condition): Check if inverse of mask is live. * tree-vectorizer.c (scalar_cond_masked_key::get_cond_ops_from_tree): Register mask inverses. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/pred-not-gen.c: Update testcase. --- inline copy of patch -- diff --git a/gcc/fold-const.h b/gcc/fold-const.h index 7bac84ba33145c17d1dac9afe70bbd1c89a4b3fa..852fc37b25023a108410fcf375604d082357efa2 100644 --- a/gcc/fold-const.h +++ b/gcc/fold-const.h @@ -144,6 +144,7 @@ extern enum tree_code swap_tree_comparison (enum tree_code); extern bool ptr_difference_const (tree, tree, poly_int64_pod *); extern enum tree_code invert_tree_comparison (enum tree_code, bool); +extern bool tree_comparison_ordered_p (enum tree_code); extern bool inverse_conditions_p (const_tree, const_tree); extern bool tree_unary_nonzero_warnv_p (enum tree_code, tree, tree, bool *); diff --git a/gcc/fold-const.c b/gcc/fold-const.c index 7dcecc9a5c08d56703075229f762f750ed6c5d93..04991457db7e5166e8ce17d4bfa3b107f619dbc1 100644 --- a/gcc/fold-const.c +++ b/gcc/fold-const.c @@ -2669,6 +2669,37 @@ invert_tree_comparison (enum tree_code code, bool honor_nans) } } +/* Given a tree comparison code return whether the comparison is for an + ordered expression or not. */ + +bool +tree_comparison_ordered_p (enum tree_code code) +{ + switch (code) + { + case EQ_EXPR: + case NE_EXPR: + case GT_EXPR: + case GE_EXPR: + case LT_EXPR: + case LE_EXPR: + case LTGT_EXPR: + return true; + case UNEQ_EXPR: + case UNGT_EXPR: + case UNGE_EXPR: + case UNLT_EXPR: + case UNLE_EXPR: + case ORDERED_EXPR: + case UNORDERED_EXPR: + return false; + default: + gcc_unreachable (); + } +} + + + /* Similar, but return the comparison that results if the operands are swapped. This is safe for floating-point. */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c index 18d5cf8dcb46e227aecfcbacb833670427ed0586..e4251de32fe347d6193d6f964a74d30e28f5d128 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c @@ -24,7 +24,6 @@ void f10(double * restrict z, double * restrict w, double * restrict x, double * ** f11: ** ... ** ld1d z0.d, p0/z, \[x1, x2, lsl 3\] -** fcmgt p2.d, p3/z, z0.d, #0.0 ** fcmgt p1.d, p0/z, z0.d, #0.0 ** not p1.b, p0/z, p1.b ** ld1d z1.d, p1/z, \[x3, x2, lsl 3\] @@ -55,5 +54,3 @@ void f12(int * restrict z, int * restrict w, int * restrict x, int * restrict y, } } -/* { dg-final { scan-assembler-not {\tbic\t} } } */ -/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 2 } } */ diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 074dfdcf385f31f2ba753012131985544dfd69f8..54cce92066c058d85ad010091c0c0eb6716f8979 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -10216,6 +10216,7 @@ vectorizable_condition (vec_info *vinfo, else { bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0)); + tree_code orig_code = cond.code; cond.code = invert_tree_comparison (cond.code, honor_nans); if (loop_vinfo->scalar_cond_masked_set.contains (cond)) { @@ -10223,6 +10224,21 @@ vectorizable_condition (vec_info *vinfo, cond_code = cond.code; swap_cond_operands = true; } + else if (tree_comparison_ordered_p (orig_code)) + { + /* Try the inverse of the current mask. We check if the + inverse mask is live and if so we generate a negate of + the current mask such that we still honor NaNs. */ + cond.code = invert_tree_comparison (orig_code, false); + if (loop_vinfo->scalar_cond_masked_set.contains (cond)) + { + bitop1 = GT_EXPR; + bitop2 = BIT_NOT_EXPR; + masks = &LOOP_VINFO_MASKS (loop_vinfo); + cond_code = cond.code; + swap_cond_operands = true; + } + } } } } diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index b9709a613d557445c060669f5b4517a15058f89d..c2d9970d79f6a9afaf0ad1fbb80a2d5a97bab89e 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -1682,6 +1682,22 @@ scalar_cond_masked_key::get_cond_ops_from_tree (tree t) this->op1 = gimple_assign_rhs2 (stmt); return; } + else if (code == BIT_NOT_EXPR) + { + tree n_op = gimple_assign_rhs1 (stmt); + if ((stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (n_op)))) + { + code = gimple_assign_rhs_code (stmt); + if (TREE_CODE_CLASS (code) == tcc_comparison + && tree_comparison_ordered_p (code)) + { + this->code = invert_tree_comparison (code, false); + this->op0 = gimple_assign_rhs1 (stmt); + this->op1 = gimple_assign_rhs2 (stmt); + return; + } + } + } } this->code = NE_EXPR; --
diff --git a/gcc/fold-const.h b/gcc/fold-const.h index 7bac84ba33145c17d1dac9afe70bbd1c89a4b3fa..852fc37b25023a108410fcf375604d082357efa2 100644 --- a/gcc/fold-const.h +++ b/gcc/fold-const.h @@ -144,6 +144,7 @@ extern enum tree_code swap_tree_comparison (enum tree_code); extern bool ptr_difference_const (tree, tree, poly_int64_pod *); extern enum tree_code invert_tree_comparison (enum tree_code, bool); +extern bool tree_comparison_ordered_p (enum tree_code); extern bool inverse_conditions_p (const_tree, const_tree); extern bool tree_unary_nonzero_warnv_p (enum tree_code, tree, tree, bool *); diff --git a/gcc/fold-const.c b/gcc/fold-const.c index 7dcecc9a5c08d56703075229f762f750ed6c5d93..04991457db7e5166e8ce17d4bfa3b107f619dbc1 100644 --- a/gcc/fold-const.c +++ b/gcc/fold-const.c @@ -2669,6 +2669,37 @@ invert_tree_comparison (enum tree_code code, bool honor_nans) } } +/* Given a tree comparison code return whether the comparison is for an + ordered expression or not. */ + +bool +tree_comparison_ordered_p (enum tree_code code) +{ + switch (code) + { + case EQ_EXPR: + case NE_EXPR: + case GT_EXPR: + case GE_EXPR: + case LT_EXPR: + case LE_EXPR: + case LTGT_EXPR: + return true; + case UNEQ_EXPR: + case UNGT_EXPR: + case UNGE_EXPR: + case UNLT_EXPR: + case UNLE_EXPR: + case ORDERED_EXPR: + case UNORDERED_EXPR: + return false; + default: + gcc_unreachable (); + } +} + + + /* Similar, but return the comparison that results if the operands are swapped. This is safe for floating-point. */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c index 18d5cf8dcb46e227aecfcbacb833670427ed0586..e4251de32fe347d6193d6f964a74d30e28f5d128 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c @@ -24,7 +24,6 @@ void f10(double * restrict z, double * restrict w, double * restrict x, double * ** f11: ** ... ** ld1d z0.d, p0/z, \[x1, x2, lsl 3\] -** fcmgt p2.d, p3/z, z0.d, #0.0 ** fcmgt p1.d, p0/z, z0.d, #0.0 ** not p1.b, p0/z, p1.b ** ld1d z1.d, p1/z, \[x3, x2, lsl 3\] @@ -55,5 +54,3 @@ void f12(int * restrict z, int * restrict w, int * restrict x, int * restrict y, } } -/* { dg-final { scan-assembler-not {\tbic\t} } } */ -/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 2 } } */ diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 074dfdcf385f31f2ba753012131985544dfd69f8..54cce92066c058d85ad010091c0c0eb6716f8979 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -10216,6 +10216,7 @@ vectorizable_condition (vec_info *vinfo, else { bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0)); + tree_code orig_code = cond.code; cond.code = invert_tree_comparison (cond.code, honor_nans); if (loop_vinfo->scalar_cond_masked_set.contains (cond)) { @@ -10223,6 +10224,21 @@ vectorizable_condition (vec_info *vinfo, cond_code = cond.code; swap_cond_operands = true; } + else if (tree_comparison_ordered_p (orig_code)) + { + /* Try the inverse of the current mask. We check if the + inverse mask is live and if so we generate a negate of + the current mask such that we still honor NaNs. */ + cond.code = invert_tree_comparison (orig_code, false); + if (loop_vinfo->scalar_cond_masked_set.contains (cond)) + { + bitop1 = GT_EXPR; + bitop2 = BIT_NOT_EXPR; + masks = &LOOP_VINFO_MASKS (loop_vinfo); + cond_code = cond.code; + swap_cond_operands = true; + } + } } } } diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index b9709a613d557445c060669f5b4517a15058f89d..c2d9970d79f6a9afaf0ad1fbb80a2d5a97bab89e 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -1682,6 +1682,22 @@ scalar_cond_masked_key::get_cond_ops_from_tree (tree t) this->op1 = gimple_assign_rhs2 (stmt); return; } + else if (code == BIT_NOT_EXPR) + { + tree n_op = gimple_assign_rhs1 (stmt); + if ((stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (n_op)))) + { + code = gimple_assign_rhs_code (stmt); + if (TREE_CODE_CLASS (code) == tcc_comparison + && tree_comparison_ordered_p (code)) + { + this->code = invert_tree_comparison (code, false); + this->op0 = gimple_assign_rhs1 (stmt); + this->op1 = gimple_assign_rhs2 (stmt); + return; + } + } + } } this->code = NE_EXPR;