On Mon, 9 Sep 2019 at 16:45, Richard Sandiford <richard.sandif...@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes: > > With patch, the only following FAIL remains for aarch64-sve.exp: > > FAIL: gcc.target/aarch64/sve/cond_unary_2.c -march=armv8.2-a+sve > > scan-assembler-times \\tmovprfx\\t 6 > > which now contains 14. > > Should I adjust the test, assuming the change isn't a regression ? > > Well, it is kind-of a regression, but it really just means that the > integer code is now consistent with the floating-point code in having > an unnecessary MOVPRFX. So I think adjusting the count is fine. > Presumably any future fix for the existing redundant MOVPRFXs will > apply to the new ones as well. > > The patch looks good to me, just some very minor nits: > > > @@ -8309,11 +8309,12 @@ vect_double_mask_nunits (tree type) > > > > /* Record that a fully-masked version of LOOP_VINFO would need MASKS to > > contain a sequence of NVECTORS masks that each control a vector of type > > - VECTYPE. */ > > + VECTYPE. SCALAR_MASK if non-null, represents the mask used for > > corresponding > > + load/store stmt. */ > > Should be two spaces between sentences. Maybe: > > VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND > these vector masks with the vector version of SCALAR_MASK. */ > > since the mask isn't necessarily for a load or store statement. > > > [...] > > @@ -1879,7 +1879,8 @@ static tree permute_vec_elements (tree, tree, tree, > > stmt_vec_info, > > says how the load or store is going to be implemented and GROUP_SIZE > > is the number of load or store statements in the containing group. > > If the access is a gather load or scatter store, GS_INFO describes > > - its arguments. > > + its arguments. SCALAR_MASK is the scalar mask used for corresponding > > + load or store stmt. > > Maybe: > > its arguments. If the load or store is conditional, SCALAR_MASK is the > condition under which it occurs. > > since SCALAR_MASK can be null here too. > > > [...] > > @@ -9975,6 +9978,31 @@ vectorizable_condition (stmt_vec_info stmt_info, > > gimple_stmt_iterator *gsi, > > /* Handle cond expr. */ > > for (j = 0; j < ncopies; j++) > > { > > + tree loop_mask = NULL_TREE; > > + bool swap_cond_operands = false; > > + > > + if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > > + { > > + scalar_cond_masked_key cond (cond_expr, ncopies); > > + if (loop_vinfo->scalar_cond_masked_set.contains (cond)) > > + { > > + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); > > + loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, > > j); > > + } > > + else > > + { > > + cond.code = invert_tree_comparison (cond.code, > > + HONOR_NANS (TREE_TYPE > > (cond.op0))); > > Long line. Maybe just split it out into a separate assignment: > > bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0)); > cond.code = invert_tree_comparison (cond.code, honor_nans); > > > + if (loop_vinfo->scalar_cond_masked_set.contains (cond)) > > + { > > + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); > > + loop_mask = vect_get_loop_mask (gsi, masks, ncopies, > > vectype, j); > > Long line here too. > > > [...] > > @@ -10090,6 +10121,26 @@ vectorizable_condition (stmt_vec_info stmt_info, > > gimple_stmt_iterator *gsi, > > } > > } > > } > > + > > + if (loop_mask) > > + { > > + if (COMPARISON_CLASS_P (vec_compare)) > > + { > > + tree tmp = make_ssa_name (vec_cmp_type); > > + gassign *g = gimple_build_assign (tmp, > > + TREE_CODE (vec_compare), > > + TREE_OPERAND > > (vec_compare, 0), > d> + TREE_OPERAND > (vec_compare, 1)); > > Two long lines. > > > + vect_finish_stmt_generation (stmt_info, g, gsi); > > + vec_compare = tmp; > > + } > > + > > + tree tmp2 = make_ssa_name (vec_cmp_type); > > + gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR, > > vec_compare, loop_mask); > > Long line here too. > > > [...] > > diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c > > index dc181524744..c4b2d8e8647 100644 > > --- a/gcc/tree-vectorizer.c > > +++ b/gcc/tree-vectorizer.c > > @@ -1513,3 +1513,39 @@ make_pass_ipa_increase_alignment (gcc::context *ctxt) > > { > > return new pass_ipa_increase_alignment (ctxt); > > } > > + > > +/* If code(T) is comparison op or def of comparison stmt, > > + extract it's operands. > > + Else return <NE_EXPR, T, 0>. */ > > + > > +void > > +scalar_cond_masked_key::get_cond_ops_from_tree (tree t) > > +{ > > + if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison) > > + { > > + this->code = TREE_CODE (t); > > + this->op0 = TREE_OPERAND (t, 0); > > + this->op1 = TREE_OPERAND (t, 1); > > + return; > > + } > > + > > + if (TREE_CODE (t) == SSA_NAME) > > + { > > + gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)); > > + if (stmt) > > + { > > Might as well do this as: > > if (TREE_CODE (t) == SSA_NAME) > if (gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t))) > { > > The patch (as hoped) introduces some XPASSes: > > XPASS: gcc.target/aarch64/sve/cond_cnot_2.c scan-assembler-not \\tsel\\t > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0\\n 21 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 42 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0\\n 15 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 30 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0\\n 21 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 42 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0\\n 15 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 30 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0\\n 21 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 42 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0\\n 15 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 30 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0\\n 21 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 42 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0\\n 15 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 30 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmuo\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 252 > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times > \\tfcmuo\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 180 > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times > \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0 21 > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times > \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d 42 > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times > \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0 15 > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times > \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s 30 > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times > \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0 21 > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times > \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d 42 > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times > \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0 15 > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times > \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s 30 > > Could you remove the associated xfails (and comments above them where > appropriate)? > > OK with those changes from my POV, but please give Richi a day or so > to object. > > Thanks for doing this. Thanks for the suggestions, I have updated the patch accordingly. Boostrap+test in progress on x86_64-unknown-linux-gnu and aarch64-linux-gnu. Richi, does the patch look OK to you ?
Thanks, Prathamesh > > Richard
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c index d689e21dc11..3df2431be38 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c @@ -32,4 +32,4 @@ TEST_ALL (DEF_LOOP) /* { dg-final { scan-assembler-not {\tmov\tz} } } */ /* { dg-final { scan-assembler-not {\tmovprfx\t} } } */ /* Currently we canonicalize the ?: so that !b[i] is the "false" value. */ -/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c index 69468eb69be..d2ffcc758f3 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c @@ -11,7 +11,10 @@ INT_TYPE *__restrict pred, int n) \ { \ for (int i = 0; i < n; ++i) \ - r[i] = pred[i] ? (FLOAT_TYPE) a[i] : b[i]; \ + { \ + FLOAT_TYPE bi = b[i]; \ + r[i] = pred[i] ? (FLOAT_TYPE) a[i] : bi; \ + } \ } #define TEST_ALL(T) \ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c index 55b535fa0cf..d55aef0bb9a 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c @@ -11,7 +11,10 @@ INT_TYPE *__restrict pred, int n) \ { \ for (int i = 0; i < n; ++i) \ - r[i] = pred[i] ? (INT_TYPE) a[i] : b[i]; \ + { \ + INT_TYPE bi = b[i]; \ + r[i] = pred[i] ? (INT_TYPE) a[i] : bi; \ + } \ } #define TEST_ALL(T) \ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c index adf828398bb..68a9d2c3b6c 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c @@ -13,7 +13,10 @@ TYPE *__restrict pred, int n) \ { \ for (int i = 0; i < n; ++i) \ - r[i] = pred[i] ? OP (a[i]) : b[i]; \ + { \ + TYPE bi = b[i]; \ + r[i] = pred[i] ? OP (a[i]) : bi; \ + } \ } #define TEST_INT_TYPE(T, TYPE) \ @@ -57,5 +60,5 @@ TEST_ALL (DEF_LOOP) /* At the moment we don't manage to avoid using MOVPRFX for the floating-point functions. */ /* { dg-final { scan-assembler-not {\tmovprfx\t} { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tmovprfx\t} 6 } } */ +/* { dg-final { scan-assembler-times {\tmovprfx\t} 14 } } */ /* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c index 5c04bcdb3f5..a1b0667dab5 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c @@ -15,5 +15,9 @@ f (double *restrict a, double *restrict b, double *restrict c, } } -/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */ +/* See https://gcc.gnu.org/ml/gcc-patches/2019-08/msg01644.html + for XFAILing the below test. */ + +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */ /* { dg-final { scan-assembler-not {\tfmad\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c index 00d84760a19..b38f23e87ba 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c @@ -98,24 +98,24 @@ TEST_CMP (nugt) /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ /* 5 for lt, 5 for ult and 5 for nult. */ -/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ /* 5 for le, 5 for ule and 5 for nule. */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ /* 5 for gt, 5 for ugt and 5 for nugt. */ -/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ /* 5 for ge, 5 for uge and 5 for nuge. */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} } } */ /* 3 loops * 5 invocations for all 12 unordered comparisons. */ -/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 } } */ /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 7 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 14 { xfail *-*-* } } } */ @@ -123,19 +123,19 @@ TEST_CMP (nugt) /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ -/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} } } */ /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2, for all 12 unordered comparisons. */ -/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c index 23bfb7b2649..2f16fbff522 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c @@ -19,16 +19,16 @@ /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */ /* 5 for le, 5 for ule and 5 for nule. */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */ /* 5 for gt, 5 for ugt, 5 for nueq and 5 for nugt. */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 20 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */ /* 5 for ge, 5 for uge and 5 for nuge. */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */ /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} } } */ /* 3 loops * 5 invocations for ordered, unordered amd ueq. */ @@ -43,14 +43,14 @@ /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */ /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} } } */ /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2, diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index b0cbbac0cb5..acd8d67d2a1 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -7197,7 +7197,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, } else vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, - vectype_in); + vectype_in, NULL); } if (dump_enabled_p () && reduction_type == FOLD_LEFT_REDUCTION) @@ -8110,7 +8110,7 @@ vectorizable_live_operation (stmt_vec_info stmt_info, gcc_assert (ncopies == 1 && !slp_node); vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo), - 1, vectype); + 1, vectype, NULL); } } return true; @@ -8309,11 +8309,12 @@ vect_double_mask_nunits (tree type) /* Record that a fully-masked version of LOOP_VINFO would need MASKS to contain a sequence of NVECTORS masks that each control a vector of type - VECTYPE. */ + VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND + these vector masks with the vector version of SCALAR_MASK. */ void vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, - unsigned int nvectors, tree vectype) + unsigned int nvectors, tree vectype, tree scalar_mask) { gcc_assert (nvectors != 0); if (masks->length () < nvectors) @@ -8329,6 +8330,12 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, rgm->max_nscalars_per_iter = nscalars_per_iter; rgm->mask_type = build_same_sized_truth_vector_type (vectype); } + + if (scalar_mask) + { + scalar_cond_masked_key cond (scalar_mask, nvectors); + loop_vinfo->scalar_cond_masked_set.add (cond); + } } /* Given a complete set of masks MASKS, extract mask number INDEX diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index dd9d45a9547..a5d4902e140 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1879,7 +1879,8 @@ static tree permute_vec_elements (tree, tree, tree, stmt_vec_info, says how the load or store is going to be implemented and GROUP_SIZE is the number of load or store statements in the containing group. If the access is a gather load or scatter store, GS_INFO describes - its arguments. + its arguments. If the load or store is conditional, SCALAR_MASK is the + condition under which it occurs. Clear LOOP_VINFO_CAN_FULLY_MASK_P if a fully-masked loop is not supported, otherwise record the required mask types. */ @@ -1888,7 +1889,7 @@ static void check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, vec_load_store_type vls_type, int group_size, vect_memory_access_type memory_access_type, - gather_scatter_info *gs_info) + gather_scatter_info *gs_info, tree scalar_mask) { /* Invariant loads need no special support. */ if (memory_access_type == VMAT_INVARIANT) @@ -1912,7 +1913,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, return; } unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype); - vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype); + vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask); return; } @@ -1936,7 +1937,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, return; } unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype); - vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype); + vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask); return; } @@ -1974,7 +1975,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); unsigned int nvectors; if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors)) - vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype); + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask); else gcc_unreachable (); } @@ -3436,7 +3437,9 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, unsigned int nvectors = (slp_node ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) : ncopies); - vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out); + tree scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno); + vect_record_loop_mask (loop_vinfo, masks, nvectors, + vectype_out, scalar_mask); } return true; } @@ -7390,7 +7393,7 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) check_load_store_masking (loop_vinfo, vectype, vls_type, group_size, - memory_access_type, &gs_info); + memory_access_type, &gs_info, mask); STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; vect_model_store_cost (stmt_info, ncopies, rhs_dt, memory_access_type, @@ -8637,7 +8640,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) check_load_store_masking (loop_vinfo, vectype, VLS_LOAD, group_size, - memory_access_type, &gs_info); + memory_access_type, &gs_info, mask); STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; vect_model_load_cost (stmt_info, ncopies, memory_access_type, @@ -9975,6 +9978,32 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, /* Handle cond expr. */ for (j = 0; j < ncopies; j++) { + tree loop_mask = NULL_TREE; + bool swap_cond_operands = false; + + if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + { + scalar_cond_masked_key cond (cond_expr, ncopies); + if (loop_vinfo->scalar_cond_masked_set.contains (cond)) + { + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j); + } + else + { + bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0)); + cond.code = invert_tree_comparison (cond.code, honor_nans); + if (loop_vinfo->scalar_cond_masked_set.contains (cond)) + { + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + loop_mask = vect_get_loop_mask (gsi, masks, ncopies, + vectype, j); + cond_code = cond.code; + swap_cond_operands = true; + } + } + } + stmt_vec_info new_stmt_info = NULL; if (j == 0) { @@ -10052,6 +10081,9 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vec_then_clause = vec_oprnds2[i]; vec_else_clause = vec_oprnds3[i]; + if (swap_cond_operands) + std::swap (vec_then_clause, vec_else_clause); + if (masked) vec_compare = vec_cond_lhs; else @@ -10090,6 +10122,28 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, } } } + + if (loop_mask) + { + if (COMPARISON_CLASS_P (vec_compare)) + { + tree tmp = make_ssa_name (vec_cmp_type); + tree op0 = TREE_OPERAND (vec_compare, 0); + tree op1 = TREE_OPERAND (vec_compare, 1); + gassign *g = gimple_build_assign (tmp, + TREE_CODE (vec_compare), + op0, op1); + vect_finish_stmt_generation (stmt_info, g, gsi); + vec_compare = tmp; + } + + tree tmp2 = make_ssa_name (vec_cmp_type); + gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR, + vec_compare, loop_mask); + vect_finish_stmt_generation (stmt_info, g, gsi); + vec_compare = tmp2; + } + if (reduction_type == EXTRACT_LAST_REDUCTION) { if (!is_gimple_val (vec_compare)) diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index dc181524744..e44eb3dda07 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -1513,3 +1513,36 @@ make_pass_ipa_increase_alignment (gcc::context *ctxt) { return new pass_ipa_increase_alignment (ctxt); } + +/* If code(T) is comparison op or def of comparison stmt, + extract it's operands. + Else return <NE_EXPR, T, 0>. */ + +void +scalar_cond_masked_key::get_cond_ops_from_tree (tree t) +{ + if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison) + { + this->code = TREE_CODE (t); + this->op0 = TREE_OPERAND (t, 0); + this->op1 = TREE_OPERAND (t, 1); + return; + } + + if (TREE_CODE (t) == SSA_NAME) + if (gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t))) + { + tree_code code = gimple_assign_rhs_code (stmt); + if (TREE_CODE_CLASS (code) == tcc_comparison) + { + this->code = code; + this->op0 = gimple_assign_rhs1 (stmt); + this->op1 = gimple_assign_rhs2 (stmt); + return; + } + } + + this->code = NE_EXPR; + this->op0 = t; + this->op1 = build_zero_cst (TREE_TYPE (t)); +} diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 1456cde4c2c..e20a61ee33f 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -26,6 +26,7 @@ typedef class _stmt_vec_info *stmt_vec_info; #include "tree-data-ref.h" #include "tree-hash-traits.h" #include "target.h" +#include "hash-set.h" /* Used for naming of new temporaries. */ enum vect_var_kind { @@ -174,7 +175,71 @@ public: #define SLP_TREE_TWO_OPERATORS(S) (S)->two_operators #define SLP_TREE_DEF_TYPE(S) (S)->def_type +struct scalar_cond_masked_key +{ + scalar_cond_masked_key (tree t, unsigned ncopies_) + : ncopies (ncopies_) + { + get_cond_ops_from_tree (t); + } + + void get_cond_ops_from_tree (tree); + + unsigned ncopies; + tree_code code; + tree op0; + tree op1; +}; +template<> +struct default_hash_traits<scalar_cond_masked_key> +{ + typedef scalar_cond_masked_key compare_type; + typedef scalar_cond_masked_key value_type; + + static inline hashval_t + hash (value_type v) + { + inchash::hash h; + h.add_int (v.code); + inchash::add_expr (v.op0, h, 0); + inchash::add_expr (v.op1, h, 0); + h.add_int (v.ncopies); + return h.end (); + } + + static inline bool + equal (value_type existing, value_type candidate) + { + return (existing.ncopies == candidate.ncopies + && existing.code == candidate.code + && operand_equal_p (existing.op0, candidate.op0, 0) + && operand_equal_p (existing.op1, candidate.op1, 0)); + } + + static inline void + mark_empty (value_type &v) + { + v.ncopies = 0; + } + + static inline bool + is_empty (value_type v) + { + return v.ncopies == 0; + } + + static inline void mark_deleted (value_type &) {} + + static inline bool is_deleted (const value_type &) + { + return false; + } + + static inline void remove (value_type &) {} +}; + +typedef hash_set<scalar_cond_masked_key> scalar_cond_masked_set_type; /* Describes two objects whose addresses must be unequal for the vectorized loop to be valid. */ @@ -255,6 +320,9 @@ public: /* Cost data used by the target cost model. */ void *target_cost_data; + /* Set of scalar conditions that have loop mask applied. */ + scalar_cond_masked_set_type scalar_cond_masked_set; + private: stmt_vec_info new_stmt_vec_info (gimple *stmt); void set_vinfo_for_stmt (gimple *, stmt_vec_info); @@ -1617,7 +1685,7 @@ extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *, extern tree vect_halve_mask_nunits (tree); extern tree vect_double_mask_nunits (tree); extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *, - unsigned int, tree); + unsigned int, tree, tree); extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *, unsigned int, tree, unsigned int);