Thanks for looking at this. Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes: > Hi, > As mentioned in PR, for the following test-case: > > typedef unsigned char uint8_t; > > static inline uint8_t > x264_clip_uint8(uint8_t x) > { > uint8_t t = -x; > uint8_t t1 = x & ~63; > return (t1 != 0) ? t : x; > } > > void > mc_weight(uint8_t *restrict dst, uint8_t *restrict src, int n) > { > for (int x = 0; x < n*16; x++) > dst[x] = x264_clip_uint8(src[x]); > } > > -O3 -mcpu=generic+sve generates following code for the inner loop: > > .L3: > ld1b z0.b, p0/z, [x1, x2] > movprfx z2, z0 > and z2.b, z2.b, #0xc0 > movprfx z1, z0 > neg z1.b, p1/m, z0.b > cmpeq p2.b, p1/z, z2.b, #0 > sel z0.b, p2, z0.b, z1.b > st1b z0.b, p0, [x0, x2] > add x2, x2, x4 > whilelo p0.b, w2, w3 > b.any .L3 > > The sel is redundant since we could conditionally negate z0 based on > the predicate > comparing z2 with 0. > > As suggested in the PR, the attached patch, introduces a new > conditional internal function .COND_NEG, and in gimple-isel replaces > the following sequence: > op2 = -op1 > op0 = A cmp B > lhs = op0 ? op1 : op2 > > with: > op0 = A inverted_cmp B > lhs = .COND_NEG (op0, op1, op1). > > lhs = .COD_NEG (op0, op1, op1) > implies > lhs = neg (op1) if cond is true OR fall back to op1 if cond is false. > > With patch, it generates the following code-gen: > .L3: > ld1b z0.b, p0/z, [x1, x2] > movprfx z1, z0 > and z1.b, z1.b, #0xc0 > cmpne p1.b, p2/z, z1.b, #0 > neg z0.b, p1/m, z0.b > st1b z0.b, p0, [x0, x2] > add x2, x2, x4 > whilelo p0.b, w2, w3 > b.any .L3 > > While it seems to work for this test-case, I am not entirely sure if > the patch is correct. Does it look in the right direction ?
For binary ops we use match.pd rather than isel: (for uncond_op (UNCOND_BINARY) cond_op (COND_BINARY) (simplify (vec_cond @0 (view_convert? (uncond_op@4 @1 @2)) @3) (with { tree op_type = TREE_TYPE (@4); } (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), op_type) && is_truth_type_for (op_type, TREE_TYPE (@0))) (view_convert (cond_op @0 @1 @2 (view_convert:op_type @3)))))) (simplify (vec_cond @0 @1 (view_convert? (uncond_op@4 @2 @3))) (with { tree op_type = TREE_TYPE (@4); } (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), op_type) && is_truth_type_for (op_type, TREE_TYPE (@0))) (view_convert (cond_op (bit_not @0) @2 @3 (view_convert:op_type @1))))))) I think it'd be good to do the same here, using new (UN)COND_UNARY iterators. (The iterators will only have one value to start with, but other unary ops could get the same treatment in future.) Richard > > Thanks, > Prathamesh > > diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc > index 38e90933c3e..5b0dd3c1993 100644 > --- a/gcc/gimple-isel.cc > +++ b/gcc/gimple-isel.cc > @@ -39,6 +39,8 @@ along with GCC; see the file COPYING3. If not see > #include "optabs.h" > #include "gimple-fold.h" > #include "internal-fn.h" > +#include "fold-const.h" > +#include "tree-pretty-print.h" > > /* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to > internal function based on vector type of selected expansion. > @@ -203,6 +205,35 @@ gimple_expand_vec_cond_expr (gimple_stmt_iterator *gsi, > return new_stmt; > } > > + /* Replace: > + op2 = -op1 > + op0 = A cmp B > + lhs = op0 ? op1 : op2 > + > + with: > + op0 = A inverted_cmp B > + lhs = .COND_NEG (op0, op1, op1). */ > + > + gassign *op1_def = nullptr; > + if (TREE_CODE (op1) == SSA_NAME) > + op1_def = static_cast<gassign *> (SSA_NAME_DEF_STMT (op1)); > + > + gassign *op2_def = nullptr; > + if (TREE_CODE (op2) == SSA_NAME) > + op2_def = static_cast<gassign *> (SSA_NAME_DEF_STMT (op2)); > + > + if (can_compute_op0 && op1_def && op2_def > + && gimple_assign_rhs_code (op2_def) == NEGATE_EXPR > + && operand_equal_p (gimple_assign_rhs1 (op2_def), op1, 0)) > + { > + auto inverted_code > + = invert_tree_comparison (gimple_assign_rhs_code (def_stmt), > true); > + gimple_assign_set_rhs_code (def_stmt, inverted_code); > + auto gsi2 = gsi_for_stmt (op2_def); > + gsi_remove (&gsi2, true); > + return gimple_build_call_internal (IFN_COND_NEG, 3, op0, op1, > op1); > + } > + > if (can_compute_op0 > && used_vec_cond_exprs >= 2 > && (get_vcond_mask_icode (mode, TYPE_MODE (op0_type)) > diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c > index 78db25bbac4..b57c7a4ed3e 100644 > --- a/gcc/internal-fn.c > +++ b/gcc/internal-fn.c > @@ -3877,7 +3877,8 @@ static void (*const internal_fn_expanders[]) > (internal_fn, gcall *) = { > T (BIT_IOR_EXPR, IFN_COND_IOR) \ > T (BIT_XOR_EXPR, IFN_COND_XOR) \ > T (LSHIFT_EXPR, IFN_COND_SHL) \ > - T (RSHIFT_EXPR, IFN_COND_SHR) > + T (RSHIFT_EXPR, IFN_COND_SHR) \ > + T (NEGATE_EXPR, IFN_COND_NEG) > > /* Return a function that only performs CODE when a certain condition is met > and that uses a given fallback value otherwise. For example, if CODE is > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index 88169ef4656..db40d1bfd18 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -204,6 +204,8 @@ DEF_INTERNAL_OPTAB_FN (COND_FMS, ECF_CONST, cond_fms, > cond_ternary) > DEF_INTERNAL_OPTAB_FN (COND_FNMA, ECF_CONST, cond_fnma, cond_ternary) > DEF_INTERNAL_OPTAB_FN (COND_FNMS, ECF_CONST, cond_fnms, cond_ternary) > > +DEF_INTERNAL_OPTAB_FN (COND_NEG, ECF_CONST | ECF_NOTHROW, cond_neg, > cond_unary) > + > DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary) > > DEF_INTERNAL_OPTAB_FN (REDUC_PLUS, ECF_CONST | ECF_NOTHROW, > diff --git a/gcc/optabs.def b/gcc/optabs.def > index 201b8aae1c0..fc65a3a7c23 100644 > --- a/gcc/optabs.def > +++ b/gcc/optabs.def > @@ -245,6 +245,7 @@ OPTAB_D (cond_fma_optab, "cond_fma$a") > OPTAB_D (cond_fms_optab, "cond_fms$a") > OPTAB_D (cond_fnma_optab, "cond_fnma$a") > OPTAB_D (cond_fnms_optab, "cond_fnms$a") > +OPTAB_D (cond_neg_optab, "cond_neg$a") > OPTAB_D (cmov_optab, "cmov$a6") > OPTAB_D (cstore_optab, "cstore$a4") > OPTAB_D (ctrap_optab, "ctrap$a4")