This patch would add a new middle-end representation for matching the x264 narrowing clip idiom:
inline U_NT clip_uint8 (S_WT x) { return x & (~((U_NT)-1)) ? (-x) >> 31 : x; } which would be accessible through the define_expand <us>clip<m1><m2>2 optabs. For example, truncating int32_t to uint8_t would produce the following results: * .NARROW_CLIP (254) => 254 * .NARROW_CLIP (255) => 255 * .NARROW_CLIP (65535) => 255 * .NARROW_CLIP (-1) => 0 Currently this patch only supports clipping and returning an unsigned narrow type. I'm unsure if this is the best way to approach the problem as there is a similar optab .SAT_TRUNC which performs a similar operation. The main difference between .NARROW_CLIP and .SAT_TRUNC can be described in the example above (clipping int32_t to uint8_t) * .SAT_TRUNC (-1) => 255 * .NARROW_CLIP (-1) => 0 This breaks the intended semantics of the code which is why I thought another optab would make sense. If there is a better way to approach this which would utilize more of the .SAT_TRUNC optab, please let me know. PR target/120378 gcc/ChangeLog: * config/riscv/autovec.md (sclip<mode><v_oct_trunc>2): New pattern. (uclip<mode><v_oct_trunc>2): Ditto. (sclip<mode><v_quad_trunc>2): Ditto. (uclip<mode><v_quad_trunc>2): Ditto. (sclip<mode><v_double_trunc>2): Ditto. (uclip<mode><v_double_trunc>2): Ditto. * internal-fn.def (NARROW_CLIP): New ifn. * match.pd: Match narrow clip idiom. * optabs.def (OPTAB_CL): Add (un)signed narrow clip optab. * rtl.def (S_NARROW_CLIP): Match for narrow clip rtl. (U_NARROW_CLIP): Ditto. * simplify-rtx.cc (simplify_const_unary_operation): New case. * tree-vect-patterns.cc (gimple_unsigned_integer_narrow_clip): New pattern. (gimple_signed_integer_narrow_clip): New pattern. (vect_recog_narrow_clip_pattern): New pattern. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr120378.c: New test. Signed-off-by: Edwin Lu <e...@rivosinc.com> --- gcc/config/riscv/autovec.md | 73 +++++++++++++++++ gcc/internal-fn.def | 2 + gcc/match.pd | 24 ++++++ gcc/optabs.def | 3 + gcc/rtl.def | 6 ++ gcc/simplify-rtx.cc | 16 ++++ .../gcc.target/riscv/rvv/autovec/pr120378.c | 20 +++++ gcc/tree-vect-patterns.cc | 82 +++++++++++++++++++ 8 files changed, 226 insertions(+) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md index 1fff8ac2fc4..95394f4dd15 100644 --- a/gcc/config/riscv/autovec.md +++ b/gcc/config/riscv/autovec.md @@ -3029,3 +3029,76 @@ (define_expand "uabd<mode>3" DONE; }); + +; ======== +; == Narrow clip +; ======== + +(define_expand "sclip<mode><v_oct_trunc>2" + [(match_operand:<V_OCT_TRUNC> 0 "register_operand") + (match_operand:VOEXTI 1 "register_operand")] + "TARGET_VECTOR && 0" + { + gcc_assert(0); + }); + +(define_expand "uclip<mode><v_oct_trunc>2" + [(match_operand:<V_OCT_TRUNC> 0 "register_operand") + (match_operand:VOEXTI 1 "register_operand") ] + "TARGET_VECTOR" + { + rtx max = gen_reg_rtx (<MODE>mode); + insn_code icode = code_for_pred (SMAX, <MODE>mode); + rtx ops1[] = {max, operands[1], CONST0_RTX (<MODE>mode)}; + riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1); + + riscv_vector::expand_vec_oct_ustrunc (operands[0], max, <MODE>mode, + <V_DOUBLE_TRUNC>mode, + <V_QUAD_TRUNC>mode); + DONE; + }); + +(define_expand "sclip<mode><v_quad_trunc>2" + [(match_operand:<V_QUAD_TRUNC> 0 "register_operand") + (match_operand:VQEXTI 1 "register_operand")] + "TARGET_VECTOR && 0" + { + gcc_assert(0); + }); + +(define_expand "uclip<mode><v_quad_trunc>2" + [(match_operand:<V_QUAD_TRUNC> 0 "register_operand") + (match_operand:VQEXTI 1 "register_operand") ] + "TARGET_VECTOR" + { + rtx max = gen_reg_rtx (<MODE>mode); + insn_code icode = code_for_pred (SMAX, <MODE>mode); + rtx ops1[] = {max, operands[1], CONST0_RTX (<MODE>mode)}; + riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1); + + riscv_vector::expand_vec_quad_ustrunc (operands[0], max, <MODE>mode, + <V_DOUBLE_TRUNC>mode); + DONE; + }); + +(define_expand "sclip<mode><v_double_trunc>2" + [(match_operand:<V_DOUBLE_TRUNC> 0 "register_operand") + (match_operand:VWEXTI 1 "register_operand")] + "TARGET_VECTOR && 0" + { + gcc_assert(0); + }); + +(define_expand "uclip<mode><v_double_trunc>2" + [(match_operand:<V_DOUBLE_TRUNC> 0 "register_operand") + (match_operand:VWEXTI 1 "register_operand") ] + "TARGET_VECTOR" + { + rtx max = gen_reg_rtx (<MODE>mode); + insn_code icode = code_for_pred (SMAX, <MODE>mode); + rtx ops1[] = {max, operands[1], CONST0_RTX (<MODE>mode)}; + riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1); + + riscv_vector::expand_vec_double_ustrunc (operands[0], max, <MODE>mode); + DONE; + }); diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index d2480a1bf79..85f44a53729 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -286,6 +286,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_MUL, ECF_CONST, first, ssmul, usmul, binary) DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_TRUNC, ECF_CONST, first, sstrunc, ustrunc, unary_convert) +DEF_INTERNAL_SIGNED_OPTAB_FN (NARROW_CLIP, ECF_CONST, first, sclip, uclip, unary_convert) + DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary) DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary) DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary) diff --git a/gcc/match.pd b/gcc/match.pd index 4903552c82a..73013bc1e29 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -3360,6 +3360,30 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) } (if (wi::eq_p (sum, wi::uhwi (0, precision)))))))) +/* Narrow clip for unsigned integer. */ +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) + (match (unsigned_integer_narrow_clip @0) + /* NARROW_CLIP = (T)X & (NT)(-1) ? (-X) >> 31 : X + + The gimple representation uses X > (NT)(-1) instead of + using & so match on gt instead of bit_and. */ + (convert (cond^ (gt (nop_convert? @0) INTEGER_CST@1) + (rshift:s (nop_convert? (negate (nop_convert? @0))) INTEGER_CST@2) + @0)) + (if (! TYPE_UNSIGNED (TREE_TYPE (@0))) + (with + { + unsigned itype_precision = TYPE_PRECISION (TREE_TYPE (@0)); + unsigned otype_precision = TYPE_PRECISION (type); + wide_int trunc_max = wi::mask (otype_precision, false, itype_precision); + wide_int int_cst_1 = wi::to_wide (@1, itype_precision); + wide_int int_cst_2 = wi::to_wide (@2, itype_precision); + wide_int shift_amount = wi::uhwi ((HOST_WIDE_INT_1U << 5) - 1, + itype_precision); // Aka 31 + } + (if (otype_precision < itype_precision && wi::eq_p (trunc_max, + int_cst_1) && wi::eq_p(int_cst_2, shift_amount))))))) + /* Saturation truncate for unsigned integer. */ (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) (match (unsigned_integer_sat_trunc @0) diff --git a/gcc/optabs.def b/gcc/optabs.def index 87a8b85da15..b56e9e75a75 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -70,6 +70,9 @@ OPTAB_CL(satfractuns_optab, "satfractuns$I$b$Q$a2", UNSIGNED_SAT_FRACT, "satfrac OPTAB_CL(ustrunc_optab, "ustrunc$b$a2", US_TRUNCATE, "ustrunc", NULL) OPTAB_CL(sstrunc_optab, "sstrunc$b$a2", SS_TRUNCATE, "sstrunc", NULL) +OPTAB_CL(uclip_optab, "uclip$b$a2", U_NARROW_CLIP, "uclip", NULL) +OPTAB_CL(sclip_optab, "sclip$b$a2", S_NARROW_CLIP, "sclip", NULL) + OPTAB_CD(sfixtrunc_optab, "fix_trunc$F$b$I$a2") OPTAB_CD(ufixtrunc_optab, "fixuns_trunc$F$b$I$a2") diff --git a/gcc/rtl.def b/gcc/rtl.def index 15ae7d10fcc..f3387aa8ea7 100644 --- a/gcc/rtl.def +++ b/gcc/rtl.def @@ -753,6 +753,12 @@ DEF_RTL_EXPR(SS_TRUNCATE, "ss_truncate", "e", RTX_UNARY) /* Unsigned saturating truncate. */ DEF_RTL_EXPR(US_TRUNCATE, "us_truncate", "e", RTX_UNARY) +/* Signed narrowing clip. */ +DEF_RTL_EXPR(S_NARROW_CLIP, "s_narrow_clip", "e", RTX_UNARY) + +/* Unsigned narrowing clip. */ +DEF_RTL_EXPR(U_NARROW_CLIP, "u_narrow_clip", "e", RTX_UNARY) + /* Floating point multiply/add combined instruction. */ DEF_RTL_EXPR(FMA, "fma", "eee", RTX_TERNARY) diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index cbe61b49bf6..a195ec502c5 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -2179,6 +2179,22 @@ simplify_const_unary_operation (enum rtx_code code, machine_mode mode, result = wide_int::from (result, width, sgn); break; } + + case U_NARROW_CLIP: + case S_NARROW_CLIP: + { + signop sgn = code == U_NARROW_CLIP ? UNSIGNED : SIGNED; + wide_int nmax + = wide_int::from (wi::max_value (width, sgn), + GET_MODE_PRECISION (imode), sgn); + wide_int nmin + = wide_int::from (wi::min_value (width, sgn), + GET_MODE_PRECISION (imode), sgn); + result = wi::min (wi::max (op0, nmin, sgn), nmax, sgn); + result = wide_int::from (result, width, sgn); + break; + } + case SIGN_EXTEND: result = wide_int::from (op0, width, SIGNED); break; diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c new file mode 100644 index 00000000000..4cfedde99ee --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */ + +#include <stdint.h> + +inline uint8_t +clip_uint8 (int x) +{ + return x & (~255) ? (-x) >> 31 : x; +} + +void __attribute__ ((noipa)) +clip_loop (uint8_t *res, int *x, int w) +{ + for (int i = 0; i < w; i++) + res[i] = clip_uint8 (x[i]); +} + +/* { dg-final { scan-tree-dump-times ".NARROW_CLIP " 1 "optimized" } } */ +/* { dg-final { scan-assembler-times {vnclipu\.wi} 2 } } */ diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index 0f6d6b77ea1..31629a31b93 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -3675,6 +3675,87 @@ vect_recog_cast_forwprop_pattern (vec_info *vinfo, return pattern_stmt; } +extern bool gimple_unsigned_integer_narrow_clip (tree, tree*, tree (*)(tree)); +extern bool gimple_signed_integer_narrow_clip (tree, tree*, tree (*)(tree)); + +/* Function vect_recog_narrow_clip_pattern + + Try to find the following narrow clip pattern: + + type x_t; + TYPE x_T, clip = init; + loop: + clip_0 = phi <init, clip_1> + S1 x_t = *p; + S2 temp_t = type_u x_t; + S3 neg_x_t = -temp_t; + S4 neg_signed_x_t = (type) neg_x_t; + S5 x_shifted = neg_signed_x_t >> 31; + S6 is_greater = x_shifted > 255; + S7 cond = is_greater ? x_shifted : x_t; + S8 clip_1 = (TYPE) cond; + + where 'TYPE' is at least twice as smalls as the size of type 'type'. + + Input: + + * STMT_VINFO: The stmt from which the pattern search begins. In the + example, when this function is called with S8, the pattern + {S3,S4,S5,S6,S7,S8} will be detected. + + Output: + + * TYPE_OUT: The type of the output of this pattern. + + * Return value: A new stmt that will be used to replace the sequence of + stmts that constitute the pattern. In this case it will be: + .NARROW_CLIP <x_t, y_t, sum_0> + */ + +static gimple * +vect_recog_narrow_clip_pattern (vec_info *vinfo, + stmt_vec_info stmt_vinfo, tree *type_out) +{ + + gimple *last_stmt = STMT_VINFO_STMT (stmt_vinfo); + + if (!is_gimple_assign (last_stmt)) + return NULL; + + tree ops[1]; + tree lhs = gimple_assign_lhs (last_stmt); + tree otype = TREE_TYPE (lhs); + + if ((gimple_unsigned_integer_narrow_clip (lhs, ops, NULL)) + // || gimple_signed_integer_narrow_clip (lhs, ops, NULL)) + && type_has_mode_precision_p (otype)) + { + tree itype = TREE_TYPE (ops[0]); + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); + internal_fn fn = IFN_NARROW_CLIP; + + if (v_itype != NULL_TREE && v_otype != NULL_TREE + && direct_internal_fn_supported_p (fn, tree_pair (v_otype, v_itype), + OPTIMIZE_FOR_BOTH)) + { + gcall *call = gimple_build_call_internal (fn, 1, ops[0]); + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); + + gimple_call_set_lhs (call, out_ssa); + gimple_call_set_nothrow (call, /* nothrow_p */ false); + gimple_set_location (call, gimple_location (last_stmt)); + + *type_out = v_otype; + vect_pattern_detected ("vect_recog_narrow_clip_pattern", stmt_vinfo->stmt); + + return call; + } + } + + return NULL; +} + /* Try to detect a shift left of a widened input, converting LSHIFT_EXPR to WIDEN_LSHIFT_EXPR. See vect_recog_widen_op_pattern for details. */ @@ -6917,6 +6998,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = { { vect_recog_sat_add_pattern, "sat_add" }, { vect_recog_sat_sub_pattern, "sat_sub" }, { vect_recog_sat_trunc_pattern, "sat_trunc" }, + { vect_recog_narrow_clip_pattern, "narrow_clip" }, { vect_recog_gcond_pattern, "gcond" }, { vect_recog_bool_pattern, "bool" }, /* This must come before mask conversion, and includes the parts -- 2.43.0