This patch would add a new middle-end representation for matching the
x264 narrowing clip idiom:

inline
U_NT clip_uint8 (S_WT x)
{
  return x & (~((U_NT)-1)) ? (-x) >> 31 : x;
}

which would be accessible through the define_expand <us>clip<m1><m2>2
optabs.

For example, truncating int32_t to uint8_t would produce the following
results:

* .NARROW_CLIP (254)    => 254
* .NARROW_CLIP (255)    => 255
* .NARROW_CLIP (65535)  => 255
* .NARROW_CLIP (-1)     => 0

Currently this patch only supports clipping and returning an unsigned
narrow type. I'm unsure if this is the best way to approach the problem
as there is a similar optab .SAT_TRUNC which performs a similar
operation. The main difference between .NARROW_CLIP and .SAT_TRUNC can
be described in the example above (clipping int32_t to uint8_t)

* .SAT_TRUNC (-1)       => 255
* .NARROW_CLIP (-1)     => 0

This breaks the intended semantics of the code which is why I thought
another optab would make sense. If there is a better way to approach
this which would utilize more of the .SAT_TRUNC optab, please let me
know.

        PR target/120378

gcc/ChangeLog:

        * config/riscv/autovec.md (sclip<mode><v_oct_trunc>2): New
        pattern.
        (uclip<mode><v_oct_trunc>2): Ditto.
        (sclip<mode><v_quad_trunc>2): Ditto.
        (uclip<mode><v_quad_trunc>2): Ditto.
        (sclip<mode><v_double_trunc>2): Ditto.
        (uclip<mode><v_double_trunc>2): Ditto.
        * internal-fn.def (NARROW_CLIP): New ifn.
        * match.pd: Match narrow clip idiom.
        * optabs.def (OPTAB_CL): Add (un)signed narrow clip optab.
        * rtl.def (S_NARROW_CLIP): Match for narrow clip rtl.
        (U_NARROW_CLIP): Ditto.
        * simplify-rtx.cc (simplify_const_unary_operation): New case.
        * tree-vect-patterns.cc (gimple_unsigned_integer_narrow_clip):
        New pattern.
        (gimple_signed_integer_narrow_clip): New pattern.
        (vect_recog_narrow_clip_pattern): New pattern.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/pr120378.c: New test.

Signed-off-by: Edwin Lu <e...@rivosinc.com>
---
 gcc/config/riscv/autovec.md                   | 73 +++++++++++++++++
 gcc/internal-fn.def                           |  2 +
 gcc/match.pd                                  | 24 ++++++
 gcc/optabs.def                                |  3 +
 gcc/rtl.def                                   |  6 ++
 gcc/simplify-rtx.cc                           | 16 ++++
 .../gcc.target/riscv/rvv/autovec/pr120378.c   | 20 +++++
 gcc/tree-vect-patterns.cc                     | 82 +++++++++++++++++++
 8 files changed, 226 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 1fff8ac2fc4..95394f4dd15 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -3029,3 +3029,76 @@ (define_expand "uabd<mode>3"

     DONE;
   });
+
+; ========
+; == Narrow clip
+; ========
+
+(define_expand "sclip<mode><v_oct_trunc>2"
+  [(match_operand:<V_OCT_TRUNC> 0 "register_operand")
+   (match_operand:VOEXTI 1 "register_operand")]
+  "TARGET_VECTOR && 0"
+  {
+    gcc_assert(0);
+  });
+
+(define_expand "uclip<mode><v_oct_trunc>2"
+  [(match_operand:<V_OCT_TRUNC> 0 "register_operand")
+   (match_operand:VOEXTI 1 "register_operand") ]
+  "TARGET_VECTOR"
+  {
+    rtx max = gen_reg_rtx (<MODE>mode);
+    insn_code icode = code_for_pred (SMAX, <MODE>mode);
+    rtx ops1[] = {max, operands[1], CONST0_RTX (<MODE>mode)};
+    riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1);
+
+    riscv_vector::expand_vec_oct_ustrunc (operands[0], max, <MODE>mode,
+                                         <V_DOUBLE_TRUNC>mode,
+                                         <V_QUAD_TRUNC>mode);
+    DONE;
+  });
+
+(define_expand "sclip<mode><v_quad_trunc>2"
+  [(match_operand:<V_QUAD_TRUNC> 0 "register_operand")
+   (match_operand:VQEXTI 1 "register_operand")]
+  "TARGET_VECTOR && 0"
+  {
+    gcc_assert(0);
+  });
+
+(define_expand "uclip<mode><v_quad_trunc>2"
+  [(match_operand:<V_QUAD_TRUNC> 0 "register_operand")
+   (match_operand:VQEXTI 1 "register_operand") ]
+  "TARGET_VECTOR"
+  {
+    rtx max = gen_reg_rtx (<MODE>mode);
+    insn_code icode = code_for_pred (SMAX, <MODE>mode);
+    rtx ops1[] = {max, operands[1], CONST0_RTX (<MODE>mode)};
+    riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1);
+
+    riscv_vector::expand_vec_quad_ustrunc (operands[0], max, <MODE>mode,
+                                          <V_DOUBLE_TRUNC>mode);
+    DONE;
+  });
+
+(define_expand "sclip<mode><v_double_trunc>2"
+  [(match_operand:<V_DOUBLE_TRUNC> 0 "register_operand")
+   (match_operand:VWEXTI 1 "register_operand")]
+  "TARGET_VECTOR && 0"
+  {
+    gcc_assert(0);
+  });
+
+(define_expand "uclip<mode><v_double_trunc>2"
+  [(match_operand:<V_DOUBLE_TRUNC> 0 "register_operand")
+   (match_operand:VWEXTI 1 "register_operand") ]
+  "TARGET_VECTOR"
+  {
+    rtx max = gen_reg_rtx (<MODE>mode);
+    insn_code icode = code_for_pred (SMAX, <MODE>mode);
+    rtx ops1[] = {max, operands[1], CONST0_RTX (<MODE>mode)};
+    riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1);
+
+    riscv_vector::expand_vec_double_ustrunc (operands[0], max, <MODE>mode);
+    DONE;
+  });
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index d2480a1bf79..85f44a53729 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -286,6 +286,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_MUL, ECF_CONST, first, 
ssmul, usmul, binary)

 DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_TRUNC, ECF_CONST, first, sstrunc, ustrunc, 
unary_convert)

+DEF_INTERNAL_SIGNED_OPTAB_FN (NARROW_CLIP, ECF_CONST, first, sclip, uclip, 
unary_convert)
+
 DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
 DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
 DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
diff --git a/gcc/match.pd b/gcc/match.pd
index 4903552c82a..73013bc1e29 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3360,6 +3360,30 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     }
     (if (wi::eq_p (sum, wi::uhwi (0, precision))))))))

+/* Narrow clip for unsigned integer.  */
+(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
+  (match (unsigned_integer_narrow_clip @0)
+  /* NARROW_CLIP = (T)X & (NT)(-1) ? (-X) >> 31 : X
+
+     The gimple representation uses X > (NT)(-1) instead of
+     using & so match on gt instead of bit_and.  */
+  (convert (cond^ (gt (nop_convert? @0) INTEGER_CST@1)
+        (rshift:s (nop_convert? (negate (nop_convert? @0))) INTEGER_CST@2)
+        @0))
+  (if (! TYPE_UNSIGNED (TREE_TYPE (@0)))
+   (with
+    {
+     unsigned itype_precision = TYPE_PRECISION (TREE_TYPE (@0));
+     unsigned otype_precision = TYPE_PRECISION (type);
+     wide_int trunc_max = wi::mask (otype_precision, false, itype_precision);
+     wide_int int_cst_1 = wi::to_wide (@1, itype_precision);
+     wide_int int_cst_2 = wi::to_wide (@2, itype_precision);
+     wide_int shift_amount = wi::uhwi ((HOST_WIDE_INT_1U << 5) - 1,
+                                 itype_precision); // Aka 31
+    }
+    (if (otype_precision < itype_precision && wi::eq_p (trunc_max,
+    int_cst_1) && wi::eq_p(int_cst_2, shift_amount)))))))
+
 /* Saturation truncate for unsigned integer.  */
 (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
  (match (unsigned_integer_sat_trunc @0)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 87a8b85da15..b56e9e75a75 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -70,6 +70,9 @@ OPTAB_CL(satfractuns_optab, "satfractuns$I$b$Q$a2", 
UNSIGNED_SAT_FRACT, "satfrac
 OPTAB_CL(ustrunc_optab, "ustrunc$b$a2", US_TRUNCATE, "ustrunc", NULL)
 OPTAB_CL(sstrunc_optab, "sstrunc$b$a2", SS_TRUNCATE, "sstrunc", NULL)

+OPTAB_CL(uclip_optab, "uclip$b$a2", U_NARROW_CLIP, "uclip", NULL)
+OPTAB_CL(sclip_optab, "sclip$b$a2", S_NARROW_CLIP, "sclip", NULL)
+
 OPTAB_CD(sfixtrunc_optab, "fix_trunc$F$b$I$a2")
 OPTAB_CD(ufixtrunc_optab, "fixuns_trunc$F$b$I$a2")

diff --git a/gcc/rtl.def b/gcc/rtl.def
index 15ae7d10fcc..f3387aa8ea7 100644
--- a/gcc/rtl.def
+++ b/gcc/rtl.def
@@ -753,6 +753,12 @@ DEF_RTL_EXPR(SS_TRUNCATE, "ss_truncate", "e", RTX_UNARY)
 /* Unsigned saturating truncate.  */
 DEF_RTL_EXPR(US_TRUNCATE, "us_truncate", "e", RTX_UNARY)

+/* Signed narrowing clip.  */
+DEF_RTL_EXPR(S_NARROW_CLIP, "s_narrow_clip", "e", RTX_UNARY)
+
+/* Unsigned narrowing clip.  */
+DEF_RTL_EXPR(U_NARROW_CLIP, "u_narrow_clip", "e", RTX_UNARY)
+
 /* Floating point multiply/add combined instruction.  */
 DEF_RTL_EXPR(FMA, "fma", "eee", RTX_TERNARY)

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index cbe61b49bf6..a195ec502c5 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -2179,6 +2179,22 @@ simplify_const_unary_operation (enum rtx_code code, 
machine_mode mode,
            result = wide_int::from (result, width, sgn);
            break;
          }
+
+       case U_NARROW_CLIP:
+       case S_NARROW_CLIP:
+         {
+           signop sgn = code == U_NARROW_CLIP ? UNSIGNED : SIGNED;
+           wide_int nmax
+             = wide_int::from (wi::max_value (width, sgn),
+                               GET_MODE_PRECISION (imode), sgn);
+           wide_int nmin
+             = wide_int::from (wi::min_value (width, sgn),
+                               GET_MODE_PRECISION (imode), sgn);
+           result = wi::min (wi::max (op0, nmin, sgn), nmax, sgn);
+           result = wide_int::from (result, width, sgn);
+           break;
+         }
+
        case SIGN_EXTEND:
          result = wide_int::from (op0, width, SIGNED);
          break;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c
new file mode 100644
index 00000000000..4cfedde99ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include <stdint.h>
+
+inline uint8_t
+clip_uint8 (int x)
+{
+  return x & (~255) ? (-x) >> 31 : x;
+}
+
+void __attribute__ ((noipa))
+clip_loop (uint8_t *res, int *x, int w)
+{
+  for (int i = 0; i < w; i++)
+    res[i] = clip_uint8 (x[i]);
+}
+
+/* { dg-final { scan-tree-dump-times ".NARROW_CLIP " 1 "optimized" } } */
+/* { dg-final { scan-assembler-times {vnclipu\.wi} 2 } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 0f6d6b77ea1..31629a31b93 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3675,6 +3675,87 @@ vect_recog_cast_forwprop_pattern (vec_info *vinfo,
   return pattern_stmt;
 }

+extern bool gimple_unsigned_integer_narrow_clip (tree, tree*, tree (*)(tree));
+extern bool gimple_signed_integer_narrow_clip (tree, tree*, tree (*)(tree));
+
+/* Function vect_recog_narrow_clip_pattern
+
+   Try to find the following narrow clip pattern:
+
+     type x_t;
+     TYPE x_T, clip = init;
+   loop:
+     clip_0 = phi <init, clip_1>
+     S1  x_t = *p;
+     S2  temp_t = type_u x_t;
+     S3  neg_x_t = -temp_t;
+     S4  neg_signed_x_t = (type) neg_x_t;
+     S5  x_shifted = neg_signed_x_t >> 31;
+     S6  is_greater = x_shifted > 255;
+     S7  cond = is_greater ? x_shifted : x_t;
+     S8  clip_1 = (TYPE) cond;
+
+   where 'TYPE' is at least twice as smalls as the size of type 'type'.
+
+   Input:
+
+   * STMT_VINFO: The stmt from which the pattern search begins.  In the
+   example, when this function is called with S8, the pattern
+   {S3,S4,S5,S6,S7,S8} will be detected.
+
+   Output:
+
+   * TYPE_OUT: The type of the output of this pattern.
+
+   * Return value: A new stmt that will be used to replace the sequence of
+   stmts that constitute the pattern. In this case it will be:
+        .NARROW_CLIP <x_t, y_t, sum_0>
+  */
+
+static gimple *
+vect_recog_narrow_clip_pattern (vec_info *vinfo,
+                       stmt_vec_info stmt_vinfo, tree *type_out)
+{
+
+  gimple *last_stmt = STMT_VINFO_STMT (stmt_vinfo);
+
+  if (!is_gimple_assign (last_stmt))
+    return NULL;
+
+  tree ops[1];
+  tree lhs = gimple_assign_lhs (last_stmt);
+  tree otype = TREE_TYPE (lhs);
+
+  if ((gimple_unsigned_integer_narrow_clip (lhs, ops, NULL))
+       // || gimple_signed_integer_narrow_clip (lhs, ops, NULL))
+      && type_has_mode_precision_p (otype))
+    {
+      tree itype = TREE_TYPE (ops[0]);
+      tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
+      tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
+      internal_fn fn = IFN_NARROW_CLIP;
+
+      if (v_itype != NULL_TREE && v_otype != NULL_TREE
+       && direct_internal_fn_supported_p (fn, tree_pair (v_otype, v_itype),
+                                          OPTIMIZE_FOR_BOTH))
+       {
+         gcall *call = gimple_build_call_internal (fn, 1, ops[0]);
+         tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
+
+         gimple_call_set_lhs (call, out_ssa);
+         gimple_call_set_nothrow (call, /* nothrow_p */ false);
+         gimple_set_location (call, gimple_location (last_stmt));
+
+         *type_out = v_otype;
+         vect_pattern_detected ("vect_recog_narrow_clip_pattern", 
stmt_vinfo->stmt);
+
+         return call;
+       }
+    }
+
+  return NULL;
+}
+
 /* Try to detect a shift left of a widened input, converting LSHIFT_EXPR
    to WIDEN_LSHIFT_EXPR.  See vect_recog_widen_op_pattern for details.  */

@@ -6917,6 +6998,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
   { vect_recog_sat_add_pattern, "sat_add" },
   { vect_recog_sat_sub_pattern, "sat_sub" },
   { vect_recog_sat_trunc_pattern, "sat_trunc" },
+  { vect_recog_narrow_clip_pattern, "narrow_clip" },
   { vect_recog_gcond_pattern, "gcond" },
   { vect_recog_bool_pattern, "bool" },
   /* This must come before mask conversion, and includes the parts
--
2.43.0

Reply via email to