Hi Richard and Tamar, I have a try with DEF_INTERNAL_SIGNED_OPTAB_FN for SAT_ADD/SUB/MUL but meet some problem when match.pd.
For unsigned SAT_ADD = (x + y) | - ((x + y) < x), the match.pd can be (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0)))). For unsigned SAT_SUB = x >= y ? x - y : 0, and then match.pd can be (cond (ge @0 @1) (minus @0 @1) integer_zerop). For signed SAT_ADD/SAT_SUB as below, seems not easy to make the simplify pattern works well as expected up to a point. sint64_t sat_add (sint64_t x, sint64_t y) { sint64_t a = x ^ y; sint64_t add = x + y; sint64_t b = sum ^ x; return (a < 0 || (a >= 0 && b >= 0)) ? add : (MAX_INT64 + (x < 0)); } sint64_t sad_sub (sint64_t x, sint64_t y) { sint64_t a = x ^ y; sint64_t sub = x - y; sint64_t b = sub ^ x; return (a >= 0 || (a < 0 && b >= 0) ? sub : (MAX_INT64 + (x < 0)); } For SAT_MUL as below, looks we may need widen type. I am not sure if we can leverage MUL_OVERFLOW or not in match.pd. uint32_t sat_mul (uint32_t x, uint32_t y) { uint64_t mul = (uint64_t)x * (uint64_t)y; return mul > UINT32_MAX ? UINT32_MAX : (uint32_t)mul; } sint32_t sat_mul (sint32_t x, sint32_t y) { sint64_t mul = (sint64_t)x * (sint64_t))y; return mul <= MAX_INT32 && mul >= MIN_INT32 ? mul : MAX_INT32 + (x ^ y) > 0; } Below diff only contains unsigned SAT_ADD and SAT_SUB for prototype validation. I will continue to try the rest part in match.pd and keep you posted. ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 80efdf2b7e5..d9ad6fe2b58 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -132,6 +132,9 @@ extern void riscv_asm_output_external (FILE *, const tree, const char *); extern bool riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int); extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx); +extern void riscv_expand_usadd (rtx, rtx, rtx); +extern void riscv_expand_ussub (rtx, rtx, rtx); #ifdef RTX_CODE extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0); diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 5e984ee2a55..795462526df 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -10655,6 +10655,28 @@ riscv_vector_mode_supported_any_target_p (machine_mode) return true; } +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x). */ +void +riscv_expand_usadd (rtx dest, rtx x, rtx y) +{ + fprintf (stdout, "Hit riscv_expand_usadd.\n"); + // ToDo +} + +void +riscv_expand_ussub (rtx dest, rtx x, rtx y) +{ + fprintf (stdout, "Hit riscv_expand_ussub.\n"); + // ToDo +} + /* Initialize the GCC target structure. */ #undef TARGET_ASM_ALIGNED_HI_OP #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t" diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 1fec13092e2..e2dbadb3ead 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -3839,6 +3839,39 @@ (define_insn "*large_load_address" [(set_attr "type" "load") (set (attr "length") (const_int 8))]) +(define_expand "usadd<mode>3" + [(match_operand:ANYI 0 "register_operand") + (match_operand:ANYI 1 "register_operand") + (match_operand:ANYI 2 "register_operand")] + "" + { + riscv_expand_usadd (operands[0], operands[1], operands[2]); + DONE; + } +) + +(define_expand "ussub<mode>3" + [(match_operand:ANYI 0 "register_operand") + (match_operand:ANYI 1 "register_operand") + (match_operand:ANYI 2 "register_operand")] + "" + { + riscv_expand_ussub (operands[0], operands[1], operands[2]); + DONE; + } +) + (include "bitmanip.md") (include "crypto.md") (include "sync.md") diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 848bb9dbff3..0fff19c875f 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -275,6 +275,13 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first, DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first, smulhrs, umulhrs, binary) +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST | ECF_NOTHROW, first, + ssadd, usadd, binary) +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_SUB, ECF_CONST | ECF_NOTHROW, first, + sssub, ussub, binary) + DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary) DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary) DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary) diff --git a/gcc/match.pd b/gcc/match.pd index f3fffd8dec2..6592dea643a 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -10276,3 +10276,32 @@ and, } (if (full_perm_p) (vec_perm (op@3 @0 @1) @3 @2)))))) + +#if GIMPLE + +/* Unsigned saturation add, aka: + SAT_ADDU = (X + Y) | - ((X + Y) < X) or + SAT_ADDU = (X + Y) | - ((X + Y) < Y). */ +(simplify + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0)))) + (if (optimize + && INTEGRAL_TYPE_P (type) + && types_match (type, TREE_TYPE (@0)) + && types_match (type, TREE_TYPE (@1)) + && TYPE_UNSIGNED (TREE_TYPE (@0)) + && direct_internal_fn_supported_p (IFN_SAT_ADD, type, OPTIMIZE_FOR_BOTH)) + (IFN_SAT_ADD @0 @1))) + +/* Unsigned saturation sub , aka + SAT_SUBU = x >= y ? x - y : 0. */ +(simplify + (cond (ge @0 @1) (minus @0 @1) integer_zerop) + (if (optimize + && INTEGRAL_TYPE_P (type) + && TYPE_UNSIGNED (TREE_TYPE (@0)) + && types_match (type, TREE_TYPE (@0)) + && types_match (type, TREE_TYPE (@1)) + && direct_internal_fn_supported_p (IFN_SAT_SUB, type, OPTIMIZE_FOR_BOTH)) + (IFN_SAT_SUB @0 @1))) + +#endif diff --git a/gcc/optabs.def b/gcc/optabs.def index ad14f9328b9..bebe38c888b 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -111,15 +111,15 @@ OPTAB_NX(add_optab, "add$F$a3") OPTAB_NX(add_optab, "add$Q$a3") OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc) OPTAB_VX(addv_optab, "add$F$a3") -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc) -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc) +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_int_libfunc) +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_int_libfunc) OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc) OPTAB_NX(sub_optab, "sub$F$a3") OPTAB_NX(sub_optab, "sub$Q$a3") OPTAB_VL(subv_optab, "subv$I$a3", MINUS, "sub", '3', gen_intv_fp_libfunc) OPTAB_VX(subv_optab, "sub$F$a3") -OPTAB_NL(sssub_optab, "sssub$Q$a3", SS_MINUS, "sssub", '3', gen_signed_fixed_libfunc) -OPTAB_NL(ussub_optab, "ussub$Q$a3", US_MINUS, "ussub", '3', gen_unsigned_fixed_libfunc) +OPTAB_NL(sssub_optab, "sssub$a3", SS_MINUS, "sssub", '3', gen_int_libfunc) +OPTAB_NL(ussub_optab, "ussub$a3", US_MINUS, "ussub", '3', gen_int_libfunc) OPTAB_NL(smul_optab, "mul$Q$a3", MULT, "mul", '3', gen_int_fp_fixed_libfunc) OPTAB_NX(smul_optab, "mul$P$a3") OPTAB_NX(smul_optab, "mul$F$a3") Pan -----Original Message----- From: Li, Pan2 <pan2...@intel.com> Sent: Tuesday, February 27, 2024 10:36 PM To: Richard Biener <richard.guent...@gmail.com>; Tamar Christina <tamar.christ...@arm.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang <yanzhang.w...@intel.com>; kito.ch...@gmail.com; richard.sandiford@arm.com2; jeffreya...@gmail.com Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS Thanks Richard and Tammer for moving this forward. > That said, I would like to see the bigger picture to be kept in mind > before altering the GIMPLE IL. > Adding an internal function for an already present optab is a > no-brainer. Adding a vectorizer > and/or if-conversion pattern to make use of this during vectorization > is existing practice. > Adding pattern recognition to ISEL or widening-mul passes for > instructions the CPU can do > is existing practice and OK. Thanks for explaining, got the point here. > So I'd suggest writing some example of both signed and unsigned saturating > add and multiply > Because signed addition, will likely require a branch and signed > multiplication would require a > larger type. Ack, will prepare one prototype validation patch for add, sub and mul (both unsigned and signed) soon. Pan -----Original Message----- From: Richard Biener <richard.guent...@gmail.com> Sent: Tuesday, February 27, 2024 9:42 PM To: Tamar Christina <tamar.christ...@arm.com> Cc: Li, Pan2 <pan2...@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang <yanzhang.w...@intel.com>; kito.ch...@gmail.com; richard.sandiford@arm.com2; jeffreya...@gmail.com Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina <tamar.christ...@arm.com> wrote: > > > Thanks Tamar. > > > > > Those two cases also *completely* stop vectorization because of either the > > > control flow or the fact the vectorizer can't handle complex types. > > > > Yes, we eventually would like to vectorize the SAT ALU but we start with > > scalar part > > first. > > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works > > well with some additions as below. > > Feel free to correct me if any misunderstandings. > > > > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to > > usadd$a3(as well as gen_int_libfunc) for int. > > 2. We need to implement a default implementation of SAT_ADD if > > direct_binary_optab_supported_p is false. > > It looks like the default implementation is difficult to make every > > backend happy. > > That is why you suggest just normal > > DEF_INTERNAL_SIGNED_OPTAB_FN in another thread. > > > > Thanks Richard. > > > > > But what I'd like to see is that we do more instruction selection on > > > GIMPLE > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel > > > passes doing what I'd call instruction selection). But that means not > > > adding > > > match.pd patterns for that or at least have a separate isel-match.pd > > > machinery for that. > > > > > So as a start I would go for a direct optab and see to recognize it during > > > ISEL? > > > > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good > > to know isel and I am happy to > > try that once we have conclusion. > > > > So after a lively discussion on IRC, the conclusion is that before we proceed > Richi would > like to see some examples of various operations. The problem is that > unsigned saturating > addition is the simplest example and it may lead to an implementation > strategy that doesn't > scale. > > So I'd suggest writing some example of both signed and unsigned saturating > add and multiply > > Because signed addition, will likely require a branch and signed > multiplication would require a > larger type. > > This would allow us to better understand what kind of gimple would have to to > deal with in > ISEL and VECT if we decide not to lower early. More specifically before making something like .SAT_ADD a core part of GIMPLE I'd like to point out that we have saturating PLUS_EXPR but just for fixed-point types. I realize Joseph thinks that keying this on the type was wrong and it should have used integer types and special saturating operations. Still having both, type-keyed saturating PLUS_EXPR and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess. It might be that the way to go is to turn all existing saturating type *_EXPR into .SAT_* internal function calls, in the end mapping to the optabs and eventual RTX codes. That could work for both integer types and fixed-point types. I'll also note that "saturating" is just another variant of overflow behavior of which we have trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and also (kind-of) sanitized. We do lack direct IL representation of -ftrapv and -fwrapv, the semantics on a PLUS_EXPR depend on per-function flags. Eventually a common representation could be found here. For saturating I was thinking of .ADD_OVERFLOW (a, b, saturation-value), a "trap" could be a "trapping" saturation value, "undefined" could be a "not-a-thing". But I didn't think much about this. That said, I would like to see the bigger picture to be kept in mind before altering the GIMPLE IL. Adding an internal function for an already present optab is a no-brainer. Adding a vectorizer and/or if-conversion pattern to make use of this during vectorization is existing practice. Adding pattern recognition to ISEL or widening-mul passes for instructions the CPU can do is existing practice and OK. Thanks, Richard. > Thanks, > Tamar > > > Pan > > > > -----Original Message----- > > From: Tamar Christina <tamar.christ...@arm.com> > > Sent: Tuesday, February 27, 2024 5:57 PM > > To: Richard Biener <richard.guent...@gmail.com> > > Cc: Li, Pan2 <pan2...@intel.com>; gcc-patches@gcc.gnu.org; > > juzhe.zh...@rivai.ai; > > Wang, Yanzhang <yanzhang.w...@intel.com>; kito.ch...@gmail.com; > > richard.sandiford@arm.com2; jeffreya...@gmail.com > > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation > > US_PLUS > > > > > -----Original Message----- > > > From: Richard Biener <richard.guent...@gmail.com> > > > Sent: Tuesday, February 27, 2024 9:44 AM > > > To: Tamar Christina <tamar.christ...@arm.com> > > > Cc: pan2...@intel.com; gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; > > > yanzhang.w...@intel.com; kito.ch...@gmail.com; > > > richard.sandiford@arm.com2; jeffreya...@gmail.com > > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn > > > saturation > > > US_PLUS > > > > > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina > > > <tamar.christ...@arm.com> wrote: > > > > > > > > Hi Pan, > > > > > > > > > From: Pan Li <pan2...@intel.com> > > > > > > > > > > Hi Richard & Tamar, > > > > > > > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion. By mapping > > > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def. > > > > > And then expand_US_PLUS in internal-fn.cc. Not very sure if my > > > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN. > > > > > > > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, > > > > > given > > > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) > > > > > already. > > > > > > > > > > > > > I think a couple of things are being confused here. So lets break it > > > > down: > > > > > > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE > > > > we only want one internal function for both signed and unsigned SAT_ADD. > > > > with this definition we don't need SAT_UADD and SAT_SADD but instead > > > > we will only have SAT_ADD, which will expand to us_plus or ss_plus. > > > > > > > > Now the downside of this is that this is a direct internal optab. This > > > > means > > > > that for the representation to be used the target *must* have the optab > > > > implemented. This is a bit annoying because it doesn't allow us to > > > > generically > > > > assume that all targets use SAT_ADD for saturating add and thus only > > > > have to > > > > write optimization for this representation. > > > > > > > > This is why Richi said we may need to use a new tree_code because we can > > > > override tree code expansions. However the same can be done with the > > _EXT_FN > > > > internal functions. > > > > > > > > So what I meant was that we want to have a combination of the two. i.e. > > > > a > > > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN. > > > > > > Whether we want/need _EXT or only direct depends mainly on how we want to > > > leverage support. If it's only during vectorization and possibly > > > instruction > > > selection a direct optab is IMO the way to go. Generic optimization only > > > marginally improves when you explode the number of basic operations you > > > expose - in fact it gets quite unwieldly to support all of them in > > > simplifications > > > and/or canonicalization and you possibly need to translate them back to > > > what > > > the target CPU supports. > > > > > > We already do have too many (IMO) "special" operations exposed "early" > > > in the GIMPLE pipeline. > > > > > > But what I'd like to see is that we do more instruction selection on > > > GIMPLE > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel > > > passes doing what I'd call instruction selection). But that means not > > > adding > > > match.pd patterns for that or at least have a separate isel-match.pd > > > machinery for that. > > > > > > So as a start I would go for a direct optab and see to recognize it during > > > ISEL? > > > > > > > The problem with ISEL and the reason I suggested an indirect IFN is that > > there > > Are benefit to be had from recognizing it early. Saturating arithmetic can > > be > > optimized > > Differently from non-saturating ones. > > > > But additionally a common way of specifying them decomposes to branches > > and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic). > > > > These two representation can be detected in PHI-opts and it's beneficial to > > all > > targets to canonicalize them to the branchless code. > > > > Those two cases also *completely* stop vectorization because of either the > > control flow or the fact the vectorizer can't handle complex types. > > > > So really, gimple ISEL would fix just 1 of the 3 very common cases, and then > > We'd still need to hack the vectorizer cost models for targets with > > saturating > > vector instructions. > > > > I of course defer to you, but it seems quite suboptimal to do it this way > > and > > doesn't get us first class saturation support. > > > > Additionally there have been discussions whether both clang and gcc should > > provide __builtin_saturate_* methods, which the non-direct IFN would help > > support. > > > > Tamar. > > > > > > If Richi agrees, the below is what I meant. It creates the > > > > infrastructure for this > > > > and for now only allows a default fallback for unsigned saturating add > > > > and > > makes > > > > it easier for us to add the rest later > > > > > > > > Also, unless I'm wrong (and Richi can correct me here), us_plus and > > > > ss_plus are > > > the > > > > RTL expression, but the optab for saturation are ssadd and usadd. So > > > > you don't > > > > need to make new us_plus and ss_plus ones. > > > > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > > > > index a07f25f3aee..aaf9f8991b3 100644 > > > > --- a/gcc/internal-fn.cc > > > > +++ b/gcc/internal-fn.cc > > > > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn, > > > tree_pair types, > > > > return direct_##TYPE##_optab_supported_p (which_optab, types, > > > > \ > > > > opt_type); > > > > \ > > > > } > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR, > > > SIGNED_OPTAB, \ > > > > + UNSIGNED_OPTAB, TYPE) > > > > \ > > > > + case IFN_##CODE: > > > > \ > > > > + { > > > > \ > > > > + optab which_optab = (TYPE_UNSIGNED (types.SELECTOR) > > > > \ > > > > + ? UNSIGNED_OPTAB ## _optab > > > > \ > > > > + : SIGNED_OPTAB ## _optab); > > > > \ > > > > + return direct_##TYPE##_optab_supported_p (which_optab, types, > > > > \ > > > > + opt_type) > > > > \ > > > > + || internal_##CODE##_fn_supported_p (types.SELECTOR, > > > > opt_type); \ > > > > + } > > > > #include "internal-fn.def" > > > > > > > > case IFN_LAST: > > > > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void) > > > > optab which_optab = direct_internal_fn_optab (fn, types); > > > > \ > > > > expand_##TYPE##_optab_fn (fn, stmt, which_optab); > > > > \ > > > > } > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR, > > > SIGNED_OPTAB, \ > > > > + UNSIGNED_OPTAB, TYPE) > > > > #include "internal-fn.def" > > > > > > > > /* Routines to expand each internal function, indexed by function > > > > number. > > > > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt) > > > > emit_move_insn (plhs, cmp); > > > > } > > > > } > > > > + > > > > +void > > > > +expand_SAT_ADD (internal_fn fn, gcall *stmt) > > > > +{ > > > > + /* Check if the target supports the expansion through an IFN. */ > > > > + tree_pair types = direct_internal_fn_types (fn, stmt); > > > > + optab which_optab = direct_internal_fn_optab (fn, types); > > > > + if (direct_binary_optab_supported_p (which_optab, types, > > > > + insn_optimization_type ())) > > > > + { > > > > + expand_binary_optab_fn (fn, stmt, which_optab); > > > > + return; > > > > + } > > > > + > > > > + /* Target does not support the optab, but we can de-compose it. */ > > > > + /* > > > > + ... decompose to a canonical representation ... > > > > + if (TYPE_UNSIGNED (types.SELECTOR)) > > > > + { > > > > + ... > > > > + decompose back to (X + Y) | - ((X + Y) < X) > > > > + } > > > > + else > > > > + { > > > > + ... > > > > + } > > > > + */ > > > > +} > > > > + > > > > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /* > > > optype */) > > > > +{ > > > > + /* For now, don't support decomposing vector ops. */ > > > > + if (VECTOR_TYPE_P (type)) > > > > + return false; > > > > + > > > > + /* Signed saturating arithmetic is harder to do since we'll so for > > > > now > > > > + lets ignore. */ > > > > + if (!TYPE_UNSIGNED (type)) > > > > + return false; > > > > + > > > > + return TREE_CODE (type) == INTEGER_TYPE; > > > > +} > > > > \ No newline at end of file > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > > > > index c14d30365c1..5a2491228d5 100644 > > > > --- a/gcc/internal-fn.def > > > > +++ b/gcc/internal-fn.def > > > > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3. If not see > > > > unsigned inputs respectively, both without the trailing "_optab". > > > > SELECTOR says which type in the tree_pair determines the signedness. > > > > > > > > + DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like > > > DEF_INTERNAL_SIGNED_OPTAB_FN, except > > > > + that it has expand_##NAME defined in internal-fn.cc to override the > > > > + DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior. > > > > + > > > > DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition, > > > > the function implements the computational part of a built-in math > > > > function BUILT_IN_<NAME>{F,,L}. Unlike some built-in functions, > > > > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3. If not see > > > > DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL) > > > > #endif > > > > > > > > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR, > > > SIGNED_OPTAB, \ > > > > + UNSIGNED_OPTAB, TYPE) \ > > > > + DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, > > > SIGNED_OPTAB, \ > > > > + UNSIGNED_OPTAB, TYPE) > > > > +#endif > > > > + > > > > #ifndef DEF_INTERNAL_FLT_FN > > > > #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \ > > > > DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE) > > > > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, > > > ECF_CONST | ECF_NOTHROW, first, > > > > smulhs, umulhs, binary) > > > > DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, > > > first, > > > > smulhrs, umulhrs, binary) > > > > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST | > > > ECF_NOTHROW, first, > > > > + ssadd, usadd, binary) > > > > > > > > DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary) > > > > DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary) > > > > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE | > > > ECF_LEAF, ". R . ") > > > > #undef DEF_INTERNAL_FLT_FN > > > > #undef DEF_INTERNAL_FLT_FLOATN_FN > > > > #undef DEF_INTERNAL_SIGNED_OPTAB_FN > > > > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN > > > > #undef DEF_INTERNAL_OPTAB_FN > > > > #undef DEF_INTERNAL_FN > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h > > > > index bccee1c3e09..dbdb1e6bad2 100644 > > > > --- a/gcc/internal-fn.h > > > > +++ b/gcc/internal-fn.h > > > > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall > > > *); > > > > extern void expand_FLOATTOBITINT (internal_fn, gcall *); > > > > extern void expand_BITINTTOFLOAT (internal_fn, gcall *); > > > > extern void expand_POPCOUNT (internal_fn, gcall *); > > > > +extern void expand_SAT_ADD (internal_fn, gcall *); > > > > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type); > > > > > > > > extern bool vectorized_internal_fn_supported_p (internal_fn, tree); > > > > > > > > > Note this patch is a draft for validation, no test are invovled here. > > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > * builtins.def (BUILT_IN_US_PLUS): Add builtin def. > > > > > (BUILT_IN_US_PLUSIMAX): Ditto. > > > > > (BUILT_IN_US_PLUSL): Ditto. > > > > > (BUILT_IN_US_PLUSLL): Ditto. > > > > > (BUILT_IN_US_PLUSG): Ditto. > > > > > * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new > > > > > func decl for expanding us_plus. > > > > > * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func > > > > > impl for expanding us_plus. > > > > > * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl > > > > > us_plus<mode>3. > > > > > * internal-fn.cc (expand_US_PLUS): Add new func impl to expand > > > > > US_PLUS. > > > > > * internal-fn.def (US_PLUS): Add new INT_EXT_FN. > > > > > * internal-fn.h (expand_US_PLUS): Add new func decl. > > > > > * match.pd: Add new simplify pattern for us_plus. > > > > > * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl. > > > > > > > > > > Signed-off-by: Pan Li <pan2...@intel.com> > > > > > --- > > > > > gcc/builtins.def | 7 +++++ > > > > > gcc/config/riscv/riscv-protos.h | 1 + > > > > > gcc/config/riscv/riscv.cc | 46 > > > > > +++++++++++++++++++++++++++++++++ > > > > > gcc/config/riscv/riscv.md | 11 ++++++++ > > > > > gcc/internal-fn.cc | 26 +++++++++++++++++++ > > > > > gcc/internal-fn.def | 3 +++ > > > > > gcc/internal-fn.h | 1 + > > > > > gcc/match.pd | 17 ++++++++++++ > > > > > gcc/optabs.def | 2 ++ > > > > > 9 files changed, 114 insertions(+) > > > > > > > > > > diff --git a/gcc/builtins.def b/gcc/builtins.def > > > > > index f6f3e104f6a..0777b912cfa 100644 > > > > > --- a/gcc/builtins.def > > > > > +++ b/gcc/builtins.def > > > > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN > > > (BUILT_IN_POPCOUNTIMAX, > > > > > "popcountimax", BT_FN_INT_UINTMAX > > > > > DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTL, "popcountl", > > > BT_FN_INT_ULONG, > > > > > ATTR_CONST_NOTHROW_LEAF_LIST) > > > > > DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTLL, "popcountll", > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST) > > > > > DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTG, "popcountg", > > > BT_FN_INT_VAR, > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF) > > > > > + > > > > > +DEF_GCC_BUILTIN (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT, > > > > > ATTR_CONST_NOTHROW_LEAF_LIST) > > > > > +DEF_GCC_BUILTIN (BUILT_IN_US_PLUSIMAX, "us_plusimax", > > > > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST) > > > > > +DEF_GCC_BUILTIN (BUILT_IN_US_PLUSL, "us_plusl", > > BT_FN_INT_ULONG, > > > > > ATTR_CONST_NOTHROW_LEAF_LIST) > > > > > +DEF_GCC_BUILTIN (BUILT_IN_US_PLUSLL, "us_plusll", > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST) > > > > > +DEF_GCC_BUILTIN (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR, > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF) > > > > > + > > > > > DEF_EXT_LIB_BUILTIN (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", > > > > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF) > > > > > DEF_GCC_BUILTIN (BUILT_IN_PREFETCH, "prefetch", > > > > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST) > > > > > DEF_LIB_BUILTIN (BUILT_IN_REALLOC, "realloc", > > > > > BT_FN_PTR_PTR_SIZE, > > > > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST) > > > > > diff --git a/gcc/config/riscv/riscv-protos.h > > > > > b/gcc/config/riscv/riscv-protos.h > > > > > index 80efdf2b7e5..ba6086f1f25 100644 > > > > > --- a/gcc/config/riscv/riscv-protos.h > > > > > +++ b/gcc/config/riscv/riscv-protos.h > > > > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, > > > > > const > > > tree, > > > > > const char *); > > > > > extern bool > > > > > riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int); > > > > > extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx); > > > > > +extern void riscv_expand_us_plus (rtx, rtx, rtx); > > > > > > > > > > #ifdef RTX_CODE > > > > > extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool > > *invert_ptr > > > = > > > > > 0); > > > > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc > > > > > index 4100abc9dd1..23f08974f07 100644 > > > > > --- a/gcc/config/riscv/riscv.cc > > > > > +++ b/gcc/config/riscv/riscv.cc > > > > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p > > > > > (machine_mode) > > > > > return true; > > > > > } > > > > > > > > > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x). > > > > > */ > > > > > +void > > > > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y) > > > > > +{ > > > > > + machine_mode mode = GET_MODE (dest); > > > > > + rtx pmode_sum = gen_reg_rtx (Pmode); > > > > > + rtx pmode_lt = gen_reg_rtx (Pmode); > > > > > + rtx pmode_x = gen_lowpart (Pmode, x); > > > > > + rtx pmode_y = gen_lowpart (Pmode, y); > > > > > + rtx pmode_dest = gen_reg_rtx (Pmode); > > > > > + > > > > > + /* Step-1: sum = x + y */ > > > > > + if (mode == SImode && mode != Pmode) > > > > > + { /* Take addw to avoid the sum truncate. */ > > > > > + rtx simode_sum = gen_reg_rtx (SImode); > > > > > + riscv_emit_binary (PLUS, simode_sum, x, y); > > > > > + emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum)); > > > > > + } > > > > > + else > > > > > + riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y); > > > > > + > > > > > + /* Step-1.1: truncate sum for HI and QI as we have no insn for add > > > > > QI/HI. > > */ > > > > > + if (mode == HImode || mode == QImode) > > > > > + { > > > > > + int mode_bits = GET_MODE_BITSIZE (mode).to_constant (); > > > > > + int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits; > > > > > + > > > > > + gcc_assert (shift_bits > 0); > > > > > + > > > > > + riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT > > > (shift_bits)); > > > > > + riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT > > > > > (shift_bits)); > > > > > + } > > > > > + > > > > > + /* Step-2: lt = sum < x */ > > > > > + riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x); > > > > > + > > > > > + /* Step-3: lt = -lt */ > > > > > + riscv_emit_unary (NEG, pmode_lt, pmode_lt); > > > > > + > > > > > + /* Step-4: pmode_dest = sum | lt */ > > > > > + riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum); > > > > > + > > > > > + /* Step-5: dest = pmode_dest */ > > > > > + emit_move_insn (dest, gen_lowpart (mode, pmode_dest)); > > > > > +} > > > > > + > > > > > /* Initialize the GCC target structure. */ > > > > > #undef TARGET_ASM_ALIGNED_HI_OP > > > > > #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t" > > > > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md > > > > > index 3f7a023d941..eaa9867023c 100644 > > > > > --- a/gcc/config/riscv/riscv.md > > > > > +++ b/gcc/config/riscv/riscv.md > > > > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address" > > > > > [(set_attr "type" "load") > > > > > (set (attr "length") (const_int 8))]) > > > > > > > > > > +(define_expand "us_plus<mode>3" > > > > > + [(match_operand:ANYI 0 "register_operand") > > > > > + (match_operand:ANYI 1 "register_operand") > > > > > + (match_operand:ANYI 2 "register_operand")] > > > > > + "" > > > > > + { > > > > > + riscv_expand_us_plus (operands[0], operands[1], operands[2]); > > > > > + DONE; > > > > > + } > > > > > +) > > > > > + > > > > > (include "bitmanip.md") > > > > > (include "crypto.md") > > > > > (include "sync.md") > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > > > > > index a07f25f3aee..a7341a57ffa 100644 > > > > > --- a/gcc/internal-fn.cc > > > > > +++ b/gcc/internal-fn.cc > > > > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt) > > > > > emit_move_insn (plhs, cmp); > > > > > } > > > > > } > > > > > + > > > > > +void > > > > > +expand_US_PLUS (internal_fn fn, gcall *stmt) > > > > > +{ > > > > > + tree lhs = gimple_call_lhs (stmt); > > > > > + tree rhs_0 = gimple_call_arg (stmt, 0); > > > > > + tree rhs_1 = gimple_call_arg (stmt, 1); > > > > > + > > > > > + do_pending_stack_adjust (); > > > > > + > > > > > + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); > > > > > + rtx op_0 = expand_normal (rhs_0); > > > > > + rtx op_1 = expand_normal (rhs_1); > > > > > + > > > > > + class expand_operand ops[3]; > > > > > + > > > > > + create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE > > > > > (lhs))); > > > > > + create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE > > (rhs_0))); > > > > > + create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE > > (rhs_1))); > > > > > + > > > > > + insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE > > > > > (rhs_0))); > > > > > + expand_insn (code, 3, ops); > > > > > + > > > > > + if (!rtx_equal_p (target, ops[0].value)) > > > > > + emit_move_insn (target, ops[0].value); > > > > > +} > > > > > > > > This can be simplified by calling expand_binary_optab_fn instead. See my > > > template > > > > > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > > > > > index c14d30365c1..b1d7b5a0307 100644 > > > > > --- a/gcc/internal-fn.def > > > > > +++ b/gcc/internal-fn.def > > > > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST | > > > > > ECF_NOTHROW, ffs, unary) > > > > > DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity, > > unary) > > > > > DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW, > > > > > popcount, unary) > > > > > > > > > > +/* Binary integer ops. */ > > > > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW, > > > us_plus, > > > > > binary) > > > > > + > > > > > DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF | > > > > > ECF_NOTHROW, NULL) > > > > > DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF | > > > > > ECF_NOTHROW, NULL) > > > > > DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW, > > NULL) > > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h > > > > > index bccee1c3e09..46e404b4a49 100644 > > > > > --- a/gcc/internal-fn.h > > > > > +++ b/gcc/internal-fn.h > > > > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn, > > gcall > > > *); > > > > > extern void expand_FLOATTOBITINT (internal_fn, gcall *); > > > > > extern void expand_BITINTTOFLOAT (internal_fn, gcall *); > > > > > extern void expand_POPCOUNT (internal_fn, gcall *); > > > > > +extern void expand_US_PLUS (internal_fn, gcall *); > > > > > > > > > > extern bool vectorized_internal_fn_supported_p (internal_fn, tree); > > > > > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > > > index c5b6540f939..f45fd58ad23 100644 > > > > > --- a/gcc/match.pd > > > > > +++ b/gcc/match.pd > > > > > @@ -10265,3 +10265,20 @@ and, > > > > > } > > > > > (if (full_perm_p) > > > > > (vec_perm (op@3 @0 @1) @3 @2)))))) > > > > > + > > > > > +#if GIMPLE > > > > > + > > > > > +/* Unsigned saturation add, aka: > > > > > + SAT_ADDU = (X + Y) | - ((X + Y) < X) or > > > > > + SAT_ADDU = (X + Y) | - ((X + Y) < Y). */ > > > > > +(simplify > > > > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0)))) > > > > > + (if (optimize > > > > > + && INTEGRAL_TYPE_P (type) > > > > > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > > > > > + && types_match (type, TREE_TYPE (@0)) > > > > > + && types_match (type, TREE_TYPE (@1)) > > > > > + && direct_internal_fn_supported_p (IFN_US_PLUS, type, > > > > > OPTIMIZE_FOR_BOTH)) > > > > > + (IFN_US_PLUS @0 @1))) > > > > > + > > > > > +#endif > > > > > > > > With the version above you can drop the #if GIMPLE and the > > > > > + && direct_internal_fn_supported_p (IFN_US_PLUS, type, > > > > > > > > Check. > > > > > > > > Thanks, > > > > Tamar > > > > > > > > > diff --git a/gcc/optabs.def b/gcc/optabs.def > > > > > index ad14f9328b9..5855c4e0834 100644 > > > > > --- a/gcc/optabs.def > > > > > +++ b/gcc/optabs.def > > > > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb", > > '2', > > > > > gen_int_libfunc) > > > > > OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2', > > > > > gen_int_libfunc) > > > > > OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', > > > > > gen_int_libfunc) > > > > > > > > > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3', > > > > > gen_int_libfunc) > > > > > + > > > > > /* Comparison libcalls for integers MUST come in pairs, > > > > > signed/unsigned. */ > > > > > OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2', > > > gen_int_fp_fixed_libfunc) > > > > > OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc) > > > > > -- > > > > > 2.34.1 > > > >