Hi Akram, > On 14 Nov 2024, at 16:53, Akram Ahmad <akram.ah...@arm.com> wrote: > > This renames the existing {s,u}q{add,sub} instructions to use the > standard names {s,u}s{add,sub}3 which are used by IFN_SAT_ADD and > IFN_SAT_SUB. > > The NEON intrinsics for saturating arithmetic and their corresponding > builtins are changed to use these standard names too. > > Using the standard names for the instructions causes 32 and 64-bit > unsigned scalar saturating arithmetic to use the NEON instructions, > resulting in an additional (and inefficient) FMOV to be generated when > the original operands are in GP registers. This patch therefore also > restores the original behaviour of using the adds/subs instructions > in this circumstance. > > Furthermore, this patch introduces a new optimisation for signed 32 > and 64-bit scalar saturating arithmetic which uses adds/subs in place > of the NEON instruction. > > Addition, before: > fmov d0, x0 > fmov d1, x1 > sqadd d0, d0, d1 > fmov x0, d0 > > Addition, after: > asr x2, x1, 63 > adds x0, x0, x1 > eor x2, x2, 0x8000000000000000 > csinv x0, x0, x2, vc > > In the above example, subtraction replaces the adds with subs and the > csinv with csel. The 32-bit case follows the same approach. Arithmetic > with a constant operand is simplified further by directly storing the > saturating limit in the temporary register, resulting in only three > instructions being used. It is important to note that this only works > when early-ra is disabled due to an early-ra bug which erroneously > assigns FP registers to the operands; if early-ra is enabled, then the > original behaviour (NEON instruction) occurs. > > Additional tests are written for the scalar and Adv. SIMD cases to > ensure that the correct instructions are used. The NEON intrinsics are > already tested elsewhere. The signed scalar case is also tested with > an execution test to check the results. > > gcc/ChangeLog: > > * config/aarch64/aarch64-builtins.cc: Expand iterators. > * config/aarch64/aarch64-simd-builtins.def: Use standard names > * config/aarch64/aarch64-simd.md: Use standard names, split insn > definitions on signedness of operator and type of operands. > * config/aarch64/arm_neon.h: Use standard builtin names. > * config/aarch64/iterators.md: Add VSDQ_I_QI_HI iterator to > simplify splitting of insn for scalar arithmetic. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc: > Template file for unsigned vector saturating arithmetic tests. > * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c: > 8-bit vector type tests. > * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c: > 16-bit vector type tests. > * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c: > 32-bit vector type tests. > * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c: > 64-bit vector type tests. > * gcc.target/aarch64/saturating_arithmetic.inc: Template file > for scalar saturating arithmetic tests. > * gcc.target/aarch64/saturating_arithmetic_1.c: 8-bit tests. > * gcc.target/aarch64/saturating_arithmetic_2.c: 16-bit tests. > * gcc.target/aarch64/saturating_arithmetic_3.c: 32-bit tests. > * gcc.target/aarch64/saturating_arithmetic_4.c: 64-bit tests. > * gcc.target/aarch64/saturating_arithmetic_signed.c: Signed tests. > --- > gcc/config/aarch64/aarch64-builtins.cc | 13 + > gcc/config/aarch64/aarch64-simd-builtins.def | 8 +- > gcc/config/aarch64/aarch64-simd.md | 209 ++++++++++++++- > gcc/config/aarch64/arm_neon.h | 96 +++---- > gcc/config/aarch64/iterators.md | 4 + > .../saturating_arithmetic_autovect.inc | 58 +++++ > .../saturating_arithmetic_autovect_1.c | 79 ++++++ > .../saturating_arithmetic_autovect_2.c | 79 ++++++ > .../saturating_arithmetic_autovect_3.c | 75 ++++++ > .../saturating_arithmetic_autovect_4.c | 77 ++++++ > .../aarch64/saturating-arithmetic-signed.c | 244 ++++++++++++++++++ > .../aarch64/saturating_arithmetic.inc | 39 +++ > .../aarch64/saturating_arithmetic_1.c | 36 +++ > .../aarch64/saturating_arithmetic_2.c | 36 +++ > .../aarch64/saturating_arithmetic_3.c | 30 +++ > .../aarch64/saturating_arithmetic_4.c | 30 +++ > 16 files changed, 1057 insertions(+), 56 deletions(-) > create mode 100644 > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc > create mode 100644 > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c > create mode 100644 > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c > create mode 100644 > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c > create mode 100644 > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c > create mode 100644 > gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc > create mode 100644 gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c > > diff --git a/gcc/config/aarch64/aarch64-builtins.cc > b/gcc/config/aarch64/aarch64-builtins.cc > index 86d96e47f01..79e43d0c0b3 100644 > --- a/gcc/config/aarch64/aarch64-builtins.cc > +++ b/gcc/config/aarch64/aarch64-builtins.cc > @@ -3863,6 +3863,19 @@ aarch64_general_gimple_fold_builtin (unsigned int > fcode, gcall *stmt, > new_stmt = gimple_build_assign (gimple_call_lhs (stmt), > LSHIFT_EXPR, args[0], args[1]); > break; > + > + /* lower saturating add/sub neon builtins to gimple. */ > + BUILTIN_VSDQ_I (BINOP, ssadd, 3, NONE) > + BUILTIN_VSDQ_I (BINOPU, usadd, 3, NONE) > + new_stmt = gimple_build_call_internal (IFN_SAT_ADD, 2, args[0], args[1]); > + gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); > + break; > + BUILTIN_VSDQ_I (BINOP, sssub, 3, NONE) > + BUILTIN_VSDQ_I (BINOPU, ussub, 3, NONE) > + new_stmt = gimple_build_call_internal (IFN_SAT_SUB, 2, args[0], args[1]); > + gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); > + break; > + > BUILTIN_VSDQ_I_DI (BINOP, sshl, 0, NONE) > BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0, NONE) > { > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def > b/gcc/config/aarch64/aarch64-simd-builtins.def > index 0814f8ba14f..43a0a62caee 100644 > --- a/gcc/config/aarch64/aarch64-simd-builtins.def > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def > @@ -71,10 +71,10 @@ > BUILTIN_VSDQ_I (BINOP, sqrshl, 0, NONE) > BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0, NONE) > /* Implemented by aarch64_<su_optab><optab><mode>. */ > - BUILTIN_VSDQ_I (BINOP, sqadd, 0, NONE) > - BUILTIN_VSDQ_I (BINOPU, uqadd, 0, NONE) > - BUILTIN_VSDQ_I (BINOP, sqsub, 0, NONE) > - BUILTIN_VSDQ_I (BINOPU, uqsub, 0, NONE) > + BUILTIN_VSDQ_I (BINOP, ssadd, 3, NONE) > + BUILTIN_VSDQ_I (BINOPU, usadd, 3, NONE) > + BUILTIN_VSDQ_I (BINOP, sssub, 3, NONE) > + BUILTIN_VSDQ_I (BINOPU, ussub, 3, NONE) > /* Implemented by aarch64_<sur>qadd<mode>. */ > BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0, NONE) > BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, NONE) > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index e456f693d2f..fc18a822c52 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -5230,15 +5230,216 @@ > ) > ;; <su>q<addsub> > > -(define_insn "aarch64_<su_optab>q<addsub><mode><vczle><vczbe>" > - [(set (match_operand:VSDQ_I 0 "register_operand" "=w") > - (BINQOPS:VSDQ_I (match_operand:VSDQ_I 1 "register_operand" "w") > - (match_operand:VSDQ_I 2 "register_operand" "w")))] > +(define_insn "<su_optab>s<addsub><mode>3<vczle><vczbe>" > + [(set (match_operand:VSDQ_I_QI_HI 0 "register_operand" "=w") > + (BINQOPS:VSDQ_I_QI_HI (match_operand:VSDQ_I_QI_HI 1 "register_operand" "w") > + (match_operand:VSDQ_I_QI_HI 2 "register_operand" "w")))] > "TARGET_SIMD" > "<su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>" > [(set_attr "type" "neon_q<addsub><q>")] > ) > > +(define_expand "<su_optab>s<addsub><mode>3<vczle><vczbe>"
You shouldn’t need the <vczle><vczbe> for the define_expand, that only does something useful for define_insns. > + [(parallel [(set (match_operand:GPI 0 "register_operand") > + (SBINQOPS:GPI (match_operand:GPI 1 "register_operand") > + (match_operand:GPI 2 "aarch64_plus_operand"))) > + (clobber (scratch:GPI)) > + (clobber (reg:CC CC_REGNUM))])] > +) > + > +;; Signed saturating arithmetic with GPR operands can be calculated without > +;; moving these operands to and from FP regs if we introduce an additional > +;; temporary GP reg. This uses asr and xor to calculate the saturating limit > +;; based on the sign of the second (register) operand, with adds/subs and > csinv > +;; or csel being used respectively to select the saturating limit if the > +;; overflow flag is set. The additional asr, xor instructions are cheaper > than > +;; using introducing the three fmov instructions that would be needed to > +;; calculate this result using the NEON instruction. If operand2 is a > constant Minor nit, but we prefer to refer to NEON as “”Advanced SIMD” in the AArch64 world. Generally, it would be good to have an example sequence for the =r,r,JIr,=&r alternative in the comment here, as it’s quite specific. > +;; value, then the temporary register is used to store the saturating limit > +;; without the need for asr, xor. > + > +(define_insn_and_split "aarch64_<su_optab>s<addsub><mode>3<vczle><vczbe>" > + [(set (match_operand:GPI 0 "register_operand") > + (SBINQOPS:GPI (match_operand:GPI 1 "register_operand") > + (match_operand:GPI 2 "aarch64_plus_operand"))) > + (clobber (match_scratch:GPI 3)) > + (clobber (reg:CC CC_REGNUM))] > + "" > + {@ [ cons: =0, 1 , 2 , =3 ; attrs: type, arch, length ] > + [ w , w , w , X ; neon_q<addsub><q>, *, 4 ] > <su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype> The arch attribute for this alternative should be simd. > + [ r , r , JIr , &r ; * , *, 8 ] # > + } > + "&& reload_completed && GP_REGNUM_P (REGNO (operands[0]))" > + [(set (match_dup 0) > + (if_then_else:GPI > + (match_operator 4 "comparison_operator" [(reg:CC_V CC_REGNUM) (const_int > 0)]) > + (match_dup 5) > + (match_dup 6)))] > + { > + if (REG_P (operands[2])) > + { > + switch (<MODE>mode) > + { > + case SImode: > + emit_insn (gen_ashr<mode>3 (operands[3], operands[2], > + gen_int_mode (31, <MODE>mode))); > + emit_insn (gen_xor<mode>3 (operands[3], operands[3], > + gen_int_mode (0x80000000, <MODE>mode))); > + break; > + case DImode: > + emit_insn (gen_ashr<mode>3 (operands[3], operands[2], > + gen_int_mode (63, <MODE>mode))); > + emit_insn (gen_xor<mode>3 (operands[3], operands[3], > + gen_int_mode (0x8000000000000000, > + <MODE>mode))); > + break; > + default: > + break; > + } > + switch (<CODE>) > + { > + case SS_MINUS: > + emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], > + operands[2])); > + break; > + case SS_PLUS: > + emit_insn (gen_add<mode>3_compare0 (operands[0], operands[1], > + operands[2])); > + break; > + default: > + break; > + } > + > + rtx ccin = gen_rtx_REG (E_CC_Vmode, CC_REGNUM); > + switch (<CODE>) > + { > + case SS_PLUS: > + operands[4] = gen_rtx_NE (<MODE>mode, ccin, const0_rtx); > + operands[5] = gen_rtx_NOT (<MODE>mode, operands[3]); > + operands[6] = operands[0]; > + break; > + case SS_MINUS: > + operands[4] = gen_rtx_EQ (<MODE>mode, ccin, const0_rtx); > + operands[5] = operands[0]; > + operands[6] = operands[3]; > + break; > + default: > + break; > + } > + } > + else > + { > + long imm = INTVAL (operands[2]); > + gcc_assert (imm != 0); > + rtx neg_imm = gen_int_mode (-imm, <MODE>mode); > + wide_int limit; > + > + switch (<CODE>) > + { > + case SS_MINUS: > + emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1], > + operands[2], neg_imm)); > + limit = (imm >> 63) + 1 ? wi::min_value (<MODE>mode, SIGNED) > + : wi::max_value (<MODE>mode, SIGNED); > + break; > + case SS_PLUS: > + emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1], > + neg_imm, operands[2])); > + limit = (imm >> 63) + 1 ? wi::max_value (<MODE>mode, SIGNED) > + : wi::min_value (<MODE>mode, SIGNED); > + break; > + default: > + break; > + } > + > + rtx sat_limit = immed_wide_int_const (limit, <MODE>mode); > + emit_insn (gen_rtx_SET (operands[3], sat_limit)); > + > + rtx ccin = gen_rtx_REG (E_CC_Vmode, CC_REGNUM); > + operands[4] = gen_rtx_EQ (<MODE>mode, ccin, const0_rtx); > + operands[5] = operands[0]; > + operands[6] = operands[3]; > + } > + } > +) > + > +;; If this is an unsigned saturating arithmetic and the operands arrive in GP > +;; registers, then it is possible to perform this arithmetic without using > the > +;; NEON instructions. This avoids using unnecessary fmov instructions to > move > +;; either the operands or the result to and from GP regs to FP regs. This is > +;; only possible with SImode and DImode. > + > +(define_insn_and_split "<su_optab>s<addsub><mode>3<vczle><vczbe>" > + [(set (match_operand:GPI 0 "register_operand") > + (UBINQOPS:GPI (match_operand:GPI 1 "register_operand") > + (match_operand:GPI 2 "aarch64_plus_operand"))) > + (clobber (reg:CC CC_REGNUM))] > + "" > + {@ [ cons: =0, 1 , 2 ; attrs: type, arch, length ] > + [ w , w , w ; neon_q<addsub><q>, *, 4 ] > <su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype> > + [ r , r , JIr ; * , *, 8 ] # > + } > + "&& reload_completed && GP_REGNUM_P (REGNO (operands[0]))" > + [(set (match_dup 0) > + (if_then_else:GPI > + (match_operator 3 "comparison_operator" [(reg:CC CC_REGNUM) (const_int 0)]) > + (match_dup 0) > + (match_operand:GPI 4 "immediate_operand" "i")))] > + { > + > + if (REG_P (operands[2])) > + { > + switch (<CODE>) > + { > + case US_MINUS: > + emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], > + operands[2])); > + break; > + case US_PLUS: > + emit_insn (gen_add<mode>3_compare0 (operands[0], operands[1], > + operands[2])); > + break; > + default: > + break; > + } > + } > + else > + { > + unsigned long imm = UINTVAL (operands[2]); > + gcc_assert (imm != 0); > + rtx neg_imm = gen_int_mode (-imm, <MODE>mode); > + switch (<CODE>) > + { > + case US_MINUS: > + emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1], > + operands[2], neg_imm)); > + break; > + case US_PLUS: > + emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1], > + neg_imm, operands[2])); > + break; > + default: > + break; > + } > + } > + > + rtx ccin = gen_rtx_REG (CC_Cmode, CC_REGNUM); > + switch (<CODE>) > + { > + case US_PLUS: > + operands[3] = gen_rtx_LTU (<MODE>mode, ccin, const0_rtx); > + operands[4] = gen_int_mode (-1, <MODE>mode); > + break; > + case US_MINUS: > + operands[3] = gen_rtx_GEU (<MODE>mode, ccin, const0_rtx); > + operands[4] = const0_rtx; > + break; > + default: > + break; > + } > + } > +) > + > ;; suqadd and usqadd > > (define_insn "aarch64_<sur>qadd<mode><vczle><vczbe>" …. > diff --git a/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c > b/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c > new file mode 100644 > index 00000000000..429a2f9ed28 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c > @@ -0,0 +1,244 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 --save-temps -mearly-ra=none" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +#include <stdint.h> > + > +/* > +** sadd32: > +** asr w([0-9]+), w1, 31 > +** adds w([0-9]+), (?:w0, w1|w1, w0) > +** eor w\1, w\1, -2147483648 > +** csinv w0, w\2, w\1, vc > +** ret > +*/ > +int32_t __attribute__((noipa)) > +sadd32 (int32_t __a, int32_t __b) > +{ > + return __builtin_aarch64_ssaddsi (__a, __b); > +} We avoid using the __builtin_aarch64_* builtins in test cases as they are undocumented and we don’t make any guarantees about their stability to users. I’d prefer if the saturating operation was open-coded in C. I expect the midend machinery is smart enough to recognize the saturating logic for scalars by now? Thanks, Kyrill > + > +/* > +** sadd32_imm: > +** adds w([0-9]+), w0, #67 > +** mov w([0-9]+), 2147483647 > +** csel w0, w\1, w\2, vc > +** ret > +*/ > +int32_t __attribute__((noipa)) > +sadd32_imm (int32_t __a) > +{ > + return __builtin_aarch64_ssaddsi (__a, 67); > +} > + > +/* > +** sadd32_imm2: > +** subs w([0-9]+), w0, 67 > +** mov w([0-9]+), -2147483648 > +** csel w0, w\1, w\2, vc > +** ret > +*/ > +int32_t __attribute__((noipa)) > +sadd32_imm2 (int32_t __a) > +{ > + return __builtin_aarch64_ssaddsi (__a, -67); > +} > + > +/* > +** ssub32: > +** asr w([0-9]+), w1, 31 > +** subs w([0-9]+), w0, w1 > +** eor w\1, w\1, -2147483648 > +** csel w0, w\2, w\1, vc > +** ret > +*/ > +int32_t __attribute__((noipa)) > +ssub32 (int32_t __a, int32_t __b) > +{ > + return __builtin_aarch64_sssubsi (__a, __b); > +} > + > +/* > +** ssub32_imm: > +** subs w([0-9]+), w0, 67 > +** mov w([0-9]+), -2147483648 > +** csel w0, w\1, w\2, vc > +** ret > +*/ > +int32_t __attribute__((noipa)) > +ssub32_imm (int32_t __a) > +{ > + return __builtin_aarch64_sssubsi (__a, 67); > +} > + > +/* > +** ssub32_imm2: > +** adds w([0-9]+), w0, #67 > +** mov w([0-9]+), 2147483647 > +** csel w0, w\1, w\2, vc > +** ret > +*/ > +int32_t __attribute__((noipa)) > +ssub32_imm2 (int32_t __a) > +{ > + return __builtin_aarch64_sssubsi (__a, -67); > +} > + > +/* > +** sadd64: > +** asr x([0-9]+), x1, 63 > +** adds x([0-9]+), (?:x0, x1|x1, x0) > +** eor x\1, x\1, -9223372036854775808 > +** csinv x0, x\2, x\1, vc > +** ret > +*/ > +int64_t __attribute__((noipa)) > +sadd64 (int64_t __a, int64_t __b) > +{ > + return __builtin_aarch64_ssadddi (__a, __b); > +} > + > +/* > +** sadd64_imm: > +** adds x([0-9]+), x0, #67 > +** mov x([0-9]+), 9223372036854775807 > +** csel x0, x\1, x\2, vc > +** ret > +*/ > +int64_t __attribute__((noipa)) > +sadd64_imm (int64_t __a) > +{ > + return __builtin_aarch64_ssadddi (__a, (int64_t) 67); > +} > + > +/* > +** sadd64_imm2: > +** subs x([0-9]+), x0, 67 > +** mov x([0-9]+), -9223372036854775808 > +** csel x0, x\1, x\2, vc > +** ret > +*/ > +int64_t __attribute__((noipa)) > +sadd64_imm2 (int64_t __a) > +{ > + return __builtin_aarch64_ssadddi (__a, (int64_t) -67); > +} > + > +/* > +** ssub64: > +** asr x([0-9]+), x1, 63 > +** subs x([0-9]+), x0, x1 > +** eor x\1, x\1, -9223372036854775808 > +** csel x0, x\2, x\1, vc > +** ret > +*/ > +int64_t __attribute__((noipa)) > +ssub64 (int64_t __a, int64_t __b) > +{ > + return __builtin_aarch64_sssubdi (__a, __b); > +} > + > +/* > +** ssub64_imm: > +** subs x([0-9]+), x0, 67 > +** mov x([0-9]+), -9223372036854775808 > +** csel x0, x\1, x\2, vc > +** ret > +*/ > +int64_t __attribute__((noipa)) > +ssub64_imm (int64_t __a) > +{ > + return __builtin_aarch64_sssubdi (__a, (int64_t) 67); > +} > + > +/* > +** ssub64_imm2: > +** adds x([0-9]+), x0, #67 > +** mov x([0-9]+), 9223372036854775807 > +** csel x0, x\1, x\2, vc > +** ret > +*/ > +int64_t __attribute__((noipa)) > +ssub64_imm2 (int64_t __a) > +{ > + return __builtin_aarch64_sssubdi (__a, (int64_t) -67); > +} > + > +int > +main (void) > +{ > + /* Addition: > + SAT_ADD(x, +ve), non-saturating > + SAT_ADD(x, +ve), saturating > + SAT_ADD(x, immediate +ve) > + SAT_ADD(x, immediate -ve) > + SAT_ADD(x, -ve), non-saturating > + SAT_ADD(x, -ve), saturating > + > + Subtraction: > + SAT_SUB(x, +ve), non-saturating > + SAT_SUB(x, +ve), saturating > + SAT_SUB(x, immediate +ve) > + SAT_SUB(x, immediate -ve) > + SAT_SUB(x, -ve), non-saturating */ > + > + int32_t a = 4; > + int32_t b = 70; > + int32_t c = 2147483647; > + int32_t d = (int32_t) -2147483648; > + > + if (sadd32 (a, b) != (a + b)) > + __builtin_abort (); > + if (sadd32 (a, c) != c) > + __builtin_abort (); > + if (sadd32_imm (a) != (a + 67)) > + __builtin_abort (); > + if (sadd32_imm2 (a) != (a - 67)) > + __builtin_abort (); > + if (sadd32 (a, -b) != (a - b)) > + __builtin_abort (); > + if (sadd32 (a, d) != (d + 4)) > + __builtin_abort (); > + > + if (ssub32 (a, b) != (a - b)) > + __builtin_abort (); > + if (ssub32 (-a, c) != d) > + __builtin_abort (); > + if (ssub32_imm (a) != (a - 67)) > + __builtin_abort (); > + if (ssub32_imm2 (a) != (a + 67)) > + __builtin_abort (); > + if (ssub32 (a, -b) != (a + b)) > + __builtin_abort (); > + > + int64_t a_64 = a; > + int64_t b_64 = b; > + int64_t c_64 = (int64_t) 9223372036854775807; > + int64_t d_64 = (int64_t) 0x8000000000000000; > + > + if (sadd64 (a_64, b_64) != (a_64 + b_64)) > + __builtin_abort (); > + if (sadd64 (a_64, c_64) != c_64) > + __builtin_abort (); > + if (sadd64_imm (a_64) != (a_64 + 67)) > + __builtin_abort (); > + if (sadd64_imm2 (a_64) != (a_64 - 67)) > + __builtin_abort (); > + if (sadd64 (a_64, -b_64) != (a_64 - b_64)) > + __builtin_abort (); > + if (sadd64 (a_64, d_64) != (d_64 + 4)) > + __builtin_abort (); > + > + if (ssub64 (a_64, b_64) != (a_64 - b_64)) > + __builtin_abort (); > + if (ssub64 (-a_64, c_64) != d_64) > + __builtin_abort (); > + if (ssub64_imm (a_64) != (a_64 - 67)) > + __builtin_abort (); > + if (ssub64_imm2 (a_64) != (a_64 + 67)) > + __builtin_abort (); > + if (ssub64 (a_64, -b_64) != (a_64 + b_64)) > + __builtin_abort (); > + > + return 0; > +} > \ No newline at end of file > diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc > b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc > new file mode 100644 > index 00000000000..e979d535405 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc > @@ -0,0 +1,39 @@ > +/* Template file for scalar saturating arithmetic validation. > + > + This file defines scalar saturating addition and subtraction functions > for a > + given type. This type, along with the corresponding minimum and maximum > + values for that type, must be defined by any test file which includes this > + template file. */ > + > +#ifndef SAT_ARIT_INC > +#define SAT_ARIT_INC > + > +#include <limits.h> > + > +#ifndef UT > +#define UT unsigned int > +#define UMAX UINT_MAX > +#define UMIN 0 > +#endif > + > +UT uadd (UT a, UT b) > +{ > + UT sum = a + b; > + return sum < a ? UMAX : sum; > +} > + > +UT uadd2 (UT a, UT b) > +{ > + UT c; > + if (!__builtin_add_overflow(a, b, &c)) > + return c; > + return UMAX; > +} > + > +UT usub (UT a, UT b) > +{ > + UT sum = a - b; > + return sum > a ? UMIN : sum; > +} > + > +#endif > \ No newline at end of file > diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c > b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c > new file mode 100644 > index 00000000000..56873f99b81 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c > @@ -0,0 +1,36 @@ > +/* { dg-do-compile } */ > +/* { dg-options "-O2 --save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +/* > +** uadd: > +** dup v([0-9]+).8b, w0 > +** dup v([0-9]+).8b, w1 > +** uqadd b([0-9]+), (?:b\2, b\1|b\1, b\2) > +** umov w0, v\3.b\[0\] > +** ret > +*/ > +/* > +** uadd2: > +** dup v([0-9]+).8b, w0 > +** dup v([0-9]+).8b, w1 > +** uqadd b([0-9]+), (?:b\2, b\1|b\1, b\2) > +** umov w0, v\3.b\[0\] > +** ret > +*/ > +/* > +** usub: { xfail *-*-* } > +** dup v([0-9]+).8b, w0 > +** dup v([0-9]+).8b, w1 > +** uqsub b([0-9]+), b\1, b\2 > +** umov w0, v\3.b\[0\] > +** ret > +*/ > + > +#include <limits.h> > + > +#define UT unsigned char > +#define UMAX UCHAR_MAX > +#define UMIN 0 > + > +#include "saturating_arithmetic.inc" > \ No newline at end of file > diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c > b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c > new file mode 100644 > index 00000000000..a719aebbcf3 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c > @@ -0,0 +1,36 @@ > +/* { dg-do-compile } */ > +/* { dg-options "-O2 --save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +/* > +** uadd: > +** dup v([0-9]+).4h, w0 > +** dup v([0-9]+).4h, w1 > +** uqadd h([0-9]+), (?:h\2, h\1|h\1, h\2) > +** umov w0, v\3.h\[0\] > +** ret > +*/ > +/* > +** uadd2: > +** dup v([0-9]+).4h, w0 > +** dup v([0-9]+).4h, w1 > +** uqadd h([0-9]+), (?:h\2, h\1|h\1, h\2) > +** umov w0, v\3.h\[0\] > +** ret > +*/ > +/* > +** usub: { xfail *-*-* } > +** dup v([0-9]+).4h, w0 > +** dup v([0-9]+).4h, w1 > +** uqsub h([0-9]+), h\1, h\2 > +** umov w0, v\3.h\[0\] > +** ret > +*/ > + > +#include <limits.h> > + > +#define UT unsigned short > +#define UMAX USHRT_MAX > +#define UMIN 0 > + > +#include "saturating_arithmetic.inc" > \ No newline at end of file > diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c > b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c > new file mode 100644 > index 00000000000..21517254519 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c > @@ -0,0 +1,30 @@ > +/* { dg-do compile { target { aarch64*-*-* } } } */ > +/* { dg-options "-O2 --save-temps -ftree-vectorize" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +/* > +** uadd: > +** adds\tw([0-9]+), w([0-9]+), w([0-9]+) > +** csinv\tw\1, w\1, wzr, cc > +** ret > +*/ > +/* > +** uadd2: > +** adds\tw([0-9]+), w([0-9]+), w([0-9]+) > +** csinv\tw\1, w\1, wzr, cc > +** ret > +*/ > +/* > +** usub: > +** subs\tw([0-9]+), w([0-9]+), w([0-9]+) > +** csel\tw\1, w\1, wzr, cs > +** ret > +*/ > + > +#include <limits.h> > + > +#define UT unsigned int > +#define UMAX UINT_MAX > +#define UMIN 0 > + > +#include "saturating_arithmetic.inc" > \ No newline at end of file > diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c > b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c > new file mode 100644 > index 00000000000..363d0a79a73 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c > @@ -0,0 +1,30 @@ > +/* { dg-do compile { target { aarch64*-*-* } } } */ > +/* { dg-options "-O2 --save-temps -ftree-vectorize" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +/* > +** uadd: > +** adds\tx([0-9]+), x([0-9]+), x([0-9]+) > +** csinv\tx\1, x\1, xzr, cc > +** ret > +*/ > +/* > +** uadd2: > +** adds\tx([0-9]+), x([0-9]+), x([0-9]+) > +** csinv\tx\1, x\1, xzr, cc > +** ret > +*/ > +/* > +** usub: > +** subs\tx([0-9]+), x([0-9]+), x([0-9]+) > +** csel\tx\1, x\1, xzr, cs > +** ret > +*/ > + > +#include <limits.h> > + > +#define UT unsigned long > +#define UMAX ULONG_MAX > +#define UMIN 0 > + > +#include "saturating_arithmetic.inc" > \ No newline at end of file > -- > 2.34.1 >