> -----Original Message----- > From: Richard Sandiford <richard.sandif...@arm.com> > Sent: Friday, January 10, 2025 4:50 PM > To: Akram Ahmad <akram.ah...@arm.com> > Cc: ktkac...@nvidia.com; gcc-patches@gcc.gnu.org > Subject: Re: [PATCH v3 1/2] aarch64: Use standard names for saturating > arithmetic > > Akram Ahmad <akram.ah...@arm.com> writes: > > Ah whoops- I didn't see this before sending off V4 just now, my apologies. > > I'll try my best to get this implemented before the end of the day so that > > it doesn't miss the deadline. > > No rush! The delay here is entirely my fault, so no problem if the > patch lands early stage 4. >
Hi, I'm picking up the remainder of the patch for Akram. I believe I have addressed all the review comments, and I've also rebased to master which needed some small changes. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc: Expand iterators. * config/aarch64/aarch64-simd-builtins.def: Use standard names * config/aarch64/aarch64-simd.md: Use standard names, split insn definitions on signedness of operator and type of operands. * config/aarch64/arm_neon.h: Use standard builtin names. * config/aarch64/iterators.md: Add VSDQ_I_QI_HI iterator to simplify splitting of insn for unsigned scalar arithmetic. gcc/testsuite/ChangeLog: * gcc.target/aarch64/scalar_intrinsics.c: Update testcases. * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc: Template file for unsigned vector saturating arithmetic tests. * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c: 8-bit vector type tests. * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c: 16-bit vector type tests. * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c: 32-bit vector type tests. * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c: 64-bit vector type tests. * gcc.target/aarch64/saturating_arithmetic.inc: Template file for scalar saturating arithmetic tests. * gcc.target/aarch64/saturating_arithmetic_1.c: 8-bit tests. * gcc.target/aarch64/saturating_arithmetic_2.c: 16-bit tests. * gcc.target/aarch64/saturating_arithmetic_3.c: 32-bit tests. * gcc.target/aarch64/saturating_arithmetic_4.c: 64-bit tests. Co-authored-by: Tamar Christina <tamar.christ...@arm.com> -- inline copy -- diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 86eebc16885976387d941efc61c55975359b6099..6d5479c2e4492078312b05561d682ead9e9c2d13 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -5039,6 +5039,18 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt, new_stmt = gimple_build_assign (gimple_call_lhs (stmt), LSHIFT_EXPR, args[0], args[1]); break; + /* lower saturating add/sub neon builtins to gimple. */ + BUILTIN_VSDQ_I (BINOP, ssadd, 3, DEFAULT) + BUILTIN_VSDQ_I (BINOPU, usadd, 3, DEFAULT) + new_stmt = gimple_build_call_internal (IFN_SAT_ADD, 2, args[0], args[1]); + gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); + break; + BUILTIN_VSDQ_I (BINOP, sssub, 3, DEFAULT) + BUILTIN_VSDQ_I (BINOPU, ussub, 3, DEFAULT) + new_stmt = gimple_build_call_internal (IFN_SAT_SUB, 2, args[0], args[1]); + gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); + break; + BUILTIN_VSDQ_I_DI (BINOP, sshl, 0, DEFAULT) BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0, DEFAULT) { diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 286272a331180d7682826dc3ec9e921b1ebbeab6..6cc45b18a723fc9621e2da6220231e61b621185a 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -71,10 +71,10 @@ BUILTIN_VSDQ_I (BINOP, sqrshl, 0, DEFAULT) BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0, DEFAULT) /* Implemented by aarch64_<su_optab><optab><mode>. */ - BUILTIN_VSDQ_I (BINOP, sqadd, 0, DEFAULT) - BUILTIN_VSDQ_I (BINOPU, uqadd, 0, DEFAULT) - BUILTIN_VSDQ_I (BINOP, sqsub, 0, DEFAULT) - BUILTIN_VSDQ_I (BINOPU, uqsub, 0, DEFAULT) + BUILTIN_VSDQ_I (BINOP, ssadd, 3, DEFAULT) + BUILTIN_VSDQ_I (BINOPU, usadd, 3, DEFAULT) + BUILTIN_VSDQ_I (BINOP, sssub, 3, DEFAULT) + BUILTIN_VSDQ_I (BINOPU, ussub, 3, DEFAULT) /* Implemented by aarch64_<sur>qadd<mode>. */ BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0, DEFAULT) BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, DEFAULT) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index eeb626f129a812d11de8c7b90cb20633739baac6..e2afe87e5130cc066b8348659209ab40747327e5 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -5162,15 +5162,214 @@ (define_insn "*aarch64_vgetfmulx<mode>" ) ;; <su>q<addsub> -(define_insn "aarch64_<su_optab>q<addsub><mode><vczle><vczbe>" - [(set (match_operand:VSDQ_I 0 "register_operand" "=w") - (BINQOPS:VSDQ_I (match_operand:VSDQ_I 1 "register_operand" "w") - (match_operand:VSDQ_I 2 "register_operand" "w")))] +(define_insn "<su_optab>s<addsub><mode>3<vczle><vczbe>" + [(set (match_operand:VSDQ_I_QI_HI 0 "register_operand" "=w") + (BINQOPS:VSDQ_I_QI_HI + (match_operand:VSDQ_I_QI_HI 1 "register_operand" "w") + (match_operand:VSDQ_I_QI_HI 2 "register_operand" "w")))] "TARGET_SIMD" "<su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>" [(set_attr "type" "neon_q<addsub><q>")] ) +(define_expand "<su_optab>s<addsub><mode>3" + [(parallel + [(set (match_operand:GPI 0 "register_operand") + (SBINQOPS:GPI (match_operand:GPI 1 "register_operand") + (match_operand:GPI 2 "aarch64_plus_operand"))) + (clobber (scratch:GPI)) + (clobber (reg:CC CC_REGNUM))])] +) + +;; Introducing a temporary GP reg allows signed saturating arithmetic with GPR +;; operands to be calculated without the use of costly transfers to and from FP +;; registers. For example, saturating addition usually uses three FMOVs: +;; +;; fmov d0, x0 +;; fmov d1, x1 +;; sqadd d0, d0, d1 +;; fmov x0, d0 +;; +;; Using a temporary register results in three cheaper instructions being used +;; in place of the three FMOVs, which calculate the saturating limit accounting +;; for the signedness of operand2: +;; +;; asr x2, x1, 63 +;; adds x0, x0, x1 +;; eor x2, x2, 0x8000000000000000 +;; csinv x0, x0, x2, vc +;; +;; If operand2 is a constant value, the temporary register can be used to store +;; the saturating limit without the need for asr, xor to calculate said limit. + +(define_insn_and_split "aarch64_<su_optab>s<addsub><mode>3<vczle><vczbe>" + [(set (match_operand:GPI 0 "register_operand") + (SBINQOPS:GPI (match_operand:GPI 1 "register_operand") + (match_operand:GPI 2 "aarch64_plus_operand"))) + (clobber (match_scratch:GPI 3)) + (clobber (reg:CC CC_REGNUM))] + "" + {@ [ cons: =0, 1 , 2 , =3 ; attrs: type , arch , length ] + [ w , w , w , X ; neon_q<addsub><q> , simd , 4 ] <su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype> + [ r , r , JIr , &r ; * , * , 8 ] # + } + "&& reload_completed && GP_REGNUM_P (REGNO (operands[0]))" + [(set (match_dup 0) + (if_then_else:GPI + (match_dup 4) + (match_dup 5) + (match_dup 6)))] + { + if (REG_P (operands[2])) + { + rtx shift_constant = gen_int_mode (GET_MODE_BITSIZE (<MODE>mode) - 1, + <MODE>mode); + auto limit = HOST_WIDE_INT_1U << (GET_MODE_BITSIZE (<MODE>mode) - 1); + rtx limit_constant = gen_int_mode (limit, <MODE>mode); + emit_insn (gen_ashr<mode>3 (operands[3], operands[2], shift_constant)); + emit_insn (gen_xor<mode>3 (operands[3], operands[3], limit_constant)); + + switch (<CODE>) + { + case SS_MINUS: + emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], + operands[2])); + break; + case SS_PLUS: + emit_insn (gen_add<mode>3_compare0 (operands[0], operands[1], + operands[2])); + break; + default: + gcc_unreachable (); + } + + rtx ccin = gen_rtx_REG (E_CC_Vmode, CC_REGNUM); + switch (<CODE>) + { + case SS_PLUS: + operands[4] = gen_rtx_NE (<MODE>mode, ccin, const0_rtx); + operands[5] = gen_rtx_NOT (<MODE>mode, operands[3]); + operands[6] = operands[0]; + break; + case SS_MINUS: + operands[4] = gen_rtx_EQ (<MODE>mode, ccin, const0_rtx); + operands[5] = operands[0]; + operands[6] = operands[3]; + break; + default: + gcc_unreachable (); + } + } + else + { + auto imm = INTVAL (operands[2]); + rtx neg_imm = gen_int_mode (-imm, <MODE>mode); + wide_int limit; + + switch (<CODE>) + { + case SS_MINUS: + emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1], + operands[2], neg_imm)); + limit = imm >= 0 ? wi::min_value (<MODE>mode, SIGNED) + : wi::max_value (<MODE>mode, SIGNED); + break; + case SS_PLUS: + emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1], + neg_imm, operands[2])); + limit = imm >= 0 ? wi::max_value (<MODE>mode, SIGNED) + : wi::min_value (<MODE>mode, SIGNED); + break; + default: + gcc_unreachable (); + } + + rtx sat_limit = immed_wide_int_const (limit, <MODE>mode); + emit_insn (gen_rtx_SET (operands[3], sat_limit)); + + rtx ccin = gen_rtx_REG (E_CC_Vmode, CC_REGNUM); + operands[4] = gen_rtx_EQ (<MODE>mode, ccin, const0_rtx); + operands[5] = operands[0]; + operands[6] = operands[3]; + } + } +) + +;; Unsigned saturating arithmetic with GPR operands can be optimised similarly +;; to the signed case, albeit without the need for a temporary register as the +;; saturating limit can be inferred from the <addsub> code. This applies only +;; to SImode and DImode. + +(define_insn_and_split "<su_optab>s<addsub><mode>3<vczle><vczbe>" + [(set (match_operand:GPI 0 "register_operand") + (UBINQOPS:GPI (match_operand:GPI 1 "register_operand") + (match_operand:GPI 2 "aarch64_plus_operand"))) + (clobber (reg:CC CC_REGNUM))] + "" + {@ [ cons: =0, 1 , 2 ; attrs: type , arch , length ] + [ w , w , w ; neon_q<addsub><q> , simd , 4 ] <su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype> + [ r , r , JIr ; * , * , 8 ] # + } + "&& reload_completed && GP_REGNUM_P (REGNO (operands[0]))" + [(set (match_dup 0) + (if_then_else:GPI + (match_dup 3) + (match_dup 0) + (match_dup 4)))] + { + + if (REG_P (operands[2])) + { + switch (<CODE>) + { + case US_MINUS: + emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], + operands[2])); + break; + case US_PLUS: + emit_insn (gen_add<mode>3_compare0 (operands[0], operands[1], + operands[2])); + break; + default: + gcc_unreachable (); + } + } + else + { + auto imm = UINTVAL (operands[2]); + rtx neg_imm = gen_int_mode (-imm, <MODE>mode); + switch (<CODE>) + { + case US_MINUS: + emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1], + operands[2], neg_imm)); + break; + case US_PLUS: + emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1], + neg_imm, operands[2])); + break; + default: + gcc_unreachable (); + } + } + + rtx ccin = gen_rtx_REG (CCmode, CC_REGNUM); + switch (<CODE>) + { + case US_PLUS: + operands[3] = gen_rtx_LTU (<MODE>mode, ccin, const0_rtx); + operands[4] = gen_int_mode (-1, <MODE>mode); + break; + case US_MINUS: + operands[3] = gen_rtx_GEU (<MODE>mode, ccin, const0_rtx); + operands[4] = const0_rtx; + break; + default: + gcc_unreachable (); + } + } +) + ;; suqadd and usqadd (define_insn "aarch64_<sur>qadd<mode><vczle><vczbe>" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 33594cb65d28a77f95554e6188d5f5ea10ebe60b..4899acead9b7c04a7a8f45a7d308e94e369e2186 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -1864,35 +1864,35 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqadd_s8 (int8x8_t __a, int8x8_t __b) { - return (int8x8_t) __builtin_aarch64_sqaddv8qi (__a, __b); + return (int8x8_t) __builtin_aarch64_ssaddv8qi (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqadd_s16 (int16x4_t __a, int16x4_t __b) { - return (int16x4_t) __builtin_aarch64_sqaddv4hi (__a, __b); + return (int16x4_t) __builtin_aarch64_ssaddv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqadd_s32 (int32x2_t __a, int32x2_t __b) { - return (int32x2_t) __builtin_aarch64_sqaddv2si (__a, __b); + return (int32x2_t) __builtin_aarch64_ssaddv2si (__a, __b); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqadd_s64 (int64x1_t __a, int64x1_t __b) { - return (int64x1_t) {__builtin_aarch64_sqadddi (__a[0], __b[0])}; + return (int64x1_t) {__builtin_aarch64_ssadddi (__a[0], __b[0])}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqadd_u8 (uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_uqaddv8qi_uuu (__a, __b); + return __builtin_aarch64_usaddv8qi_uuu (__a, __b); } __extension__ extern __inline int8x8_t @@ -2151,189 +2151,189 @@ __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqadd_u16 (uint16x4_t __a, uint16x4_t __b) { - return __builtin_aarch64_uqaddv4hi_uuu (__a, __b); + return __builtin_aarch64_usaddv4hi_uuu (__a, __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqadd_u32 (uint32x2_t __a, uint32x2_t __b) { - return __builtin_aarch64_uqaddv2si_uuu (__a, __b); + return __builtin_aarch64_usaddv2si_uuu (__a, __b); } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqadd_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) {__builtin_aarch64_uqadddi_uuu (__a[0], __b[0])}; + return (uint64x1_t) {__builtin_aarch64_usadddi_uuu (__a[0], __b[0])}; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddq_s8 (int8x16_t __a, int8x16_t __b) { - return (int8x16_t) __builtin_aarch64_sqaddv16qi (__a, __b); + return (int8x16_t) __builtin_aarch64_ssaddv16qi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddq_s16 (int16x8_t __a, int16x8_t __b) { - return (int16x8_t) __builtin_aarch64_sqaddv8hi (__a, __b); + return (int16x8_t) __builtin_aarch64_ssaddv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddq_s32 (int32x4_t __a, int32x4_t __b) { - return (int32x4_t) __builtin_aarch64_sqaddv4si (__a, __b); + return (int32x4_t) __builtin_aarch64_ssaddv4si (__a, __b); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddq_s64 (int64x2_t __a, int64x2_t __b) { - return (int64x2_t) __builtin_aarch64_sqaddv2di (__a, __b); + return (int64x2_t) __builtin_aarch64_ssaddv2di (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddq_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_uqaddv16qi_uuu (__a, __b); + return __builtin_aarch64_usaddv16qi_uuu (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddq_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_aarch64_uqaddv8hi_uuu (__a, __b); + return __builtin_aarch64_usaddv8hi_uuu (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddq_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_aarch64_uqaddv4si_uuu (__a, __b); + return __builtin_aarch64_usaddv4si_uuu (__a, __b); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddq_u64 (uint64x2_t __a, uint64x2_t __b) { - return __builtin_aarch64_uqaddv2di_uuu (__a, __b); + return __builtin_aarch64_usaddv2di_uuu (__a, __b); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsub_s8 (int8x8_t __a, int8x8_t __b) { - return (int8x8_t) __builtin_aarch64_sqsubv8qi (__a, __b); + return (int8x8_t) __builtin_aarch64_sssubv8qi (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsub_s16 (int16x4_t __a, int16x4_t __b) { - return (int16x4_t) __builtin_aarch64_sqsubv4hi (__a, __b); + return (int16x4_t) __builtin_aarch64_sssubv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsub_s32 (int32x2_t __a, int32x2_t __b) { - return (int32x2_t) __builtin_aarch64_sqsubv2si (__a, __b); + return (int32x2_t) __builtin_aarch64_sssubv2si (__a, __b); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsub_s64 (int64x1_t __a, int64x1_t __b) { - return (int64x1_t) {__builtin_aarch64_sqsubdi (__a[0], __b[0])}; + return (int64x1_t) {__builtin_aarch64_sssubdi (__a[0], __b[0])}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsub_u8 (uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_uqsubv8qi_uuu (__a, __b); + return __builtin_aarch64_ussubv8qi_uuu (__a, __b); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsub_u16 (uint16x4_t __a, uint16x4_t __b) { - return __builtin_aarch64_uqsubv4hi_uuu (__a, __b); + return __builtin_aarch64_ussubv4hi_uuu (__a, __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsub_u32 (uint32x2_t __a, uint32x2_t __b) { - return __builtin_aarch64_uqsubv2si_uuu (__a, __b); + return __builtin_aarch64_ussubv2si_uuu (__a, __b); } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsub_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) {__builtin_aarch64_uqsubdi_uuu (__a[0], __b[0])}; + return (uint64x1_t) {__builtin_aarch64_ussubdi_uuu (__a[0], __b[0])}; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubq_s8 (int8x16_t __a, int8x16_t __b) { - return (int8x16_t) __builtin_aarch64_sqsubv16qi (__a, __b); + return (int8x16_t) __builtin_aarch64_sssubv16qi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubq_s16 (int16x8_t __a, int16x8_t __b) { - return (int16x8_t) __builtin_aarch64_sqsubv8hi (__a, __b); + return (int16x8_t) __builtin_aarch64_sssubv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubq_s32 (int32x4_t __a, int32x4_t __b) { - return (int32x4_t) __builtin_aarch64_sqsubv4si (__a, __b); + return (int32x4_t) __builtin_aarch64_sssubv4si (__a, __b); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubq_s64 (int64x2_t __a, int64x2_t __b) { - return (int64x2_t) __builtin_aarch64_sqsubv2di (__a, __b); + return (int64x2_t) __builtin_aarch64_sssubv2di (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubq_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_uqsubv16qi_uuu (__a, __b); + return __builtin_aarch64_ussubv16qi_uuu (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubq_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_aarch64_uqsubv8hi_uuu (__a, __b); + return __builtin_aarch64_ussubv8hi_uuu (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubq_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_aarch64_uqsubv4si_uuu (__a, __b); + return __builtin_aarch64_ussubv4si_uuu (__a, __b); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubq_u64 (uint64x2_t __a, uint64x2_t __b) { - return __builtin_aarch64_uqsubv2di_uuu (__a, __b); + return __builtin_aarch64_ussubv2di_uuu (__a, __b); } __extension__ extern __inline int8x8_t @@ -17543,56 +17543,56 @@ __extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddb_s8 (int8_t __a, int8_t __b) { - return (int8_t) __builtin_aarch64_sqaddqi (__a, __b); + return (int8_t) __builtin_aarch64_ssaddqi (__a, __b); } __extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddh_s16 (int16_t __a, int16_t __b) { - return (int16_t) __builtin_aarch64_sqaddhi (__a, __b); + return (int16_t) __builtin_aarch64_ssaddhi (__a, __b); } __extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqadds_s32 (int32_t __a, int32_t __b) { - return (int32_t) __builtin_aarch64_sqaddsi (__a, __b); + return (int32_t) __builtin_aarch64_ssaddsi (__a, __b); } __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddd_s64 (int64_t __a, int64_t __b) { - return __builtin_aarch64_sqadddi (__a, __b); + return __builtin_aarch64_ssadddi (__a, __b); } __extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddb_u8 (uint8_t __a, uint8_t __b) { - return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b); + return (uint8_t) __builtin_aarch64_usaddqi_uuu (__a, __b); } __extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddh_u16 (uint16_t __a, uint16_t __b) { - return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b); + return (uint16_t) __builtin_aarch64_usaddhi_uuu (__a, __b); } __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqadds_u32 (uint32_t __a, uint32_t __b) { - return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b); + return (uint32_t) __builtin_aarch64_usaddsi_uuu (__a, __b); } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqaddd_u64 (uint64_t __a, uint64_t __b) { - return __builtin_aarch64_uqadddi_uuu (__a, __b); + return __builtin_aarch64_usadddi_uuu (__a, __b); } /* vqdmlal */ @@ -19242,56 +19242,56 @@ __extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubb_s8 (int8_t __a, int8_t __b) { - return (int8_t) __builtin_aarch64_sqsubqi (__a, __b); + return (int8_t) __builtin_aarch64_sssubqi (__a, __b); } __extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubh_s16 (int16_t __a, int16_t __b) { - return (int16_t) __builtin_aarch64_sqsubhi (__a, __b); + return (int16_t) __builtin_aarch64_sssubhi (__a, __b); } __extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubs_s32 (int32_t __a, int32_t __b) { - return (int32_t) __builtin_aarch64_sqsubsi (__a, __b); + return (int32_t) __builtin_aarch64_sssubsi (__a, __b); } __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubd_s64 (int64_t __a, int64_t __b) { - return __builtin_aarch64_sqsubdi (__a, __b); + return __builtin_aarch64_sssubdi (__a, __b); } __extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubb_u8 (uint8_t __a, uint8_t __b) { - return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b); + return (uint8_t) __builtin_aarch64_ussubqi_uuu (__a, __b); } __extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubh_u16 (uint16_t __a, uint16_t __b) { - return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b); + return (uint16_t) __builtin_aarch64_ussubhi_uuu (__a, __b); } __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubs_u32 (uint32_t __a, uint32_t __b) { - return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b); + return (uint32_t) __builtin_aarch64_ussubsi_uuu (__a, __b); } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqsubd_u64 (uint64_t __a, uint64_t __b) { - return __builtin_aarch64_uqsubdi_uuu (__a, __b); + return __builtin_aarch64_ussubdi_uuu (__a, __b); } /* vqtbl2 */ diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index ff0f34dd0430d73833149d20f4cc0d1f84c3fd2f..2f7aa489ae8ba4b0101ad168cc61d592a9b7b660 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -95,6 +95,10 @@ (define_mode_iterator VSDQ_I [V8QI V16QI V4HI V8HI V2SI V4SI V2DI QI HI SI DI]) ;; integer modes; 64-bit scalar integer mode. (define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI]) +;; Advanced SIMD and scalar, 64 & 128-bit container; 8 and 16-bit scalar +;; integer modes. +(define_mode_iterator VSDQ_I_QI_HI [VDQ_I HI QI]) + ;; Double vector modes. (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF V4BF]) diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc new file mode 100644 index 0000000000000000000000000000000000000000..1fadfd587555f0a0e5c390f3a46442a05ec675e6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc @@ -0,0 +1,58 @@ +/* Template file for vector saturating arithmetic validation. + + This file defines saturating addition and subtraction functions for a given + scalar type, testing the auto-vectorization of these two operators. This + type, along with the corresponding minimum and maximum values for that type, + must be defined by any test file which includes this template file. */ + +#ifndef SAT_ARIT_AUTOVEC_INC +#define SAT_ARIT_AUTOVEC_INC + +#include <limits.h> +#include <arm_neon.h> + +#ifndef UT +#define UT unsigned int +#define VT uint32x4_t +#define UMAX UINT_MAX +#define UMIN 0 +#endif + + +UT uadd_lane (UT a, VT b) +{ + UT sum = a + b[0]; + return sum < a ? UMAX : sum; +} + +void uaddq (UT *out, UT *a, UT *b, int n) +{ + for (int i = 0; i < n; i++) + { + UT sum = a[i] + b[i]; + out[i] = sum < a[i] ? UMAX : sum; + } +} + +void uaddq2 (UT *out, UT *a, UT *b, int n) +{ + for (int i = 0; i < n; i++) + { + UT sum; + if (!__builtin_add_overflow(a[i], b[i], &sum)) + out[i] = sum; + else + out[i] = UMAX; + } +} + +void usubq (UT *out, UT *a, UT *b, int n) +{ + for (int i = 0; i < n; i++) + { + UT sum = a[i] - b[i]; + out[i] = sum > a[i] ? UMIN : sum; + } +} + +#endif \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c new file mode 100644 index 0000000000000000000000000000000000000000..2b72be7b0d7c6b23d9b7431a70ae2eb79f4c4bf8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c @@ -0,0 +1,79 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-options "-O2 --save-temps -ftree-vectorize" } */ +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ + +/* +** uadd_lane: { xfail *-*-* } +** dup\tv([0-9]+).8b, w0 +** uqadd\tb([0-9]+), (?:b\1, b0|b0, b\1) +** umov\tw0, v\2.b\[0\] +** ret +*/ +/* +** uaddq: +** ... +** ldr\tq([0-9]+), .* +** ldr\tq([0-9]+), .* +** uqadd\tv[0-9]+.16b, (?:v\1.16b, v\2.16b|v\2.16b, v\1.16b) +** ... +** ldr\td([0-9]+), .* +** ldr\td([0-9]+), .* +** uqadd\tv[0-9]+.8b, (?:v\3.8b, v\4.8b|v\4.8b, v\3.8b) +** ... +** ldr\tb([0-9]+), .* +** ldr\tb([0-9]+), .* +** uqadd\tb[0-9]+, (?:b\5, b\6|b\6, b\5) +** ... +** ldr\tb([0-9]+), .* +** ldr\tb([0-9]+), .* +** uqadd\tb[0-9]+, (?:b\7, b\8|b\8, b\7) +** ... +*/ +/* +** uaddq2: +** ... +** ldr\tq([0-9]+), .* +** ldr\tq([0-9]+), .* +** uqadd\tv[0-9]+.16b, (?:v\1.16b, v\2.16b|v\2.16b, v\1.16b) +** ... +** ldr\td([0-9]+), .* +** ldr\td([0-9]+), .* +** uqadd\tv[0-9]+.8b, (?:v\3.8b, v\4.8b|v\4.8b, v\3.8b) +** ... +** ldr\tb([0-9]+), .* +** ldr\tb([0-9]+), .* +** uqadd\tb[0-9]+, (?:b\5, b\6|b\6, b\5) +** ... +** uqadd\tb([0-9]+), (?:b[0-9]+, b\7|b\7, b[0-9]+) +** ... +*/ +/* +** usubq: { xfail *-*-* } +** ... +** ldr\tq([0-9]+), .* +** ldr\tq([0-9]+), .* +** uqsub\tv[0-9]+.16b, v\1.16b, v\2.16b +** ... +** ldr\td([0-9]+), .* +** ldr\td([0-9]+), .* +** uqsub\tv[0-9]+.8b, v\3.8b, v\4.8b +** ... +** ldr\tb([0-9]+), .* +** ldr\tb([0-9]+), .* +** uqsub\tb[0-9]+, b\5, b\6 +** ... +** ldr\tb([0-9]+), .* +** ldr\tb([0-9]+), .* +** uqsub\tb[0-9]+, b\7, b\8 +** ... +*/ + +#include <limits.h> +#include <arm_neon.h> + +#define UT unsigned char +#define VT uint8x8_t +#define UMAX UCHAR_MAX +#define UMIN 0 + +#include "saturating_arithmetic_autovect.inc" \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c new file mode 100644 index 0000000000000000000000000000000000000000..0640361498f020284910d2370f9c8a6a3e898022 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c @@ -0,0 +1,79 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-options "-O2 --save-temps -ftree-vectorize" } */ +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ + +/* +** uadd_lane: { xfail *-*-* } +** dup\tv([0-9]+).4h, w0 +** uqadd\th([0-9]+), (?:h\1, h0|h0, h\1) +** umov\tw0, v\2.h\[0\] +** ret +*/ +/* +** uaddq: +** ... +** ldr\tq([0-9]+), .* +** ldr\tq([0-9]+), .* +** uqadd\tv[0-9]+.8h, (?:v\1.8h, v\2.8h|v\2.8h, v\1.8h) +** ... +** ldr\td([0-9]+), .* +** ldr\td([0-9]+), .* +** uqadd\tv[0-9]+.4h, (?:v\3.4h, v\4.4h|v\4.4h, v\3.4h) +** ... +** ldr\th([0-9]+), .* +** ldr\th([0-9]+), .* +** uqadd\th[0-9]+, (?:h\5, h\6|h\6, h\5) +** ... +** ldr\th([0-9]+), .* +** ldr\th([0-9]+), .* +** uqadd\th[0-9]+, (?:h\7, h\8|h\8, h\7) +** ... +*/ +/* +** uaddq2: +** ... +** ldr\tq([0-9]+), .* +** ldr\tq([0-9]+), .* +** uqadd\tv[0-9]+.8h, (?:v\1.8h, v\2.8h|v\2.8h, v\1.8h) +** ... +** ldr\td([0-9]+), .* +** ldr\td([0-9]+), .* +** uqadd\tv[0-9]+.4h, (?:v\3.4h, v\4.4h|v\4.4h, v\3.4h) +** ... +** ldr\th([0-9]+), .* +** ldr\th([0-9]+), .* +** uqadd\th[0-9]+, (?:h\5, h\6|h\6, h\5) +** ... +** uqadd\th([0-9]+), (?:h[0-9]+, h\7|h\7, h[0-9]+) +** ... +*/ +/* +** usubq: { xfail *-*-* } +** ... +** ldr\tq([0-9]+), .* +** ldr\tq([0-9]+), .* +** uqsub\tv[0-9]+.8h, v\1.8h, v\2.8h +** ... +** ldr\td([0-9]+), .* +** ldr\td([0-9]+), .* +** uqsub\tv[0-9]+.4h, v\3.4h, v\4.4h +** ... +** ldr\th([0-9]+), .* +** ldr\th([0-9]+), .* +** uqsub\th[0-9]+, h\5, h\6 +** ... +** ldr\th([0-9]+), .* +** ldr\th([0-9]+), .* +** uqsub\th[0-9]+, h\7, h\8 +** ... +*/ + +#include <limits.h> +#include <arm_neon.h> + +#define UT unsigned short +#define VT uint16x4_t +#define UMAX USHRT_MAX +#define UMIN 0 + +#include "saturating_arithmetic_autovect.inc" \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c new file mode 100644 index 0000000000000000000000000000000000000000..ea6e0c78d7860700d750d4d6055ea2bf636cd2f1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c @@ -0,0 +1,75 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-options "-O2 --save-temps -ftree-vectorize" } */ +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ + +/* +** uadd_lane: +** fmov\tw([0-9]+), s0 +** adds\tw([0-9]+), (?:w\1, w0|w0, w\1) +** csinv\tw\2, w\2, wzr, cc +** ret +*/ +/* +** uaddq: +** ... +** ldr\tq([0-9]+), .* +** ldr\tq([0-9]+), .* +** uqadd\tv[0-9]+.4s, (?:v\1.4s, v\2.4s|v\2.4s, v\1.4s) +** ... +** ldr\tw([0-9]+), .* +** ldr\tw([0-9]+), .* +** adds\tw([0-9]+), (?:w\3, w\4|w\4, w\3) +** csinv\tw\5, w\5, wzr, cc +** ... +** ldr\tw([0-9]+), .* +** ldr\tw([0-9]+), .* +** adds\tw([0-9]+), (?:w\6, w\7|w\7, w\6) +** csinv\tw\8, w\8, wzr, cc +** ... +*/ +/* +** uaddq2: +** ... +** ldr\tq([0-9]+), .* +** ldr\tq([0-9]+), .* +** uqadd\tv[0-9]+.4s, (?:v\1.4s, v\2.4s|v\2.4s, v\1.4s) +** ... +** ldr\tw([0-9]+), .* +** ldr\tw([0-9]+), .* +** adds\tw([0-9]+), (?:w\3, w\4|w\4, w\3) +** csinv\tw\5, w\5, wzr, cc +** ... +** ldr\tw([0-9]+), .* +** ldr\tw([0-9]+), .* +** adds\tw([0-9]+), (?:w\6, w\7|w\7, w\6) +** csinv\tw\8, w\8, wzr, cc +** ... +*/ +/* +** usubq: { xfail *-*-* } +** ... +** ldr\tq([0-9]+), .* +** ldr\tq([0-9]+), .* +** uqsub\tv[0-9]+.4s, v\1.4s, v\2.4s +** ... +** ldr\tw([0-9]+), .* +** ldr\tw([0-9]+), .* +** subs\tw([0-9]+), w\3, w\4 +** csel\tw\5, w\5, wzr, cs +** ... +** ldr\tw([0-9]+), .* +** ldr\tw([0-9]+), .* +** subs\tw([0-9]+), w\6, w\7 +** csel\tw\8, w\8, wzr, cs +** ... +*/ + +#include <limits.h> +#include <arm_neon.h> + +#define UT unsigned int +#define VT uint32x2_t +#define UMAX UINT_MAX +#define UMIN 0 + +#include "saturating_arithmetic_autovect.inc" \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c new file mode 100644 index 0000000000000000000000000000000000000000..01390637b5ca4accd306ce3a9aa4fc991d3f97b7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c @@ -0,0 +1,77 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-options "-O2 --save-temps -ftree-vectorize" } */ +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ + +/* +** uadd_lane: +** ... +** (?:fmov|ldr)\tx([0-9]+), .* +** ... +** adds\tx([0-9]+), (?:x\1, x0|x0, x\1) +** csinv\tx\2, x\2, xzr, cc +** ret +*/ +/* +** uaddq: +** ... +** ldr\tq([0-9]+), .* +** ldr\tq([0-9]+), .* +** uqadd\tv[0-9]+.2d, (?:v\1.2d, v\2.2d|v\2.2d, v\1.2d) +** ... +** ldr\tx([0-9]+), .* +** ldr\tx([0-9]+), .* +** adds\tx([0-9]+), (?:x\3, x\4|x\4, x\3) +** csinv\tx\5, x\5, xzr, cc +** ... +** ldr\tx([0-9]+), .* +** ldr\tx([0-9]+), .* +** adds\tx([0-9]+), (?:x\6, x\7|x\7, x\6) +** csinv\tx\8, x\8, xzr, cc +** ... +*/ +/* +** uaddq2: +** ... +** ldr\tq([0-9]+), .* +** ldr\tq([0-9]+), .* +** uqadd\tv[0-9]+.2d, (?:v\1.2d, v\2.2d|v\2.2d, v\1.2d) +** ... +** ldr\tx([0-9]+), .* +** ldr\tx([0-9]+), .* +** adds\tx([0-9]+), (?:x\3, x\4|x\4, x\3) +** csinv\tx\5, x\5, xzr, cc +** ... +** ldr\tx([0-9]+), .* +** ldr\tx([0-9]+), .* +** adds\tx([0-9]+), (?:x\6, x\7|x\7, x\6) +** csinv\tx\8, x\8, xzr, cc +** ... +*/ +/* +** usubq: { xfail *-*-* } +** ... +** ldr\tq([0-9]+), .* +** ldr\tq([0-9]+), .* +** uqsub\tv[0-9]+.2d, v\1.2d, v\2.2d +** ... +** ldr\tx([0-9]+), .* +** ldr\tx([0-9]+), .* +** subs\tx([0-9]+), x\3, x\4 +** csel\tx\5, x\5, xzr, cs +** ... +** ldr\tx([0-9]+), .* +** ldr\tx([0-9]+), .* +** subs\tx([0-9]+), x\6, x\7 +** csel\tx\8, x\8, xzr, cs +** ... +*/ + +#include <limits.h> +#include <arm_neon.h> + +#define UT unsigned long +#define VT uint64x2_t +#define UMAX ULONG_MAX +#define UMIN 0 + +#include "saturating_arithmetic_autovect.inc" \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c b/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c new file mode 100644 index 0000000000000000000000000000000000000000..de652bf1d9e6e59f088478b0e63ad7a027ac7782 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c @@ -0,0 +1,270 @@ +/* { dg-do run } */ +/* { dg-options "-O2 --save-temps -mearly-ra=none -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include <limits.h> +#include <stdbool.h> +#include <stdint.h> + +/* +** sadd32: +** asr w([0-9]+), w1, 31 +** eor w\1, w\1, -2147483648 +** adds w([0-9]+), (?:w0, w1|w1, w0) +** csinv w0, w\2, w\1, vc +** ret +*/ +int32_t __attribute__((noipa)) +sadd32 (int32_t __a, int32_t __b) +{ + int32_t sum; + bool overflow = __builtin_add_overflow (__a, __b, &sum); + return !overflow ? sum : __a < 0 ? INT_MIN : INT_MAX; +} + +/* +** sadd32_imm: +** adds w([0-9]+), w0, #67 +** mov w([0-9]+), 2147483647 +** csel w0, w\1, w\2, vc +** ret +*/ +int32_t __attribute__((noipa)) +sadd32_imm (int32_t __a) +{ + int32_t sum; + bool overflow = __builtin_add_overflow (__a, 67, &sum); + return !overflow ? sum : __a < 0 ? INT_MIN : INT_MAX; +} + +/* +** sadd32_imm2: +** subs w([0-9]+), w0, 67 +** mov w([0-9]+), -2147483648 +** csel w0, w\1, w\2, vc +** ret +*/ +int32_t __attribute__((noipa)) +sadd32_imm2 (int32_t __a) +{ + int32_t sum; + bool overflow = __builtin_add_overflow (__a, -67, &sum); + return !overflow ? sum : __a < 0 ? INT_MIN : INT_MAX; +} + +/* +** ssub32: +** asr w([0-9]+), w1, 31 +** eor w\1, w\1, -2147483648 +** subs w([0-9]+), w0, w1 +** csel w0, w\2, w\1, vc +** ret +*/ +int32_t __attribute__((noipa)) +ssub32 (int32_t __a, int32_t __b) +{ + int32_t result; + bool overflow = __builtin_sub_overflow (__a, __b, &result); + return !overflow ? result : __a < 0 ? INT_MIN : INT_MAX; +} + +/* +** ssub32_imm: +** subs w([0-9]+), w0, 67 +** mov w([0-9]+), -2147483648 +** csel w0, w\1, w\2, vc +** ret +*/ +int32_t __attribute__((noipa)) +ssub32_imm (int32_t __a) +{ + int32_t result; + bool overflow = __builtin_sub_overflow (__a, 67, &result); + return !overflow ? result : __a < 0 ? INT_MIN : INT_MAX; +} + +/* +** ssub32_imm2: +** adds w([0-9]+), w0, #67 +** mov w([0-9]+), 2147483647 +** csel w0, w\1, w\2, vc +** ret +*/ +int32_t __attribute__((noipa)) +ssub32_imm2 (int32_t __a) +{ + int32_t result; + bool overflow = __builtin_sub_overflow (__a, -67, &result); + return !overflow ? result : __a < 0 ? INT_MIN : INT_MAX; +} + +/* +** sadd64: +** asr x([0-9]+), x1, 63 +** eor x\1, x\1, -9223372036854775808 +** adds x([0-9]+), (?:x0, x1|x1, x0) +** csinv x0, x\2, x\1, vc +** ret +*/ +int64_t __attribute__((noipa)) +sadd64 (int64_t __a, int64_t __b) +{ + int64_t sum; + bool overflow = __builtin_add_overflow (__a, __b, &sum); + return !overflow ? sum : __a < 0 ? LONG_MIN : LONG_MAX; +} + +/* +** sadd64_imm: +** adds x([0-9]+), x0, #67 +** mov x([0-9]+), 9223372036854775807 +** csel x0, x\1, x\2, vc +** ret +*/ +int64_t __attribute__((noipa)) +sadd64_imm (int64_t __a) +{ + int64_t sum; + bool overflow = __builtin_add_overflow (__a, (int64_t)67, &sum); + return !overflow ? sum : __a < 0 ? LONG_MIN : LONG_MAX; +} + +/* +** sadd64_imm2: +** subs x([0-9]+), x0, 67 +** mov x([0-9]+), -9223372036854775808 +** csel x0, x\1, x\2, vc +** ret +*/ +int64_t __attribute__((noipa)) +sadd64_imm2 (int64_t __a) +{ + int64_t sum; + bool overflow = __builtin_add_overflow (__a, (int64_t)-67, &sum); + return !overflow ? sum : __a < 0 ? LONG_MIN : LONG_MAX; +} + +/* +** ssub64: +** asr x([0-9]+), x1, 63 +** eor x\1, x\1, -9223372036854775808 +** subs x([0-9]+), x0, x1 +** csel x0, x\2, x\1, vc +** ret +*/ +int64_t __attribute__((noipa)) +ssub64 (int64_t __a, int64_t __b) +{ + int64_t result; + bool overflow = __builtin_sub_overflow (__a, __b, &result); + return !overflow ? result : __a < 0 ? LONG_MIN : LONG_MAX; +} + +/* +** ssub64_imm: +** subs x([0-9]+), x0, 67 +** mov x([0-9]+), -9223372036854775808 +** csel x0, x\1, x\2, vc +** ret +*/ +int64_t __attribute__((noipa)) +ssub64_imm (int64_t __a) +{ + int64_t result; + bool overflow = __builtin_sub_overflow (__a, (int64_t) 67, &result); + return !overflow ? result : __a < 0 ? LONG_MIN : LONG_MAX; +} + +/* +** ssub64_imm2: +** adds x([0-9]+), x0, #67 +** mov x([0-9]+), 9223372036854775807 +** csel x0, x\1, x\2, vc +** ret +*/ +int64_t __attribute__((noipa)) +ssub64_imm2 (int64_t __a) +{ + int64_t result; + bool overflow = __builtin_sub_overflow (__a, (int64_t) -67, &result); + return !overflow ? result : __a < 0 ? LONG_MIN : LONG_MAX; +} + +int +main (void) +{ + /* Addition: + SAT_ADD(x, +ve), non-saturating + SAT_ADD(x, +ve), saturating + SAT_ADD(x, immediate +ve) + SAT_ADD(x, immediate -ve) + SAT_ADD(x, -ve), non-saturating + SAT_ADD(x, -ve), saturating + + Subtraction: + SAT_SUB(x, +ve), non-saturating + SAT_SUB(x, +ve), saturating + SAT_SUB(x, immediate +ve) + SAT_SUB(x, immediate -ve) + SAT_SUB(x, -ve), non-saturating */ + + int32_t a = 4; + int32_t b = 70; + int32_t c = 2147483647; + int32_t d = (int32_t) -2147483648; + + if (sadd32 (a, b) != (a + b)) + __builtin_abort (); + if (sadd32 (a, c) != c) + __builtin_abort (); + if (sadd32_imm (a) != (a + 67)) + __builtin_abort (); + if (sadd32_imm2 (a) != (a - 67)) + __builtin_abort (); + if (sadd32 (a, -b) != (a - b)) + __builtin_abort (); + if (sadd32 (a, d) != (d + 4)) + __builtin_abort (); + + if (ssub32 (a, b) != (a - b)) + __builtin_abort (); + if (ssub32 (-a, c) != d) + __builtin_abort (); + if (ssub32_imm (a) != (a - 67)) + __builtin_abort (); + if (ssub32_imm2 (a) != (a + 67)) + __builtin_abort (); + if (ssub32 (a, -b) != (a + b)) + __builtin_abort (); + + int64_t a_64 = a; + int64_t b_64 = b; + int64_t c_64 = (int64_t) 9223372036854775807; + int64_t d_64 = (int64_t) 0x8000000000000000; + + if (sadd64 (a_64, b_64) != (a_64 + b_64)) + __builtin_abort (); + if (sadd64 (a_64, c_64) != c_64) + __builtin_abort (); + if (sadd64_imm (a_64) != (a_64 + 67)) + __builtin_abort (); + if (sadd64_imm2 (a_64) != (a_64 - 67)) + __builtin_abort (); + if (sadd64 (a_64, -b_64) != (a_64 - b_64)) + __builtin_abort (); + if (sadd64 (a_64, d_64) != (d_64 + 4)) + __builtin_abort (); + + if (ssub64 (a_64, b_64) != (a_64 - b_64)) + __builtin_abort (); + if (ssub64 (-a_64, c_64) != d_64) + __builtin_abort (); + if (ssub64_imm (a_64) != (a_64 - 67)) + __builtin_abort (); + if (ssub64_imm2 (a_64) != (a_64 + 67)) + __builtin_abort (); + if (ssub64 (a_64, -b_64) != (a_64 + b_64)) + __builtin_abort (); + + return 0; +} \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc new file mode 100644 index 0000000000000000000000000000000000000000..e979d5354057bc3f5861b386ea28a7318c7c533c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc @@ -0,0 +1,39 @@ +/* Template file for scalar saturating arithmetic validation. + + This file defines scalar saturating addition and subtraction functions for a + given type. This type, along with the corresponding minimum and maximum + values for that type, must be defined by any test file which includes this + template file. */ + +#ifndef SAT_ARIT_INC +#define SAT_ARIT_INC + +#include <limits.h> + +#ifndef UT +#define UT unsigned int +#define UMAX UINT_MAX +#define UMIN 0 +#endif + +UT uadd (UT a, UT b) +{ + UT sum = a + b; + return sum < a ? UMAX : sum; +} + +UT uadd2 (UT a, UT b) +{ + UT c; + if (!__builtin_add_overflow(a, b, &c)) + return c; + return UMAX; +} + +UT usub (UT a, UT b) +{ + UT sum = a - b; + return sum > a ? UMIN : sum; +} + +#endif \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c new file mode 100644 index 0000000000000000000000000000000000000000..2ac0c376d126ecc15b37015b1a2e098dfa7f5543 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c @@ -0,0 +1,36 @@ +/* { dg-do-compile } */ +/* { dg-options "-O2 --save-temps -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +/* +** uadd: +** dup v([0-9]+).8b, w1 +** dup v([0-9]+).8b, w0 +** uqadd b([0-9]+), (?:b\2, b\1|b\1, b\2) +** umov w0, v\3.b\[0\] +** ret +*/ +/* +** uadd2: +** dup v([0-9]+).8b, w1 +** dup v([0-9]+).8b, w0 +** uqadd b([0-9]+), (?:b\2, b\1|b\1, b\2) +** umov w0, v\3.b\[0\] +** ret +*/ +/* +** usub: { xfail *-*-* } +** dup v([0-9]+).8b, w1 +** dup v([0-9]+).8b, w0 +** uqsub b([0-9]+), b\1, b\2 +** umov w0, v\3.b\[0\] +** ret +*/ + +#include <limits.h> + +#define UT unsigned char +#define UMAX UCHAR_MAX +#define UMIN 0 + +#include "saturating_arithmetic.inc" \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c new file mode 100644 index 0000000000000000000000000000000000000000..2a55aa9f2218a168becf3b6b13905c6cd228cacc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c @@ -0,0 +1,36 @@ +/* { dg-do-compile } */ +/* { dg-options "-O2 --save-temps -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +/* +** uadd: +** dup v([0-9]+).4h, w1 +** dup v([0-9]+).4h, w0 +** uqadd h([0-9]+), (?:h\2, h\1|h\1, h\2) +** umov w0, v\3.h\[0\] +** ret +*/ +/* +** uadd2: +** dup v([0-9]+).4h, w1 +** dup v([0-9]+).4h, w0 +** uqadd h([0-9]+), (?:h\2, h\1|h\1, h\2) +** umov w0, v\3.h\[0\] +** ret +*/ +/* +** usub: { xfail *-*-* } +** dup v([0-9]+).4h, w1 +** dup v([0-9]+).4h, w0 +** uqsub h([0-9]+), h\1, h\2 +** umov w0, v\3.h\[0\] +** ret +*/ + +#include <limits.h> + +#define UT unsigned short +#define UMAX USHRT_MAX +#define UMIN 0 + +#include "saturating_arithmetic.inc" \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c new file mode 100644 index 0000000000000000000000000000000000000000..215172545196678b759b9fb779b4473f86b2805c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c @@ -0,0 +1,30 @@ +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-options "-O2 --save-temps -ftree-vectorize" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +/* +** uadd: +** adds\tw([0-9]+), w([0-9]+), w([0-9]+) +** csinv\tw\1, w\1, wzr, cc +** ret +*/ +/* +** uadd2: +** adds\tw([0-9]+), w([0-9]+), w([0-9]+) +** csinv\tw\1, w\1, wzr, cc +** ret +*/ +/* +** usub: +** subs\tw([0-9]+), w([0-9]+), w([0-9]+) +** csel\tw\1, w\1, wzr, cs +** ret +*/ + +#include <limits.h> + +#define UT unsigned int +#define UMAX UINT_MAX +#define UMIN 0 + +#include "saturating_arithmetic.inc" \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c new file mode 100644 index 0000000000000000000000000000000000000000..363d0a79a730eaf4401f88d66bfea0c61dfc5c82 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c @@ -0,0 +1,30 @@ +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-options "-O2 --save-temps -ftree-vectorize" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +/* +** uadd: +** adds\tx([0-9]+), x([0-9]+), x([0-9]+) +** csinv\tx\1, x\1, xzr, cc +** ret +*/ +/* +** uadd2: +** adds\tx([0-9]+), x([0-9]+), x([0-9]+) +** csinv\tx\1, x\1, xzr, cc +** ret +*/ +/* +** usub: +** subs\tx([0-9]+), x([0-9]+), x([0-9]+) +** csel\tx\1, x\1, xzr, cs +** ret +*/ + +#include <limits.h> + +#define UT unsigned long +#define UMAX ULONG_MAX +#define UMIN 0 + +#include "saturating_arithmetic.inc" \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c index c2e13b651e96fdc9c5abeaf154905dce177d9e2f..dcf9dc777adea528c7aa3c96343668f44b7941cb 100644 --- a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c +++ b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c @@ -318,33 +318,33 @@ test_vpaddd_u64 (uint64x2_t a) /* { dg-final { scan-assembler-times "\\tuqadd\\td\[0-9\]+" 1 } } */ uint64_t -test_vqaddd_u64 (uint64_t a, uint64_t b) +test_vqaddd_u64 (uint64x1_t a, uint64x1_t b) { - return vqaddd_u64 (a, b); + return vqaddd_u64 (a[0], b[0]); } /* { dg-final { scan-assembler-times "\\tuqadd\\ts\[0-9\]+" 1 } } */ uint32_t -test_vqadds_u32 (uint32_t a, uint32_t b) +test_vqadds_u32 (uint32x4_t a, uint32x4_t b) { - return vqadds_u32 (a, b); + return vqadds_u32 (a[0], b[0]); } /* { dg-final { scan-assembler-times "\\tuqadd\\th\[0-9\]+" 1 } } */ uint16_t -test_vqaddh_u16 (uint16_t a, uint16_t b) +test_vqaddh_u16 (uint16x8_t a, uint16x8_t b) { - return vqaddh_u16 (a, b); + return vqaddh_u16 (a[0], b[0]); } /* { dg-final { scan-assembler-times "\\tuqadd\\tb\[0-9\]+" 1 } } */ uint8_t -test_vqaddb_u8 (uint8_t a, uint8_t b) +test_vqaddb_u8 (uint8x16_t a, uint8x16_t b) { - return vqaddb_u8 (a, b); + return vqaddb_u8 (a[0], b[0]); } /* { dg-final { scan-assembler-times "\\tsqadd\\td\[0-9\]+" 1 } } */ @@ -761,33 +761,33 @@ test_vsubd_s64_2 (int64_t a, int64_t b) /* { dg-final { scan-assembler-times "\\tuqsub\\td\[0-9\]+" 1 } } */ uint64_t -test_vqsubd_u64 (uint64_t a, uint64_t b) +test_vqsubd_u64 (uint64x1_t a, uint64x1_t b) { - return vqsubd_u64 (a, b); + return vqsubd_u64 (a[0], b[0]); } /* { dg-final { scan-assembler-times "\\tuqsub\\ts\[0-9\]+" 1 } } */ uint32_t -test_vqsubs_u32 (uint32_t a, uint32_t b) +test_vqsubs_u32 (uint32x4_t a, uint32x4_t b) { - return vqsubs_u32 (a, b); + return vqsubs_u32 (a[0], b[0]); } /* { dg-final { scan-assembler-times "\\tuqsub\\th\[0-9\]+" 1 } } */ uint16_t -test_vqsubh_u16 (uint16_t a, uint16_t b) +test_vqsubh_u16 (uint16x8_t a, uint16x8_t b) { - return vqsubh_u16 (a, b); + return vqsubh_u16 (a[0], b[0]); } /* { dg-final { scan-assembler-times "\\tuqsub\\tb\[0-9\]+" 1 } } */ uint8_t -test_vqsubb_u8 (uint8_t a, uint8_t b) +test_vqsubb_u8 (uint8x16_t a, uint8x16_t b) { - return vqsubb_u8 (a, b); + return vqsubb_u8 (a[0], b[0]); } /* { dg-final { scan-assembler-times "\\tsqsub\\td\[0-9\]+" 1 } } */
rb19159.patch
Description: rb19159.patch