RE: [PATCH v3 1/2] aarch64: Use standard names for saturating arithmetic

Tamar Christina Fri, 17 Jan 2025 05:02:18 -0800

> -----Original Message-----
> From: Richard Sandiford <richard.sandif...@arm.com>
> Sent: Friday, January 10, 2025 4:50 PM
> To: Akram Ahmad <akram.ah...@arm.com>
> Cc: ktkac...@nvidia.com; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH v3 1/2] aarch64: Use standard names for saturating 
> arithmetic
> 
> Akram Ahmad <akram.ah...@arm.com> writes:
> > Ah whoops- I didn't see this before sending off V4 just now, my apologies.
> > I'll try my best to get this implemented before the end of the day so that
> > it doesn't miss the deadline.
> 
> No rush!  The delay here is entirely my fault, so no problem if the
> patch lands early stage 4.
>


Hi,  I'm picking up the remainder of the patch for Akram.

I believe I have addressed all the review comments, and I've also rebased to
master which needed some small changes.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64-builtins.cc: Expand iterators.
        * config/aarch64/aarch64-simd-builtins.def: Use standard names
        * config/aarch64/aarch64-simd.md: Use standard names, split insn
        definitions on signedness of operator and type of operands.
        * config/aarch64/arm_neon.h: Use standard builtin names.
        * config/aarch64/iterators.md: Add VSDQ_I_QI_HI iterator to
        simplify splitting of insn for unsigned scalar arithmetic.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/scalar_intrinsics.c: Update testcases.
        * 
gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc:
        Template file for unsigned vector saturating arithmetic tests.
        * 
gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c:
        8-bit vector type tests.
        * 
gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c:
        16-bit vector type tests.
        * 
gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c:
        32-bit vector type tests.
        * 
gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c:
        64-bit vector type tests.
        * gcc.target/aarch64/saturating_arithmetic.inc: Template file
        for scalar saturating arithmetic tests.
        * gcc.target/aarch64/saturating_arithmetic_1.c: 8-bit tests.
        * gcc.target/aarch64/saturating_arithmetic_2.c: 16-bit tests.
        * gcc.target/aarch64/saturating_arithmetic_3.c: 32-bit tests.
        * gcc.target/aarch64/saturating_arithmetic_4.c: 64-bit tests.

Co-authored-by: Tamar Christina <tamar.christ...@arm.com>

-- inline copy --

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
86eebc16885976387d941efc61c55975359b6099..6d5479c2e4492078312b05561d682ead9e9c2d13
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -5039,6 +5039,18 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, 
gcall *stmt,
          new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
                                          LSHIFT_EXPR, args[0], args[1]);
        break;
+      /* lower saturating add/sub neon builtins to gimple.  */
+      BUILTIN_VSDQ_I (BINOP, ssadd, 3, DEFAULT)
+      BUILTIN_VSDQ_I (BINOPU, usadd, 3, DEFAULT)
+       new_stmt = gimple_build_call_internal (IFN_SAT_ADD, 2, args[0], 
args[1]);
+       gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+       break;
+      BUILTIN_VSDQ_I (BINOP, sssub, 3, DEFAULT)
+      BUILTIN_VSDQ_I (BINOPU, ussub, 3, DEFAULT)
+       new_stmt = gimple_build_call_internal (IFN_SAT_SUB, 2, args[0], 
args[1]);
+       gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+       break;
+
       BUILTIN_VSDQ_I_DI (BINOP, sshl, 0, DEFAULT)
       BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0, DEFAULT)
        {
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
b/gcc/config/aarch64/aarch64-simd-builtins.def
index 
286272a331180d7682826dc3ec9e921b1ebbeab6..6cc45b18a723fc9621e2da6220231e61b621185a
 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -71,10 +71,10 @@
   BUILTIN_VSDQ_I (BINOP, sqrshl, 0, DEFAULT)
   BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0, DEFAULT)
   /* Implemented by aarch64_<su_optab><optab><mode>.  */
-  BUILTIN_VSDQ_I (BINOP, sqadd, 0, DEFAULT)
-  BUILTIN_VSDQ_I (BINOPU, uqadd, 0, DEFAULT)
-  BUILTIN_VSDQ_I (BINOP, sqsub, 0, DEFAULT)
-  BUILTIN_VSDQ_I (BINOPU, uqsub, 0, DEFAULT)
+  BUILTIN_VSDQ_I (BINOP, ssadd, 3, DEFAULT)
+  BUILTIN_VSDQ_I (BINOPU, usadd, 3, DEFAULT)
+  BUILTIN_VSDQ_I (BINOP, sssub, 3, DEFAULT)
+  BUILTIN_VSDQ_I (BINOPU, ussub, 3, DEFAULT)
   /* Implemented by aarch64_<sur>qadd<mode>.  */
   BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0, DEFAULT)
   BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, DEFAULT)
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
eeb626f129a812d11de8c7b90cb20633739baac6..e2afe87e5130cc066b8348659209ab40747327e5
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5162,15 +5162,214 @@ (define_insn "*aarch64_vgetfmulx<mode>"
 )
 ;; <su>q<addsub>
 
-(define_insn "aarch64_<su_optab>q<addsub><mode><vczle><vczbe>"
-  [(set (match_operand:VSDQ_I 0 "register_operand" "=w")
-       (BINQOPS:VSDQ_I (match_operand:VSDQ_I 1 "register_operand" "w")
-                       (match_operand:VSDQ_I 2 "register_operand" "w")))]
+(define_insn "<su_optab>s<addsub><mode>3<vczle><vczbe>"
+  [(set (match_operand:VSDQ_I_QI_HI 0 "register_operand" "=w")
+       (BINQOPS:VSDQ_I_QI_HI
+         (match_operand:VSDQ_I_QI_HI 1 "register_operand" "w")
+         (match_operand:VSDQ_I_QI_HI 2 "register_operand" "w")))]
   "TARGET_SIMD"
   "<su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
   [(set_attr "type" "neon_q<addsub><q>")]
 )
 
+(define_expand "<su_optab>s<addsub><mode>3"
+  [(parallel
+    [(set (match_operand:GPI 0 "register_operand")
+         (SBINQOPS:GPI (match_operand:GPI 1 "register_operand")
+                       (match_operand:GPI 2 "aarch64_plus_operand")))
+    (clobber (scratch:GPI))
+    (clobber (reg:CC CC_REGNUM))])]
+)
+
+;; Introducing a temporary GP reg allows signed saturating arithmetic with GPR
+;; operands to be calculated without the use of costly transfers to and from FP
+;; registers.  For example, saturating addition usually uses three FMOVs:
+;;
+;;   fmov      d0, x0
+;;   fmov      d1, x1
+;;   sqadd     d0, d0, d1
+;;   fmov      x0, d0
+;;
+;; Using a temporary register results in three cheaper instructions being used
+;; in place of the three FMOVs, which calculate the saturating limit accounting
+;; for the signedness of operand2:
+;;
+;;   asr       x2, x1, 63
+;;   adds      x0, x0, x1
+;;   eor       x2, x2, 0x8000000000000000
+;;   csinv     x0, x0, x2, vc
+;;
+;; If operand2 is a constant value, the temporary register can be used to store
+;; the saturating limit without the need for asr, xor to calculate said limit.
+
+(define_insn_and_split "aarch64_<su_optab>s<addsub><mode>3<vczle><vczbe>"
+  [(set (match_operand:GPI 0 "register_operand")
+       (SBINQOPS:GPI (match_operand:GPI 1 "register_operand")
+                     (match_operand:GPI 2 "aarch64_plus_operand")))
+    (clobber (match_scratch:GPI 3))
+    (clobber (reg:CC CC_REGNUM))]
+  ""
+  {@ [ cons: =0, 1 , 2   , =3 ; attrs: type       , arch , length ]
+     [ w       , w , w   , X  ; neon_q<addsub><q> , simd , 4      ] 
<su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>
+     [ r       , r , JIr , &r ; *                , *    , 8      ] #
+  }
+  "&& reload_completed && GP_REGNUM_P (REGNO (operands[0]))"
+  [(set (match_dup 0)
+       (if_then_else:GPI
+         (match_dup 4)
+         (match_dup 5)
+         (match_dup 6)))]
+  {
+    if (REG_P (operands[2]))
+      {
+      rtx shift_constant = gen_int_mode (GET_MODE_BITSIZE (<MODE>mode) - 1,
+                                        <MODE>mode);
+      auto limit = HOST_WIDE_INT_1U << (GET_MODE_BITSIZE (<MODE>mode) - 1);
+      rtx limit_constant = gen_int_mode (limit, <MODE>mode);
+      emit_insn (gen_ashr<mode>3 (operands[3], operands[2], shift_constant));
+      emit_insn (gen_xor<mode>3 (operands[3], operands[3], limit_constant));
+
+      switch (<CODE>)
+       {
+       case SS_MINUS:
+         emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1],
+                                             operands[2]));
+       break;
+       case SS_PLUS:
+         emit_insn (gen_add<mode>3_compare0 (operands[0], operands[1],
+                                             operands[2]));
+         break;
+       default:
+         gcc_unreachable ();
+       }
+
+      rtx ccin = gen_rtx_REG (E_CC_Vmode, CC_REGNUM);
+      switch (<CODE>)
+       {
+       case SS_PLUS:
+         operands[4] = gen_rtx_NE (<MODE>mode, ccin, const0_rtx);
+         operands[5] = gen_rtx_NOT (<MODE>mode, operands[3]);
+         operands[6] = operands[0];
+         break;
+       case SS_MINUS:
+         operands[4] = gen_rtx_EQ (<MODE>mode, ccin, const0_rtx);
+         operands[5] = operands[0];
+         operands[6] = operands[3];
+         break;
+       default:
+         gcc_unreachable ();
+       }
+      }
+    else
+      {
+       auto imm = INTVAL (operands[2]);
+       rtx neg_imm = gen_int_mode (-imm, <MODE>mode);
+       wide_int limit;
+
+       switch (<CODE>)
+         {
+         case SS_MINUS:
+           emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
+                                                   operands[2], neg_imm));
+           limit = imm >= 0 ? wi::min_value (<MODE>mode, SIGNED)
+                            : wi::max_value (<MODE>mode, SIGNED);
+           break;
+         case SS_PLUS:
+           emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
+                                                   neg_imm, operands[2]));
+           limit = imm >= 0 ? wi::max_value (<MODE>mode, SIGNED)
+                            : wi::min_value (<MODE>mode, SIGNED);
+           break;
+         default:
+           gcc_unreachable ();
+         }
+
+      rtx sat_limit = immed_wide_int_const (limit, <MODE>mode);
+      emit_insn (gen_rtx_SET (operands[3], sat_limit));
+
+      rtx ccin = gen_rtx_REG (E_CC_Vmode, CC_REGNUM);
+      operands[4] = gen_rtx_EQ (<MODE>mode, ccin, const0_rtx);
+      operands[5] = operands[0];
+      operands[6] = operands[3];
+      }
+  }
+)
+
+;; Unsigned saturating arithmetic with GPR operands can be optimised similarly
+;; to the signed case, albeit without the need for a temporary register as the
+;; saturating limit can be inferred from the <addsub> code.  This applies only
+;; to SImode and DImode.
+
+(define_insn_and_split "<su_optab>s<addsub><mode>3<vczle><vczbe>"
+  [(set (match_operand:GPI 0 "register_operand")
+       (UBINQOPS:GPI (match_operand:GPI 1 "register_operand")
+                     (match_operand:GPI 2 "aarch64_plus_operand")))
+    (clobber (reg:CC CC_REGNUM))]
+  ""
+  {@ [ cons: =0, 1 , 2   ; attrs: type       , arch , length ]
+     [ w       , w , w   ; neon_q<addsub><q> , simd , 4      ] 
<su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>
+     [ r       , r , JIr ; *                , *    , 8      ] #
+  }
+  "&& reload_completed && GP_REGNUM_P (REGNO (operands[0]))"
+  [(set (match_dup 0)
+       (if_then_else:GPI
+         (match_dup 3)
+         (match_dup 0)
+         (match_dup 4)))]
+  {
+
+    if (REG_P (operands[2]))
+      {
+       switch (<CODE>)
+         {
+         case US_MINUS:
+           emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1],
+                                               operands[2]));
+           break;
+         case US_PLUS:
+           emit_insn (gen_add<mode>3_compare0 (operands[0], operands[1],
+                                               operands[2]));
+           break;
+         default:
+           gcc_unreachable ();
+         }
+      }
+    else
+      {
+       auto imm = UINTVAL (operands[2]);
+       rtx neg_imm = gen_int_mode (-imm, <MODE>mode);
+       switch (<CODE>)
+         {
+         case US_MINUS:
+           emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
+                                                   operands[2], neg_imm));
+           break;
+         case US_PLUS:
+           emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
+                                                   neg_imm, operands[2]));
+           break;
+         default:
+           gcc_unreachable ();
+         }
+      }
+
+    rtx ccin = gen_rtx_REG (CCmode, CC_REGNUM);
+    switch (<CODE>)
+      {
+      case US_PLUS:
+       operands[3] = gen_rtx_LTU (<MODE>mode, ccin, const0_rtx);
+       operands[4] = gen_int_mode (-1, <MODE>mode);
+       break;
+      case US_MINUS:
+       operands[3] = gen_rtx_GEU (<MODE>mode, ccin, const0_rtx);
+       operands[4] = const0_rtx;
+       break;
+      default:
+       gcc_unreachable ();
+      }
+  }
+)
+
 ;; suqadd and usqadd
 
 (define_insn "aarch64_<sur>qadd<mode><vczle><vczbe>"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 
33594cb65d28a77f95554e6188d5f5ea10ebe60b..4899acead9b7c04a7a8f45a7d308e94e369e2186
 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -1864,35 +1864,35 @@ __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return (int8x8_t) __builtin_aarch64_sqaddv8qi (__a, __b);
+  return (int8x8_t) __builtin_aarch64_ssaddv8qi (__a, __b);
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (int16x4_t) __builtin_aarch64_sqaddv4hi (__a, __b);
+  return (int16x4_t) __builtin_aarch64_ssaddv4hi (__a, __b);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return (int32x2_t) __builtin_aarch64_sqaddv2si (__a, __b);
+  return (int32x2_t) __builtin_aarch64_ssaddv2si (__a, __b);
 }
 
 __extension__ extern __inline int64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (int64x1_t) {__builtin_aarch64_sqadddi (__a[0], __b[0])};
+  return (int64x1_t) {__builtin_aarch64_ssadddi (__a[0], __b[0])};
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return __builtin_aarch64_uqaddv8qi_uuu (__a, __b);
+  return __builtin_aarch64_usaddv8qi_uuu (__a, __b);
 }
 
 __extension__ extern __inline int8x8_t
@@ -2151,189 +2151,189 @@ __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return __builtin_aarch64_uqaddv4hi_uuu (__a, __b);
+  return __builtin_aarch64_usaddv4hi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return __builtin_aarch64_uqaddv2si_uuu (__a, __b);
+  return __builtin_aarch64_usaddv2si_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) {__builtin_aarch64_uqadddi_uuu (__a[0], __b[0])};
+  return (uint64x1_t) {__builtin_aarch64_usadddi_uuu (__a[0], __b[0])};
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return (int8x16_t) __builtin_aarch64_sqaddv16qi (__a, __b);
+  return (int8x16_t) __builtin_aarch64_ssaddv16qi (__a, __b);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return (int16x8_t) __builtin_aarch64_sqaddv8hi (__a, __b);
+  return (int16x8_t) __builtin_aarch64_ssaddv8hi (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return (int32x4_t) __builtin_aarch64_sqaddv4si (__a, __b);
+  return (int32x4_t) __builtin_aarch64_ssaddv4si (__a, __b);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  return (int64x2_t) __builtin_aarch64_sqaddv2di (__a, __b);
+  return (int64x2_t) __builtin_aarch64_ssaddv2di (__a, __b);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_aarch64_uqaddv16qi_uuu (__a, __b);
+  return __builtin_aarch64_usaddv16qi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_aarch64_uqaddv8hi_uuu (__a, __b);
+  return __builtin_aarch64_usaddv8hi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_aarch64_uqaddv4si_uuu (__a, __b);
+  return __builtin_aarch64_usaddv4si_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  return __builtin_aarch64_uqaddv2di_uuu (__a, __b);
+  return __builtin_aarch64_usaddv2di_uuu (__a, __b);
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return (int8x8_t) __builtin_aarch64_sqsubv8qi (__a, __b);
+  return (int8x8_t) __builtin_aarch64_sssubv8qi (__a, __b);
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (int16x4_t) __builtin_aarch64_sqsubv4hi (__a, __b);
+  return (int16x4_t) __builtin_aarch64_sssubv4hi (__a, __b);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return (int32x2_t) __builtin_aarch64_sqsubv2si (__a, __b);
+  return (int32x2_t) __builtin_aarch64_sssubv2si (__a, __b);
 }
 
 __extension__ extern __inline int64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (int64x1_t) {__builtin_aarch64_sqsubdi (__a[0], __b[0])};
+  return (int64x1_t) {__builtin_aarch64_sssubdi (__a[0], __b[0])};
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return __builtin_aarch64_uqsubv8qi_uuu (__a, __b);
+  return __builtin_aarch64_ussubv8qi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return __builtin_aarch64_uqsubv4hi_uuu (__a, __b);
+  return __builtin_aarch64_ussubv4hi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return __builtin_aarch64_uqsubv2si_uuu (__a, __b);
+  return __builtin_aarch64_ussubv2si_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) {__builtin_aarch64_uqsubdi_uuu (__a[0], __b[0])};
+  return (uint64x1_t) {__builtin_aarch64_ussubdi_uuu (__a[0], __b[0])};
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return (int8x16_t) __builtin_aarch64_sqsubv16qi (__a, __b);
+  return (int8x16_t) __builtin_aarch64_sssubv16qi (__a, __b);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return (int16x8_t) __builtin_aarch64_sqsubv8hi (__a, __b);
+  return (int16x8_t) __builtin_aarch64_sssubv8hi (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return (int32x4_t) __builtin_aarch64_sqsubv4si (__a, __b);
+  return (int32x4_t) __builtin_aarch64_sssubv4si (__a, __b);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  return (int64x2_t) __builtin_aarch64_sqsubv2di (__a, __b);
+  return (int64x2_t) __builtin_aarch64_sssubv2di (__a, __b);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_aarch64_uqsubv16qi_uuu (__a, __b);
+  return __builtin_aarch64_ussubv16qi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_aarch64_uqsubv8hi_uuu (__a, __b);
+  return __builtin_aarch64_ussubv8hi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_aarch64_uqsubv4si_uuu (__a, __b);
+  return __builtin_aarch64_ussubv4si_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  return __builtin_aarch64_uqsubv2di_uuu (__a, __b);
+  return __builtin_aarch64_ussubv2di_uuu (__a, __b);
 }
 
 __extension__ extern __inline int8x8_t
@@ -17543,56 +17543,56 @@ __extension__ extern __inline int8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddb_s8 (int8_t __a, int8_t __b)
 {
-  return (int8_t) __builtin_aarch64_sqaddqi (__a, __b);
+  return (int8_t) __builtin_aarch64_ssaddqi (__a, __b);
 }
 
 __extension__ extern __inline int16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddh_s16 (int16_t __a, int16_t __b)
 {
-  return (int16_t) __builtin_aarch64_sqaddhi (__a, __b);
+  return (int16_t) __builtin_aarch64_ssaddhi (__a, __b);
 }
 
 __extension__ extern __inline int32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadds_s32 (int32_t __a, int32_t __b)
 {
-  return (int32_t) __builtin_aarch64_sqaddsi (__a, __b);
+  return (int32_t) __builtin_aarch64_ssaddsi (__a, __b);
 }
 
 __extension__ extern __inline int64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddd_s64 (int64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_sqadddi (__a, __b);
+  return __builtin_aarch64_ssadddi (__a, __b);
 }
 
 __extension__ extern __inline uint8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddb_u8 (uint8_t __a, uint8_t __b)
 {
-  return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b);
+  return (uint8_t) __builtin_aarch64_usaddqi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddh_u16 (uint16_t __a, uint16_t __b)
 {
-  return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b);
+  return (uint16_t) __builtin_aarch64_usaddhi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadds_u32 (uint32_t __a, uint32_t __b)
 {
-  return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b);
+  return (uint32_t) __builtin_aarch64_usaddsi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddd_u64 (uint64_t __a, uint64_t __b)
 {
-  return __builtin_aarch64_uqadddi_uuu (__a, __b);
+  return __builtin_aarch64_usadddi_uuu (__a, __b);
 }
 
 /* vqdmlal */
@@ -19242,56 +19242,56 @@ __extension__ extern __inline int8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubb_s8 (int8_t __a, int8_t __b)
 {
-  return (int8_t) __builtin_aarch64_sqsubqi (__a, __b);
+  return (int8_t) __builtin_aarch64_sssubqi (__a, __b);
 }
 
 __extension__ extern __inline int16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubh_s16 (int16_t __a, int16_t __b)
 {
-  return (int16_t) __builtin_aarch64_sqsubhi (__a, __b);
+  return (int16_t) __builtin_aarch64_sssubhi (__a, __b);
 }
 
 __extension__ extern __inline int32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubs_s32 (int32_t __a, int32_t __b)
 {
-  return (int32_t) __builtin_aarch64_sqsubsi (__a, __b);
+  return (int32_t) __builtin_aarch64_sssubsi (__a, __b);
 }
 
 __extension__ extern __inline int64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubd_s64 (int64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_sqsubdi (__a, __b);
+  return __builtin_aarch64_sssubdi (__a, __b);
 }
 
 __extension__ extern __inline uint8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubb_u8 (uint8_t __a, uint8_t __b)
 {
-  return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b);
+  return (uint8_t) __builtin_aarch64_ussubqi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubh_u16 (uint16_t __a, uint16_t __b)
 {
-  return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b);
+  return (uint16_t) __builtin_aarch64_ussubhi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubs_u32 (uint32_t __a, uint32_t __b)
 {
-  return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b);
+  return (uint32_t) __builtin_aarch64_ussubsi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubd_u64 (uint64_t __a, uint64_t __b)
 {
-  return __builtin_aarch64_uqsubdi_uuu (__a, __b);
+  return __builtin_aarch64_ussubdi_uuu (__a, __b);
 }
 
 /* vqtbl2 */
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 
ff0f34dd0430d73833149d20f4cc0d1f84c3fd2f..2f7aa489ae8ba4b0101ad168cc61d592a9b7b660
 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -95,6 +95,10 @@ (define_mode_iterator VSDQ_I [V8QI V16QI V4HI V8HI V2SI V4SI 
V2DI QI HI SI DI])
 ;; integer modes; 64-bit scalar integer mode.
 (define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI])
 
+;; Advanced SIMD and scalar, 64 & 128-bit container; 8 and 16-bit scalar
+;; integer modes.
+(define_mode_iterator VSDQ_I_QI_HI [VDQ_I HI QI])
+
 ;; Double vector modes.
 (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF V4BF])
 
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc
 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc
new file mode 100644
index 
0000000000000000000000000000000000000000..1fadfd587555f0a0e5c390f3a46442a05ec675e6
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc
@@ -0,0 +1,58 @@
+/* Template file for vector saturating arithmetic validation.
+
+   This file defines saturating addition and subtraction functions for a given
+   scalar type, testing the auto-vectorization of these two operators. This
+   type, along with the corresponding minimum and maximum values for that type,
+   must be defined by any test file which includes this template file.  */
+
+#ifndef SAT_ARIT_AUTOVEC_INC
+#define SAT_ARIT_AUTOVEC_INC
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#ifndef UT
+#define UT unsigned int
+#define VT uint32x4_t
+#define UMAX UINT_MAX
+#define UMIN 0
+#endif
+
+
+UT uadd_lane (UT a, VT b)
+{
+  UT sum = a + b[0];
+  return sum < a ? UMAX : sum;
+}
+
+void uaddq (UT *out, UT *a, UT *b, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      UT sum = a[i] + b[i];
+      out[i] = sum < a[i] ? UMAX : sum;
+    }
+}
+
+void uaddq2 (UT *out, UT *a, UT *b, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      UT sum;
+      if (!__builtin_add_overflow(a[i], b[i], &sum))
+        out[i] = sum;
+      else
+        out[i] = UMAX;
+    }
+}
+
+void usubq (UT *out, UT *a, UT *b, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      UT sum = a[i] - b[i];
+      out[i] = sum > a[i] ? UMIN : sum;
+    }
+}
+
+#endif
\ No newline at end of file
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c
 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..2b72be7b0d7c6b23d9b7431a70ae2eb79f4c4bf8
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c
@@ -0,0 +1,79 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+/*
+** uadd_lane: { xfail *-*-* }
+**     dup\tv([0-9]+).8b, w0
+**     uqadd\tb([0-9]+), (?:b\1, b0|b0, b\1)
+**     umov\tw0, v\2.b\[0\]
+**     ret
+*/
+/*
+** uaddq:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.16b, (?:v\1.16b, v\2.16b|v\2.16b, v\1.16b)
+** ...
+**     ldr\td([0-9]+), .*
+**     ldr\td([0-9]+), .*
+**     uqadd\tv[0-9]+.8b, (?:v\3.8b, v\4.8b|v\4.8b, v\3.8b)
+** ...
+**     ldr\tb([0-9]+), .*
+**     ldr\tb([0-9]+), .*
+**     uqadd\tb[0-9]+, (?:b\5, b\6|b\6, b\5)
+** ...
+**     ldr\tb([0-9]+), .*
+**     ldr\tb([0-9]+), .*
+**     uqadd\tb[0-9]+, (?:b\7, b\8|b\8, b\7)
+** ...
+*/
+/*
+** uaddq2:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.16b, (?:v\1.16b, v\2.16b|v\2.16b, v\1.16b)
+** ...
+**     ldr\td([0-9]+), .*
+**     ldr\td([0-9]+), .*
+**     uqadd\tv[0-9]+.8b, (?:v\3.8b, v\4.8b|v\4.8b, v\3.8b)
+** ...
+**     ldr\tb([0-9]+), .*
+**     ldr\tb([0-9]+), .*
+**     uqadd\tb[0-9]+, (?:b\5, b\6|b\6, b\5)
+** ...
+**     uqadd\tb([0-9]+), (?:b[0-9]+, b\7|b\7, b[0-9]+)
+** ...
+*/
+/*
+** usubq: { xfail *-*-* }
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqsub\tv[0-9]+.16b, v\1.16b, v\2.16b
+** ...
+**     ldr\td([0-9]+), .*
+**     ldr\td([0-9]+), .*
+**     uqsub\tv[0-9]+.8b, v\3.8b, v\4.8b
+** ...
+**     ldr\tb([0-9]+), .*
+**     ldr\tb([0-9]+), .*
+**     uqsub\tb[0-9]+, b\5, b\6
+** ...
+**     ldr\tb([0-9]+), .*
+**     ldr\tb([0-9]+), .*
+**     uqsub\tb[0-9]+, b\7, b\8
+** ...
+*/
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#define UT unsigned char
+#define VT uint8x8_t
+#define UMAX UCHAR_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic_autovect.inc"
\ No newline at end of file
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c
 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..0640361498f020284910d2370f9c8a6a3e898022
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c
@@ -0,0 +1,79 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+/*
+** uadd_lane: { xfail *-*-* }
+**     dup\tv([0-9]+).4h, w0
+**     uqadd\th([0-9]+), (?:h\1, h0|h0, h\1)
+**     umov\tw0, v\2.h\[0\]
+**     ret
+*/
+/*
+** uaddq:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.8h, (?:v\1.8h, v\2.8h|v\2.8h, v\1.8h)
+** ...
+**     ldr\td([0-9]+), .*
+**     ldr\td([0-9]+), .*
+**     uqadd\tv[0-9]+.4h, (?:v\3.4h, v\4.4h|v\4.4h, v\3.4h)
+** ...
+**     ldr\th([0-9]+), .*
+**     ldr\th([0-9]+), .*
+**     uqadd\th[0-9]+, (?:h\5, h\6|h\6, h\5)
+** ...
+**     ldr\th([0-9]+), .*
+**     ldr\th([0-9]+), .*
+**     uqadd\th[0-9]+, (?:h\7, h\8|h\8, h\7)
+** ...
+*/
+/*
+** uaddq2:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.8h, (?:v\1.8h, v\2.8h|v\2.8h, v\1.8h)
+** ...
+**     ldr\td([0-9]+), .*
+**     ldr\td([0-9]+), .*
+**     uqadd\tv[0-9]+.4h, (?:v\3.4h, v\4.4h|v\4.4h, v\3.4h)
+** ...
+**     ldr\th([0-9]+), .*
+**     ldr\th([0-9]+), .*
+**     uqadd\th[0-9]+, (?:h\5, h\6|h\6, h\5)
+** ...
+**     uqadd\th([0-9]+), (?:h[0-9]+, h\7|h\7, h[0-9]+)
+** ...
+*/
+/*
+** usubq: { xfail *-*-* }
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqsub\tv[0-9]+.8h, v\1.8h, v\2.8h
+** ...
+**     ldr\td([0-9]+), .*
+**     ldr\td([0-9]+), .*
+**     uqsub\tv[0-9]+.4h, v\3.4h, v\4.4h
+** ...
+**     ldr\th([0-9]+), .*
+**     ldr\th([0-9]+), .*
+**     uqsub\th[0-9]+, h\5, h\6
+** ...
+**     ldr\th([0-9]+), .*
+**     ldr\th([0-9]+), .*
+**     uqsub\th[0-9]+, h\7, h\8
+** ...
+*/
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#define UT unsigned short
+#define VT uint16x4_t
+#define UMAX USHRT_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic_autovect.inc"
\ No newline at end of file
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c
 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c
new file mode 100644
index 
0000000000000000000000000000000000000000..ea6e0c78d7860700d750d4d6055ea2bf636cd2f1
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c
@@ -0,0 +1,75 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+/*
+** uadd_lane:
+**     fmov\tw([0-9]+), s0
+**     adds\tw([0-9]+), (?:w\1, w0|w0, w\1)
+**     csinv\tw\2, w\2, wzr, cc
+**     ret
+*/
+/*
+** uaddq:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.4s, (?:v\1.4s, v\2.4s|v\2.4s, v\1.4s)
+** ...
+**     ldr\tw([0-9]+), .*
+**     ldr\tw([0-9]+), .*
+**     adds\tw([0-9]+), (?:w\3, w\4|w\4, w\3)
+**     csinv\tw\5, w\5, wzr, cc
+** ...
+**     ldr\tw([0-9]+), .*
+**     ldr\tw([0-9]+), .*
+**     adds\tw([0-9]+), (?:w\6, w\7|w\7, w\6)
+**     csinv\tw\8, w\8, wzr, cc
+** ...
+*/
+/*
+** uaddq2:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.4s, (?:v\1.4s, v\2.4s|v\2.4s, v\1.4s)
+** ...
+**     ldr\tw([0-9]+), .*
+**     ldr\tw([0-9]+), .*
+**     adds\tw([0-9]+), (?:w\3, w\4|w\4, w\3)
+**     csinv\tw\5, w\5, wzr, cc
+** ...
+**     ldr\tw([0-9]+), .*
+**     ldr\tw([0-9]+), .*
+**     adds\tw([0-9]+), (?:w\6, w\7|w\7, w\6)
+**     csinv\tw\8, w\8, wzr, cc
+** ...
+*/
+/*
+** usubq: { xfail *-*-* }
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqsub\tv[0-9]+.4s, v\1.4s, v\2.4s
+** ...
+**     ldr\tw([0-9]+), .*
+**     ldr\tw([0-9]+), .*
+**     subs\tw([0-9]+), w\3, w\4
+**     csel\tw\5, w\5, wzr, cs
+** ...
+**     ldr\tw([0-9]+), .*
+**     ldr\tw([0-9]+), .*
+**     subs\tw([0-9]+), w\6, w\7
+**     csel\tw\8, w\8, wzr, cs
+** ...
+*/
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#define UT unsigned int
+#define VT uint32x2_t
+#define UMAX UINT_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic_autovect.inc"
\ No newline at end of file
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c
 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c
new file mode 100644
index 
0000000000000000000000000000000000000000..01390637b5ca4accd306ce3a9aa4fc991d3f97b7
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c
@@ -0,0 +1,77 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+/*
+** uadd_lane:
+** ...
+**     (?:fmov|ldr)\tx([0-9]+), .*
+** ...
+**     adds\tx([0-9]+), (?:x\1, x0|x0, x\1)
+**     csinv\tx\2, x\2, xzr, cc
+**     ret
+*/
+/*
+** uaddq:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.2d, (?:v\1.2d, v\2.2d|v\2.2d, v\1.2d)
+** ...
+**     ldr\tx([0-9]+), .*
+**     ldr\tx([0-9]+), .*
+**     adds\tx([0-9]+), (?:x\3, x\4|x\4, x\3)
+**     csinv\tx\5, x\5, xzr, cc
+** ...
+**     ldr\tx([0-9]+), .*
+**     ldr\tx([0-9]+), .*
+**     adds\tx([0-9]+), (?:x\6, x\7|x\7, x\6)
+**     csinv\tx\8, x\8, xzr, cc
+** ...
+*/
+/*
+** uaddq2:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.2d, (?:v\1.2d, v\2.2d|v\2.2d, v\1.2d)
+** ...
+**     ldr\tx([0-9]+), .*
+**     ldr\tx([0-9]+), .*
+**     adds\tx([0-9]+), (?:x\3, x\4|x\4, x\3)
+**     csinv\tx\5, x\5, xzr, cc
+** ...
+**     ldr\tx([0-9]+), .*
+**     ldr\tx([0-9]+), .*
+**     adds\tx([0-9]+), (?:x\6, x\7|x\7, x\6)
+**     csinv\tx\8, x\8, xzr, cc
+** ...
+*/
+/*
+** usubq: { xfail *-*-* }
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqsub\tv[0-9]+.2d, v\1.2d, v\2.2d
+** ...
+**     ldr\tx([0-9]+), .*
+**     ldr\tx([0-9]+), .*
+**     subs\tx([0-9]+), x\3, x\4
+**     csel\tx\5, x\5, xzr, cs
+** ...
+**     ldr\tx([0-9]+), .*
+**     ldr\tx([0-9]+), .*
+**     subs\tx([0-9]+), x\6, x\7
+**     csel\tx\8, x\8, xzr, cs
+** ...
+*/
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#define UT unsigned long
+#define VT uint64x2_t
+#define UMAX ULONG_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic_autovect.inc"
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c 
b/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c
new file mode 100644
index 
0000000000000000000000000000000000000000..de652bf1d9e6e59f088478b0e63ad7a027ac7782
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c
@@ -0,0 +1,270 @@
+/* { dg-do run } */
+/* { dg-options "-O2 --save-temps -mearly-ra=none -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+/*
+** sadd32:
+**     asr     w([0-9]+), w1, 31
+**     eor     w\1, w\1, -2147483648
+**     adds    w([0-9]+), (?:w0, w1|w1, w0)
+**     csinv   w0, w\2, w\1, vc
+**     ret
+*/
+int32_t __attribute__((noipa))
+sadd32 (int32_t __a, int32_t __b)
+{
+  int32_t sum;
+  bool overflow = __builtin_add_overflow (__a, __b, &sum);
+  return !overflow ? sum : __a < 0 ? INT_MIN : INT_MAX;
+}
+
+/*
+** sadd32_imm:
+**     adds    w([0-9]+), w0, #67
+**     mov     w([0-9]+), 2147483647
+**     csel    w0, w\1, w\2, vc
+**     ret
+*/
+int32_t __attribute__((noipa))
+sadd32_imm (int32_t __a)
+{
+  int32_t sum;
+  bool overflow = __builtin_add_overflow (__a, 67, &sum);
+  return !overflow ? sum : __a < 0 ? INT_MIN : INT_MAX;
+}
+
+/*
+** sadd32_imm2:
+**     subs    w([0-9]+), w0, 67
+**     mov     w([0-9]+), -2147483648
+**     csel    w0, w\1, w\2, vc
+**     ret
+*/
+int32_t  __attribute__((noipa))
+sadd32_imm2 (int32_t __a)
+{
+  int32_t sum;
+  bool overflow = __builtin_add_overflow (__a, -67, &sum);
+  return !overflow ? sum : __a < 0 ? INT_MIN : INT_MAX;
+}
+
+/*
+** ssub32:
+**     asr     w([0-9]+), w1, 31
+**     eor     w\1, w\1, -2147483648
+**     subs    w([0-9]+), w0, w1
+**     csel    w0, w\2, w\1, vc
+**     ret
+*/
+int32_t  __attribute__((noipa))
+ssub32 (int32_t __a, int32_t __b)
+{
+  int32_t result;
+  bool overflow = __builtin_sub_overflow (__a, __b, &result);
+  return !overflow ? result : __a < 0 ? INT_MIN : INT_MAX;
+}
+
+/*
+** ssub32_imm:
+**     subs    w([0-9]+), w0, 67
+**     mov     w([0-9]+), -2147483648
+**     csel    w0, w\1, w\2, vc
+**     ret
+*/
+int32_t  __attribute__((noipa))
+ssub32_imm (int32_t __a)
+{
+  int32_t result;
+  bool overflow = __builtin_sub_overflow (__a, 67, &result);
+  return !overflow ? result : __a < 0 ? INT_MIN : INT_MAX;
+}
+
+/*
+** ssub32_imm2:
+**     adds    w([0-9]+), w0, #67
+**     mov     w([0-9]+), 2147483647
+**     csel    w0, w\1, w\2, vc
+**     ret
+*/
+int32_t  __attribute__((noipa))
+ssub32_imm2 (int32_t __a)
+{
+  int32_t result;
+  bool overflow = __builtin_sub_overflow (__a, -67, &result);
+  return !overflow ? result : __a < 0 ? INT_MIN : INT_MAX;
+}
+
+/*
+** sadd64:
+**     asr     x([0-9]+), x1, 63
+**     eor     x\1, x\1, -9223372036854775808
+**     adds    x([0-9]+), (?:x0, x1|x1, x0)
+**     csinv   x0, x\2, x\1, vc
+**     ret
+*/
+int64_t  __attribute__((noipa))
+sadd64 (int64_t __a, int64_t __b)
+{
+  int64_t sum;
+  bool overflow = __builtin_add_overflow (__a, __b, &sum);
+  return !overflow ? sum : __a < 0 ? LONG_MIN : LONG_MAX;
+}
+
+/*
+** sadd64_imm:
+**     adds    x([0-9]+), x0, #67
+**     mov     x([0-9]+), 9223372036854775807
+**     csel    x0, x\1, x\2, vc
+**     ret
+*/
+int64_t  __attribute__((noipa))
+sadd64_imm (int64_t __a)
+{
+  int64_t sum;
+  bool overflow = __builtin_add_overflow (__a, (int64_t)67, &sum);
+  return !overflow ? sum : __a < 0 ? LONG_MIN : LONG_MAX;
+}
+
+/*
+** sadd64_imm2:
+**     subs    x([0-9]+), x0, 67
+**     mov     x([0-9]+), -9223372036854775808
+**     csel    x0, x\1, x\2, vc
+**     ret
+*/
+int64_t  __attribute__((noipa))
+sadd64_imm2 (int64_t __a)
+{
+  int64_t sum;
+  bool overflow = __builtin_add_overflow (__a, (int64_t)-67, &sum);
+  return !overflow ? sum : __a < 0 ? LONG_MIN : LONG_MAX;
+}
+
+/*
+** ssub64:
+**     asr     x([0-9]+), x1, 63
+**     eor     x\1, x\1, -9223372036854775808
+**     subs    x([0-9]+), x0, x1
+**     csel    x0, x\2, x\1, vc
+**     ret
+*/
+int64_t  __attribute__((noipa))
+ssub64 (int64_t __a, int64_t __b)
+{
+  int64_t result;
+  bool overflow = __builtin_sub_overflow (__a, __b, &result);
+  return !overflow ? result : __a < 0 ? LONG_MIN : LONG_MAX;
+}
+
+/*
+** ssub64_imm:
+**     subs    x([0-9]+), x0, 67
+**     mov     x([0-9]+), -9223372036854775808
+**     csel    x0, x\1, x\2, vc
+**     ret
+*/
+int64_t  __attribute__((noipa))
+ssub64_imm (int64_t __a)
+{
+  int64_t result;
+  bool overflow = __builtin_sub_overflow (__a, (int64_t) 67, &result);
+  return !overflow ? result : __a < 0 ? LONG_MIN : LONG_MAX;
+}
+
+/*
+** ssub64_imm2:
+**     adds    x([0-9]+), x0, #67
+**     mov     x([0-9]+), 9223372036854775807
+**     csel    x0, x\1, x\2, vc
+**     ret
+*/
+int64_t  __attribute__((noipa))
+ssub64_imm2 (int64_t __a)
+{
+  int64_t result;
+  bool overflow = __builtin_sub_overflow (__a, (int64_t) -67, &result);
+  return !overflow ? result : __a < 0 ? LONG_MIN : LONG_MAX;
+}
+
+int
+main (void)
+{
+  /* Addition:
+  SAT_ADD(x, +ve), non-saturating
+  SAT_ADD(x, +ve), saturating
+  SAT_ADD(x, immediate +ve)
+  SAT_ADD(x, immediate -ve)
+  SAT_ADD(x, -ve), non-saturating
+  SAT_ADD(x, -ve), saturating
+
+  Subtraction:
+  SAT_SUB(x, +ve), non-saturating
+  SAT_SUB(x, +ve), saturating
+  SAT_SUB(x, immediate +ve)
+  SAT_SUB(x, immediate -ve)
+  SAT_SUB(x, -ve), non-saturating  */
+
+  int32_t a = 4;
+  int32_t b = 70;
+  int32_t c = 2147483647;
+  int32_t d = (int32_t) -2147483648;
+
+  if (sadd32 (a, b) != (a + b))
+    __builtin_abort ();
+  if (sadd32 (a, c) != c)
+    __builtin_abort ();
+  if (sadd32_imm (a) != (a + 67))
+    __builtin_abort ();
+  if (sadd32_imm2 (a) != (a - 67))
+    __builtin_abort ();
+  if (sadd32 (a, -b) != (a - b))
+    __builtin_abort ();
+  if (sadd32 (a, d) != (d + 4))
+    __builtin_abort ();
+
+  if (ssub32 (a, b) != (a - b))
+    __builtin_abort ();
+  if (ssub32 (-a, c) != d)
+    __builtin_abort ();
+  if (ssub32_imm (a) != (a - 67))
+    __builtin_abort ();
+  if (ssub32_imm2 (a) != (a + 67))
+    __builtin_abort ();
+  if (ssub32 (a, -b) != (a + b))
+    __builtin_abort ();
+
+  int64_t a_64 = a;
+  int64_t b_64 = b;
+  int64_t c_64 = (int64_t) 9223372036854775807;
+  int64_t d_64 = (int64_t) 0x8000000000000000;
+
+  if (sadd64 (a_64, b_64) != (a_64 + b_64))
+    __builtin_abort ();
+  if (sadd64 (a_64, c_64) != c_64)
+    __builtin_abort ();
+  if (sadd64_imm (a_64) != (a_64 + 67))
+    __builtin_abort ();
+  if (sadd64_imm2 (a_64) != (a_64 - 67))
+    __builtin_abort ();
+  if (sadd64 (a_64, -b_64) != (a_64 - b_64))
+    __builtin_abort ();
+  if (sadd64 (a_64, d_64) != (d_64 + 4))
+    __builtin_abort ();
+
+  if (ssub64 (a_64, b_64) != (a_64 - b_64))
+    __builtin_abort ();
+  if (ssub64 (-a_64, c_64) != d_64)
+    __builtin_abort ();
+  if (ssub64_imm (a_64) != (a_64 - 67))
+    __builtin_abort ();
+  if (ssub64_imm2 (a_64) != (a_64 + 67))
+    __builtin_abort ();
+  if (ssub64 (a_64, -b_64) != (a_64 + b_64))
+    __builtin_abort ();
+
+  return 0;
+}
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc 
b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc
new file mode 100644
index 
0000000000000000000000000000000000000000..e979d5354057bc3f5861b386ea28a7318c7c533c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc
@@ -0,0 +1,39 @@
+/* Template file for scalar saturating arithmetic validation.
+
+   This file defines scalar saturating addition and subtraction functions for a
+   given type. This type, along with the corresponding minimum and maximum
+   values for that type, must be defined by any test file which includes this
+   template file.  */
+
+#ifndef SAT_ARIT_INC
+#define SAT_ARIT_INC
+
+#include <limits.h>
+
+#ifndef UT
+#define UT unsigned int
+#define UMAX UINT_MAX
+#define UMIN 0
+#endif
+
+UT uadd (UT a, UT b)
+{
+        UT sum = a + b;
+        return sum < a ? UMAX : sum;
+}
+
+UT uadd2 (UT a, UT b)
+{
+        UT c;
+        if (!__builtin_add_overflow(a, b, &c))
+                return c;
+        return UMAX;
+}
+
+UT usub (UT a, UT b)
+{
+        UT sum = a - b;
+        return sum > a ? UMIN : sum;
+}
+
+#endif
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c 
b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..2ac0c376d126ecc15b37015b1a2e098dfa7f5543
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c
@@ -0,0 +1,36 @@
+/* { dg-do-compile } */
+/* { dg-options "-O2 --save-temps -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** uadd:
+**     dup     v([0-9]+).8b, w1
+**     dup     v([0-9]+).8b, w0
+**     uqadd   b([0-9]+), (?:b\2, b\1|b\1, b\2)
+**     umov    w0, v\3.b\[0\]
+**     ret
+*/
+/*
+** uadd2:
+**     dup     v([0-9]+).8b, w1
+**     dup     v([0-9]+).8b, w0
+**     uqadd   b([0-9]+), (?:b\2, b\1|b\1, b\2)
+**     umov    w0, v\3.b\[0\]
+**     ret
+*/
+/*
+** usub: { xfail *-*-* }
+**     dup     v([0-9]+).8b, w1
+**     dup     v([0-9]+).8b, w0
+**     uqsub   b([0-9]+), b\1, b\2
+**     umov    w0, v\3.b\[0\]
+**     ret
+*/
+
+#include <limits.h>
+
+#define UT unsigned char
+#define UMAX UCHAR_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic.inc"
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c 
b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..2a55aa9f2218a168becf3b6b13905c6cd228cacc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c
@@ -0,0 +1,36 @@
+/* { dg-do-compile } */
+/* { dg-options "-O2 --save-temps -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** uadd:
+**     dup     v([0-9]+).4h, w1
+**     dup     v([0-9]+).4h, w0
+**     uqadd   h([0-9]+), (?:h\2, h\1|h\1, h\2)
+**     umov    w0, v\3.h\[0\]
+**     ret
+*/
+/*
+** uadd2:
+**     dup     v([0-9]+).4h, w1
+**     dup     v([0-9]+).4h, w0
+**     uqadd   h([0-9]+), (?:h\2, h\1|h\1, h\2)
+**     umov    w0, v\3.h\[0\]
+**     ret
+*/
+/*
+** usub: { xfail *-*-* }
+**     dup     v([0-9]+).4h, w1
+**     dup     v([0-9]+).4h, w0
+**     uqsub   h([0-9]+), h\1, h\2
+**     umov    w0, v\3.h\[0\]
+**     ret
+*/
+
+#include <limits.h>
+
+#define UT unsigned short
+#define UMAX USHRT_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic.inc"
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c 
b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c
new file mode 100644
index 
0000000000000000000000000000000000000000..215172545196678b759b9fb779b4473f86b2805c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c
@@ -0,0 +1,30 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** uadd:
+**     adds\tw([0-9]+), w([0-9]+), w([0-9]+)
+**     csinv\tw\1, w\1, wzr, cc
+**     ret
+*/
+/*
+** uadd2:
+**     adds\tw([0-9]+), w([0-9]+), w([0-9]+)
+**     csinv\tw\1, w\1, wzr, cc
+**     ret
+*/
+/*
+** usub:
+**     subs\tw([0-9]+), w([0-9]+), w([0-9]+)
+**     csel\tw\1, w\1, wzr, cs
+**     ret
+*/
+
+#include <limits.h>
+
+#define UT unsigned int
+#define UMAX UINT_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic.inc"
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c 
b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c
new file mode 100644
index 
0000000000000000000000000000000000000000..363d0a79a730eaf4401f88d66bfea0c61dfc5c82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c
@@ -0,0 +1,30 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** uadd:
+**     adds\tx([0-9]+), x([0-9]+), x([0-9]+)
+**     csinv\tx\1, x\1, xzr, cc
+**     ret
+*/
+/*
+** uadd2:
+**     adds\tx([0-9]+), x([0-9]+), x([0-9]+)
+**     csinv\tx\1, x\1, xzr, cc
+**     ret
+*/
+/*
+** usub:
+**     subs\tx([0-9]+), x([0-9]+), x([0-9]+)
+**     csel\tx\1, x\1, xzr, cs
+**     ret
+*/
+
+#include <limits.h>
+
+#define UT unsigned long
+#define UMAX ULONG_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic.inc"
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c 
b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c
index 
c2e13b651e96fdc9c5abeaf154905dce177d9e2f..dcf9dc777adea528c7aa3c96343668f44b7941cb
 100644
--- a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c
+++ b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c
@@ -318,33 +318,33 @@ test_vpaddd_u64 (uint64x2_t a)
 /* { dg-final { scan-assembler-times "\\tuqadd\\td\[0-9\]+" 1 } } */
 
 uint64_t
-test_vqaddd_u64 (uint64_t a, uint64_t b)
+test_vqaddd_u64 (uint64x1_t a, uint64x1_t b)
 {
-  return vqaddd_u64 (a, b);
+  return vqaddd_u64 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tuqadd\\ts\[0-9\]+" 1 } } */
 
 uint32_t
-test_vqadds_u32 (uint32_t a, uint32_t b)
+test_vqadds_u32 (uint32x4_t a, uint32x4_t b)
 {
-  return vqadds_u32 (a, b);
+  return vqadds_u32 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tuqadd\\th\[0-9\]+" 1 } } */
 
 uint16_t
-test_vqaddh_u16 (uint16_t a, uint16_t b)
+test_vqaddh_u16 (uint16x8_t a, uint16x8_t b)
 {
-  return vqaddh_u16 (a, b);
+  return vqaddh_u16 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tuqadd\\tb\[0-9\]+" 1 } } */
 
 uint8_t
-test_vqaddb_u8 (uint8_t a, uint8_t b)
+test_vqaddb_u8 (uint8x16_t a, uint8x16_t b)
 {
-  return vqaddb_u8 (a, b);
+  return vqaddb_u8 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tsqadd\\td\[0-9\]+" 1 } } */
@@ -761,33 +761,33 @@ test_vsubd_s64_2 (int64_t a, int64_t b)
 /* { dg-final { scan-assembler-times "\\tuqsub\\td\[0-9\]+" 1 } } */
 
 uint64_t
-test_vqsubd_u64 (uint64_t a, uint64_t b)
+test_vqsubd_u64 (uint64x1_t a, uint64x1_t b)
 {
-  return vqsubd_u64 (a, b);
+  return vqsubd_u64 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tuqsub\\ts\[0-9\]+" 1 } } */
 
 uint32_t
-test_vqsubs_u32 (uint32_t a, uint32_t b)
+test_vqsubs_u32 (uint32x4_t a, uint32x4_t b)
 {
-  return vqsubs_u32 (a, b);
+  return vqsubs_u32 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tuqsub\\th\[0-9\]+" 1 } } */
 
 uint16_t
-test_vqsubh_u16 (uint16_t a, uint16_t b)
+test_vqsubh_u16 (uint16x8_t a, uint16x8_t b)
 {
-  return vqsubh_u16 (a, b);
+  return vqsubh_u16 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tuqsub\\tb\[0-9\]+" 1 } } */
 
 uint8_t
-test_vqsubb_u8 (uint8_t a, uint8_t b)
+test_vqsubb_u8 (uint8x16_t a, uint8x16_t b)
 {
-  return vqsubb_u8 (a, b);
+  return vqsubb_u8 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tsqsub\\td\[0-9\]+" 1 } } */

rb19159.patch
Description: rb19159.patch

RE: [PATCH v3 1/2] aarch64: Use standard names for saturating arithmetic

Reply via email to