[gcc r15-7003] AArch64: Use standard names for saturating arithmetic

Tamar Christina via Gcc-cvs Fri, 17 Jan 2025 09:47:43 -0800

https://gcc.gnu.org/g:5f5833a4107ddfbcd87651bf140151de043f4c36


commit r15-7003-g5f5833a4107ddfbcd87651bf140151de043f4c36
Author: Tamar Christina <tamar.christ...@arm.com>
Date:   Fri Jan 17 17:43:49 2025 +0000

    AArch64: Use standard names for saturating arithmetic
    
    This renames the existing {s,u}q{add,sub} instructions to use the
    standard names {s,u}s{add,sub}3 which are used by IFN_SAT_ADD and
    IFN_SAT_SUB.
    
    The NEON intrinsics for saturating arithmetic and their corresponding
    builtins are changed to use these standard names too.
    
    Using the standard names for the instructions causes 32 and 64-bit
    unsigned scalar saturating arithmetic to use the NEON instructions,
    resulting in an additional (and inefficient) FMOV to be generated when
    the original operands are in GP registers. This patch therefore also
    restores the original behaviour of using the adds/subs instructions
    in this circumstance.
    
    Additional tests are written for the scalar and Adv. SIMD cases to
    ensure that the correct instructions are used. The NEON intrinsics are
    already tested elsewhere.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-builtins.cc: Expand iterators.
            * config/aarch64/aarch64-simd-builtins.def: Use standard names
            * config/aarch64/aarch64-simd.md: Use standard names, split insn
            definitions on signedness of operator and type of operands.
            * config/aarch64/arm_neon.h: Use standard builtin names.
            * config/aarch64/iterators.md: Add VSDQ_I_QI_HI iterator to
            simplify splitting of insn for unsigned scalar arithmetic.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/scalar_intrinsics.c: Update testcases.
            * 
gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc:
            Template file for unsigned vector saturating arithmetic tests.
            * 
gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c:
            8-bit vector type tests.
            * 
gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c:
            16-bit vector type tests.
            * 
gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c:
            32-bit vector type tests.
            * 
gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c:
            64-bit vector type tests.
            * gcc.target/aarch64/saturating_arithmetic.inc: Template file
            for scalar saturating arithmetic tests.
            * gcc.target/aarch64/saturating_arithmetic_1.c: 8-bit tests.
            * gcc.target/aarch64/saturating_arithmetic_2.c: 16-bit tests.
            * gcc.target/aarch64/saturating_arithmetic_3.c: 32-bit tests.
            * gcc.target/aarch64/saturating_arithmetic_4.c: 64-bit tests.
    
    Co-authored-by: Tamar Christina <tamar.christ...@arm.com>

Diff:
---
 gcc/config/aarch64/aarch64-builtins.cc             |  12 +
 gcc/config/aarch64/aarch64-simd-builtins.def       |   8 +-
 gcc/config/aarch64/aarch64-simd.md                 | 207 +++++++++++++++-
 gcc/config/aarch64/arm_neon.h                      |  96 ++++----
 gcc/config/aarch64/iterators.md                    |   4 +
 .../saturating_arithmetic_autovect.inc             |  58 +++++
 .../saturating_arithmetic_autovect_1.c             |  79 ++++++
 .../saturating_arithmetic_autovect_2.c             |  79 ++++++
 .../saturating_arithmetic_autovect_3.c             |  75 ++++++
 .../saturating_arithmetic_autovect_4.c             |  77 ++++++
 .../aarch64/saturating-arithmetic-signed.c         | 270 +++++++++++++++++++++
 .../gcc.target/aarch64/saturating_arithmetic.inc   |  39 +++
 .../gcc.target/aarch64/saturating_arithmetic_1.c   |  36 +++
 .../gcc.target/aarch64/saturating_arithmetic_2.c   |  36 +++
 .../gcc.target/aarch64/saturating_arithmetic_3.c   |  30 +++
 .../gcc.target/aarch64/saturating_arithmetic_4.c   |  30 +++
 .../gcc.target/aarch64/scalar_intrinsics.c         |  32 +--
 17 files changed, 1096 insertions(+), 72 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 86eebc168859..6d5479c2e449 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -5039,6 +5039,18 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, 
gcall *stmt,
          new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
                                          LSHIFT_EXPR, args[0], args[1]);
        break;
+      /* lower saturating add/sub neon builtins to gimple.  */
+      BUILTIN_VSDQ_I (BINOP, ssadd, 3, DEFAULT)
+      BUILTIN_VSDQ_I (BINOPU, usadd, 3, DEFAULT)
+       new_stmt = gimple_build_call_internal (IFN_SAT_ADD, 2, args[0], 
args[1]);
+       gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+       break;
+      BUILTIN_VSDQ_I (BINOP, sssub, 3, DEFAULT)
+      BUILTIN_VSDQ_I (BINOPU, ussub, 3, DEFAULT)
+       new_stmt = gimple_build_call_internal (IFN_SAT_SUB, 2, args[0], 
args[1]);
+       gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+       break;
+
       BUILTIN_VSDQ_I_DI (BINOP, sshl, 0, DEFAULT)
       BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0, DEFAULT)
        {
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
b/gcc/config/aarch64/aarch64-simd-builtins.def
index 286272a33118..6cc45b18a723 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -71,10 +71,10 @@
   BUILTIN_VSDQ_I (BINOP, sqrshl, 0, DEFAULT)
   BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0, DEFAULT)
   /* Implemented by aarch64_<su_optab><optab><mode>.  */
-  BUILTIN_VSDQ_I (BINOP, sqadd, 0, DEFAULT)
-  BUILTIN_VSDQ_I (BINOPU, uqadd, 0, DEFAULT)
-  BUILTIN_VSDQ_I (BINOP, sqsub, 0, DEFAULT)
-  BUILTIN_VSDQ_I (BINOPU, uqsub, 0, DEFAULT)
+  BUILTIN_VSDQ_I (BINOP, ssadd, 3, DEFAULT)
+  BUILTIN_VSDQ_I (BINOPU, usadd, 3, DEFAULT)
+  BUILTIN_VSDQ_I (BINOP, sssub, 3, DEFAULT)
+  BUILTIN_VSDQ_I (BINOPU, ussub, 3, DEFAULT)
   /* Implemented by aarch64_<sur>qadd<mode>.  */
   BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0, DEFAULT)
   BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, DEFAULT)
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index eeb626f129a8..e2afe87e5130 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5162,15 +5162,214 @@
 )
 ;; <su>q<addsub>
 
-(define_insn "aarch64_<su_optab>q<addsub><mode><vczle><vczbe>"
-  [(set (match_operand:VSDQ_I 0 "register_operand" "=w")
-       (BINQOPS:VSDQ_I (match_operand:VSDQ_I 1 "register_operand" "w")
-                       (match_operand:VSDQ_I 2 "register_operand" "w")))]
+(define_insn "<su_optab>s<addsub><mode>3<vczle><vczbe>"
+  [(set (match_operand:VSDQ_I_QI_HI 0 "register_operand" "=w")
+       (BINQOPS:VSDQ_I_QI_HI
+         (match_operand:VSDQ_I_QI_HI 1 "register_operand" "w")
+         (match_operand:VSDQ_I_QI_HI 2 "register_operand" "w")))]
   "TARGET_SIMD"
   "<su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
   [(set_attr "type" "neon_q<addsub><q>")]
 )
 
+(define_expand "<su_optab>s<addsub><mode>3"
+  [(parallel
+    [(set (match_operand:GPI 0 "register_operand")
+         (SBINQOPS:GPI (match_operand:GPI 1 "register_operand")
+                       (match_operand:GPI 2 "aarch64_plus_operand")))
+    (clobber (scratch:GPI))
+    (clobber (reg:CC CC_REGNUM))])]
+)
+
+;; Introducing a temporary GP reg allows signed saturating arithmetic with GPR
+;; operands to be calculated without the use of costly transfers to and from FP
+;; registers.  For example, saturating addition usually uses three FMOVs:
+;;
+;;   fmov      d0, x0
+;;   fmov      d1, x1
+;;   sqadd     d0, d0, d1
+;;   fmov      x0, d0
+;;
+;; Using a temporary register results in three cheaper instructions being used
+;; in place of the three FMOVs, which calculate the saturating limit accounting
+;; for the signedness of operand2:
+;;
+;;   asr       x2, x1, 63
+;;   adds      x0, x0, x1
+;;   eor       x2, x2, 0x8000000000000000
+;;   csinv     x0, x0, x2, vc
+;;
+;; If operand2 is a constant value, the temporary register can be used to store
+;; the saturating limit without the need for asr, xor to calculate said limit.
+
+(define_insn_and_split "aarch64_<su_optab>s<addsub><mode>3<vczle><vczbe>"
+  [(set (match_operand:GPI 0 "register_operand")
+       (SBINQOPS:GPI (match_operand:GPI 1 "register_operand")
+                     (match_operand:GPI 2 "aarch64_plus_operand")))
+    (clobber (match_scratch:GPI 3))
+    (clobber (reg:CC CC_REGNUM))]
+  ""
+  {@ [ cons: =0, 1 , 2   , =3 ; attrs: type       , arch , length ]
+     [ w       , w , w   , X  ; neon_q<addsub><q> , simd , 4      ] 
<su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>
+     [ r       , r , JIr , &r ; *                , *    , 8      ] #
+  }
+  "&& reload_completed && GP_REGNUM_P (REGNO (operands[0]))"
+  [(set (match_dup 0)
+       (if_then_else:GPI
+         (match_dup 4)
+         (match_dup 5)
+         (match_dup 6)))]
+  {
+    if (REG_P (operands[2]))
+      {
+      rtx shift_constant = gen_int_mode (GET_MODE_BITSIZE (<MODE>mode) - 1,
+                                        <MODE>mode);
+      auto limit = HOST_WIDE_INT_1U << (GET_MODE_BITSIZE (<MODE>mode) - 1);
+      rtx limit_constant = gen_int_mode (limit, <MODE>mode);
+      emit_insn (gen_ashr<mode>3 (operands[3], operands[2], shift_constant));
+      emit_insn (gen_xor<mode>3 (operands[3], operands[3], limit_constant));
+
+      switch (<CODE>)
+       {
+       case SS_MINUS:
+         emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1],
+                                             operands[2]));
+       break;
+       case SS_PLUS:
+         emit_insn (gen_add<mode>3_compare0 (operands[0], operands[1],
+                                             operands[2]));
+         break;
+       default:
+         gcc_unreachable ();
+       }
+
+      rtx ccin = gen_rtx_REG (E_CC_Vmode, CC_REGNUM);
+      switch (<CODE>)
+       {
+       case SS_PLUS:
+         operands[4] = gen_rtx_NE (<MODE>mode, ccin, const0_rtx);
+         operands[5] = gen_rtx_NOT (<MODE>mode, operands[3]);
+         operands[6] = operands[0];
+         break;
+       case SS_MINUS:
+         operands[4] = gen_rtx_EQ (<MODE>mode, ccin, const0_rtx);
+         operands[5] = operands[0];
+         operands[6] = operands[3];
+         break;
+       default:
+         gcc_unreachable ();
+       }
+      }
+    else
+      {
+       auto imm = INTVAL (operands[2]);
+       rtx neg_imm = gen_int_mode (-imm, <MODE>mode);
+       wide_int limit;
+
+       switch (<CODE>)
+         {
+         case SS_MINUS:
+           emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
+                                                   operands[2], neg_imm));
+           limit = imm >= 0 ? wi::min_value (<MODE>mode, SIGNED)
+                            : wi::max_value (<MODE>mode, SIGNED);
+           break;
+         case SS_PLUS:
+           emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
+                                                   neg_imm, operands[2]));
+           limit = imm >= 0 ? wi::max_value (<MODE>mode, SIGNED)
+                            : wi::min_value (<MODE>mode, SIGNED);
+           break;
+         default:
+           gcc_unreachable ();
+         }
+
+      rtx sat_limit = immed_wide_int_const (limit, <MODE>mode);
+      emit_insn (gen_rtx_SET (operands[3], sat_limit));
+
+      rtx ccin = gen_rtx_REG (E_CC_Vmode, CC_REGNUM);
+      operands[4] = gen_rtx_EQ (<MODE>mode, ccin, const0_rtx);
+      operands[5] = operands[0];
+      operands[6] = operands[3];
+      }
+  }
+)
+
+;; Unsigned saturating arithmetic with GPR operands can be optimised similarly
+;; to the signed case, albeit without the need for a temporary register as the
+;; saturating limit can be inferred from the <addsub> code.  This applies only
+;; to SImode and DImode.
+
+(define_insn_and_split "<su_optab>s<addsub><mode>3<vczle><vczbe>"
+  [(set (match_operand:GPI 0 "register_operand")
+       (UBINQOPS:GPI (match_operand:GPI 1 "register_operand")
+                     (match_operand:GPI 2 "aarch64_plus_operand")))
+    (clobber (reg:CC CC_REGNUM))]
+  ""
+  {@ [ cons: =0, 1 , 2   ; attrs: type       , arch , length ]
+     [ w       , w , w   ; neon_q<addsub><q> , simd , 4      ] 
<su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>
+     [ r       , r , JIr ; *                , *    , 8      ] #
+  }
+  "&& reload_completed && GP_REGNUM_P (REGNO (operands[0]))"
+  [(set (match_dup 0)
+       (if_then_else:GPI
+         (match_dup 3)
+         (match_dup 0)
+         (match_dup 4)))]
+  {
+
+    if (REG_P (operands[2]))
+      {
+       switch (<CODE>)
+         {
+         case US_MINUS:
+           emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1],
+                                               operands[2]));
+           break;
+         case US_PLUS:
+           emit_insn (gen_add<mode>3_compare0 (operands[0], operands[1],
+                                               operands[2]));
+           break;
+         default:
+           gcc_unreachable ();
+         }
+      }
+    else
+      {
+       auto imm = UINTVAL (operands[2]);
+       rtx neg_imm = gen_int_mode (-imm, <MODE>mode);
+       switch (<CODE>)
+         {
+         case US_MINUS:
+           emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
+                                                   operands[2], neg_imm));
+           break;
+         case US_PLUS:
+           emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
+                                                   neg_imm, operands[2]));
+           break;
+         default:
+           gcc_unreachable ();
+         }
+      }
+
+    rtx ccin = gen_rtx_REG (CCmode, CC_REGNUM);
+    switch (<CODE>)
+      {
+      case US_PLUS:
+       operands[3] = gen_rtx_LTU (<MODE>mode, ccin, const0_rtx);
+       operands[4] = gen_int_mode (-1, <MODE>mode);
+       break;
+      case US_MINUS:
+       operands[3] = gen_rtx_GEU (<MODE>mode, ccin, const0_rtx);
+       operands[4] = const0_rtx;
+       break;
+      default:
+       gcc_unreachable ();
+      }
+  }
+)
+
 ;; suqadd and usqadd
 
 (define_insn "aarch64_<sur>qadd<mode><vczle><vczbe>"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 33594cb65d28..4899acead9b7 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -1864,35 +1864,35 @@ __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return (int8x8_t) __builtin_aarch64_sqaddv8qi (__a, __b);
+  return (int8x8_t) __builtin_aarch64_ssaddv8qi (__a, __b);
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (int16x4_t) __builtin_aarch64_sqaddv4hi (__a, __b);
+  return (int16x4_t) __builtin_aarch64_ssaddv4hi (__a, __b);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return (int32x2_t) __builtin_aarch64_sqaddv2si (__a, __b);
+  return (int32x2_t) __builtin_aarch64_ssaddv2si (__a, __b);
 }
 
 __extension__ extern __inline int64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (int64x1_t) {__builtin_aarch64_sqadddi (__a[0], __b[0])};
+  return (int64x1_t) {__builtin_aarch64_ssadddi (__a[0], __b[0])};
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return __builtin_aarch64_uqaddv8qi_uuu (__a, __b);
+  return __builtin_aarch64_usaddv8qi_uuu (__a, __b);
 }
 
 __extension__ extern __inline int8x8_t
@@ -2151,189 +2151,189 @@ __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return __builtin_aarch64_uqaddv4hi_uuu (__a, __b);
+  return __builtin_aarch64_usaddv4hi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return __builtin_aarch64_uqaddv2si_uuu (__a, __b);
+  return __builtin_aarch64_usaddv2si_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) {__builtin_aarch64_uqadddi_uuu (__a[0], __b[0])};
+  return (uint64x1_t) {__builtin_aarch64_usadddi_uuu (__a[0], __b[0])};
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return (int8x16_t) __builtin_aarch64_sqaddv16qi (__a, __b);
+  return (int8x16_t) __builtin_aarch64_ssaddv16qi (__a, __b);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return (int16x8_t) __builtin_aarch64_sqaddv8hi (__a, __b);
+  return (int16x8_t) __builtin_aarch64_ssaddv8hi (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return (int32x4_t) __builtin_aarch64_sqaddv4si (__a, __b);
+  return (int32x4_t) __builtin_aarch64_ssaddv4si (__a, __b);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  return (int64x2_t) __builtin_aarch64_sqaddv2di (__a, __b);
+  return (int64x2_t) __builtin_aarch64_ssaddv2di (__a, __b);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_aarch64_uqaddv16qi_uuu (__a, __b);
+  return __builtin_aarch64_usaddv16qi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_aarch64_uqaddv8hi_uuu (__a, __b);
+  return __builtin_aarch64_usaddv8hi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_aarch64_uqaddv4si_uuu (__a, __b);
+  return __builtin_aarch64_usaddv4si_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  return __builtin_aarch64_uqaddv2di_uuu (__a, __b);
+  return __builtin_aarch64_usaddv2di_uuu (__a, __b);
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return (int8x8_t) __builtin_aarch64_sqsubv8qi (__a, __b);
+  return (int8x8_t) __builtin_aarch64_sssubv8qi (__a, __b);
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (int16x4_t) __builtin_aarch64_sqsubv4hi (__a, __b);
+  return (int16x4_t) __builtin_aarch64_sssubv4hi (__a, __b);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return (int32x2_t) __builtin_aarch64_sqsubv2si (__a, __b);
+  return (int32x2_t) __builtin_aarch64_sssubv2si (__a, __b);
 }
 
 __extension__ extern __inline int64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (int64x1_t) {__builtin_aarch64_sqsubdi (__a[0], __b[0])};
+  return (int64x1_t) {__builtin_aarch64_sssubdi (__a[0], __b[0])};
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return __builtin_aarch64_uqsubv8qi_uuu (__a, __b);
+  return __builtin_aarch64_ussubv8qi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return __builtin_aarch64_uqsubv4hi_uuu (__a, __b);
+  return __builtin_aarch64_ussubv4hi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return __builtin_aarch64_uqsubv2si_uuu (__a, __b);
+  return __builtin_aarch64_ussubv2si_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) {__builtin_aarch64_uqsubdi_uuu (__a[0], __b[0])};
+  return (uint64x1_t) {__builtin_aarch64_ussubdi_uuu (__a[0], __b[0])};
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return (int8x16_t) __builtin_aarch64_sqsubv16qi (__a, __b);
+  return (int8x16_t) __builtin_aarch64_sssubv16qi (__a, __b);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return (int16x8_t) __builtin_aarch64_sqsubv8hi (__a, __b);
+  return (int16x8_t) __builtin_aarch64_sssubv8hi (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return (int32x4_t) __builtin_aarch64_sqsubv4si (__a, __b);
+  return (int32x4_t) __builtin_aarch64_sssubv4si (__a, __b);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  return (int64x2_t) __builtin_aarch64_sqsubv2di (__a, __b);
+  return (int64x2_t) __builtin_aarch64_sssubv2di (__a, __b);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_aarch64_uqsubv16qi_uuu (__a, __b);
+  return __builtin_aarch64_ussubv16qi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_aarch64_uqsubv8hi_uuu (__a, __b);
+  return __builtin_aarch64_ussubv8hi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_aarch64_uqsubv4si_uuu (__a, __b);
+  return __builtin_aarch64_ussubv4si_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  return __builtin_aarch64_uqsubv2di_uuu (__a, __b);
+  return __builtin_aarch64_ussubv2di_uuu (__a, __b);
 }
 
 __extension__ extern __inline int8x8_t
@@ -17543,56 +17543,56 @@ __extension__ extern __inline int8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddb_s8 (int8_t __a, int8_t __b)
 {
-  return (int8_t) __builtin_aarch64_sqaddqi (__a, __b);
+  return (int8_t) __builtin_aarch64_ssaddqi (__a, __b);
 }
 
 __extension__ extern __inline int16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddh_s16 (int16_t __a, int16_t __b)
 {
-  return (int16_t) __builtin_aarch64_sqaddhi (__a, __b);
+  return (int16_t) __builtin_aarch64_ssaddhi (__a, __b);
 }
 
 __extension__ extern __inline int32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadds_s32 (int32_t __a, int32_t __b)
 {
-  return (int32_t) __builtin_aarch64_sqaddsi (__a, __b);
+  return (int32_t) __builtin_aarch64_ssaddsi (__a, __b);
 }
 
 __extension__ extern __inline int64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddd_s64 (int64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_sqadddi (__a, __b);
+  return __builtin_aarch64_ssadddi (__a, __b);
 }
 
 __extension__ extern __inline uint8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddb_u8 (uint8_t __a, uint8_t __b)
 {
-  return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b);
+  return (uint8_t) __builtin_aarch64_usaddqi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddh_u16 (uint16_t __a, uint16_t __b)
 {
-  return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b);
+  return (uint16_t) __builtin_aarch64_usaddhi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadds_u32 (uint32_t __a, uint32_t __b)
 {
-  return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b);
+  return (uint32_t) __builtin_aarch64_usaddsi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddd_u64 (uint64_t __a, uint64_t __b)
 {
-  return __builtin_aarch64_uqadddi_uuu (__a, __b);
+  return __builtin_aarch64_usadddi_uuu (__a, __b);
 }
 
 /* vqdmlal */
@@ -19242,56 +19242,56 @@ __extension__ extern __inline int8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubb_s8 (int8_t __a, int8_t __b)
 {
-  return (int8_t) __builtin_aarch64_sqsubqi (__a, __b);
+  return (int8_t) __builtin_aarch64_sssubqi (__a, __b);
 }
 
 __extension__ extern __inline int16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubh_s16 (int16_t __a, int16_t __b)
 {
-  return (int16_t) __builtin_aarch64_sqsubhi (__a, __b);
+  return (int16_t) __builtin_aarch64_sssubhi (__a, __b);
 }
 
 __extension__ extern __inline int32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubs_s32 (int32_t __a, int32_t __b)
 {
-  return (int32_t) __builtin_aarch64_sqsubsi (__a, __b);
+  return (int32_t) __builtin_aarch64_sssubsi (__a, __b);
 }
 
 __extension__ extern __inline int64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubd_s64 (int64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_sqsubdi (__a, __b);
+  return __builtin_aarch64_sssubdi (__a, __b);
 }
 
 __extension__ extern __inline uint8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubb_u8 (uint8_t __a, uint8_t __b)
 {
-  return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b);
+  return (uint8_t) __builtin_aarch64_ussubqi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubh_u16 (uint16_t __a, uint16_t __b)
 {
-  return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b);
+  return (uint16_t) __builtin_aarch64_ussubhi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubs_u32 (uint32_t __a, uint32_t __b)
 {
-  return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b);
+  return (uint32_t) __builtin_aarch64_ussubsi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubd_u64 (uint64_t __a, uint64_t __b)
 {
-  return __builtin_aarch64_uqsubdi_uuu (__a, __b);
+  return __builtin_aarch64_ussubdi_uuu (__a, __b);
 }
 
 /* vqtbl2 */
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index ff0f34dd0430..2f7aa489ae8b 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -95,6 +95,10 @@
 ;; integer modes; 64-bit scalar integer mode.
 (define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI])
 
+;; Advanced SIMD and scalar, 64 & 128-bit container; 8 and 16-bit scalar
+;; integer modes.
+(define_mode_iterator VSDQ_I_QI_HI [VDQ_I HI QI])
+
 ;; Double vector modes.
 (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF V4BF])
 
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc
 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc
new file mode 100644
index 000000000000..1fadfd587555
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc
@@ -0,0 +1,58 @@
+/* Template file for vector saturating arithmetic validation.
+
+   This file defines saturating addition and subtraction functions for a given
+   scalar type, testing the auto-vectorization of these two operators. This
+   type, along with the corresponding minimum and maximum values for that type,
+   must be defined by any test file which includes this template file.  */
+
+#ifndef SAT_ARIT_AUTOVEC_INC
+#define SAT_ARIT_AUTOVEC_INC
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#ifndef UT
+#define UT unsigned int
+#define VT uint32x4_t
+#define UMAX UINT_MAX
+#define UMIN 0
+#endif
+
+
+UT uadd_lane (UT a, VT b)
+{
+  UT sum = a + b[0];
+  return sum < a ? UMAX : sum;
+}
+
+void uaddq (UT *out, UT *a, UT *b, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      UT sum = a[i] + b[i];
+      out[i] = sum < a[i] ? UMAX : sum;
+    }
+}
+
+void uaddq2 (UT *out, UT *a, UT *b, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      UT sum;
+      if (!__builtin_add_overflow(a[i], b[i], &sum))
+        out[i] = sum;
+      else
+        out[i] = UMAX;
+    }
+}
+
+void usubq (UT *out, UT *a, UT *b, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      UT sum = a[i] - b[i];
+      out[i] = sum > a[i] ? UMIN : sum;
+    }
+}
+
+#endif
\ No newline at end of file
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c
 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c
new file mode 100644
index 000000000000..2b72be7b0d7c
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c
@@ -0,0 +1,79 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+/*
+** uadd_lane: { xfail *-*-* }
+**     dup\tv([0-9]+).8b, w0
+**     uqadd\tb([0-9]+), (?:b\1, b0|b0, b\1)
+**     umov\tw0, v\2.b\[0\]
+**     ret
+*/
+/*
+** uaddq:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.16b, (?:v\1.16b, v\2.16b|v\2.16b, v\1.16b)
+** ...
+**     ldr\td([0-9]+), .*
+**     ldr\td([0-9]+), .*
+**     uqadd\tv[0-9]+.8b, (?:v\3.8b, v\4.8b|v\4.8b, v\3.8b)
+** ...
+**     ldr\tb([0-9]+), .*
+**     ldr\tb([0-9]+), .*
+**     uqadd\tb[0-9]+, (?:b\5, b\6|b\6, b\5)
+** ...
+**     ldr\tb([0-9]+), .*
+**     ldr\tb([0-9]+), .*
+**     uqadd\tb[0-9]+, (?:b\7, b\8|b\8, b\7)
+** ...
+*/
+/*
+** uaddq2:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.16b, (?:v\1.16b, v\2.16b|v\2.16b, v\1.16b)
+** ...
+**     ldr\td([0-9]+), .*
+**     ldr\td([0-9]+), .*
+**     uqadd\tv[0-9]+.8b, (?:v\3.8b, v\4.8b|v\4.8b, v\3.8b)
+** ...
+**     ldr\tb([0-9]+), .*
+**     ldr\tb([0-9]+), .*
+**     uqadd\tb[0-9]+, (?:b\5, b\6|b\6, b\5)
+** ...
+**     uqadd\tb([0-9]+), (?:b[0-9]+, b\7|b\7, b[0-9]+)
+** ...
+*/
+/*
+** usubq: { xfail *-*-* }
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqsub\tv[0-9]+.16b, v\1.16b, v\2.16b
+** ...
+**     ldr\td([0-9]+), .*
+**     ldr\td([0-9]+), .*
+**     uqsub\tv[0-9]+.8b, v\3.8b, v\4.8b
+** ...
+**     ldr\tb([0-9]+), .*
+**     ldr\tb([0-9]+), .*
+**     uqsub\tb[0-9]+, b\5, b\6
+** ...
+**     ldr\tb([0-9]+), .*
+**     ldr\tb([0-9]+), .*
+**     uqsub\tb[0-9]+, b\7, b\8
+** ...
+*/
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#define UT unsigned char
+#define VT uint8x8_t
+#define UMAX UCHAR_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic_autovect.inc"
\ No newline at end of file
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c
 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c
new file mode 100644
index 000000000000..0640361498f0
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c
@@ -0,0 +1,79 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+/*
+** uadd_lane: { xfail *-*-* }
+**     dup\tv([0-9]+).4h, w0
+**     uqadd\th([0-9]+), (?:h\1, h0|h0, h\1)
+**     umov\tw0, v\2.h\[0\]
+**     ret
+*/
+/*
+** uaddq:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.8h, (?:v\1.8h, v\2.8h|v\2.8h, v\1.8h)
+** ...
+**     ldr\td([0-9]+), .*
+**     ldr\td([0-9]+), .*
+**     uqadd\tv[0-9]+.4h, (?:v\3.4h, v\4.4h|v\4.4h, v\3.4h)
+** ...
+**     ldr\th([0-9]+), .*
+**     ldr\th([0-9]+), .*
+**     uqadd\th[0-9]+, (?:h\5, h\6|h\6, h\5)
+** ...
+**     ldr\th([0-9]+), .*
+**     ldr\th([0-9]+), .*
+**     uqadd\th[0-9]+, (?:h\7, h\8|h\8, h\7)
+** ...
+*/
+/*
+** uaddq2:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.8h, (?:v\1.8h, v\2.8h|v\2.8h, v\1.8h)
+** ...
+**     ldr\td([0-9]+), .*
+**     ldr\td([0-9]+), .*
+**     uqadd\tv[0-9]+.4h, (?:v\3.4h, v\4.4h|v\4.4h, v\3.4h)
+** ...
+**     ldr\th([0-9]+), .*
+**     ldr\th([0-9]+), .*
+**     uqadd\th[0-9]+, (?:h\5, h\6|h\6, h\5)
+** ...
+**     uqadd\th([0-9]+), (?:h[0-9]+, h\7|h\7, h[0-9]+)
+** ...
+*/
+/*
+** usubq: { xfail *-*-* }
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqsub\tv[0-9]+.8h, v\1.8h, v\2.8h
+** ...
+**     ldr\td([0-9]+), .*
+**     ldr\td([0-9]+), .*
+**     uqsub\tv[0-9]+.4h, v\3.4h, v\4.4h
+** ...
+**     ldr\th([0-9]+), .*
+**     ldr\th([0-9]+), .*
+**     uqsub\th[0-9]+, h\5, h\6
+** ...
+**     ldr\th([0-9]+), .*
+**     ldr\th([0-9]+), .*
+**     uqsub\th[0-9]+, h\7, h\8
+** ...
+*/
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#define UT unsigned short
+#define VT uint16x4_t
+#define UMAX USHRT_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic_autovect.inc"
\ No newline at end of file
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c
 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c
new file mode 100644
index 000000000000..ea6e0c78d786
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c
@@ -0,0 +1,75 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+/*
+** uadd_lane:
+**     fmov\tw([0-9]+), s0
+**     adds\tw([0-9]+), (?:w\1, w0|w0, w\1)
+**     csinv\tw\2, w\2, wzr, cc
+**     ret
+*/
+/*
+** uaddq:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.4s, (?:v\1.4s, v\2.4s|v\2.4s, v\1.4s)
+** ...
+**     ldr\tw([0-9]+), .*
+**     ldr\tw([0-9]+), .*
+**     adds\tw([0-9]+), (?:w\3, w\4|w\4, w\3)
+**     csinv\tw\5, w\5, wzr, cc
+** ...
+**     ldr\tw([0-9]+), .*
+**     ldr\tw([0-9]+), .*
+**     adds\tw([0-9]+), (?:w\6, w\7|w\7, w\6)
+**     csinv\tw\8, w\8, wzr, cc
+** ...
+*/
+/*
+** uaddq2:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.4s, (?:v\1.4s, v\2.4s|v\2.4s, v\1.4s)
+** ...
+**     ldr\tw([0-9]+), .*
+**     ldr\tw([0-9]+), .*
+**     adds\tw([0-9]+), (?:w\3, w\4|w\4, w\3)
+**     csinv\tw\5, w\5, wzr, cc
+** ...
+**     ldr\tw([0-9]+), .*
+**     ldr\tw([0-9]+), .*
+**     adds\tw([0-9]+), (?:w\6, w\7|w\7, w\6)
+**     csinv\tw\8, w\8, wzr, cc
+** ...
+*/
+/*
+** usubq: { xfail *-*-* }
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqsub\tv[0-9]+.4s, v\1.4s, v\2.4s
+** ...
+**     ldr\tw([0-9]+), .*
+**     ldr\tw([0-9]+), .*
+**     subs\tw([0-9]+), w\3, w\4
+**     csel\tw\5, w\5, wzr, cs
+** ...
+**     ldr\tw([0-9]+), .*
+**     ldr\tw([0-9]+), .*
+**     subs\tw([0-9]+), w\6, w\7
+**     csel\tw\8, w\8, wzr, cs
+** ...
+*/
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#define UT unsigned int
+#define VT uint32x2_t
+#define UMAX UINT_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic_autovect.inc"
\ No newline at end of file
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c
 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c
new file mode 100644
index 000000000000..01390637b5ca
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c
@@ -0,0 +1,77 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+/*
+** uadd_lane:
+** ...
+**     (?:fmov|ldr)\tx([0-9]+), .*
+** ...
+**     adds\tx([0-9]+), (?:x\1, x0|x0, x\1)
+**     csinv\tx\2, x\2, xzr, cc
+**     ret
+*/
+/*
+** uaddq:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.2d, (?:v\1.2d, v\2.2d|v\2.2d, v\1.2d)
+** ...
+**     ldr\tx([0-9]+), .*
+**     ldr\tx([0-9]+), .*
+**     adds\tx([0-9]+), (?:x\3, x\4|x\4, x\3)
+**     csinv\tx\5, x\5, xzr, cc
+** ...
+**     ldr\tx([0-9]+), .*
+**     ldr\tx([0-9]+), .*
+**     adds\tx([0-9]+), (?:x\6, x\7|x\7, x\6)
+**     csinv\tx\8, x\8, xzr, cc
+** ...
+*/
+/*
+** uaddq2:
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqadd\tv[0-9]+.2d, (?:v\1.2d, v\2.2d|v\2.2d, v\1.2d)
+** ...
+**     ldr\tx([0-9]+), .*
+**     ldr\tx([0-9]+), .*
+**     adds\tx([0-9]+), (?:x\3, x\4|x\4, x\3)
+**     csinv\tx\5, x\5, xzr, cc
+** ...
+**     ldr\tx([0-9]+), .*
+**     ldr\tx([0-9]+), .*
+**     adds\tx([0-9]+), (?:x\6, x\7|x\7, x\6)
+**     csinv\tx\8, x\8, xzr, cc
+** ...
+*/
+/*
+** usubq: { xfail *-*-* }
+** ...
+**     ldr\tq([0-9]+), .*
+**     ldr\tq([0-9]+), .*
+**     uqsub\tv[0-9]+.2d, v\1.2d, v\2.2d
+** ...
+**     ldr\tx([0-9]+), .*
+**     ldr\tx([0-9]+), .*
+**     subs\tx([0-9]+), x\3, x\4
+**     csel\tx\5, x\5, xzr, cs
+** ...
+**     ldr\tx([0-9]+), .*
+**     ldr\tx([0-9]+), .*
+**     subs\tx([0-9]+), x\6, x\7
+**     csel\tx\8, x\8, xzr, cs
+** ...
+*/
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#define UT unsigned long
+#define VT uint64x2_t
+#define UMAX ULONG_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic_autovect.inc"
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c 
b/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c
new file mode 100644
index 000000000000..de652bf1d9e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c
@@ -0,0 +1,270 @@
+/* { dg-do run } */
+/* { dg-options "-O2 --save-temps -mearly-ra=none -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+/*
+** sadd32:
+**     asr     w([0-9]+), w1, 31
+**     eor     w\1, w\1, -2147483648
+**     adds    w([0-9]+), (?:w0, w1|w1, w0)
+**     csinv   w0, w\2, w\1, vc
+**     ret
+*/
+int32_t __attribute__((noipa))
+sadd32 (int32_t __a, int32_t __b)
+{
+  int32_t sum;
+  bool overflow = __builtin_add_overflow (__a, __b, &sum);
+  return !overflow ? sum : __a < 0 ? INT_MIN : INT_MAX;
+}
+
+/*
+** sadd32_imm:
+**     adds    w([0-9]+), w0, #67
+**     mov     w([0-9]+), 2147483647
+**     csel    w0, w\1, w\2, vc
+**     ret
+*/
+int32_t __attribute__((noipa))
+sadd32_imm (int32_t __a)
+{
+  int32_t sum;
+  bool overflow = __builtin_add_overflow (__a, 67, &sum);
+  return !overflow ? sum : __a < 0 ? INT_MIN : INT_MAX;
+}
+
+/*
+** sadd32_imm2:
+**     subs    w([0-9]+), w0, 67
+**     mov     w([0-9]+), -2147483648
+**     csel    w0, w\1, w\2, vc
+**     ret
+*/
+int32_t  __attribute__((noipa))
+sadd32_imm2 (int32_t __a)
+{
+  int32_t sum;
+  bool overflow = __builtin_add_overflow (__a, -67, &sum);
+  return !overflow ? sum : __a < 0 ? INT_MIN : INT_MAX;
+}
+
+/*
+** ssub32:
+**     asr     w([0-9]+), w1, 31
+**     eor     w\1, w\1, -2147483648
+**     subs    w([0-9]+), w0, w1
+**     csel    w0, w\2, w\1, vc
+**     ret
+*/
+int32_t  __attribute__((noipa))
+ssub32 (int32_t __a, int32_t __b)
+{
+  int32_t result;
+  bool overflow = __builtin_sub_overflow (__a, __b, &result);
+  return !overflow ? result : __a < 0 ? INT_MIN : INT_MAX;
+}
+
+/*
+** ssub32_imm:
+**     subs    w([0-9]+), w0, 67
+**     mov     w([0-9]+), -2147483648
+**     csel    w0, w\1, w\2, vc
+**     ret
+*/
+int32_t  __attribute__((noipa))
+ssub32_imm (int32_t __a)
+{
+  int32_t result;
+  bool overflow = __builtin_sub_overflow (__a, 67, &result);
+  return !overflow ? result : __a < 0 ? INT_MIN : INT_MAX;
+}
+
+/*
+** ssub32_imm2:
+**     adds    w([0-9]+), w0, #67
+**     mov     w([0-9]+), 2147483647
+**     csel    w0, w\1, w\2, vc
+**     ret
+*/
+int32_t  __attribute__((noipa))
+ssub32_imm2 (int32_t __a)
+{
+  int32_t result;
+  bool overflow = __builtin_sub_overflow (__a, -67, &result);
+  return !overflow ? result : __a < 0 ? INT_MIN : INT_MAX;
+}
+
+/*
+** sadd64:
+**     asr     x([0-9]+), x1, 63
+**     eor     x\1, x\1, -9223372036854775808
+**     adds    x([0-9]+), (?:x0, x1|x1, x0)
+**     csinv   x0, x\2, x\1, vc
+**     ret
+*/
+int64_t  __attribute__((noipa))
+sadd64 (int64_t __a, int64_t __b)
+{
+  int64_t sum;
+  bool overflow = __builtin_add_overflow (__a, __b, &sum);
+  return !overflow ? sum : __a < 0 ? LONG_MIN : LONG_MAX;
+}
+
+/*
+** sadd64_imm:
+**     adds    x([0-9]+), x0, #67
+**     mov     x([0-9]+), 9223372036854775807
+**     csel    x0, x\1, x\2, vc
+**     ret
+*/
+int64_t  __attribute__((noipa))
+sadd64_imm (int64_t __a)
+{
+  int64_t sum;
+  bool overflow = __builtin_add_overflow (__a, (int64_t)67, &sum);
+  return !overflow ? sum : __a < 0 ? LONG_MIN : LONG_MAX;
+}
+
+/*
+** sadd64_imm2:
+**     subs    x([0-9]+), x0, 67
+**     mov     x([0-9]+), -9223372036854775808
+**     csel    x0, x\1, x\2, vc
+**     ret
+*/
+int64_t  __attribute__((noipa))
+sadd64_imm2 (int64_t __a)
+{
+  int64_t sum;
+  bool overflow = __builtin_add_overflow (__a, (int64_t)-67, &sum);
+  return !overflow ? sum : __a < 0 ? LONG_MIN : LONG_MAX;
+}
+
+/*
+** ssub64:
+**     asr     x([0-9]+), x1, 63
+**     eor     x\1, x\1, -9223372036854775808
+**     subs    x([0-9]+), x0, x1
+**     csel    x0, x\2, x\1, vc
+**     ret
+*/
+int64_t  __attribute__((noipa))
+ssub64 (int64_t __a, int64_t __b)
+{
+  int64_t result;
+  bool overflow = __builtin_sub_overflow (__a, __b, &result);
+  return !overflow ? result : __a < 0 ? LONG_MIN : LONG_MAX;
+}
+
+/*
+** ssub64_imm:
+**     subs    x([0-9]+), x0, 67
+**     mov     x([0-9]+), -9223372036854775808
+**     csel    x0, x\1, x\2, vc
+**     ret
+*/
+int64_t  __attribute__((noipa))
+ssub64_imm (int64_t __a)
+{
+  int64_t result;
+  bool overflow = __builtin_sub_overflow (__a, (int64_t) 67, &result);
+  return !overflow ? result : __a < 0 ? LONG_MIN : LONG_MAX;
+}
+
+/*
+** ssub64_imm2:
+**     adds    x([0-9]+), x0, #67
+**     mov     x([0-9]+), 9223372036854775807
+**     csel    x0, x\1, x\2, vc
+**     ret
+*/
+int64_t  __attribute__((noipa))
+ssub64_imm2 (int64_t __a)
+{
+  int64_t result;
+  bool overflow = __builtin_sub_overflow (__a, (int64_t) -67, &result);
+  return !overflow ? result : __a < 0 ? LONG_MIN : LONG_MAX;
+}
+
+int
+main (void)
+{
+  /* Addition:
+  SAT_ADD(x, +ve), non-saturating
+  SAT_ADD(x, +ve), saturating
+  SAT_ADD(x, immediate +ve)
+  SAT_ADD(x, immediate -ve)
+  SAT_ADD(x, -ve), non-saturating
+  SAT_ADD(x, -ve), saturating
+
+  Subtraction:
+  SAT_SUB(x, +ve), non-saturating
+  SAT_SUB(x, +ve), saturating
+  SAT_SUB(x, immediate +ve)
+  SAT_SUB(x, immediate -ve)
+  SAT_SUB(x, -ve), non-saturating  */
+
+  int32_t a = 4;
+  int32_t b = 70;
+  int32_t c = 2147483647;
+  int32_t d = (int32_t) -2147483648;
+
+  if (sadd32 (a, b) != (a + b))
+    __builtin_abort ();
+  if (sadd32 (a, c) != c)
+    __builtin_abort ();
+  if (sadd32_imm (a) != (a + 67))
+    __builtin_abort ();
+  if (sadd32_imm2 (a) != (a - 67))
+    __builtin_abort ();
+  if (sadd32 (a, -b) != (a - b))
+    __builtin_abort ();
+  if (sadd32 (a, d) != (d + 4))
+    __builtin_abort ();
+
+  if (ssub32 (a, b) != (a - b))
+    __builtin_abort ();
+  if (ssub32 (-a, c) != d)
+    __builtin_abort ();
+  if (ssub32_imm (a) != (a - 67))
+    __builtin_abort ();
+  if (ssub32_imm2 (a) != (a + 67))
+    __builtin_abort ();
+  if (ssub32 (a, -b) != (a + b))
+    __builtin_abort ();
+
+  int64_t a_64 = a;
+  int64_t b_64 = b;
+  int64_t c_64 = (int64_t) 9223372036854775807;
+  int64_t d_64 = (int64_t) 0x8000000000000000;
+
+  if (sadd64 (a_64, b_64) != (a_64 + b_64))
+    __builtin_abort ();
+  if (sadd64 (a_64, c_64) != c_64)
+    __builtin_abort ();
+  if (sadd64_imm (a_64) != (a_64 + 67))
+    __builtin_abort ();
+  if (sadd64_imm2 (a_64) != (a_64 - 67))
+    __builtin_abort ();
+  if (sadd64 (a_64, -b_64) != (a_64 - b_64))
+    __builtin_abort ();
+  if (sadd64 (a_64, d_64) != (d_64 + 4))
+    __builtin_abort ();
+
+  if (ssub64 (a_64, b_64) != (a_64 - b_64))
+    __builtin_abort ();
+  if (ssub64 (-a_64, c_64) != d_64)
+    __builtin_abort ();
+  if (ssub64_imm (a_64) != (a_64 - 67))
+    __builtin_abort ();
+  if (ssub64_imm2 (a_64) != (a_64 + 67))
+    __builtin_abort ();
+  if (ssub64 (a_64, -b_64) != (a_64 + b_64))
+    __builtin_abort ();
+
+  return 0;
+}
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc 
b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc
new file mode 100644
index 000000000000..e979d5354057
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc
@@ -0,0 +1,39 @@
+/* Template file for scalar saturating arithmetic validation.
+
+   This file defines scalar saturating addition and subtraction functions for a
+   given type. This type, along with the corresponding minimum and maximum
+   values for that type, must be defined by any test file which includes this
+   template file.  */
+
+#ifndef SAT_ARIT_INC
+#define SAT_ARIT_INC
+
+#include <limits.h>
+
+#ifndef UT
+#define UT unsigned int
+#define UMAX UINT_MAX
+#define UMIN 0
+#endif
+
+UT uadd (UT a, UT b)
+{
+        UT sum = a + b;
+        return sum < a ? UMAX : sum;
+}
+
+UT uadd2 (UT a, UT b)
+{
+        UT c;
+        if (!__builtin_add_overflow(a, b, &c))
+                return c;
+        return UMAX;
+}
+
+UT usub (UT a, UT b)
+{
+        UT sum = a - b;
+        return sum > a ? UMIN : sum;
+}
+
+#endif
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c 
b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c
new file mode 100644
index 000000000000..2ac0c376d126
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c
@@ -0,0 +1,36 @@
+/* { dg-do-compile } */
+/* { dg-options "-O2 --save-temps -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** uadd:
+**     dup     v([0-9]+).8b, w1
+**     dup     v([0-9]+).8b, w0
+**     uqadd   b([0-9]+), (?:b\2, b\1|b\1, b\2)
+**     umov    w0, v\3.b\[0\]
+**     ret
+*/
+/*
+** uadd2:
+**     dup     v([0-9]+).8b, w1
+**     dup     v([0-9]+).8b, w0
+**     uqadd   b([0-9]+), (?:b\2, b\1|b\1, b\2)
+**     umov    w0, v\3.b\[0\]
+**     ret
+*/
+/*
+** usub: { xfail *-*-* }
+**     dup     v([0-9]+).8b, w1
+**     dup     v([0-9]+).8b, w0
+**     uqsub   b([0-9]+), b\1, b\2
+**     umov    w0, v\3.b\[0\]
+**     ret
+*/
+
+#include <limits.h>
+
+#define UT unsigned char
+#define UMAX UCHAR_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic.inc"
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c 
b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c
new file mode 100644
index 000000000000..2a55aa9f2218
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c
@@ -0,0 +1,36 @@
+/* { dg-do-compile } */
+/* { dg-options "-O2 --save-temps -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** uadd:
+**     dup     v([0-9]+).4h, w1
+**     dup     v([0-9]+).4h, w0
+**     uqadd   h([0-9]+), (?:h\2, h\1|h\1, h\2)
+**     umov    w0, v\3.h\[0\]
+**     ret
+*/
+/*
+** uadd2:
+**     dup     v([0-9]+).4h, w1
+**     dup     v([0-9]+).4h, w0
+**     uqadd   h([0-9]+), (?:h\2, h\1|h\1, h\2)
+**     umov    w0, v\3.h\[0\]
+**     ret
+*/
+/*
+** usub: { xfail *-*-* }
+**     dup     v([0-9]+).4h, w1
+**     dup     v([0-9]+).4h, w0
+**     uqsub   h([0-9]+), h\1, h\2
+**     umov    w0, v\3.h\[0\]
+**     ret
+*/
+
+#include <limits.h>
+
+#define UT unsigned short
+#define UMAX USHRT_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic.inc"
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c 
b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c
new file mode 100644
index 000000000000..215172545196
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c
@@ -0,0 +1,30 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** uadd:
+**     adds\tw([0-9]+), w([0-9]+), w([0-9]+)
+**     csinv\tw\1, w\1, wzr, cc
+**     ret
+*/
+/*
+** uadd2:
+**     adds\tw([0-9]+), w([0-9]+), w([0-9]+)
+**     csinv\tw\1, w\1, wzr, cc
+**     ret
+*/
+/*
+** usub:
+**     subs\tw([0-9]+), w([0-9]+), w([0-9]+)
+**     csel\tw\1, w\1, wzr, cs
+**     ret
+*/
+
+#include <limits.h>
+
+#define UT unsigned int
+#define UMAX UINT_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic.inc"
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c 
b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c
new file mode 100644
index 000000000000..363d0a79a730
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c
@@ -0,0 +1,30 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** uadd:
+**     adds\tx([0-9]+), x([0-9]+), x([0-9]+)
+**     csinv\tx\1, x\1, xzr, cc
+**     ret
+*/
+/*
+** uadd2:
+**     adds\tx([0-9]+), x([0-9]+), x([0-9]+)
+**     csinv\tx\1, x\1, xzr, cc
+**     ret
+*/
+/*
+** usub:
+**     subs\tx([0-9]+), x([0-9]+), x([0-9]+)
+**     csel\tx\1, x\1, xzr, cs
+**     ret
+*/
+
+#include <limits.h>
+
+#define UT unsigned long
+#define UMAX ULONG_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic.inc"
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c 
b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c
index c2e13b651e96..dcf9dc777ade 100644
--- a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c
+++ b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c
@@ -318,33 +318,33 @@ test_vpaddd_u64 (uint64x2_t a)
 /* { dg-final { scan-assembler-times "\\tuqadd\\td\[0-9\]+" 1 } } */
 
 uint64_t
-test_vqaddd_u64 (uint64_t a, uint64_t b)
+test_vqaddd_u64 (uint64x1_t a, uint64x1_t b)
 {
-  return vqaddd_u64 (a, b);
+  return vqaddd_u64 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tuqadd\\ts\[0-9\]+" 1 } } */
 
 uint32_t
-test_vqadds_u32 (uint32_t a, uint32_t b)
+test_vqadds_u32 (uint32x4_t a, uint32x4_t b)
 {
-  return vqadds_u32 (a, b);
+  return vqadds_u32 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tuqadd\\th\[0-9\]+" 1 } } */
 
 uint16_t
-test_vqaddh_u16 (uint16_t a, uint16_t b)
+test_vqaddh_u16 (uint16x8_t a, uint16x8_t b)
 {
-  return vqaddh_u16 (a, b);
+  return vqaddh_u16 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tuqadd\\tb\[0-9\]+" 1 } } */
 
 uint8_t
-test_vqaddb_u8 (uint8_t a, uint8_t b)
+test_vqaddb_u8 (uint8x16_t a, uint8x16_t b)
 {
-  return vqaddb_u8 (a, b);
+  return vqaddb_u8 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tsqadd\\td\[0-9\]+" 1 } } */
@@ -761,33 +761,33 @@ test_vsubd_s64_2 (int64_t a, int64_t b)
 /* { dg-final { scan-assembler-times "\\tuqsub\\td\[0-9\]+" 1 } } */
 
 uint64_t
-test_vqsubd_u64 (uint64_t a, uint64_t b)
+test_vqsubd_u64 (uint64x1_t a, uint64x1_t b)
 {
-  return vqsubd_u64 (a, b);
+  return vqsubd_u64 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tuqsub\\ts\[0-9\]+" 1 } } */
 
 uint32_t
-test_vqsubs_u32 (uint32_t a, uint32_t b)
+test_vqsubs_u32 (uint32x4_t a, uint32x4_t b)
 {
-  return vqsubs_u32 (a, b);
+  return vqsubs_u32 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tuqsub\\th\[0-9\]+" 1 } } */
 
 uint16_t
-test_vqsubh_u16 (uint16_t a, uint16_t b)
+test_vqsubh_u16 (uint16x8_t a, uint16x8_t b)
 {
-  return vqsubh_u16 (a, b);
+  return vqsubh_u16 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tuqsub\\tb\[0-9\]+" 1 } } */
 
 uint8_t
-test_vqsubb_u8 (uint8_t a, uint8_t b)
+test_vqsubb_u8 (uint8x16_t a, uint8x16_t b)
 {
-  return vqsubb_u8 (a, b);
+  return vqsubb_u8 (a[0], b[0]);
 }
 
 /* { dg-final { scan-assembler-times "\\tsqsub\\td\[0-9\]+" 1 } } */

[gcc r15-7003] AArch64: Use standard names for saturating arithmetic

Reply via email to