[PATCH][AArch64] Use intrinsics for upper saturating shift right

David Candler via Gcc-patches Tue, 03 Nov 2020 10:36:44 -0800

The use of vqshrn_high_n_s32 was triggering an unneeded register move, because
sqshrn2 is destructive but was declared as inline assembly in arm_neon.h. This
patch implements sqshrn2 and uqshrn2 as actual intrinsics which do not trigger
the unnecessary move, along with new tests to cover them.


Bootstrapped and regression tested on aarch64-none-linux-gnu

gcc/ChangeLog

2020-11-03  David Candler  <david.cand...@arm.com>

        * config/aarch64/aarch64-builtins.c
        (TYPES_SHIFT2IMM): Add define.
        (TYPES_SHIFT2IMM_UUSS): Add define.
        * config/aarch64/aarch64-simd.md
        (aarch64_<sur>q<r>shr<u>n2_n<mode>): Add new insn for upper saturating 
shift right.
        * config/aarch64/aarch64-simd-builtins.def: Add intrinsics.
        * config/aarch64/arm_neon.h:
        (vqrshrn_high_n_s16): Expand using intrinsic rather than inline asm.
        (vqrshrn_high_n_s32): Likewise.
        (vqrshrn_high_n_s64): Likewise.
        (vqrshrn_high_n_u16): Likewise.
        (vqrshrn_high_n_u32): Likewise.
        (vqrshrn_high_n_u64): Likewise.
        (vqrshrun_high_n_s16): Likewise.
        (vqrshrun_high_n_s32): Likewise.
        (vqrshrun_high_n_s64): Likewise.
        (vqshrn_high_n_s16): Likewise.
        (vqshrn_high_n_s32): Likewise.
        (vqshrn_high_n_s64): Likewise.
        (vqshrn_high_n_u16): Likewise.
        (vqshrn_high_n_u32): Likewise.
        (vqshrn_high_n_u64): Likewise.
        (vqshrun_high_n_s16): Likewise.
        (vqshrun_high_n_s32): Likewise.
        (vqshrun_high_n_s64): Likewise.

gcc/testsuite/ChangeLog

2020-11-03  David Candler  <david.cand...@arm.com>

        * gcc.target/aarch64/advsimd-intrinsics/vqrshrn_high_n.c: New testcase.
        * gcc.target/aarch64/advsimd-intrinsics/vqrshrun_high_n.c: Likewise.
        * gcc.target/aarch64/advsimd-intrinsics/vqshrn_high_n.c: Likewise.
        * gcc.target/aarch64/advsimd-intrinsics/vqshrun_high_n.c: Likewise.
        * gcc.target/aarch64/narrow_high-intrinsics.c: Update expected assembler
        for sqshrun2, sqrshrun2, sqshrn2, uqshrn2, sqrshrn2 and uqrshrn2.

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 4f33dd936c7..f93f4e29c89 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -254,6 +254,10 @@ aarch64_types_binop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define TYPES_GETREG (aarch64_types_binop_imm_qualifiers)
 #define TYPES_SHIFTIMM (aarch64_types_binop_imm_qualifiers)
 static enum aarch64_type_qualifiers
+aarch64_types_ternop_s_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate};
+#define TYPES_SHIFT2IMM (aarch64_types_ternop_s_imm_qualifiers)
+static enum aarch64_type_qualifiers
 aarch64_types_shift_to_unsigned_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_none, qualifier_immediate };
 #define TYPES_SHIFTIMM_USS (aarch64_types_shift_to_unsigned_qualifiers)
@@ -265,14 +269,16 @@ static enum aarch64_type_qualifiers
 aarch64_types_unsigned_shift_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate };
 #define TYPES_USHIFTIMM (aarch64_types_unsigned_shift_qualifiers)
+#define TYPES_USHIFT2IMM (aarch64_types_ternopu_imm_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_shift2_to_unsigned_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_unsigned, qualifier_unsigned, qualifier_none, qualifier_immediate };
+#define TYPES_SHIFT2IMM_UUSS (aarch64_types_shift2_to_unsigned_qualifiers)
 
 static enum aarch64_type_qualifiers
 aarch64_types_ternop_s_imm_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_poly, qualifier_immediate};
 #define TYPES_SETREGP (aarch64_types_ternop_s_imm_p_qualifiers)
-static enum aarch64_type_qualifiers
-aarch64_types_ternop_s_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate};
 #define TYPES_SETREG (aarch64_types_ternop_s_imm_qualifiers)
 #define TYPES_SHIFTINSERT (aarch64_types_ternop_s_imm_qualifiers)
 #define TYPES_SHIFTACC (aarch64_types_ternop_s_imm_qualifiers)
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index d1b21102b2f..0b82b9c072b 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -285,6 +285,13 @@
   BUILTIN_VSQN_HSDI (USHIFTIMM, uqshrn_n, 0, ALL)
   BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrn_n, 0, ALL)
   BUILTIN_VSQN_HSDI (USHIFTIMM, uqrshrn_n, 0, ALL)
+  /* Implemented by aarch64_<sur>q<r>shr<u>n2_n<mode>.  */
+  BUILTIN_VQN (SHIFT2IMM_UUSS, sqshrun2_n, 0, ALL)
+  BUILTIN_VQN (SHIFT2IMM_UUSS, sqrshrun2_n, 0, ALL)
+  BUILTIN_VQN (SHIFT2IMM, sqshrn2_n, 0, ALL)
+  BUILTIN_VQN (USHIFT2IMM, uqshrn2_n, 0, ALL)
+  BUILTIN_VQN (SHIFT2IMM, sqrshrn2_n, 0, ALL)
+  BUILTIN_VQN (USHIFT2IMM, uqrshrn2_n, 0, ALL)
   /* Implemented by aarch64_<sur>s<lr>i_n<mode>.  */
   BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0, ALL)
   BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0, ALL)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 381a702eba0..76de3f50e48 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4720,6 +4720,17 @@
   [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
 )
 
+(define_insn "aarch64_<sur>q<r>shr<u>n2_n<mode>"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+        (unspec:<VNARROWQ2> [(match_operand:<VNARROWQ> 1 "register_operand" "0")
+			     (match_operand:VQN 2 "register_operand" "w")
+			     (match_operand:SI 3 "aarch64_simd_shift_imm_offset_<ve_mode>" "i")]
+                            VQSHRN_N))]
+  "TARGET_SIMD"
+  "<sur>q<r>shr<u>n2\\t%<vn2>0.<V2ntype>, %<v>2.<Vtype>, %3"
+  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
 
 ;; cm(eq|ge|gt|lt|le)
 ;; Note, we have constraints for Dz and Z as different expanders
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 50f8b23bc17..e918c86ff78 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -9979,275 +9979,131 @@ vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b)
   return __result;
 }
 
-#define vqrshrn_high_n_s16(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int8x8_t a_ = (a);                                               \
-       int8x16_t result = vcombine_s8                                   \
-                            (a_, vcreate_s8                             \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("sqrshrn2 %0.16b, %1.8h, #%2"                           \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_s16 (int8x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_sqrshrn2_nv8hi (__a, __b, __c);
+}
 
-#define vqrshrn_high_n_s32(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int16x8_t result = vcombine_s16                                  \
-                            (a_, vcreate_s16                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("sqrshrn2 %0.8h, %1.4s, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_s32 (int16x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqrshrn2_nv4si (__a, __b, __c);
+}
 
-#define vqrshrn_high_n_s64(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int32x4_t result = vcombine_s32                                  \
-                            (a_, vcreate_s32                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("sqrshrn2 %0.4s, %1.2d, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_s64 (int32x2_t __a, int64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_sqrshrn2_nv2di (__a, __b, __c);
+}
 
-#define vqrshrn_high_n_u16(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint8x8_t a_ = (a);                                              \
-       uint8x16_t result = vcombine_u8                                  \
-                             (a_, vcreate_u8                            \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("uqrshrn2 %0.16b, %1.8h, #%2"                           \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_u16 (uint8x8_t __a, uint16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_uqrshrn2_nv8hi_uuus (__a, __b, __c);
+}
 
-#define vqrshrn_high_n_u32(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint16x8_t result = vcombine_u16                                 \
-                             (a_, vcreate_u16                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("uqrshrn2 %0.8h, %1.4s, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_u32 (uint16x4_t __a, uint32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_uqrshrn2_nv4si_uuus (__a, __b, __c);
+}
 
-#define vqrshrn_high_n_u64(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint32x4_t result = vcombine_u32                                 \
-                             (a_, vcreate_u32                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("uqrshrn2 %0.4s, %1.2d, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_uqrshrn2_nv2di_uuus (__a, __b, __c);
+}
 
-#define vqrshrun_high_n_s16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       uint8x8_t a_ = (a);                                              \
-       uint8x16_t result = vcombine_u8                                  \
-                             (a_, vcreate_u8                            \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqrshrun2 %0.16b, %1.8h, #%2"                          \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrun_high_n_s16 (uint8x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_sqrshrun2_nv8hi_uuss (__a, __b, __c);
+}
 
-#define vqrshrun_high_n_s32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       uint16x4_t a_ = (a);                                             \
-       uint16x8_t result = vcombine_u16                                 \
-                             (a_, vcreate_u16                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqrshrun2 %0.8h, %1.4s, #%2"                           \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrun_high_n_s32 (uint16x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqrshrun2_nv4si_uuss (__a, __b, __c);
+}
 
-#define vqrshrun_high_n_s64(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t b_ = (b);                                              \
-       uint32x2_t a_ = (a);                                             \
-       uint32x4_t result = vcombine_u32                                 \
-                             (a_, vcreate_u32                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqrshrun2 %0.4s, %1.2d, #%2"                           \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrun_high_n_s64 (uint32x2_t __a, int64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_sqrshrun2_nv2di_uuss (__a, __b, __c);
+}
 
-#define vqshrn_high_n_s16(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int8x8_t a_ = (a);                                               \
-       int8x16_t result = vcombine_s8                                   \
-                            (a_, vcreate_s8                             \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("sqshrn2 %0.16b, %1.8h, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_s16 (int8x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_sqshrn2_nv8hi (__a, __b, __c);
+}
 
-#define vqshrn_high_n_s32(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int16x8_t result = vcombine_s16                                  \
-                            (a_, vcreate_s16                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("sqshrn2 %0.8h, %1.4s, #%2"                             \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_s32 (int16x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqshrn2_nv4si (__a, __b, __c);
+}
 
-#define vqshrn_high_n_s64(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int32x4_t result = vcombine_s32                                  \
-                            (a_, vcreate_s32                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("sqshrn2 %0.4s, %1.2d, #%2"                             \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_s64 (int32x2_t __a, int64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_sqshrn2_nv2di (__a, __b, __c);
+}
 
-#define vqshrn_high_n_u16(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint8x8_t a_ = (a);                                              \
-       uint8x16_t result = vcombine_u8                                  \
-                             (a_, vcreate_u8                            \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("uqshrn2 %0.16b, %1.8h, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_u16 (uint8x8_t __a, uint16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_uqshrn2_nv8hi_uuus (__a, __b, __c);
+}
 
-#define vqshrn_high_n_u32(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint16x8_t result = vcombine_u16                                 \
-                             (a_, vcreate_u16                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("uqshrn2 %0.8h, %1.4s, #%2"                             \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_u32 (uint16x4_t __a, uint32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_uqshrn2_nv4si_uuus (__a, __b, __c);
+}
 
-#define vqshrn_high_n_u64(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint32x4_t result = vcombine_u32                                 \
-                             (a_, vcreate_u32                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("uqshrn2 %0.4s, %1.2d, #%2"                             \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_uqshrn2_nv2di_uuus (__a, __b, __c);
+}
 
-#define vqshrun_high_n_s16(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       uint8x8_t a_ = (a);                                              \
-       uint8x16_t result = vcombine_u8                                  \
-                             (a_, vcreate_u8                            \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqshrun2 %0.16b, %1.8h, #%2"                           \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrun_high_n_s16 (uint8x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_sqshrun2_nv8hi_uuss (__a, __b, __c);
+}
 
-#define vqshrun_high_n_s32(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       uint16x4_t a_ = (a);                                             \
-       uint16x8_t result = vcombine_u16                                 \
-                             (a_, vcreate_u16                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqshrun2 %0.8h, %1.4s, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrun_high_n_s32 (uint16x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqshrun2_nv4si_uuss (__a, __b, __c);
+}
 
-#define vqshrun_high_n_s64(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t b_ = (b);                                              \
-       uint32x2_t a_ = (a);                                             \
-       uint32x4_t result = vcombine_u32                                 \
-                             (a_, vcreate_u32                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqshrun2 %0.4s, %1.2d, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrun_high_n_s64 (uint32x2_t __a, int64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_sqshrun2_nv2di_uuss (__a, __b, __c);
+}
 
 #define vrshrn_high_n_s16(a, b, c)                                      \
   __extension__                                                         \
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrshrn_high_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrshrn_high_n.c
new file mode 100644
index 00000000000..22bad2a22eb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrshrn_high_n.c
@@ -0,0 +1,219 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected values of cumulative_saturation flag.  */
+int VECT_VAR(expected_cumulative_sat,int,16,8) = 0;
+int VECT_VAR(expected_cumulative_sat,int,32,4) = 0;
+int VECT_VAR(expected_cumulative_sat,int,64,2) = 0;
+int VECT_VAR(expected_cumulative_sat,uint,16,8) = 1;
+int VECT_VAR(expected_cumulative_sat,uint,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat,uint,64,2) = 1;
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+					0xf4, 0xf5, 0xf6, 0xf7,
+					0xf8, 0xf9, 0xf9, 0xfa,
+					0xfa, 0xfb, 0xfb, 0xfc };
+VECT_VAR_DECL(expected,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
+					0xfff8, 0xfff9, 0xfff9, 0xfffa };
+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffffff0, 0xfffffff1,
+					0xfffffffc, 0xfffffffc };
+VECT_VAR_DECL(expected,uint,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+					 0xf4, 0xf5, 0xf6, 0xf7,
+					 0xff, 0xff, 0xff, 0xff,
+					 0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
+					 0xffff, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
+					 0xffffffff, 0xffffffff };
+
+/* Expected values of cumulative_saturation flag with shift by 3.  */
+int VECT_VAR(expected_cumulative_sat_sh3,int,16,8) = 1;
+int VECT_VAR(expected_cumulative_sat_sh3,int,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat_sh3,int,64,2) = 1;
+int VECT_VAR(expected_cumulative_sat_sh3,uint,16,8) = 1;
+int VECT_VAR(expected_cumulative_sat_sh3,uint,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat_sh3,uint,64,2) = 1;
+
+/* Expected results with shift by 3.  */
+VECT_VAR_DECL(expected_sh3,int,8,16) [] = { 0x7f, 0x7f, 0x7f, 0x7f,
+					    0x7f, 0x7f, 0x7f, 0x7f,
+					    0x7f, 0x7f, 0x7f, 0x7f,
+					    0x7f, 0x7f, 0x7f, 0x7f };
+VECT_VAR_DECL(expected_sh3,int,16,8) [] = { 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+					    0x7fff, 0x7fff, 0x7fff, 0x7fff };
+VECT_VAR_DECL(expected_sh3,int,32,4) [] = { 0x7fffffff, 0x7fffffff,
+					    0x7fffffff, 0x7fffffff };
+VECT_VAR_DECL(expected_sh3,uint,8,16) [] = { 0xff, 0xff, 0xff, 0xff,
+					     0xff, 0xff, 0xff, 0xff,
+					     0xff, 0xff, 0xff, 0xff,
+					     0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_sh3,uint,16,8) [] = { 0xffff, 0xffff, 0xffff, 0xffff,
+					     0xffff, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(expected_sh3,uint,32,4) [] = { 0xffffffff, 0xffffffff,
+					     0xffffffff, 0xffffffff };
+
+/* Expected values of cumulative_saturation flag with shift by max
+   amount.  */
+int VECT_VAR(expected_cumulative_sat_shmax,int,16,8) = 1;
+int VECT_VAR(expected_cumulative_sat_shmax,int,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat_shmax,int,64,2) = 1;
+int VECT_VAR(expected_cumulative_sat_shmax,uint,16,8) = 1;
+int VECT_VAR(expected_cumulative_sat_shmax,uint,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat_shmax,uint,64,2) = 1;
+
+/* Expected results with shift by max amount.  */
+VECT_VAR_DECL(expected_shmax,int,8,16) [] = { 0x7f, 0x7f, 0x7f, 0x7f,
+					      0x7f, 0x7f, 0x7f, 0x7f,
+					      0x7f, 0x7f, 0x7f, 0x7f,
+					      0x7f, 0x7f, 0x7f, 0x7f };
+VECT_VAR_DECL(expected_shmax,int,16,8) [] = { 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+					      0x7fff, 0x7fff, 0x7fff, 0x7fff };
+VECT_VAR_DECL(expected_shmax,int,32,4) [] = { 0x7fffffff, 0x7fffffff,
+					      0x7fffffff, 0x7fffffff };
+VECT_VAR_DECL(expected_shmax,uint,8,16) [] = { 0xff, 0xff, 0xff, 0xff,
+					       0xff, 0xff, 0xff, 0xff,
+					       0xff, 0xff, 0xff, 0xff,
+					       0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_shmax,uint,16,8) [] = { 0xffff, 0xffff, 0xffff, 0xffff,
+					       0xffff, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(expected_shmax,uint,32,4) [] = { 0xffffffff, 0xffffffff,
+					       0xffffffff, 0xffffffff };
+
+#define INSN vqrshrn_high_n
+#define TEST_MSG "VQRSHRN_HIGH_N"
+
+#define FNNAME1(NAME) void exec_ ## NAME (void)
+#define FNNAME(NAME) FNNAME1(NAME)
+
+FNNAME (INSN)
+{
+  /* Basic test: y=vqrshrn_high_n(x,v), then store the result.  */
+#define TEST_VQRSHRN_HIGH_N2(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT) \
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W2, N2));		\
+  VECT_VAR(vector_res, T1, W2, N2) =					\
+    INSN##_##T2##W(VECT_VAR(vector1, T1, W2, N),			\
+		   VECT_VAR(vector2, T1, W, N), V);			\
+  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2),				\
+		 VECT_VAR(vector_res, T1, W2, N2));			\
+  CHECK_CUMULATIVE_SAT(TEST_MSG, T1, W, N, EXPECTED_CUMULATIVE_SAT, CMT)
+
+  /* Two auxliary macros are necessary to expand INSN */
+#define TEST_VQRSHRN_HIGH_N1(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT) \
+  TEST_VQRSHRN_HIGH_N2(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT)
+
+#define TEST_VQRSHRN_HIGH_N(T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT) \
+  TEST_VQRSHRN_HIGH_N1(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT)
+
+
+  DECL_VARIABLE(vector1, int, 8, 8);
+  DECL_VARIABLE(vector1, int, 16, 4);
+  DECL_VARIABLE(vector1, int, 32, 2);
+  DECL_VARIABLE(vector1, uint, 8, 8);
+  DECL_VARIABLE(vector1, uint, 16, 4);
+  DECL_VARIABLE(vector1, uint, 32, 2);
+
+  /* vector is twice as large as vector_res.  */
+  DECL_VARIABLE(vector2, int, 16, 8);
+  DECL_VARIABLE(vector2, int, 32, 4);
+  DECL_VARIABLE(vector2, int, 64, 2);
+  DECL_VARIABLE(vector2, uint, 16, 8);
+  DECL_VARIABLE(vector2, uint, 32, 4);
+  DECL_VARIABLE(vector2, uint, 64, 2);
+
+  DECL_VARIABLE(vector_res, int, 8, 16);
+  DECL_VARIABLE(vector_res, int, 16, 8);
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 8, 16);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+
+  clean_results ();
+
+  VLOAD(vector1, buffer, , int, s, 8, 8);
+  VLOAD(vector1, buffer, , int, s, 16, 4);
+  VLOAD(vector1, buffer, , int, s, 32, 2);
+  VLOAD(vector1, buffer, , uint, u, 8, 8);
+  VLOAD(vector1, buffer, , uint, u, 16, 4);
+  VLOAD(vector1, buffer, , uint, u, 32, 2);
+
+  VLOAD(vector2, buffer, q, int, s, 16, 8);
+  VLOAD(vector2, buffer, q, int, s, 32, 4);
+  VLOAD(vector2, buffer, q, int, s, 64, 2);
+  VLOAD(vector2, buffer, q, uint, u, 16, 8);
+  VLOAD(vector2, buffer, q, uint, u, 32, 4);
+  VLOAD(vector2, buffer, q, uint, u, 64, 2);
+
+  /* Choose shift amount arbitrarily.  */
+#define CMT ""
+  TEST_VQRSHRN_HIGH_N(int, s, 16, 8, 8, 16, 1, expected_cumulative_sat, CMT);
+  TEST_VQRSHRN_HIGH_N(int, s, 32, 16, 4, 8, 1, expected_cumulative_sat, CMT);
+  TEST_VQRSHRN_HIGH_N(int, s, 64, 32, 2, 4, 2, expected_cumulative_sat, CMT);
+  TEST_VQRSHRN_HIGH_N(uint, u, 16, 8, 8, 16, 2, expected_cumulative_sat, CMT);
+  TEST_VQRSHRN_HIGH_N(uint, u, 32, 16, 4, 8, 3, expected_cumulative_sat, CMT);
+  TEST_VQRSHRN_HIGH_N(uint, u, 64, 32, 2, 4, 3, expected_cumulative_sat, CMT);
+
+  CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, CMT);
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, CMT);
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, CMT);
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, CMT);
+
+
+  /* Another set of tests, shifting max value by 3.  */
+  VDUP(vector1, , int, s, 8, 8, 0x7F);
+  VDUP(vector1, , int, s, 16, 4, 0x7FFF);
+  VDUP(vector1, , int, s, 32, 2, 0x7FFFFFFFLL);
+  VDUP(vector1, , uint, u, 8, 8, 0xFF);
+  VDUP(vector1, , uint, u, 16, 4, 0xFFFF);
+  VDUP(vector1, , uint, u, 32, 2, 0xFFFFFFFFULL);
+
+  VDUP(vector2, q, int, s, 16, 8, 0x7FFF);
+  VDUP(vector2, q, int, s, 32, 4, 0x7FFFFFFF);
+  VDUP(vector2, q, int, s, 64, 2, 0x7FFFFFFFFFFFFFFFLL);
+  VDUP(vector2, q, uint, u, 16, 8, 0xFFFF);
+  VDUP(vector2, q, uint, u, 32, 4, 0xFFFFFFFF);
+  VDUP(vector2, q, uint, u, 64, 2, 0xFFFFFFFFFFFFFFFFULL);
+
+#undef CMT
+#define CMT " (check saturation: shift by 3)"
+  TEST_VQRSHRN_HIGH_N(int, s, 16, 8, 8, 16, 3, expected_cumulative_sat_sh3, CMT);
+  TEST_VQRSHRN_HIGH_N(int, s, 32, 16, 4, 8, 3, expected_cumulative_sat_sh3, CMT);
+  TEST_VQRSHRN_HIGH_N(int, s, 64, 32, 2, 4, 3, expected_cumulative_sat_sh3, CMT);
+  TEST_VQRSHRN_HIGH_N(uint, u, 16, 8, 8, 16, 3, expected_cumulative_sat_sh3, CMT);
+  TEST_VQRSHRN_HIGH_N(uint, u, 32, 16, 4, 8, 3, expected_cumulative_sat_sh3, CMT);
+  TEST_VQRSHRN_HIGH_N(uint, u, 64, 32, 2, 4, 3, expected_cumulative_sat_sh3, CMT);
+
+  CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_sh3, CMT);
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_sh3, CMT);
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_sh3, CMT);
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_sh3, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_sh3, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_sh3, CMT);
+
+
+  /* Shift by max amount.  */
+#undef CMT
+#define CMT " (check saturation: shift by max)"
+  TEST_VQRSHRN_HIGH_N(int, s, 16, 8, 8, 16, 8, expected_cumulative_sat_shmax, CMT);
+  TEST_VQRSHRN_HIGH_N(int, s, 32, 16, 4, 8, 16, expected_cumulative_sat_shmax, CMT);
+  TEST_VQRSHRN_HIGH_N(int, s, 64, 32, 2, 4, 32, expected_cumulative_sat_shmax, CMT);
+  TEST_VQRSHRN_HIGH_N(uint, u, 16, 8, 8, 16, 8, expected_cumulative_sat_shmax, CMT);
+  TEST_VQRSHRN_HIGH_N(uint, u, 32, 16, 4, 8, 16, expected_cumulative_sat_shmax, CMT);
+  TEST_VQRSHRN_HIGH_N(uint, u, 64, 32, 2, 4, 32, expected_cumulative_sat_shmax, CMT);
+
+  CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_shmax, CMT);
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_shmax, CMT);
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_shmax, CMT);
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_shmax, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_shmax, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_shmax, CMT);
+}
+
+int main (void)
+{
+  exec_vqrshrn_high_n ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrshrun_high_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrshrun_high_n.c
new file mode 100644
index 00000000000..983d060d67d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrshrun_high_n.c
@@ -0,0 +1,225 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected values of cumulative_saturation flag with negative unput.  */
+int VECT_VAR(expected_cumulative_sat_neg,int,16,8) = 0;
+int VECT_VAR(expected_cumulative_sat_neg,int,32,4) = 0;
+int VECT_VAR(expected_cumulative_sat_neg,int,64,2) = 1;
+
+/* Expected results with negative input.  */
+VECT_VAR_DECL(expected_neg,uint,8,16) [] = { 0xfe, 0xfe, 0xfe, 0xfe,
+					     0xfe, 0xfe, 0xfe, 0xfe,
+					     0x0, 0x0, 0x0, 0x0,
+					     0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_neg,uint,16,8) [] = { 0xfffd, 0xfffd, 0xfffd, 0xfffd,
+					     0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_neg,uint,32,4) [] = { 0xfffffffc, 0xfffffffc, 0x0, 0x0 };
+
+/* Expected values of cumulative_saturation flag with max input value
+   shifted by 1.  */
+int VECT_VAR(expected_cumulative_sat_max_sh1,int,16,8) = 1;
+int VECT_VAR(expected_cumulative_sat_max_sh1,int,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat_max_sh1,int,64,2) = 1;
+
+/* Expected results with max input value shifted by 1.  */
+VECT_VAR_DECL(expected_max_sh1,uint,8,16) [] = { 0x7f, 0x7f, 0x7f, 0x7f,
+						 0x7f, 0x7f, 0x7f, 0x7f,
+						 0xff, 0xff, 0xff, 0xff,
+						 0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_max_sh1,uint,16,8) [] = { 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+						 0xffff, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(expected_max_sh1,uint,32,4) [] = { 0x7fffffff, 0x7fffffff,
+						 0xffffffff, 0xffffffff };
+
+/* Expected values of cumulative_saturation flag with max input value
+   shifted by max amount.  */
+int VECT_VAR(expected_cumulative_sat_max_shmax,int,16,8) = 0;
+int VECT_VAR(expected_cumulative_sat_max_shmax,int,32,4) = 0;
+int VECT_VAR(expected_cumulative_sat_max_shmax,int,64,2) = 0;
+
+/* Expected results with max input value shifted by max amount.  */
+VECT_VAR_DECL(expected_max_shmax,uint,8,16) [] = { 0x7f, 0x7f, 0x7f, 0x7f,
+						   0x7f, 0x7f, 0x7f, 0x7f,
+						   0x80, 0x80, 0x80, 0x80,
+						   0x80, 0x80, 0x80, 0x80 };
+VECT_VAR_DECL(expected_max_shmax,uint,16,8) [] = { 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+						   0x8000, 0x8000, 0x8000, 0x8000 };
+VECT_VAR_DECL(expected_max_shmax,uint,32,4) [] = { 0x7fffffff, 0x7fffffff,
+						   0x80000000, 0x80000000 };
+
+/* Expected values of cumulative_saturation flag with min input value
+   shifted by max amount.  */
+int VECT_VAR(expected_cumulative_sat_min_shmax,int,16,8) = 1;
+int VECT_VAR(expected_cumulative_sat_min_shmax,int,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat_min_shmax,int,64,2) = 1;
+
+/* Expected results with min input value shifted by max amount.  */
+VECT_VAR_DECL(expected_min_shmax,uint,8,16) [] = { 0x80, 0x80, 0x80, 0x80,
+						   0x80, 0x80, 0x80, 0x80,
+						   0x0, 0x0, 0x0, 0x0,
+						   0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_min_shmax,uint,16,8) [] = { 0x8000, 0x8000, 0x8000, 0x8000,
+						   0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_min_shmax,uint,32,4) [] = { 0x80000000, 0x80000000,
+						   0x0, 0x0 };
+
+/* Expected values of cumulative_saturation flag with inputs in usual
+   range.  */
+int VECT_VAR(expected_cumulative_sat,int,16,8) = 0;
+int VECT_VAR(expected_cumulative_sat,int,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat,int,64,2) = 0;
+
+/* Expected results with inputs in usual range.  */
+VECT_VAR_DECL(expected,uint,8,16) [] = { 0x12, 0x12, 0x12, 0x12,
+					 0x12, 0x12, 0x12, 0x12,
+					 0x49, 0x49, 0x49, 0x49,
+					 0x49, 0x49, 0x49, 0x49 };
+VECT_VAR_DECL(expected,uint,16,8) [] = { 0x4321, 0x4321, 0x4321, 0x4321,
+					 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xdeadbeef, 0xdeadbeef,
+					 0xdeadbf, 0xdeadbf };
+
+#define INSN vqrshrun_high_n
+#define TEST_MSG "VQRSHRUN_HIGH_N"
+
+#define FNNAME1(NAME) void exec_ ## NAME (void)
+#define FNNAME(NAME) FNNAME1(NAME)
+
+FNNAME (INSN)
+{
+  /* Basic test: y=vqrshrun_high_n(x,v), then store the result.  */
+#define TEST_VQRSHRUN_HIGH_N2(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT) \
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, uint, W2, N2));	\
+  VECT_VAR(vector_res, uint, W2, N2) =					\
+    INSN##_##T2##W(VECT_VAR(vector1, uint, W2, N),			\
+		   VECT_VAR(vector2, T1, W, N), V);			\
+  vst1q_u##W2(VECT_VAR(result, uint, W2, N2),				\
+	      VECT_VAR(vector_res, uint, W2, N2));			\
+  CHECK_CUMULATIVE_SAT(TEST_MSG, T1, W, N, EXPECTED_CUMULATIVE_SAT, CMT)
+
+  /* Two auxliary macros are necessary to expand INSN */
+#define TEST_VQRSHRUN_HIGH_N1(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT) \
+  TEST_VQRSHRUN_HIGH_N2(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT)
+
+#define TEST_VQRSHRUN_HIGH_N(T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT) \
+  TEST_VQRSHRUN_HIGH_N1(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT)
+
+
+  DECL_VARIABLE(vector1, uint, 8, 8);
+  DECL_VARIABLE(vector1, uint, 16, 4);
+  DECL_VARIABLE(vector1, uint, 32, 2);
+
+  /* vector is twice as large as vector_res.  */
+  DECL_VARIABLE(vector2, int, 16, 8);
+  DECL_VARIABLE(vector2, int, 32, 4);
+  DECL_VARIABLE(vector2, int, 64, 2);
+
+  DECL_VARIABLE(vector_res, uint, 8, 16);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+
+  clean_results ();
+
+  /* Fill input vector with negative values, to check saturation on
+     limits.  */
+  VDUP(vector1, , uint, u, 8, 8, -2);
+  VDUP(vector1, , uint, u, 16, 4, -3);
+  VDUP(vector1, , uint, u, 32, 2, -4);
+
+  VDUP(vector2, q, int, s, 16, 8, -2);
+  VDUP(vector2, q, int, s, 32, 4, -3);
+  VDUP(vector2, q, int, s, 64, 2, -4);
+
+  /* Choose shift amount arbitrarily.   */
+#define CMT " (negative input)"
+  TEST_VQRSHRUN_HIGH_N(int, s, 16, 8, 8, 16, 3, expected_cumulative_sat_neg, CMT);
+  TEST_VQRSHRUN_HIGH_N(int, s, 32, 16, 4, 8, 4, expected_cumulative_sat_neg, CMT);
+  TEST_VQRSHRUN_HIGH_N(int, s, 64, 32, 2, 4, 2, expected_cumulative_sat_neg, CMT);
+
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_neg, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_neg, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_neg, CMT);
+
+
+  /* Fill input vector with max value, to check saturation on
+     limits.  */
+  VDUP(vector1, , uint, u, 8, 8, 0x7F);
+  VDUP(vector1, , uint, u, 16, 4, 0x7FFF);
+  VDUP(vector1, , uint, u, 32, 2, 0x7FFFFFFFLL);
+
+  VDUP(vector2, q, int, s, 16, 8, 0x7FFF);
+  VDUP(vector2, q, int, s, 32, 4, 0x7FFFFFFF);
+  VDUP(vector2, q, int, s, 64, 2, 0x7FFFFFFFFFFFFFFFLL);
+
+  /* shift by 1.  */
+#undef CMT
+#define CMT " (check cumulative saturation: shift by 1)"
+  TEST_VQRSHRUN_HIGH_N(int, s, 16, 8, 8, 16, 1, expected_cumulative_sat_max_sh1, CMT);
+  TEST_VQRSHRUN_HIGH_N(int, s, 32, 16, 4, 8, 1, expected_cumulative_sat_max_sh1, CMT);
+  TEST_VQRSHRUN_HIGH_N(int, s, 64, 32, 2, 4, 1, expected_cumulative_sat_max_sh1, CMT);
+
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_sh1, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_sh1, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_sh1, CMT);
+
+
+  /* shift by max.  */
+#undef CMT
+#define CMT " (check cumulative saturation: shift by max, positive input)"
+  TEST_VQRSHRUN_HIGH_N(int, s, 16, 8, 8, 16, 8, expected_cumulative_sat_max_shmax, CMT);
+  TEST_VQRSHRUN_HIGH_N(int, s, 32, 16, 4, 8, 16, expected_cumulative_sat_max_shmax, CMT);
+  TEST_VQRSHRUN_HIGH_N(int, s, 64, 32, 2, 4, 32, expected_cumulative_sat_max_shmax, CMT);
+
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_shmax, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_shmax, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_shmax, CMT);
+
+
+  /* Fill input vector with min value, to check saturation on limits.  */
+  VDUP(vector1, , uint, u, 8, 8, 0x80);
+  VDUP(vector1, , uint, u, 16, 4, 0x8000);
+  VDUP(vector1, , uint, u, 32, 2, 0x80000000LL);
+
+  VDUP(vector2, q, int, s, 16, 8, 0x8000);
+  VDUP(vector2, q, int, s, 32, 4, 0x80000000);
+  VDUP(vector2, q, int, s, 64, 2, 0x8000000000000000LL);
+
+  /* shift by max  */
+#undef CMT
+#define CMT " (check cumulative saturation: shift by max, negative input)"
+  TEST_VQRSHRUN_HIGH_N(int, s, 16, 8, 8, 16, 8, expected_cumulative_sat_min_shmax, CMT);
+  TEST_VQRSHRUN_HIGH_N(int, s, 32, 16, 4, 8, 16, expected_cumulative_sat_min_shmax, CMT);
+  TEST_VQRSHRUN_HIGH_N(int, s, 64, 32, 2, 4, 32, expected_cumulative_sat_min_shmax, CMT);
+
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_min_shmax, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_min_shmax, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_min_shmax, CMT);
+
+
+  /* Fill input vector with positive values, to check normal case.  */
+  VDUP(vector1, , uint, u, 8, 8, 0x12);
+  VDUP(vector1, , uint, u, 16, 4, 0x4321);
+  VDUP(vector1, , uint, u, 32, 2, 0xDEADBEEF);
+
+  VDUP(vector2, q, int, s, 16, 8, 0x1234);
+  VDUP(vector2, q, int, s, 32, 4, 0x87654321);
+  VDUP(vector2, q, int, s, 64, 2, 0xDEADBEEF);
+
+  /* shift arbitrary amount.  */
+#undef CMT
+#define CMT ""
+  TEST_VQRSHRUN_HIGH_N(int, s, 16, 8, 8, 16, 6, expected_cumulative_sat, CMT);
+  TEST_VQRSHRUN_HIGH_N(int, s, 32, 16, 4, 8, 7, expected_cumulative_sat, CMT);
+  TEST_VQRSHRUN_HIGH_N(int, s, 64, 32, 2, 4, 8, expected_cumulative_sat, CMT);
+
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, CMT);
+}
+
+int main (void)
+{
+  exec_vqrshrun_high_n ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqshrn_high_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqshrn_high_n.c
new file mode 100644
index 00000000000..fa9b1713bd2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqshrn_high_n.c
@@ -0,0 +1,218 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected values of cumulative_saturation flag.  */
+int VECT_VAR(expected_cumulative_sat,int,16,8) = 0;
+int VECT_VAR(expected_cumulative_sat,int,32,4) = 0;
+int VECT_VAR(expected_cumulative_sat,int,64,2) = 0;
+int VECT_VAR(expected_cumulative_sat,uint,16,8) = 1;
+int VECT_VAR(expected_cumulative_sat,uint,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat,uint,64,2) = 1;
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+					0xf4, 0xf5, 0xf6, 0xf7,
+					0xf8, 0xf8, 0xf9, 0xf9,
+					0xfa, 0xfa, 0xfb, 0xfb };
+VECT_VAR_DECL(expected,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
+					0xfff8, 0xfff8, 0xfff9, 0xfff9 };
+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffffff0, 0xfffffff1,
+					0xfffffffc, 0xfffffffc };
+VECT_VAR_DECL(expected,uint,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+					 0xf4, 0xf5, 0xf6, 0xf7,
+					 0xff, 0xff, 0xff, 0xff,
+					 0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
+					 0xffff, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
+					 0xffffffff, 0xffffffff };
+
+/* Expected values of cumulative_saturation flag with max input value
+   shifted by 3.  */
+int VECT_VAR(expected_cumulative_sat_max_sh3,int,16,8) = 1;
+int VECT_VAR(expected_cumulative_sat_max_sh3,int,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat_max_sh3,int,64,2) = 1;
+int VECT_VAR(expected_cumulative_sat_max_sh3,uint,16,8) = 1;
+int VECT_VAR(expected_cumulative_sat_max_sh3,uint,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat_max_sh3,uint,64,2) = 1;
+
+/* Expected results with max input value shifted by 3.  */
+VECT_VAR_DECL(expected_max_sh3,int,8,16) [] = { 0x7f, 0x7f, 0x7f, 0x7f,
+						0x7f, 0x7f, 0x7f, 0x7f,
+						0x7f, 0x7f, 0x7f, 0x7f,
+						0x7f, 0x7f, 0x7f, 0x7f };
+VECT_VAR_DECL(expected_max_sh3,int,16,8) [] = { 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+						0x7fff, 0x7fff, 0x7fff, 0x7fff };
+VECT_VAR_DECL(expected_max_sh3,int,32,4) [] = { 0x7fffffff, 0x7fffffff,
+						0x7fffffff, 0x7fffffff };
+VECT_VAR_DECL(expected_max_sh3,uint,8,16) [] = { 0xff, 0xff, 0xff, 0xff,
+						 0xff, 0xff, 0xff, 0xff,
+						 0xff, 0xff, 0xff, 0xff,
+						 0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_max_sh3,uint,16,8) [] = { 0xffff, 0xffff, 0xffff, 0xffff,
+						 0xffff, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(expected_max_sh3,uint,32,4) [] = { 0xffffffff, 0xffffffff,
+						 0xffffffff, 0xffffffff };
+
+/* Expected values of cumulative_saturation flag with max input value
+   shifted by type size.  */
+int VECT_VAR(expected_cumulative_sat_max_shmax,int,16,8) = 0;
+int VECT_VAR(expected_cumulative_sat_max_shmax,int,32,4) = 0;
+int VECT_VAR(expected_cumulative_sat_max_shmax,int,64,2) = 0;
+int VECT_VAR(expected_cumulative_sat_max_shmax,uint,16,8) = 0;
+int VECT_VAR(expected_cumulative_sat_max_shmax,uint,32,4) = 0;
+int VECT_VAR(expected_cumulative_sat_max_shmax,uint,64,2) = 0;
+
+/* Expected results with max input value shifted by type size.  */
+VECT_VAR_DECL(expected_max_shmax,int,8,16) [] = { 0x7f, 0x7f, 0x7f, 0x7f,
+						  0x7f, 0x7f, 0x7f, 0x7f,
+						  0x7f, 0x7f, 0x7f, 0x7f,
+						  0x7f, 0x7f, 0x7f, 0x7f };
+VECT_VAR_DECL(expected_max_shmax,int,16,8) [] = { 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+						  0x7fff, 0x7fff, 0x7fff, 0x7fff };
+VECT_VAR_DECL(expected_max_shmax,int,32,4) [] = { 0x7fffffff, 0x7fffffff,
+						  0x7fffffff, 0x7fffffff };
+VECT_VAR_DECL(expected_max_shmax,uint,8,16) [] = { 0xff, 0xff, 0xff, 0xff,
+						   0xff, 0xff, 0xff, 0xff,
+						   0xff, 0xff, 0xff, 0xff,
+						   0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_max_shmax,uint,16,8) [] = { 0xffff, 0xffff, 0xffff, 0xffff,
+						   0xffff, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(expected_max_shmax,uint,32,4) [] = { 0xffffffff, 0xffffffff,
+						   0xffffffff, 0xffffffff };
+
+#define INSN vqshrn_high_n
+#define TEST_MSG "VQSHRN_HIGH_N"
+
+#define FNNAME1(NAME) void exec_ ## NAME (void)
+#define FNNAME(NAME) FNNAME1(NAME)
+
+FNNAME (INSN)
+{
+  /* Basic test: y=vqshrn_high_n(x1,x2,v), then store the result.  */
+#define TEST_VQSHRN_HIGH_N2(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT) \
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W2, N2));		\
+  VECT_VAR(vector_res, T1, W2, N2) =					\
+    INSN##_##T2##W(VECT_VAR(vector1, T1, W2, N),			\
+		   VECT_VAR(vector2, T1, W, N), V);			\
+  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2),				\
+		 VECT_VAR(vector_res, T1, W2, N2));			\
+  CHECK_CUMULATIVE_SAT(TEST_MSG, T1, W, N, EXPECTED_CUMULATIVE_SAT, CMT)
+
+  /* Two auxliary macros are necessary to expand INSN */
+#define TEST_VQSHRN_HIGH_N1(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT) \
+  TEST_VQSHRN_HIGH_N2(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT)
+
+#define TEST_VQSHRN_HIGH_N(T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT) \
+  TEST_VQSHRN_HIGH_N1(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT)
+
+
+  DECL_VARIABLE(vector1, int, 8, 8);
+  DECL_VARIABLE(vector1, int, 16, 4);
+  DECL_VARIABLE(vector1, int, 32, 2);
+  DECL_VARIABLE(vector1, uint, 8, 8);
+  DECL_VARIABLE(vector1, uint, 16, 4);
+  DECL_VARIABLE(vector1, uint, 32, 2);
+
+  /* vector is twice as large as vector_res.  */
+  DECL_VARIABLE(vector2, int, 16, 8);
+  DECL_VARIABLE(vector2, int, 32, 4);
+  DECL_VARIABLE(vector2, int, 64, 2);
+  DECL_VARIABLE(vector2, uint, 16, 8);
+  DECL_VARIABLE(vector2, uint, 32, 4);
+  DECL_VARIABLE(vector2, uint, 64, 2);
+
+  DECL_VARIABLE(vector_res, int, 8, 16);
+  DECL_VARIABLE(vector_res, int, 16, 8);
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 8, 16);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+
+  clean_results ();
+
+  VLOAD(vector1, buffer, , int, s, 8, 8);
+  VLOAD(vector1, buffer, , int, s, 16, 4);
+  VLOAD(vector1, buffer, , int, s, 32, 2);
+  VLOAD(vector1, buffer, , uint, u, 8, 8);
+  VLOAD(vector1, buffer, , uint, u, 16, 4);
+  VLOAD(vector1, buffer, , uint, u, 32, 2);
+
+  VLOAD(vector2, buffer, q, int, s, 16, 8);
+  VLOAD(vector2, buffer, q, int, s, 32, 4);
+  VLOAD(vector2, buffer, q, int, s, 64, 2);
+  VLOAD(vector2, buffer, q, uint, u, 16, 8);
+  VLOAD(vector2, buffer, q, uint, u, 32, 4);
+  VLOAD(vector2, buffer, q, uint, u, 64, 2);
+
+  /* Choose shift amount arbitrarily.  */
+#define CMT ""
+  TEST_VQSHRN_HIGH_N(int, s, 16, 8, 8, 16, 1, expected_cumulative_sat, CMT);
+  TEST_VQSHRN_HIGH_N(int, s, 32, 16, 4, 8, 1, expected_cumulative_sat, CMT);
+  TEST_VQSHRN_HIGH_N(int, s, 64, 32, 2, 4, 2, expected_cumulative_sat, CMT);
+  TEST_VQSHRN_HIGH_N(uint, u, 16, 8, 8, 16, 2, expected_cumulative_sat, CMT);
+  TEST_VQSHRN_HIGH_N(uint, u, 32, 16, 4, 8, 3, expected_cumulative_sat, CMT);
+  TEST_VQSHRN_HIGH_N(uint, u, 64, 32, 2, 4, 3, expected_cumulative_sat, CMT);
+
+  CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, CMT);
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, CMT);
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, CMT);
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, CMT);
+
+  /* Use max possible value as input.  */
+  VDUP(vector1, , int, s, 8, 8, 0x7F);
+  VDUP(vector1, , int, s, 16, 4, 0x7FFF);
+  VDUP(vector1, , int, s, 32, 2, 0x7FFFFFFFLL);
+  VDUP(vector1, , uint, u, 8, 8, 0xFF);
+  VDUP(vector1, , uint, u, 16, 4, 0xFFFF);
+  VDUP(vector1, , uint, u, 32, 2, 0xFFFFFFFFULL);
+
+  VDUP(vector2, q, int, s, 16, 8, 0x7FFF);
+  VDUP(vector2, q, int, s, 32, 4, 0x7FFFFFFF);
+  VDUP(vector2, q, int, s, 64, 2, 0x7FFFFFFFFFFFFFFFLL);
+  VDUP(vector2, q, uint, u, 16, 8, 0xFFFF);
+  VDUP(vector2, q, uint, u, 32, 4, 0xFFFFFFFF);
+  VDUP(vector2, q, uint, u, 64, 2, 0xFFFFFFFFFFFFFFFFULL);
+
+#undef CMT
+#define CMT " (check saturation: shift by 3)"
+  TEST_VQSHRN_HIGH_N(int, s, 16, 8, 8, 16, 3, expected_cumulative_sat_max_sh3, CMT);
+  TEST_VQSHRN_HIGH_N(int, s, 32, 16, 4, 8, 3, expected_cumulative_sat_max_sh3, CMT);
+  TEST_VQSHRN_HIGH_N(int, s, 64, 32, 2, 4, 3, expected_cumulative_sat_max_sh3, CMT);
+  TEST_VQSHRN_HIGH_N(uint, u, 16, 8, 8, 16, 3, expected_cumulative_sat_max_sh3, CMT);
+  TEST_VQSHRN_HIGH_N(uint, u, 32, 16, 4, 8, 3, expected_cumulative_sat_max_sh3, CMT);
+  TEST_VQSHRN_HIGH_N(uint, u, 64, 32, 2, 4, 3, expected_cumulative_sat_max_sh3, CMT);
+
+  CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_max_sh3, CMT);
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_max_sh3, CMT);
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_max_sh3, CMT);
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_sh3, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_sh3, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_sh3, CMT);
+
+
+#undef CMT
+#define CMT " (check saturation: shift by max)"
+  TEST_VQSHRN_HIGH_N(int, s, 16, 8, 8, 16, 8, expected_cumulative_sat_max_shmax, CMT);
+  TEST_VQSHRN_HIGH_N(int, s, 32, 16, 4, 8, 16, expected_cumulative_sat_max_shmax, CMT);
+  TEST_VQSHRN_HIGH_N(int, s, 64, 32, 2, 4, 32, expected_cumulative_sat_max_shmax, CMT);
+  TEST_VQSHRN_HIGH_N(uint, u, 16, 8, 8, 16, 8, expected_cumulative_sat_max_shmax, CMT);
+  TEST_VQSHRN_HIGH_N(uint, u, 32, 16, 4, 8, 16, expected_cumulative_sat_max_shmax, CMT);
+  TEST_VQSHRN_HIGH_N(uint, u, 64, 32, 2, 4, 32, expected_cumulative_sat_max_shmax, CMT);
+
+  CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_max_shmax, CMT);
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_max_shmax, CMT);
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_max_shmax, CMT);
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_shmax, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_shmax, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_shmax, CMT);
+}
+
+int main (void)
+{
+  exec_vqshrn_high_n ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqshrun_high_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqshrun_high_n.c
new file mode 100644
index 00000000000..61ad832486a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqshrun_high_n.c
@@ -0,0 +1,159 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected values of cumulative_saturation flag with negative input.  */
+int VECT_VAR(expected_cumulative_sat_neg,int,16,8) = 1;
+int VECT_VAR(expected_cumulative_sat_neg,int,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat_neg,int,64,2) = 1;
+
+/* Expected results with negative input.  */
+VECT_VAR_DECL(expected_neg,uint,8,16) [] = { 0xfe, 0xfe, 0xfe, 0xfe,
+					     0xfe, 0xfe, 0xfe, 0xfe,
+					     0x0, 0x0, 0x0, 0x0,
+					     0x0,0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_neg,uint,16,8) [] = { 0xfffd, 0xfffd, 0xfffd, 0xfffd,
+					     0x0, 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_neg,uint,32,4) [] = { 0xfffffffc, 0xfffffffc,
+					     0x0, 0x0 };
+
+/* Expected values of cumulative_saturation flag with max input value
+   shifted by 1.  */
+int VECT_VAR(expected_cumulative_sat_max_sh1,int,16,8) = 1;
+int VECT_VAR(expected_cumulative_sat_max_sh1,int,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat_max_sh1,int,64,2) = 1;
+
+/* Expected results with max input value shifted by 1.  */
+VECT_VAR_DECL(expected_max_sh1,uint,8,16) [] = { 0x7f, 0x7f, 0x7f, 0x7f,
+						 0x7f, 0x7f, 0x7f, 0x7f,
+						 0xff, 0xff, 0xff, 0xff,
+						 0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_max_sh1,uint,16,8) [] = { 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+						 0xffff, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(expected_max_sh1,uint,32,4) [] = { 0x7fffffff, 0x7fffffff,
+						 0xffffffff, 0xffffffff };
+
+/* Expected values of cumulative_saturation flag.  */
+int VECT_VAR(expected_cumulative_sat,int,16,8) = 0;
+int VECT_VAR(expected_cumulative_sat,int,32,4) = 1;
+int VECT_VAR(expected_cumulative_sat,int,64,2) = 0;
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,uint,8,16) [] = { 0x12, 0x12, 0x12, 0x12,
+					 0x12, 0x12, 0x12, 0x12,
+					 0x48, 0x48, 0x48, 0x48,
+					 0x48, 0x48, 0x48, 0x48 };
+VECT_VAR_DECL(expected,uint,16,8) [] = { 0x4321, 0x4321, 0x4321, 0x4321,
+					 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xdeadbeef, 0xdeadbeef,
+					 0xdeadbe, 0xdeadbe };
+
+
+#define INSN vqshrun_high_n
+#define TEST_MSG "VQSHRUN_HIGH_N"
+
+#define FNNAME1(NAME) void exec_ ## NAME (void)
+#define FNNAME(NAME) FNNAME1(NAME)
+
+FNNAME (INSN)
+{
+  /* Basic test: y=vqshrun_high_n(x,v), then store the result.  */
+#define TEST_VQSHRUN_HIGH_N2(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT) \
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, uint, W2, N2));	\
+  VECT_VAR(vector_res, uint, W2, N2) =					\
+    INSN##_##T2##W(VECT_VAR(vector1,uint, W2, N),			\
+		   VECT_VAR(vector2, T1, W, N), V);			\
+  vst1q_u##W2(VECT_VAR(result, uint, W2, N2),				\
+	      VECT_VAR(vector_res, uint, W2, N2));			\
+  CHECK_CUMULATIVE_SAT(TEST_MSG, T1, W, N, EXPECTED_CUMULATIVE_SAT, CMT)
+
+  /* Two auxliary macros are necessary to expand INSN */
+#define TEST_VQSHRUN_HIGH_N1(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT) \
+  TEST_VQSHRUN_HIGH_N2(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT)
+
+#define TEST_VQSHRUN_HIGH_N(T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT) \
+  TEST_VQSHRUN_HIGH_N1(INSN, T1, T2, W, W2, N, N2, V, EXPECTED_CUMULATIVE_SAT, CMT)
+
+
+  DECL_VARIABLE(vector1, uint, 8, 8);
+  DECL_VARIABLE(vector1, uint, 16, 4);
+  DECL_VARIABLE(vector1, uint, 32, 2);
+
+  /* vector is twice as large as vector_res.  */
+  DECL_VARIABLE(vector2, int, 16, 8);
+  DECL_VARIABLE(vector2, int, 32, 4);
+  DECL_VARIABLE(vector2, int, 64, 2);
+
+  DECL_VARIABLE(vector_res, uint, 8, 16);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+
+  clean_results ();
+
+  /* Fill input vector with negative values, to check saturation on
+     limits.  */
+  VDUP(vector1, , uint, u, 8, 8, -2);
+  VDUP(vector1, , uint, u, 16, 4, -3);
+  VDUP(vector1, , uint, u, 32, 2, -4);
+
+  VDUP(vector2, q, int, s, 16, 8, -2);
+  VDUP(vector2, q, int, s, 32, 4, -3);
+  VDUP(vector2, q, int, s, 64, 2, -4);
+
+  /* Choose shift amount arbitrarily.  */
+#define CMT " (negative input)"
+  TEST_VQSHRUN_HIGH_N(int, s, 16, 8, 8, 16, 3, expected_cumulative_sat_neg, CMT);
+  TEST_VQSHRUN_HIGH_N(int, s, 32, 16, 4, 8, 4, expected_cumulative_sat_neg, CMT);
+  TEST_VQSHRUN_HIGH_N(int, s, 64, 32, 2, 4, 2, expected_cumulative_sat_neg, CMT);
+
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_neg, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_neg, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_neg, CMT);
+
+
+  /* Fill input vector with max value, to check saturation on
+     limits.  */
+  VDUP(vector1, , uint, u, 8, 8, 0x7F);
+  VDUP(vector1, , uint, u, 16, 4, 0x7FFF);
+  VDUP(vector1, , uint, u, 32, 2, 0x7FFFFFFFLL);
+
+  VDUP(vector2, q, int, s, 16, 8, 0x7FFF);
+  VDUP(vector2, q, int, s, 32, 4, 0x7FFFFFFF);
+  VDUP(vector2, q, int, s, 64, 2, 0x7FFFFFFFFFFFFFFFLL);
+
+#undef CMT
+#define CMT " (check cumulative saturation)"
+  TEST_VQSHRUN_HIGH_N(int, s, 16, 8, 8, 16, 1, expected_cumulative_sat_max_sh1, CMT);
+  TEST_VQSHRUN_HIGH_N(int, s, 32, 16, 4, 8, 1, expected_cumulative_sat_max_sh1, CMT);
+  TEST_VQSHRUN_HIGH_N(int, s, 64, 32, 2, 4, 1, expected_cumulative_sat_max_sh1, CMT);
+
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_sh1, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_sh1, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_sh1, CMT);
+
+
+  /* Fill input vector with positive values, to check normal case.  */
+  VDUP(vector1, , uint, u, 8, 8, 0x12);
+  VDUP(vector1, , uint, u, 16, 4, 0x4321);
+  VDUP(vector1, , uint, u, 32, 2, 0xDEADBEEF);
+
+  VDUP(vector2, q, int, s, 16, 8, 0x1234);
+  VDUP(vector2, q, int, s, 32, 4, 0x87654321);
+  VDUP(vector2, q, int, s, 64, 2, 0xDEADBEEF);
+
+#undef CMT
+#define CMT ""
+  TEST_VQSHRUN_HIGH_N(int, s, 16, 8, 8, 16, 6, expected_cumulative_sat, CMT);
+  TEST_VQSHRUN_HIGH_N(int, s, 32, 16, 4, 8, 7, expected_cumulative_sat, CMT);
+  TEST_VQSHRUN_HIGH_N(int, s, 64, 32, 2, 4, 8, expected_cumulative_sat, CMT);
+
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, CMT);
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, CMT);
+}
+
+int main (void)
+{
+  exec_vqshrun_high_n ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c b/gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c
index 8b8a6302692..07d78030058 100644
--- a/gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c
+++ b/gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c
@@ -113,12 +113,12 @@ ONE (vmovn_high, uint32x4_t, uint32x2_t, uint64x2_t, u64)
 /* { dg-final { scan-assembler-times "raddhn2\\tv" 6} }  */
 /* { dg-final { scan-assembler-times "\\trshrn2 v" 6} }  */
 /* { dg-final { scan-assembler-times "\\tshrn2 v" 6} }  */
-/* { dg-final { scan-assembler-times "sqshrun2 v" 3} }  */
-/* { dg-final { scan-assembler-times "sqrshrun2 v" 3} }  */
-/* { dg-final { scan-assembler-times "sqshrn2 v" 3} }  */
-/* { dg-final { scan-assembler-times "uqshrn2 v" 3} }  */
-/* { dg-final { scan-assembler-times "sqrshrn2 v" 3} }  */
-/* { dg-final { scan-assembler-times "uqrshrn2 v" 3} }  */
+/* { dg-final { scan-assembler-times "sqshrun2\\tv" 3} }  */
+/* { dg-final { scan-assembler-times "sqrshrun2\\tv" 3} }  */
+/* { dg-final { scan-assembler-times "sqshrn2\\tv" 3} }  */
+/* { dg-final { scan-assembler-times "uqshrn2\\tv" 3} }  */
+/* { dg-final { scan-assembler-times "sqrshrn2\\tv" 3} }  */
+/* { dg-final { scan-assembler-times "uqrshrn2\\tv" 3} }  */
 /* { dg-final { scan-assembler-times "uqxtn2 v" 3} }  */
 /* { dg-final { scan-assembler-times "sqxtn2 v" 3} }  */
 /* { dg-final { scan-assembler-times "sqxtun2 v" 3} }  */

[PATCH][AArch64] Use intrinsics for upper saturating shift right

Reply via email to