Implement vst2q, vst4q, vld2q and vld4q using the new MVE builtins
framework.

Since MVE uses different tuple modes than Neon, we need to use
VALID_MVE_STRUCT_MODE because VALID_NEON_STRUCT_MODE is no longer a
super-set of it, for instance in output_move_neon and
arm_print_operand_address.

In arm_hard_regno_mode_ok, the change is similar but a bit more
intrusive.

Expand the VSTRUCT iterator, so that mov<mode> and neon_mov<mode>
patterns from neon.md still work for MVE.

Besides the small updates to the patterns in mve.md, we have to update
vec_load_lanes and vec_store_lanes in vec-common.md so that the
vectorizer can handle the new modes. These patterns are now different
from Neon's, so maybe we should move them back to neon.md and mve.md

The patch adds arm_array_mode, which is used by build_array_type_nelts
and makes it possible to support the new assert in
register_builtin_tuple_types.

gcc/ChangeLog:

        * config/arm/arm-mve-builtins-base.cc (class vst24_impl): New.
        (class vld24_impl): New.
        (vld2q, vld4q, vst2q, vst4q): New.
        * config/arm/arm-mve-builtins-base.def (vld2q, vld4q, vst2q)
        (vst4q): New.
        * config/arm/arm-mve-builtins-base.h (vld2q, vld4q, vst2q, vst4q):
        New.
        * config/arm/arm-mve-builtins.cc (register_builtin_tuple_types):
        Add more asserts.
        * config/arm/arm.cc (TARGET_ARRAY_MODE): New.
        (output_move_neon): Handle MVE struct modes.
        (arm_print_operand_address): Likewise.
        (arm_hard_regno_mode_ok): Likewise.
        (arm_array_mode): New.
        * config/arm/arm.h (VALID_MVE_STRUCT_MODE): Likewise.
        * config/arm/arm_mve.h (vst4q): Delete.
        (vst2q): Delete.
        (vld2q): Delete.
        (vld4q): Delete.
        (vst4q_s8): Delete.
        (vst4q_s16): Delete.
        (vst4q_s32): Delete.
        (vst4q_u8): Delete.
        (vst4q_u16): Delete.
        (vst4q_u32): Delete.
        (vst4q_f16): Delete.
        (vst4q_f32): Delete.
        (vst2q_s8): Delete.
        (vst2q_u8): Delete.
        (vld2q_s8): Delete.
        (vld2q_u8): Delete.
        (vld4q_s8): Delete.
        (vld4q_u8): Delete.
        (vst2q_s16): Delete.
        (vst2q_u16): Delete.
        (vld2q_s16): Delete.
        (vld2q_u16): Delete.
        (vld4q_s16): Delete.
        (vld4q_u16): Delete.
        (vst2q_s32): Delete.
        (vst2q_u32): Delete.
        (vld2q_s32): Delete.
        (vld2q_u32): Delete.
        (vld4q_s32): Delete.
        (vld4q_u32): Delete.
        (vld4q_f16): Delete.
        (vld2q_f16): Delete.
        (vst2q_f16): Delete.
        (vld4q_f32): Delete.
        (vld2q_f32): Delete.
        (vst2q_f32): Delete.
        (__arm_vst4q_s8): Delete.
        (__arm_vst4q_s16): Delete.
        (__arm_vst4q_s32): Delete.
        (__arm_vst4q_u8): Delete.
        (__arm_vst4q_u16): Delete.
        (__arm_vst4q_u32): Delete.
        (__arm_vst2q_s8): Delete.
        (__arm_vst2q_u8): Delete.
        (__arm_vld2q_s8): Delete.
        (__arm_vld2q_u8): Delete.
        (__arm_vld4q_s8): Delete.
        (__arm_vld4q_u8): Delete.
        (__arm_vst2q_s16): Delete.
        (__arm_vst2q_u16): Delete.
        (__arm_vld2q_s16): Delete.
        (__arm_vld2q_u16): Delete.
        (__arm_vld4q_s16): Delete.
        (__arm_vld4q_u16): Delete.
        (__arm_vst2q_s32): Delete.
        (__arm_vst2q_u32): Delete.
        (__arm_vld2q_s32): Delete.
        (__arm_vld2q_u32): Delete.
        (__arm_vld4q_s32): Delete.
        (__arm_vld4q_u32): Delete.
        (__arm_vst4q_f16): Delete.
        (__arm_vst4q_f32): Delete.
        (__arm_vld4q_f16): Delete.
        (__arm_vld2q_f16): Delete.
        (__arm_vst2q_f16): Delete.
        (__arm_vld4q_f32): Delete.
        (__arm_vld2q_f32): Delete.
        (__arm_vst2q_f32): Delete.
        (__arm_vst4q): Delete.
        (__arm_vst2q): Delete.
        (__arm_vld2q): Delete.
        (__arm_vld4q): Delete.
        * config/arm/arm_mve_builtins.def (vst4q, vst2q, vld4q, vld2q):
        Delete.
        * config/arm/iterators.md (VSTRUCT): Add V2x16QI, V2x8HI, V2x4SI,
        V2x8HF, V2x4SF, V4x16QI, V4x8HI, V4x4SI, V4x8HF, V4x4SF.
        (MVE_VLD2_VST2, MVE_vld2_vst2, MVE_VLD4_VST4, MVE_vld4_vst4): New.
        * config/arm/mve.md (mve_vst4q<mode>): Update into ...
        (@mve_vst4q<mode>): ... this.
        (mve_vst2q<mode>): Update into ...
        (@mve_vst2q<mode>): ... this.
        (mve_vld2q<mode>): Update into ...
        (@mve_vld2q<mode>): ... this.
        (mve_vld4q<mode>): Update into ...
        (@mve_vld4q<mode>): ... this.
        * config/arm/vec-common.md (vec_load_lanesoi<mode>) Remove MVE
        support.
        (vec_load_lanesxi<mode>): Likewise.
        (vec_store_lanesoi<mode>): Likewise.
        (vec_store_lanesxi<mode>): Likewise.
        (vec_load_lanes<MVE_vld2_vst2><mode>):
        New.
        (vec_store_lanes<MVE_vld2_vst2><mode>): New.
        (vec_load_lanes<MVE_vld4_vst4><mode>): New.
        (vec_store_lanes<MVE_vld4_vst4><mode>): New.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |  71 +++
 gcc/config/arm/arm-mve-builtins-base.def |   8 +
 gcc/config/arm/arm-mve-builtins-base.h   |   4 +
 gcc/config/arm/arm-mve-builtins.cc       |   6 +-
 gcc/config/arm/arm.cc                    |  43 +-
 gcc/config/arm/arm.h                     |  13 +-
 gcc/config/arm/arm_mve.h                 | 628 -----------------------
 gcc/config/arm/arm_mve_builtins.def      |   4 -
 gcc/config/arm/iterators.md              |  36 +-
 gcc/config/arm/mve.md                    |  47 +-
 gcc/config/arm/vec-common.md             |  76 ++-
 11 files changed, 253 insertions(+), 683 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 737403527a9..723004b53d7 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -1100,6 +1100,73 @@ public:
   }
 };
 
+
+/* Implements vst2 and vst4.  */
+class vst24_impl : public full_width_access
+{
+public:
+  using full_width_access::full_width_access;
+
+  unsigned int
+  call_properties (const function_instance &) const override
+  {
+    return CP_WRITE_MEMORY;
+  }
+
+  rtx
+  expand (function_expander &e) const override
+  {
+    insn_code icode;
+    switch (vectors_per_tuple ())
+      {
+      case 2:
+       icode = code_for_mve_vst2q (e.vector_mode (0));
+       break;
+
+      case 4:
+       icode = code_for_mve_vst4q (e.vector_mode (0));
+       break;
+
+      default:
+       gcc_unreachable ();
+      }
+    return e.use_contiguous_store_insn (icode);
+  }
+};
+
+/* Implements vld2 and vld4.  */
+class vld24_impl : public full_width_access
+{
+public:
+  using full_width_access::full_width_access;
+
+  unsigned int
+  call_properties (const function_instance &) const override
+  {
+    return CP_READ_MEMORY;
+  }
+
+  rtx
+  expand (function_expander &e) const override
+  {
+    insn_code icode;
+    switch (vectors_per_tuple ())
+      {
+      case 2:
+       icode = code_for_mve_vld2q (e.vector_mode (0));
+       break;
+
+      case 4:
+       icode = code_for_mve_vld4q (e.vector_mode (0));
+       break;
+
+      default:
+       gcc_unreachable ();
+      }
+    return e.use_contiguous_load_insn (icode);
+  }
+};
+
 } /* end anonymous namespace */
 
 namespace arm_mve {
@@ -1326,6 +1393,8 @@ FUNCTION (vfmsq, unspec_mve_function_exact_insn, (-1, -1, 
VFMSQ_F, -1, -1, -1, -
 FUNCTION_WITH_M_N_NO_F (vhaddq, VHADDQ)
 FUNCTION_WITH_M_N_NO_F (vhsubq, VHSUBQ)
 FUNCTION (vld1q, vld1_impl,)
+FUNCTION (vld2q, vld24_impl, (2))
+FUNCTION (vld4q, vld24_impl, (4))
 FUNCTION (vldrbq, vldrq_impl, (TYPE_SUFFIX_s8, TYPE_SUFFIX_u8))
 FUNCTION (vldrbq_gather, vldrq_gather_impl, (false, TYPE_SUFFIX_s8, 
TYPE_SUFFIX_u8))
 FUNCTION (vldrdq_gather, vldrq_gather_impl, (false, TYPE_SUFFIX_s64, 
TYPE_SUFFIX_u64, NUM_TYPE_SUFFIXES))
@@ -1458,6 +1527,8 @@ FUNCTION_ONLY_N_NO_F (vshrq, VSHRQ)
 FUNCTION_ONLY_N_NO_F (vsliq, VSLIQ)
 FUNCTION_ONLY_N_NO_F (vsriq, VSRIQ)
 FUNCTION (vst1q, vst1_impl,)
+FUNCTION (vst2q, vst24_impl, (2))
+FUNCTION (vst4q, vst24_impl, (4))
 FUNCTION (vstrbq, vstrq_impl, (QImode, opt_scalar_mode ()))
 FUNCTION (vstrbq_scatter, vstrq_scatter_impl, (false, QImode, opt_scalar_mode 
()))
 FUNCTION (vstrdq_scatter, vstrq_scatter_impl, (false, DImode, opt_scalar_mode 
()))
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index 223d20436e0..73d70af1072 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -59,6 +59,8 @@ DEF_MVE_FUNCTION (vhsubq, binary_opt_n, all_integer, 
mx_or_none)
 DEF_MVE_FUNCTION (vidupq, viddup, all_unsigned, mx_or_none)
 DEF_MVE_FUNCTION (viwdupq, vidwdup, all_unsigned, mx_or_none)
 DEF_MVE_FUNCTION (vld1q, load, all_integer, z_or_none)
+DEF_MVE_FUNCTION (vld2q, load, all_integer, none)
+DEF_MVE_FUNCTION (vld4q, load, all_integer, none)
 DEF_MVE_FUNCTION (vldrbq, load_ext, all_integer, z_or_none)
 DEF_MVE_FUNCTION (vldrbq_gather, load_ext_gather_offset, all_integer, 
z_or_none)
 DEF_MVE_FUNCTION (vldrdq_gather, load_ext_gather_offset, integer_64, z_or_none)
@@ -179,6 +181,8 @@ DEF_MVE_FUNCTION (vshrq, binary_rshift, all_integer, 
mx_or_none)
 DEF_MVE_FUNCTION (vsliq, ternary_lshift, all_integer, m_or_none)
 DEF_MVE_FUNCTION (vsriq, ternary_rshift, all_integer, m_or_none)
 DEF_MVE_FUNCTION (vst1q, store, all_integer, p_or_none)
+DEF_MVE_FUNCTION (vst2q, store, all_integer, none)
+DEF_MVE_FUNCTION (vst4q, store, all_integer, none)
 DEF_MVE_FUNCTION (vstrbq, store, all_integer, p_or_none)
 DEF_MVE_FUNCTION (vstrbq_scatter, store_scatter_offset, all_integer, p_or_none)
 DEF_MVE_FUNCTION (vstrhq, store, integer_16_32, p_or_none)
@@ -234,6 +238,8 @@ DEF_MVE_FUNCTION (vfmaq, ternary_opt_n, all_float, 
m_or_none)
 DEF_MVE_FUNCTION (vfmasq, ternary_n, all_float, m_or_none)
 DEF_MVE_FUNCTION (vfmsq, ternary, all_float, m_or_none)
 DEF_MVE_FUNCTION (vld1q, load, all_float, z_or_none)
+DEF_MVE_FUNCTION (vld2q, load, all_float, none)
+DEF_MVE_FUNCTION (vld4q, load, all_float, none)
 DEF_MVE_FUNCTION (vldrhq, load_ext, float_16, z_or_none)
 DEF_MVE_FUNCTION (vldrhq_gather, load_ext_gather_offset, float_16, z_or_none)
 DEF_MVE_FUNCTION (vldrhq_gather_shifted, load_ext_gather_offset, float_16, 
z_or_none)
@@ -264,6 +270,8 @@ DEF_MVE_FUNCTION (vrndpq, unary, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vrndq, unary, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vrndxq, unary, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vst1q, store, all_float, p_or_none)
+DEF_MVE_FUNCTION (vst2q, store, all_float, none)
+DEF_MVE_FUNCTION (vst4q, store, all_float, none)
 DEF_MVE_FUNCTION (vstrhq, store, float_16, p_or_none)
 DEF_MVE_FUNCTION (vstrhq_scatter, store_scatter_offset, float_16, p_or_none)
 DEF_MVE_FUNCTION (vstrhq_scatter_shifted, store_scatter_offset, float_16, 
p_or_none)
diff --git a/gcc/config/arm/arm-mve-builtins-base.h 
b/gcc/config/arm/arm-mve-builtins-base.h
index 3bc1e933bfc..362eef5940a 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -82,6 +82,8 @@ extern const function_base *const vhsubq;
 extern const function_base *const vidupq;
 extern const function_base *const viwdupq;
 extern const function_base *const vld1q;
+extern const function_base *const vld2q;
+extern const function_base *const vld4q;
 extern const function_base *const vldrbq;
 extern const function_base *const vldrbq_gather;
 extern const function_base *const vldrdq_gather;
@@ -214,6 +216,8 @@ extern const function_base *const vshrq;
 extern const function_base *const vsliq;
 extern const function_base *const vsriq;
 extern const function_base *const vst1q;
+extern const function_base *const vst2q;
+extern const function_base *const vst4q;
 extern const function_base *const vstrbq;
 extern const function_base *const vstrbq_scatter;
 extern const function_base *const vstrdq_scatter;
diff --git a/gcc/config/arm/arm-mve-builtins.cc 
b/gcc/config/arm/arm-mve-builtins.cc
index 0a7ffcfa546..8570e18fd96 100644
--- a/gcc/config/arm/arm-mve-builtins.cc
+++ b/gcc/config/arm/arm-mve-builtins.cc
@@ -535,11 +535,13 @@ register_builtin_tuple_types (vector_type_index type)
 
       tree vectype = acle_vector_types[0][type];
       tree arrtype = build_array_type_nelts (vectype, num_vectors);
-      gcc_assert (TYPE_MODE_RAW (arrtype) == TYPE_MODE (arrtype)
+      gcc_assert (VECTOR_MODE_P (TYPE_MODE (arrtype))
+                 && TYPE_MODE_RAW (arrtype) == TYPE_MODE (arrtype)
                  && TYPE_ALIGN (arrtype) == 64);
 
       tree tuple_type = wrap_type_in_struct (arrtype);
-      gcc_assert (TYPE_MODE_RAW (tuple_type) == TYPE_MODE (tuple_type)
+      gcc_assert (VECTOR_MODE_P (TYPE_MODE (tuple_type))
+                 && TYPE_MODE_RAW (tuple_type) == TYPE_MODE (tuple_type)
                  && TYPE_ALIGN (tuple_type) == 64);
 
       register_type_decl (tuple_type, buffer);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 4ee6fc9d670..777c737d1ff 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -278,6 +278,7 @@ static rtx_insn *arm_pic_static_addr (rtx orig, rtx reg);
 static bool cortex_a9_sched_adjust_cost (rtx_insn *, int, rtx_insn *, int *);
 static bool xscale_sched_adjust_cost (rtx_insn *, int, rtx_insn *, int *);
 static bool fa726te_sched_adjust_cost (rtx_insn *, int, rtx_insn *, int *);
+static opt_machine_mode arm_array_mode (machine_mode, unsigned HOST_WIDE_INT);
 static bool arm_array_mode_supported_p (machine_mode,
                                        unsigned HOST_WIDE_INT);
 static machine_mode arm_preferred_simd_mode (scalar_mode);
@@ -515,6 +516,8 @@ static const scoped_attribute_specs *const 
arm_attribute_table[] =
 #define TARGET_SHIFT_TRUNCATION_MASK arm_shift_truncation_mask
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P arm_vector_mode_supported_p
+#undef TARGET_ARRAY_MODE
+#define TARGET_ARRAY_MODE arm_array_mode
 #undef TARGET_ARRAY_MODE_SUPPORTED_P
 #define TARGET_ARRAY_MODE_SUPPORTED_P arm_array_mode_supported_p
 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
@@ -20774,7 +20777,9 @@ output_move_neon (rtx *operands)
              || NEON_REGNO_OK_FOR_QUAD (regno));
   gcc_assert (VALID_NEON_DREG_MODE (mode)
              || VALID_NEON_QREG_MODE (mode)
-             || VALID_NEON_STRUCT_MODE (mode));
+             || VALID_NEON_STRUCT_MODE (mode)
+             || (TARGET_HAVE_MVE
+                 && VALID_MVE_STRUCT_MODE (mode)));
   gcc_assert (MEM_P (mem));
 
   addr = XEXP (mem, 0);
@@ -24949,7 +24954,8 @@ arm_print_operand_address (FILE *stream, machine_mode 
mode, rtx x)
                         REGNO (XEXP (x, 0)),
                         GET_CODE (x) == PRE_DEC ? "-" : "",
                         GET_MODE_SIZE (mode));
-         else if (TARGET_HAVE_MVE && (mode == OImode || mode == XImode))
+         else if (TARGET_HAVE_MVE
+                  && VALID_MVE_STRUCT_MODE (mode))
            asm_fprintf (stream, "[%r]!", REGNO (XEXP (x,0)));
          else
            asm_fprintf (stream, "[%r], #%s%d", REGNO (XEXP (x, 0)),
@@ -25839,7 +25845,17 @@ arm_hard_regno_mode_ok (unsigned int regno, 
machine_mode mode)
      if (TARGET_HAVE_MVE)
        return ((VALID_MVE_MODE (mode) && NEON_REGNO_OK_FOR_QUAD (regno))
               || (mode == OImode && NEON_REGNO_OK_FOR_NREGS (regno, 4))
-              || (mode == XImode && NEON_REGNO_OK_FOR_NREGS (regno, 8)));
+              || (mode == V2x16QImode && NEON_REGNO_OK_FOR_NREGS (regno, 4))
+              || (mode == V2x8HImode && NEON_REGNO_OK_FOR_NREGS (regno, 4))
+              || (mode == V2x4SImode && NEON_REGNO_OK_FOR_NREGS (regno, 4))
+              || (mode == V2x8HFmode && NEON_REGNO_OK_FOR_NREGS (regno, 4))
+              || (mode == V2x4SFmode && NEON_REGNO_OK_FOR_NREGS (regno, 4))
+              || (mode == XImode && NEON_REGNO_OK_FOR_NREGS (regno, 8))
+              || (mode == V4x16QImode && NEON_REGNO_OK_FOR_NREGS (regno, 8))
+              || (mode == V4x8HImode && NEON_REGNO_OK_FOR_NREGS (regno, 8))
+              || (mode == V4x4SImode && NEON_REGNO_OK_FOR_NREGS (regno, 8))
+              || (mode == V4x8HFmode && NEON_REGNO_OK_FOR_NREGS (regno, 8))
+              || (mode == V4x4SFmode && NEON_REGNO_OK_FOR_NREGS (regno, 8)));
 
       return false;
     }
@@ -29785,6 +29801,27 @@ arm_vector_mode_supported_p (machine_mode mode)
   return false;
 }
 
+/* Implements target hook array_mode.  */
+static opt_machine_mode
+arm_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
+{
+  if (TARGET_HAVE_MVE
+      /* MVE accepts only tuples of 2 or 4 vectors.  */
+      && (nelems == 2
+         || nelems == 4))
+    {
+      machine_mode struct_mode;
+      FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
+       {
+         if (GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
+             && known_eq (GET_MODE_NUNITS (struct_mode),
+                          GET_MODE_NUNITS (mode) * nelems))
+           return struct_mode;
+       }
+    }
+  return opt_machine_mode ();
+}
+
 /* Implements target hook array_mode_supported_p.  */
 
 static bool
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 13a90d854d2..b2044db938b 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -1127,8 +1127,17 @@ extern const int arm_arch_cde_coproc_bits[];
   ((MODE) == TImode || (MODE) == EImode || (MODE) == OImode \
    || (MODE) == CImode || (MODE) == XImode)
 
-#define VALID_MVE_STRUCT_MODE(MODE) \
-  ((MODE) == TImode || (MODE) == OImode || (MODE) == XImode)
+#define VALID_MVE_STRUCT_MODE(MODE)                        \
+  ((MODE) == V2x16QImode                                   \
+   || (MODE) == V2x8HImode                                 \
+   || (MODE) == V2x4SImode                                 \
+   || (MODE) == V2x8HFmode                                 \
+   || (MODE) == V2x4SFmode                                 \
+   || (MODE) == V4x16QImode                                \
+   || (MODE) == V4x8HImode                                 \
+   || (MODE) == V4x4SImode                                 \
+   || (MODE) == V4x8HFmode                                 \
+   || (MODE) == V4x4SFmode)
 
 /* The conditions under which vector modes are supported for general
    arithmetic using Neon.  */
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 45b27ed9fb8..d2e382ee347 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -45,23 +45,11 @@
 #endif
 
 #ifndef __ARM_MVE_PRESERVE_USER_NAMESPACE
-#define vst4q(__addr, __value) __arm_vst4q(__addr, __value)
 #define vuninitializedq(__v) __arm_vuninitializedq(__v)
-#define vst2q(__addr, __value) __arm_vst2q(__addr, __value)
-#define vld2q(__addr) __arm_vld2q(__addr)
-#define vld4q(__addr) __arm_vld4q(__addr)
 #define vsetq_lane(__a, __b, __idx) __arm_vsetq_lane(__a, __b, __idx)
 #define vgetq_lane(__a, __idx) __arm_vgetq_lane(__a, __idx)
 
 
-#define vst4q_s8( __addr, __value) __arm_vst4q_s8( __addr, __value)
-#define vst4q_s16( __addr, __value) __arm_vst4q_s16( __addr, __value)
-#define vst4q_s32( __addr, __value) __arm_vst4q_s32( __addr, __value)
-#define vst4q_u8( __addr, __value) __arm_vst4q_u8( __addr, __value)
-#define vst4q_u16( __addr, __value) __arm_vst4q_u16( __addr, __value)
-#define vst4q_u32( __addr, __value) __arm_vst4q_u32( __addr, __value)
-#define vst4q_f16( __addr, __value) __arm_vst4q_f16( __addr, __value)
-#define vst4q_f32( __addr, __value) __arm_vst4q_f32( __addr, __value)
 #define vpnot(__a) __arm_vpnot(__a)
 #define vuninitializedq_u8(void) __arm_vuninitializedq_u8(void)
 #define vuninitializedq_u16(void) __arm_vuninitializedq_u16(void)
@@ -73,30 +61,6 @@
 #define vuninitializedq_s64(void) __arm_vuninitializedq_s64(void)
 #define vuninitializedq_f16(void) __arm_vuninitializedq_f16(void)
 #define vuninitializedq_f32(void) __arm_vuninitializedq_f32(void)
-#define vst2q_s8(__addr, __value) __arm_vst2q_s8(__addr, __value)
-#define vst2q_u8(__addr, __value) __arm_vst2q_u8(__addr, __value)
-#define vld2q_s8(__addr) __arm_vld2q_s8(__addr)
-#define vld2q_u8(__addr) __arm_vld2q_u8(__addr)
-#define vld4q_s8(__addr) __arm_vld4q_s8(__addr)
-#define vld4q_u8(__addr) __arm_vld4q_u8(__addr)
-#define vst2q_s16(__addr, __value) __arm_vst2q_s16(__addr, __value)
-#define vst2q_u16(__addr, __value) __arm_vst2q_u16(__addr, __value)
-#define vld2q_s16(__addr) __arm_vld2q_s16(__addr)
-#define vld2q_u16(__addr) __arm_vld2q_u16(__addr)
-#define vld4q_s16(__addr) __arm_vld4q_s16(__addr)
-#define vld4q_u16(__addr) __arm_vld4q_u16(__addr)
-#define vst2q_s32(__addr, __value) __arm_vst2q_s32(__addr, __value)
-#define vst2q_u32(__addr, __value) __arm_vst2q_u32(__addr, __value)
-#define vld2q_s32(__addr) __arm_vld2q_s32(__addr)
-#define vld2q_u32(__addr) __arm_vld2q_u32(__addr)
-#define vld4q_s32(__addr) __arm_vld4q_s32(__addr)
-#define vld4q_u32(__addr) __arm_vld4q_u32(__addr)
-#define vld4q_f16(__addr) __arm_vld4q_f16(__addr)
-#define vld2q_f16(__addr) __arm_vld2q_f16(__addr)
-#define vst2q_f16(__addr, __value) __arm_vst2q_f16(__addr, __value)
-#define vld4q_f32(__addr) __arm_vld4q_f32(__addr)
-#define vld2q_f32(__addr) __arm_vld2q_f32(__addr)
-#define vst2q_f32(__addr, __value) __arm_vst2q_f32(__addr, __value)
 #define vsetq_lane_f16(__a, __b,  __idx) __arm_vsetq_lane_f16(__a, __b,  __idx)
 #define vsetq_lane_f32(__a, __b,  __idx) __arm_vsetq_lane_f32(__a, __b,  __idx)
 #define vsetq_lane_s16(__a, __b,  __idx) __arm_vsetq_lane_s16(__a, __b,  __idx)
@@ -147,60 +111,6 @@
   __builtin_arm_lane_check (__ARM_NUM_LANES(__vec),     \
                            __ARM_LANEQ(__vec, __idx))
 
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_s8 (int8_t * __addr, int8x16x4_t __value)
-{
-  union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst4qv16qi ((__builtin_neon_qi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_s16 (int16_t * __addr, int16x8x4_t __value)
-{
-  union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst4qv8hi ((__builtin_neon_hi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_s32 (int32_t * __addr, int32x4x4_t __value)
-{
-  union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst4qv4si ((__builtin_neon_si *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_u8 (uint8_t * __addr, uint8x16x4_t __value)
-{
-  union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst4qv16qi ((__builtin_neon_qi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_u16 (uint16_t * __addr, uint16x8x4_t __value)
-{
-  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst4qv8hi ((__builtin_neon_hi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_u32 (uint32_t * __addr, uint32x4x4_t __value)
-{
-  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst4qv4si ((__builtin_neon_si *) __addr, __rv.__o);
-}
-
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vpnot (mve_pred16_t __a)
@@ -208,168 +118,6 @@ __arm_vpnot (mve_pred16_t __a)
   return __builtin_mve_vpnotv16bi (__a);
 }
 
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_s8 (int8_t * __addr, int8x16x2_t __value)
-{
-  union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst2qv16qi ((__builtin_neon_qi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_u8 (uint8_t * __addr, uint8x16x2_t __value)
-{
-  union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst2qv16qi ((__builtin_neon_qi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline int8x16x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_s8 (int8_t const * __addr)
-{
-  union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_mve_vld2qv16qi ((__builtin_neon_qi *) __addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline uint8x16x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_u8 (uint8_t const * __addr)
-{
-  union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_mve_vld2qv16qi ((__builtin_neon_qi *) __addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline int8x16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_s8 (int8_t const * __addr)
-{
-  union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_mve_vld4qv16qi ((__builtin_neon_qi *) __addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline uint8x16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_u8 (uint8_t const * __addr)
-{
-  union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_mve_vld4qv16qi ((__builtin_neon_qi *) __addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_s16 (int16_t * __addr, int16x8x2_t __value)
-{
-  union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst2qv8hi ((__builtin_neon_hi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_u16 (uint16_t * __addr, uint16x8x2_t __value)
-{
-  union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst2qv8hi ((__builtin_neon_hi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline int16x8x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_s16 (int16_t const * __addr)
-{
-  union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_mve_vld2qv8hi ((__builtin_neon_hi *) __addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline uint16x8x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_u16 (uint16_t const * __addr)
-{
-  union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_mve_vld2qv8hi ((__builtin_neon_hi *) __addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline int16x8x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_s16 (int16_t const * __addr)
-{
-  union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_mve_vld4qv8hi ((__builtin_neon_hi *) __addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline uint16x8x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_u16 (uint16_t const * __addr)
-{
-  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_mve_vld4qv8hi ((__builtin_neon_hi *) __addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_s32 (int32_t * __addr, int32x4x2_t __value)
-{
-  union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst2qv4si ((__builtin_neon_si *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_u32 (uint32_t * __addr, uint32x4x2_t __value)
-{
-  union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst2qv4si ((__builtin_neon_si *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline int32x4x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_s32 (int32_t const * __addr)
-{
-  union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_mve_vld2qv4si ((__builtin_neon_si *) __addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline uint32x4x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_u32 (uint32_t const * __addr)
-{
-  union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_mve_vld2qv4si ((__builtin_neon_si *) __addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline int32x4x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_s32 (int32_t const * __addr)
-{
-  union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_mve_vld4qv4si ((__builtin_neon_si *) __addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline uint32x4x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_u32 (uint32_t const * __addr)
-{
-  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_mve_vld4qv4si ((__builtin_neon_si *) __addr);
-  return __rv.__i;
-}
-
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vsetq_lane_s16 (int16_t __a, int16x8_t __b, const int __idx)
@@ -620,78 +368,6 @@ __arm_srshr (int32_t value, const int shift)
 
 #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
 
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_f16 (float16_t * __addr, float16x8x4_t __value)
-{
-  union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst4qv8hf (__addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_f32 (float32_t * __addr, float32x4x4_t __value)
-{
-  union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst4qv4sf (__addr, __rv.__o);
-}
-
-__extension__ extern __inline float16x8x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_f16 (float16_t const * __addr)
-{
-  union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_mve_vld4qv8hf (__addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline float16x8x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_f16 (float16_t const * __addr)
-{
-  union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_mve_vld2qv8hf (__addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_f16 (float16_t * __addr, float16x8x2_t __value)
-{
-  union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst2qv8hf (__addr, __rv.__o);
-}
-
-__extension__ extern __inline float32x4x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_f32 (float32_t const * __addr)
-{
-  union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_mve_vld4qv4sf (__addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline float32x4x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_f32 (float32_t const * __addr)
-{
-  union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_mve_vld2qv4sf (__addr);
-  return __rv.__i;
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_f32 (float32_t * __addr, float32x4x2_t __value)
-{
-  union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__i = __value;
-  __builtin_mve_vst2qv4sf (__addr, __rv.__o);
-}
-
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vsetq_lane_f16 (float16_t __a, float16x8_t __b, const int __idx)
@@ -728,173 +404,6 @@ __arm_vgetq_lane_f32 (float32x4_t __a, const int __idx)
 #endif
 
 #ifdef __cplusplus
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (int8_t * __addr, int8x16x4_t __value)
-{
- __arm_vst4q_s8 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (int16_t * __addr, int16x8x4_t __value)
-{
- __arm_vst4q_s16 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (int32_t * __addr, int32x4x4_t __value)
-{
- __arm_vst4q_s32 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (uint8_t * __addr, uint8x16x4_t __value)
-{
- __arm_vst4q_u8 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (uint16_t * __addr, uint16x8x4_t __value)
-{
- __arm_vst4q_u16 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (uint32_t * __addr, uint32x4x4_t __value)
-{
- __arm_vst4q_u32 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (int8_t * __addr, int8x16x2_t __value)
-{
- __arm_vst2q_s8 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (uint8_t * __addr, uint8x16x2_t __value)
-{
- __arm_vst2q_u8 (__addr, __value);
-}
-
-__extension__ extern __inline int8x16x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (int8_t const * __addr)
-{
- return __arm_vld2q_s8 (__addr);
-}
-
-__extension__ extern __inline uint8x16x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (uint8_t const * __addr)
-{
- return __arm_vld2q_u8 (__addr);
-}
-
-__extension__ extern __inline int8x16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (int8_t const * __addr)
-{
- return __arm_vld4q_s8 (__addr);
-}
-
-__extension__ extern __inline uint8x16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (uint8_t const * __addr)
-{
- return __arm_vld4q_u8 (__addr);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (int16_t * __addr, int16x8x2_t __value)
-{
- __arm_vst2q_s16 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (uint16_t * __addr, uint16x8x2_t __value)
-{
- __arm_vst2q_u16 (__addr, __value);
-}
-
-__extension__ extern __inline int16x8x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (int16_t const * __addr)
-{
- return __arm_vld2q_s16 (__addr);
-}
-
-__extension__ extern __inline uint16x8x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (uint16_t const * __addr)
-{
- return __arm_vld2q_u16 (__addr);
-}
-
-__extension__ extern __inline int16x8x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (int16_t const * __addr)
-{
- return __arm_vld4q_s16 (__addr);
-}
-
-__extension__ extern __inline uint16x8x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (uint16_t const * __addr)
-{
- return __arm_vld4q_u16 (__addr);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (int32_t * __addr, int32x4x2_t __value)
-{
- __arm_vst2q_s32 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (uint32_t * __addr, uint32x4x2_t __value)
-{
- __arm_vst2q_u32 (__addr, __value);
-}
-
-__extension__ extern __inline int32x4x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (int32_t const * __addr)
-{
- return __arm_vld2q_s32 (__addr);
-}
-
-__extension__ extern __inline uint32x4x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (uint32_t const * __addr)
-{
- return __arm_vld2q_u32 (__addr);
-}
-
-__extension__ extern __inline int32x4x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (int32_t const * __addr)
-{
- return __arm_vld4q_s32 (__addr);
-}
-
-__extension__ extern __inline uint32x4x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (uint32_t const * __addr)
-{
- return __arm_vld4q_u32 (__addr);
-}
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -1010,62 +519,6 @@ __arm_vgetq_lane (uint64x2_t __a, const int __idx)
 
 #if (__ARM_FEATURE_MVE & 2)  /* MVE Floating point.  */
 
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (float16_t * __addr, float16x8x4_t __value)
-{
- __arm_vst4q_f16 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (float32_t * __addr, float32x4x4_t __value)
-{
- __arm_vst4q_f32 (__addr, __value);
-}
-
-__extension__ extern __inline float16x8x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (float16_t const * __addr)
-{
- return __arm_vld4q_f16 (__addr);
-}
-
-__extension__ extern __inline float16x8x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (float16_t const * __addr)
-{
- return __arm_vld2q_f16 (__addr);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (float16_t * __addr, float16x8x2_t __value)
-{
- __arm_vst2q_f16 (__addr, __value);
-}
-
-__extension__ extern __inline float32x4x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (float32_t const * __addr)
-{
- return __arm_vld4q_f32 (__addr);
-}
-
-__extension__ extern __inline float32x4x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (float32_t const * __addr)
-{
- return __arm_vld2q_f32 (__addr);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (float32_t * __addr, float32x4x2_t __value)
-{
- __arm_vst2q_f32 (__addr, __value);
-}
-
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vsetq_lane (float16_t __a, float16x8_t __b, const int __idx)
@@ -1405,51 +858,6 @@ extern void *__ARM_undef;
 
 #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
 
-#define __arm_vst4q(p0,p1) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x4_t]: 
__arm_vst4q_s8 (__ARM_mve_coerce_s8_ptr(__p0, int8_t *), __ARM_mve_coerce(__p1, 
int8x16x4_t)), \
-  int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x4_t]: 
__arm_vst4q_s16 (__ARM_mve_coerce_s16_ptr(__p0, int16_t *), 
__ARM_mve_coerce(__p1, int16x8x4_t)), \
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x4_t]: 
__arm_vst4q_s32 (__ARM_mve_coerce_s32_ptr(__p0, int32_t *), 
__ARM_mve_coerce(__p1, int32x4x4_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x4_t]: 
__arm_vst4q_u8 (__ARM_mve_coerce_u8_ptr(__p0, uint8_t *), 
__ARM_mve_coerce(__p1, uint8x16x4_t)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: 
__arm_vst4q_u16 (__ARM_mve_coerce_u16_ptr(__p0, uint16_t *), 
__ARM_mve_coerce(__p1, uint16x8x4_t)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: 
__arm_vst4q_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), 
__ARM_mve_coerce(__p1, uint32x4x4_t)), \
-  int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8x4_t]: 
__arm_vst4q_f16 (__ARM_mve_coerce_f16_ptr(__p0, float16_t *), 
__ARM_mve_coerce(__p1, float16x8x4_t)), \
-  int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4x4_t]: 
__arm_vst4q_f32 (__ARM_mve_coerce_f32_ptr(__p0, float32_t *), 
__ARM_mve_coerce(__p1, float32x4x4_t)));})
-
-#define __arm_vld2q(p0) ( \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 
(__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 
(__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 
(__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 
(__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 
(__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 
(__ARM_mve_coerce_u32_ptr(p0, uint32_t *)), \
-  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld2q_f16 
(__ARM_mve_coerce_f16_ptr(p0, float16_t *)), \
-  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld2q_f32 
(__ARM_mve_coerce_f32_ptr(p0, float32_t *))))
-
-#define __arm_vld4q(p0) ( \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 
(__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 
(__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 
(__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 
(__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 
(__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 
(__ARM_mve_coerce_u32_ptr(p0, uint32_t *)), \
-  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld4q_f16 
(__ARM_mve_coerce_f16_ptr(p0, float16_t *)), \
-  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld4q_f32 
(__ARM_mve_coerce_f32_ptr(p0, float32_t *))))
-
-#define __arm_vst2q(p0,p1) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x2_t]: 
__arm_vst2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, 
int8x16x2_t)), \
-  int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x2_t]: 
__arm_vst2q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), 
__ARM_mve_coerce(__p1, int16x8x2_t)), \
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x2_t]: 
__arm_vst2q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), 
__ARM_mve_coerce(__p1, int32x4x2_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x2_t]: 
__arm_vst2q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, 
uint8x16x2_t)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x2_t]: 
__arm_vst2q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), 
__ARM_mve_coerce(__p1, uint16x8x2_t)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x2_t]: 
__arm_vst2q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), 
__ARM_mve_coerce(__p1, uint32x4x2_t)), \
-  int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8x2_t]: 
__arm_vst2q_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *), 
__ARM_mve_coerce(__p1, float16x8x2_t)), \
-  int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4x2_t]: 
__arm_vst2q_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *), 
__ARM_mve_coerce(__p1, float32x4x2_t)));})
-
 #define __arm_vuninitializedq(p0) ({ __typeof(p0) __p0 = (p0); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
   int (*)[__ARM_mve_type_int8x16_t]: __arm_vuninitializedq_s8 (), \
@@ -1492,25 +900,6 @@ extern void *__ARM_undef;
 
 #else /* MVE Integer.  */
 
-#define __arm_vst4q(p0,p1) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x4_t]: 
__arm_vst4q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, 
int8x16x4_t)), \
-  int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x4_t]: 
__arm_vst4q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), 
__ARM_mve_coerce(__p1, int16x8x4_t)), \
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x4_t]: 
__arm_vst4q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), 
__ARM_mve_coerce(__p1, int32x4x4_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x4_t]: 
__arm_vst4q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, 
uint8x16x4_t)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: 
__arm_vst4q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), 
__ARM_mve_coerce(__p1, uint16x8x4_t)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: 
__arm_vst4q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), 
__ARM_mve_coerce(__p1, uint32x4x4_t)));})
-
-#define __arm_vst2q(p0,p1) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x2_t]: 
__arm_vst2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, 
int8x16x2_t)), \
-  int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x2_t]: 
__arm_vst2q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), 
__ARM_mve_coerce(__p1, int16x8x2_t)), \
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x2_t]: 
__arm_vst2q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), 
__ARM_mve_coerce(__p1, int32x4x2_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x2_t]: 
__arm_vst2q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, 
uint8x16x2_t)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x2_t]: 
__arm_vst2q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), 
__ARM_mve_coerce(__p1, uint16x8x2_t)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x2_t]: 
__arm_vst2q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), 
__ARM_mve_coerce(__p1, uint32x4x2_t)));})
-
-
 #define __arm_vuninitializedq(p0) ({ __typeof(p0) __p0 = (p0); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
   int (*)[__ARM_mve_type_int8x16_t]: __arm_vuninitializedq_s8 (), \
@@ -1522,23 +911,6 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint32x4_t]: __arm_vuninitializedq_u32 (), \
   int (*)[__ARM_mve_type_uint64x2_t]: __arm_vuninitializedq_u64 ());})
 
-#define __arm_vld2q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 
(__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 
(__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 
(__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 
(__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 
(__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 
(__ARM_mve_coerce_u32_ptr(p0, uint32_t *))))
-
-
-#define __arm_vld4q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 
(__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 
(__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 
(__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 
(__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 
(__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 
(__ARM_mve_coerce_u32_ptr(p0, uint32_t *))))
-
 #define __arm_vgetq_lane(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
   int (*)[__ARM_mve_type_int8x16_t]: __arm_vgetq_lane_s8 
(__ARM_mve_coerce(__p0, int8x16_t), p1), \
diff --git a/gcc/config/arm/arm_mve_builtins.def 
b/gcc/config/arm/arm_mve_builtins.def
index b85b334a81e..90d8f90b98f 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -18,7 +18,6 @@
     along with GCC; see the file COPYING3.  If not see
     <http://www.gnu.org/licenses/>.  */
 
-VAR5 (STORE1, vst4q, v16qi, v8hi, v4si, v8hf, v4sf)
 VAR2 (UNOP_NONE_NONE, vrndxq_f, v8hf, v4sf)
 VAR2 (UNOP_NONE_NONE, vrndq_f, v8hf, v4sf)
 VAR2 (UNOP_NONE_NONE, vrndpq_f, v8hf, v4sf)
@@ -679,9 +678,6 @@ VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsbciq_m_s, v4si)
 VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsbciq_m_u, v4si)
 VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsbcq_m_s, v4si)
 VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsbcq_m_u, v4si)
-VAR5 (STORE1, vst2q, v16qi, v8hi, v4si, v8hf, v4sf)
-VAR5 (LOAD1, vld4q, v16qi, v8hi, v4si, v8hf, v4sf)
-VAR5 (LOAD1, vld2q, v16qi, v8hi, v4si, v8hf, v4sf)
 VAR1 (ASRL, sqrshr_,si)
 VAR1 (ASRL, sqrshrl_sat64_,di)
 VAR1 (ASRL, sqrshrl_sat48_,di)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 1caf5d18ad6..cfe712ceda9 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -139,7 +139,18 @@ (define_mode_iterator VQXMOV [V16QI V8HI V8HF V8BF V4SI 
V4SF V2DI TI])
 
 ;; Opaque structure types wider than TImode.
 (define_mode_iterator VSTRUCT [(EI "!TARGET_HAVE_MVE") OI
-                              (CI "!TARGET_HAVE_MVE") XI])
+                              (CI "!TARGET_HAVE_MVE") XI
+                              (V2x16QI "TARGET_HAVE_MVE")
+                              (V2x8HI "TARGET_HAVE_MVE")
+                              (V2x4SI "TARGET_HAVE_MVE")
+                              (V2x8HF "TARGET_HAVE_MVE_FLOAT")
+                              (V2x4SF "TARGET_HAVE_MVE_FLOAT")
+                              (V4x16QI "TARGET_HAVE_MVE")
+                              (V4x8HI "TARGET_HAVE_MVE")
+                              (V4x4SI "TARGET_HAVE_MVE")
+                              (V4x8HF "TARGET_HAVE_MVE_FLOAT")
+                              (V4x4SF "TARGET_HAVE_MVE_FLOAT")
+                              ])
 
 ;; Opaque structure types used in table lookups (except vtbl1/vtbx1).
 (define_mode_iterator VTAB [TI EI OI])
@@ -286,6 +297,29 @@ (define_mode_iterator MVE_7_HI [HI V16BI V8BI V4BI V2QI])
 (define_mode_iterator MVE_V8HF [V8HF])
 (define_mode_iterator MVE_V16QI [V16QI])
 
+(define_mode_attr MVE_VLD2_VST2 [(V16QI "V2x16QI")
+                                (V8HI "V2x8HI")
+                                (V4SI "V2x4SI")
+                                (V8HF "V2x8HF")
+                                (V4SF "V2x4SF")])
+(define_mode_attr MVE_vld2_vst2 [(V16QI "v2x16qi")
+                                (V8HI "v2x8hi")
+                                (V4SI "v2x4si")
+                                (V8HF "v2x8hf")
+                                (V4SF "v2x4sf")])
+
+(define_mode_attr MVE_VLD4_VST4 [(V16QI "V4x16QI")
+                                (V8HI "V4x8HI")
+                                (V4SI "V4x4SI")
+                                (V8HF "V4x8HF")
+                                (V4SF "V4x4SF")])
+
+(define_mode_attr MVE_vld4_vst4 [(V16QI "v4x16qi")
+                                (V8HI "v4x8hi")
+                                (V4SI "v4x4si")
+                                (V8HF "v4x8hf")
+                                (V4SF "v4x4sf")])
+
 ;; Types for MVE truncating stores and widening loads
 (define_mode_iterator MVE_w_narrow_TYPE [V8QI V4QI V4HI])
 (define_mode_attr MVE_w_narrow_type [(V8QI "v8qi") (V4QI "v4qi") (V4HI 
"v4hi")])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 70f6ec6c2cc..325dad87833 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -110,13 +110,14 @@ (define_insn "@mve_vdupq_n<mode>"
 ;;
 ;; [vst4q])
 ;;
-(define_insn "mve_vst4q<mode>"
-  [(set (match_operand:XI 0 "mve_struct_operand" "=Ug")
-       (unspec:XI [(match_operand:XI 1 "s_register_operand" "w")
-                   (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+(define_insn "@mve_vst4q<mode>"
+  [(set (match_operand:<MVE_VLD4_VST4> 0 "mve_struct_operand" "=Ug")
+       (unspec:<MVE_VLD4_VST4>
+               [(match_operand:<MVE_VLD4_VST4> 1 "s_register_operand" "w")
+                (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
         VST4Q))
   ]
-  "TARGET_HAVE_MVE"
+  "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD4_VST4>mode))"
 {
    rtx ops[6];
    int regno = REGNO (operands[1]);
@@ -4061,14 +4062,14 @@ (define_insn "@mve_<mve_insn>q_m_<supf>v4si"
 ;;
 ;; [vst2q])
 ;;
-(define_insn "mve_vst2q<mode>"
-  [(set (match_operand:OI 0 "mve_struct_operand" "=Ug")
-       (unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
-                   (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+(define_insn "@mve_vst2q<mode>"
+  [(set (match_operand:<MVE_VLD2_VST2> 0 "mve_struct_operand" "=Ug")
+       (unspec:<MVE_VLD2_VST2>
+               [(match_operand:<MVE_VLD2_VST2> 1 "s_register_operand" "w")
+                (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
         VST2Q))
   ]
-  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
-   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))"
+  "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD2_VST2>mode))"
 {
    rtx ops[4];
    int regno = REGNO (operands[1]);
@@ -4089,14 +4090,14 @@ (define_insn "mve_vst2q<mode>"
 ;;
 ;; [vld2q])
 ;;
-(define_insn "mve_vld2q<mode>"
-  [(set (match_operand:OI 0 "s_register_operand" "=w")
-       (unspec:OI [(match_operand:OI 1 "mve_struct_operand" "Ug")
-                   (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+(define_insn "@mve_vld2q<mode>"
+  [(set (match_operand:<MVE_VLD2_VST2> 0 "s_register_operand" "=w")
+       (unspec:<MVE_VLD2_VST2>
+               [(match_operand:<MVE_VLD2_VST2> 1 "mve_struct_operand" "Ug")
+                (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
         VLD2Q))
   ]
-  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
-   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))"
+  "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD2_VST2>mode))"
 {
    rtx ops[4];
    int regno = REGNO (operands[0]);
@@ -4117,14 +4118,14 @@ (define_insn "mve_vld2q<mode>"
 ;;
 ;; [vld4q])
 ;;
-(define_insn "mve_vld4q<mode>"
-  [(set (match_operand:XI 0 "s_register_operand" "=w")
-       (unspec:XI [(match_operand:XI 1 "mve_struct_operand" "Ug")
-                   (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+(define_insn "@mve_vld4q<mode>"
+  [(set (match_operand:<MVE_VLD4_VST4> 0 "s_register_operand" "=w")
+       (unspec:<MVE_VLD4_VST4>
+               [(match_operand:<MVE_VLD4_VST4> 1 "mve_struct_operand" "Ug")
+                (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
         VLD4Q))
   ]
-  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
-   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))"
+  "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD4_VST4>mode))"
 {
    rtx ops[6];
    int regno = REGNO (operands[0]);
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index ff1c27a0d71..03a5cf9e7e3 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -492,12 +492,21 @@ (define_expand "vec_load_lanesoi<mode>"
         (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
                     (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                   UNSPEC_VLD2))]
-  "TARGET_NEON || TARGET_HAVE_MVE"
+  "TARGET_NEON"
 {
-  if (TARGET_NEON)
-    emit_insn (gen_neon_vld2<mode> (operands[0], operands[1]));
-  else
-    emit_insn (gen_mve_vld2q<mode> (operands[0], operands[1]));
+  emit_insn (gen_neon_vld2<mode> (operands[0], operands[1]));
+  DONE;
+})
+
+;;; On MVE we use V2xYYY modes instead of OI
+(define_expand "vec_load_lanes<MVE_vld2_vst2><mode>"
+  [(set (match_operand:<MVE_VLD2_VST2> 0 "s_register_operand")
+        (unspec:<MVE_VLD2_VST2> [(match_operand:<MVE_VLD2_VST2> 1 
"neon_struct_operand")
+                    (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                  UNSPEC_VLD2))]
+  "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD2_VST2>mode))"
+{
+  emit_insn (gen_mve_vld2q<mode> (operands[0], operands[1]));
   DONE;
 })
 
@@ -506,12 +515,21 @@ (define_expand "vec_store_lanesoi<mode>"
        (unspec:OI [(match_operand:OI 1 "s_register_operand")
                     (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST2))]
-  "TARGET_NEON || TARGET_HAVE_MVE"
+  "TARGET_NEON"
 {
-  if (TARGET_NEON)
-    emit_insn (gen_neon_vst2<mode> (operands[0], operands[1]));
-  else
-    emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
+  emit_insn (gen_neon_vst2<mode> (operands[0], operands[1]));
+  DONE;
+})
+
+;;; On MVE we use V2xYYY modes instead of OI
+(define_expand "vec_store_lanes<MVE_vld2_vst2><mode>"
+  [(set (match_operand:<MVE_VLD2_VST2> 0 "neon_struct_operand")
+       (unspec:<MVE_VLD2_VST2> [(match_operand:<MVE_VLD2_VST2> 1 
"s_register_operand")
+                    (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VST2))]
+  "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD2_VST2>mode))"
+{
+  emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
   DONE;
 })
 
@@ -519,12 +537,21 @@ (define_expand "vec_load_lanesxi<mode>"
   [(match_operand:XI 0 "s_register_operand")
    (match_operand:XI 1 "neon_struct_operand")
    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
-  "TARGET_NEON || TARGET_HAVE_MVE"
+  "TARGET_NEON"
 {
-  if (TARGET_NEON)
-    emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
-  else
-    emit_insn (gen_mve_vld4q<mode> (operands[0], operands[1]));
+  emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
+  DONE;
+})
+
+;;; On MVE we use V4xYYY modes instead of XI
+(define_expand "vec_load_lanes<MVE_vld4_vst4><mode>"
+  [(set (match_operand:<MVE_VLD4_VST4> 0 "s_register_operand")
+        (unspec:<MVE_VLD4_VST4> [(match_operand:<MVE_VLD4_VST4> 1 
"neon_struct_operand")
+                    (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                  UNSPEC_VLD4))]
+  "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD4_VST4>mode))"
+{
+  emit_insn (gen_mve_vld4q<mode> (operands[0], operands[1]));
   DONE;
 })
 
@@ -532,12 +559,21 @@ (define_expand "vec_store_lanesxi<mode>"
   [(match_operand:XI 0 "neon_struct_operand")
    (match_operand:XI 1 "s_register_operand")
    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
-  "TARGET_NEON || TARGET_HAVE_MVE"
+  "TARGET_NEON"
 {
-  if (TARGET_NEON)
-    emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
-  else
-    emit_insn (gen_mve_vst4q<mode> (operands[0], operands[1]));
+  emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
+  DONE;
+})
+
+;;; On MVE we use V4xYYY modes instead of XI
+(define_expand "vec_store_lanes<MVE_vld4_vst4><mode>"
+  [(set (match_operand:<MVE_VLD4_VST4> 0 "neon_struct_operand")
+       (unspec:<MVE_VLD4_VST4> [(match_operand:<MVE_VLD4_VST4> 1 
"s_register_operand")
+                    (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VST4))]
+  "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD4_VST4>mode))"
+{
+  emit_insn (gen_mve_vst4q<mode> (operands[0], operands[1]));
   DONE;
 })
 
-- 
2.34.1

Reply via email to