Implement vst2q, vst4q, vld2q and vld4q using the new MVE builtins framework.
Since MVE uses different tuple modes than Neon, we need to use VALID_MVE_STRUCT_MODE because VALID_NEON_STRUCT_MODE is no longer a super-set of it, for instance in output_move_neon and arm_print_operand_address. In arm_hard_regno_mode_ok, the change is similar but a bit more intrusive. Expand the VSTRUCT iterator, so that mov<mode> and neon_mov<mode> patterns from neon.md still work for MVE. Besides the small updates to the patterns in mve.md, we have to update vec_load_lanes and vec_store_lanes in vec-common.md so that the vectorizer can handle the new modes. These patterns are now different from Neon's, so maybe we should move them back to neon.md and mve.md The patch adds arm_array_mode, which is used by build_array_type_nelts and makes it possible to support the new assert in register_builtin_tuple_types. gcc/ChangeLog: * config/arm/arm-mve-builtins-base.cc (class vst24_impl): New. (class vld24_impl): New. (vld2q, vld4q, vst2q, vst4q): New. * config/arm/arm-mve-builtins-base.def (vld2q, vld4q, vst2q) (vst4q): New. * config/arm/arm-mve-builtins-base.h (vld2q, vld4q, vst2q, vst4q): New. * config/arm/arm-mve-builtins.cc (register_builtin_tuple_types): Add more asserts. * config/arm/arm.cc (TARGET_ARRAY_MODE): New. (output_move_neon): Handle MVE struct modes. (arm_print_operand_address): Likewise. (arm_hard_regno_mode_ok): Likewise. (arm_array_mode): New. * config/arm/arm.h (VALID_MVE_STRUCT_MODE): Likewise. * config/arm/arm_mve.h (vst4q): Delete. (vst2q): Delete. (vld2q): Delete. (vld4q): Delete. (vst4q_s8): Delete. (vst4q_s16): Delete. (vst4q_s32): Delete. (vst4q_u8): Delete. (vst4q_u16): Delete. (vst4q_u32): Delete. (vst4q_f16): Delete. (vst4q_f32): Delete. (vst2q_s8): Delete. (vst2q_u8): Delete. (vld2q_s8): Delete. (vld2q_u8): Delete. (vld4q_s8): Delete. (vld4q_u8): Delete. (vst2q_s16): Delete. (vst2q_u16): Delete. (vld2q_s16): Delete. (vld2q_u16): Delete. (vld4q_s16): Delete. (vld4q_u16): Delete. (vst2q_s32): Delete. (vst2q_u32): Delete. (vld2q_s32): Delete. (vld2q_u32): Delete. (vld4q_s32): Delete. (vld4q_u32): Delete. (vld4q_f16): Delete. (vld2q_f16): Delete. (vst2q_f16): Delete. (vld4q_f32): Delete. (vld2q_f32): Delete. (vst2q_f32): Delete. (__arm_vst4q_s8): Delete. (__arm_vst4q_s16): Delete. (__arm_vst4q_s32): Delete. (__arm_vst4q_u8): Delete. (__arm_vst4q_u16): Delete. (__arm_vst4q_u32): Delete. (__arm_vst2q_s8): Delete. (__arm_vst2q_u8): Delete. (__arm_vld2q_s8): Delete. (__arm_vld2q_u8): Delete. (__arm_vld4q_s8): Delete. (__arm_vld4q_u8): Delete. (__arm_vst2q_s16): Delete. (__arm_vst2q_u16): Delete. (__arm_vld2q_s16): Delete. (__arm_vld2q_u16): Delete. (__arm_vld4q_s16): Delete. (__arm_vld4q_u16): Delete. (__arm_vst2q_s32): Delete. (__arm_vst2q_u32): Delete. (__arm_vld2q_s32): Delete. (__arm_vld2q_u32): Delete. (__arm_vld4q_s32): Delete. (__arm_vld4q_u32): Delete. (__arm_vst4q_f16): Delete. (__arm_vst4q_f32): Delete. (__arm_vld4q_f16): Delete. (__arm_vld2q_f16): Delete. (__arm_vst2q_f16): Delete. (__arm_vld4q_f32): Delete. (__arm_vld2q_f32): Delete. (__arm_vst2q_f32): Delete. (__arm_vst4q): Delete. (__arm_vst2q): Delete. (__arm_vld2q): Delete. (__arm_vld4q): Delete. * config/arm/arm_mve_builtins.def (vst4q, vst2q, vld4q, vld2q): Delete. * config/arm/iterators.md (VSTRUCT): Add V2x16QI, V2x8HI, V2x4SI, V2x8HF, V2x4SF, V4x16QI, V4x8HI, V4x4SI, V4x8HF, V4x4SF. (MVE_VLD2_VST2, MVE_vld2_vst2, MVE_VLD4_VST4, MVE_vld4_vst4): New. * config/arm/mve.md (mve_vst4q<mode>): Update into ... (@mve_vst4q<mode>): ... this. (mve_vst2q<mode>): Update into ... (@mve_vst2q<mode>): ... this. (mve_vld2q<mode>): Update into ... (@mve_vld2q<mode>): ... this. (mve_vld4q<mode>): Update into ... (@mve_vld4q<mode>): ... this. * config/arm/vec-common.md (vec_load_lanesoi<mode>) Remove MVE support. (vec_load_lanesxi<mode>): Likewise. (vec_store_lanesoi<mode>): Likewise. (vec_store_lanesxi<mode>): Likewise. (vec_load_lanes<MVE_vld2_vst2><mode>): New. (vec_store_lanes<MVE_vld2_vst2><mode>): New. (vec_load_lanes<MVE_vld4_vst4><mode>): New. (vec_store_lanes<MVE_vld4_vst4><mode>): New. --- gcc/config/arm/arm-mve-builtins-base.cc | 71 +++ gcc/config/arm/arm-mve-builtins-base.def | 8 + gcc/config/arm/arm-mve-builtins-base.h | 4 + gcc/config/arm/arm-mve-builtins.cc | 6 +- gcc/config/arm/arm.cc | 43 +- gcc/config/arm/arm.h | 13 +- gcc/config/arm/arm_mve.h | 628 ----------------------- gcc/config/arm/arm_mve_builtins.def | 4 - gcc/config/arm/iterators.md | 36 +- gcc/config/arm/mve.md | 47 +- gcc/config/arm/vec-common.md | 76 ++- 11 files changed, 253 insertions(+), 683 deletions(-) diff --git a/gcc/config/arm/arm-mve-builtins-base.cc b/gcc/config/arm/arm-mve-builtins-base.cc index 737403527a9..723004b53d7 100644 --- a/gcc/config/arm/arm-mve-builtins-base.cc +++ b/gcc/config/arm/arm-mve-builtins-base.cc @@ -1100,6 +1100,73 @@ public: } }; + +/* Implements vst2 and vst4. */ +class vst24_impl : public full_width_access +{ +public: + using full_width_access::full_width_access; + + unsigned int + call_properties (const function_instance &) const override + { + return CP_WRITE_MEMORY; + } + + rtx + expand (function_expander &e) const override + { + insn_code icode; + switch (vectors_per_tuple ()) + { + case 2: + icode = code_for_mve_vst2q (e.vector_mode (0)); + break; + + case 4: + icode = code_for_mve_vst4q (e.vector_mode (0)); + break; + + default: + gcc_unreachable (); + } + return e.use_contiguous_store_insn (icode); + } +}; + +/* Implements vld2 and vld4. */ +class vld24_impl : public full_width_access +{ +public: + using full_width_access::full_width_access; + + unsigned int + call_properties (const function_instance &) const override + { + return CP_READ_MEMORY; + } + + rtx + expand (function_expander &e) const override + { + insn_code icode; + switch (vectors_per_tuple ()) + { + case 2: + icode = code_for_mve_vld2q (e.vector_mode (0)); + break; + + case 4: + icode = code_for_mve_vld4q (e.vector_mode (0)); + break; + + default: + gcc_unreachable (); + } + return e.use_contiguous_load_insn (icode); + } +}; + } /* end anonymous namespace */ namespace arm_mve { @@ -1326,6 +1393,8 @@ FUNCTION (vfmsq, unspec_mve_function_exact_insn, (-1, -1, VFMSQ_F, -1, -1, -1, - FUNCTION_WITH_M_N_NO_F (vhaddq, VHADDQ) FUNCTION_WITH_M_N_NO_F (vhsubq, VHSUBQ) FUNCTION (vld1q, vld1_impl,) +FUNCTION (vld2q, vld24_impl, (2)) +FUNCTION (vld4q, vld24_impl, (4)) FUNCTION (vldrbq, vldrq_impl, (TYPE_SUFFIX_s8, TYPE_SUFFIX_u8)) FUNCTION (vldrbq_gather, vldrq_gather_impl, (false, TYPE_SUFFIX_s8, TYPE_SUFFIX_u8)) FUNCTION (vldrdq_gather, vldrq_gather_impl, (false, TYPE_SUFFIX_s64, TYPE_SUFFIX_u64, NUM_TYPE_SUFFIXES)) @@ -1458,6 +1527,8 @@ FUNCTION_ONLY_N_NO_F (vshrq, VSHRQ) FUNCTION_ONLY_N_NO_F (vsliq, VSLIQ) FUNCTION_ONLY_N_NO_F (vsriq, VSRIQ) FUNCTION (vst1q, vst1_impl,) +FUNCTION (vst2q, vst24_impl, (2)) +FUNCTION (vst4q, vst24_impl, (4)) FUNCTION (vstrbq, vstrq_impl, (QImode, opt_scalar_mode ())) FUNCTION (vstrbq_scatter, vstrq_scatter_impl, (false, QImode, opt_scalar_mode ())) FUNCTION (vstrdq_scatter, vstrq_scatter_impl, (false, DImode, opt_scalar_mode ())) diff --git a/gcc/config/arm/arm-mve-builtins-base.def b/gcc/config/arm/arm-mve-builtins-base.def index 223d20436e0..73d70af1072 100644 --- a/gcc/config/arm/arm-mve-builtins-base.def +++ b/gcc/config/arm/arm-mve-builtins-base.def @@ -59,6 +59,8 @@ DEF_MVE_FUNCTION (vhsubq, binary_opt_n, all_integer, mx_or_none) DEF_MVE_FUNCTION (vidupq, viddup, all_unsigned, mx_or_none) DEF_MVE_FUNCTION (viwdupq, vidwdup, all_unsigned, mx_or_none) DEF_MVE_FUNCTION (vld1q, load, all_integer, z_or_none) +DEF_MVE_FUNCTION (vld2q, load, all_integer, none) +DEF_MVE_FUNCTION (vld4q, load, all_integer, none) DEF_MVE_FUNCTION (vldrbq, load_ext, all_integer, z_or_none) DEF_MVE_FUNCTION (vldrbq_gather, load_ext_gather_offset, all_integer, z_or_none) DEF_MVE_FUNCTION (vldrdq_gather, load_ext_gather_offset, integer_64, z_or_none) @@ -179,6 +181,8 @@ DEF_MVE_FUNCTION (vshrq, binary_rshift, all_integer, mx_or_none) DEF_MVE_FUNCTION (vsliq, ternary_lshift, all_integer, m_or_none) DEF_MVE_FUNCTION (vsriq, ternary_rshift, all_integer, m_or_none) DEF_MVE_FUNCTION (vst1q, store, all_integer, p_or_none) +DEF_MVE_FUNCTION (vst2q, store, all_integer, none) +DEF_MVE_FUNCTION (vst4q, store, all_integer, none) DEF_MVE_FUNCTION (vstrbq, store, all_integer, p_or_none) DEF_MVE_FUNCTION (vstrbq_scatter, store_scatter_offset, all_integer, p_or_none) DEF_MVE_FUNCTION (vstrhq, store, integer_16_32, p_or_none) @@ -234,6 +238,8 @@ DEF_MVE_FUNCTION (vfmaq, ternary_opt_n, all_float, m_or_none) DEF_MVE_FUNCTION (vfmasq, ternary_n, all_float, m_or_none) DEF_MVE_FUNCTION (vfmsq, ternary, all_float, m_or_none) DEF_MVE_FUNCTION (vld1q, load, all_float, z_or_none) +DEF_MVE_FUNCTION (vld2q, load, all_float, none) +DEF_MVE_FUNCTION (vld4q, load, all_float, none) DEF_MVE_FUNCTION (vldrhq, load_ext, float_16, z_or_none) DEF_MVE_FUNCTION (vldrhq_gather, load_ext_gather_offset, float_16, z_or_none) DEF_MVE_FUNCTION (vldrhq_gather_shifted, load_ext_gather_offset, float_16, z_or_none) @@ -264,6 +270,8 @@ DEF_MVE_FUNCTION (vrndpq, unary, all_float, mx_or_none) DEF_MVE_FUNCTION (vrndq, unary, all_float, mx_or_none) DEF_MVE_FUNCTION (vrndxq, unary, all_float, mx_or_none) DEF_MVE_FUNCTION (vst1q, store, all_float, p_or_none) +DEF_MVE_FUNCTION (vst2q, store, all_float, none) +DEF_MVE_FUNCTION (vst4q, store, all_float, none) DEF_MVE_FUNCTION (vstrhq, store, float_16, p_or_none) DEF_MVE_FUNCTION (vstrhq_scatter, store_scatter_offset, float_16, p_or_none) DEF_MVE_FUNCTION (vstrhq_scatter_shifted, store_scatter_offset, float_16, p_or_none) diff --git a/gcc/config/arm/arm-mve-builtins-base.h b/gcc/config/arm/arm-mve-builtins-base.h index 3bc1e933bfc..362eef5940a 100644 --- a/gcc/config/arm/arm-mve-builtins-base.h +++ b/gcc/config/arm/arm-mve-builtins-base.h @@ -82,6 +82,8 @@ extern const function_base *const vhsubq; extern const function_base *const vidupq; extern const function_base *const viwdupq; extern const function_base *const vld1q; +extern const function_base *const vld2q; +extern const function_base *const vld4q; extern const function_base *const vldrbq; extern const function_base *const vldrbq_gather; extern const function_base *const vldrdq_gather; @@ -214,6 +216,8 @@ extern const function_base *const vshrq; extern const function_base *const vsliq; extern const function_base *const vsriq; extern const function_base *const vst1q; +extern const function_base *const vst2q; +extern const function_base *const vst4q; extern const function_base *const vstrbq; extern const function_base *const vstrbq_scatter; extern const function_base *const vstrdq_scatter; diff --git a/gcc/config/arm/arm-mve-builtins.cc b/gcc/config/arm/arm-mve-builtins.cc index 0a7ffcfa546..8570e18fd96 100644 --- a/gcc/config/arm/arm-mve-builtins.cc +++ b/gcc/config/arm/arm-mve-builtins.cc @@ -535,11 +535,13 @@ register_builtin_tuple_types (vector_type_index type) tree vectype = acle_vector_types[0][type]; tree arrtype = build_array_type_nelts (vectype, num_vectors); - gcc_assert (TYPE_MODE_RAW (arrtype) == TYPE_MODE (arrtype) + gcc_assert (VECTOR_MODE_P (TYPE_MODE (arrtype)) + && TYPE_MODE_RAW (arrtype) == TYPE_MODE (arrtype) && TYPE_ALIGN (arrtype) == 64); tree tuple_type = wrap_type_in_struct (arrtype); - gcc_assert (TYPE_MODE_RAW (tuple_type) == TYPE_MODE (tuple_type) + gcc_assert (VECTOR_MODE_P (TYPE_MODE (tuple_type)) + && TYPE_MODE_RAW (tuple_type) == TYPE_MODE (tuple_type) && TYPE_ALIGN (tuple_type) == 64); register_type_decl (tuple_type, buffer); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 4ee6fc9d670..777c737d1ff 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -278,6 +278,7 @@ static rtx_insn *arm_pic_static_addr (rtx orig, rtx reg); static bool cortex_a9_sched_adjust_cost (rtx_insn *, int, rtx_insn *, int *); static bool xscale_sched_adjust_cost (rtx_insn *, int, rtx_insn *, int *); static bool fa726te_sched_adjust_cost (rtx_insn *, int, rtx_insn *, int *); +static opt_machine_mode arm_array_mode (machine_mode, unsigned HOST_WIDE_INT); static bool arm_array_mode_supported_p (machine_mode, unsigned HOST_WIDE_INT); static machine_mode arm_preferred_simd_mode (scalar_mode); @@ -515,6 +516,8 @@ static const scoped_attribute_specs *const arm_attribute_table[] = #define TARGET_SHIFT_TRUNCATION_MASK arm_shift_truncation_mask #undef TARGET_VECTOR_MODE_SUPPORTED_P #define TARGET_VECTOR_MODE_SUPPORTED_P arm_vector_mode_supported_p +#undef TARGET_ARRAY_MODE +#define TARGET_ARRAY_MODE arm_array_mode #undef TARGET_ARRAY_MODE_SUPPORTED_P #define TARGET_ARRAY_MODE_SUPPORTED_P arm_array_mode_supported_p #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE @@ -20774,7 +20777,9 @@ output_move_neon (rtx *operands) || NEON_REGNO_OK_FOR_QUAD (regno)); gcc_assert (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode) - || VALID_NEON_STRUCT_MODE (mode)); + || VALID_NEON_STRUCT_MODE (mode) + || (TARGET_HAVE_MVE + && VALID_MVE_STRUCT_MODE (mode))); gcc_assert (MEM_P (mem)); addr = XEXP (mem, 0); @@ -24949,7 +24954,8 @@ arm_print_operand_address (FILE *stream, machine_mode mode, rtx x) REGNO (XEXP (x, 0)), GET_CODE (x) == PRE_DEC ? "-" : "", GET_MODE_SIZE (mode)); - else if (TARGET_HAVE_MVE && (mode == OImode || mode == XImode)) + else if (TARGET_HAVE_MVE + && VALID_MVE_STRUCT_MODE (mode)) asm_fprintf (stream, "[%r]!", REGNO (XEXP (x,0))); else asm_fprintf (stream, "[%r], #%s%d", REGNO (XEXP (x, 0)), @@ -25839,7 +25845,17 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode) if (TARGET_HAVE_MVE) return ((VALID_MVE_MODE (mode) && NEON_REGNO_OK_FOR_QUAD (regno)) || (mode == OImode && NEON_REGNO_OK_FOR_NREGS (regno, 4)) - || (mode == XImode && NEON_REGNO_OK_FOR_NREGS (regno, 8))); + || (mode == V2x16QImode && NEON_REGNO_OK_FOR_NREGS (regno, 4)) + || (mode == V2x8HImode && NEON_REGNO_OK_FOR_NREGS (regno, 4)) + || (mode == V2x4SImode && NEON_REGNO_OK_FOR_NREGS (regno, 4)) + || (mode == V2x8HFmode && NEON_REGNO_OK_FOR_NREGS (regno, 4)) + || (mode == V2x4SFmode && NEON_REGNO_OK_FOR_NREGS (regno, 4)) + || (mode == XImode && NEON_REGNO_OK_FOR_NREGS (regno, 8)) + || (mode == V4x16QImode && NEON_REGNO_OK_FOR_NREGS (regno, 8)) + || (mode == V4x8HImode && NEON_REGNO_OK_FOR_NREGS (regno, 8)) + || (mode == V4x4SImode && NEON_REGNO_OK_FOR_NREGS (regno, 8)) + || (mode == V4x8HFmode && NEON_REGNO_OK_FOR_NREGS (regno, 8)) + || (mode == V4x4SFmode && NEON_REGNO_OK_FOR_NREGS (regno, 8))); return false; } @@ -29785,6 +29801,27 @@ arm_vector_mode_supported_p (machine_mode mode) return false; } +/* Implements target hook array_mode. */ +static opt_machine_mode +arm_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems) +{ + if (TARGET_HAVE_MVE + /* MVE accepts only tuples of 2 or 4 vectors. */ + && (nelems == 2 + || nelems == 4)) + { + machine_mode struct_mode; + FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode)) + { + if (GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode) + && known_eq (GET_MODE_NUNITS (struct_mode), + GET_MODE_NUNITS (mode) * nelems)) + return struct_mode; + } + } + return opt_machine_mode (); +} + /* Implements target hook array_mode_supported_p. */ static bool diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index 13a90d854d2..b2044db938b 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -1127,8 +1127,17 @@ extern const int arm_arch_cde_coproc_bits[]; ((MODE) == TImode || (MODE) == EImode || (MODE) == OImode \ || (MODE) == CImode || (MODE) == XImode) -#define VALID_MVE_STRUCT_MODE(MODE) \ - ((MODE) == TImode || (MODE) == OImode || (MODE) == XImode) +#define VALID_MVE_STRUCT_MODE(MODE) \ + ((MODE) == V2x16QImode \ + || (MODE) == V2x8HImode \ + || (MODE) == V2x4SImode \ + || (MODE) == V2x8HFmode \ + || (MODE) == V2x4SFmode \ + || (MODE) == V4x16QImode \ + || (MODE) == V4x8HImode \ + || (MODE) == V4x4SImode \ + || (MODE) == V4x8HFmode \ + || (MODE) == V4x4SFmode) /* The conditions under which vector modes are supported for general arithmetic using Neon. */ diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h index 45b27ed9fb8..d2e382ee347 100644 --- a/gcc/config/arm/arm_mve.h +++ b/gcc/config/arm/arm_mve.h @@ -45,23 +45,11 @@ #endif #ifndef __ARM_MVE_PRESERVE_USER_NAMESPACE -#define vst4q(__addr, __value) __arm_vst4q(__addr, __value) #define vuninitializedq(__v) __arm_vuninitializedq(__v) -#define vst2q(__addr, __value) __arm_vst2q(__addr, __value) -#define vld2q(__addr) __arm_vld2q(__addr) -#define vld4q(__addr) __arm_vld4q(__addr) #define vsetq_lane(__a, __b, __idx) __arm_vsetq_lane(__a, __b, __idx) #define vgetq_lane(__a, __idx) __arm_vgetq_lane(__a, __idx) -#define vst4q_s8( __addr, __value) __arm_vst4q_s8( __addr, __value) -#define vst4q_s16( __addr, __value) __arm_vst4q_s16( __addr, __value) -#define vst4q_s32( __addr, __value) __arm_vst4q_s32( __addr, __value) -#define vst4q_u8( __addr, __value) __arm_vst4q_u8( __addr, __value) -#define vst4q_u16( __addr, __value) __arm_vst4q_u16( __addr, __value) -#define vst4q_u32( __addr, __value) __arm_vst4q_u32( __addr, __value) -#define vst4q_f16( __addr, __value) __arm_vst4q_f16( __addr, __value) -#define vst4q_f32( __addr, __value) __arm_vst4q_f32( __addr, __value) #define vpnot(__a) __arm_vpnot(__a) #define vuninitializedq_u8(void) __arm_vuninitializedq_u8(void) #define vuninitializedq_u16(void) __arm_vuninitializedq_u16(void) @@ -73,30 +61,6 @@ #define vuninitializedq_s64(void) __arm_vuninitializedq_s64(void) #define vuninitializedq_f16(void) __arm_vuninitializedq_f16(void) #define vuninitializedq_f32(void) __arm_vuninitializedq_f32(void) -#define vst2q_s8(__addr, __value) __arm_vst2q_s8(__addr, __value) -#define vst2q_u8(__addr, __value) __arm_vst2q_u8(__addr, __value) -#define vld2q_s8(__addr) __arm_vld2q_s8(__addr) -#define vld2q_u8(__addr) __arm_vld2q_u8(__addr) -#define vld4q_s8(__addr) __arm_vld4q_s8(__addr) -#define vld4q_u8(__addr) __arm_vld4q_u8(__addr) -#define vst2q_s16(__addr, __value) __arm_vst2q_s16(__addr, __value) -#define vst2q_u16(__addr, __value) __arm_vst2q_u16(__addr, __value) -#define vld2q_s16(__addr) __arm_vld2q_s16(__addr) -#define vld2q_u16(__addr) __arm_vld2q_u16(__addr) -#define vld4q_s16(__addr) __arm_vld4q_s16(__addr) -#define vld4q_u16(__addr) __arm_vld4q_u16(__addr) -#define vst2q_s32(__addr, __value) __arm_vst2q_s32(__addr, __value) -#define vst2q_u32(__addr, __value) __arm_vst2q_u32(__addr, __value) -#define vld2q_s32(__addr) __arm_vld2q_s32(__addr) -#define vld2q_u32(__addr) __arm_vld2q_u32(__addr) -#define vld4q_s32(__addr) __arm_vld4q_s32(__addr) -#define vld4q_u32(__addr) __arm_vld4q_u32(__addr) -#define vld4q_f16(__addr) __arm_vld4q_f16(__addr) -#define vld2q_f16(__addr) __arm_vld2q_f16(__addr) -#define vst2q_f16(__addr, __value) __arm_vst2q_f16(__addr, __value) -#define vld4q_f32(__addr) __arm_vld4q_f32(__addr) -#define vld2q_f32(__addr) __arm_vld2q_f32(__addr) -#define vst2q_f32(__addr, __value) __arm_vst2q_f32(__addr, __value) #define vsetq_lane_f16(__a, __b, __idx) __arm_vsetq_lane_f16(__a, __b, __idx) #define vsetq_lane_f32(__a, __b, __idx) __arm_vsetq_lane_f32(__a, __b, __idx) #define vsetq_lane_s16(__a, __b, __idx) __arm_vsetq_lane_s16(__a, __b, __idx) @@ -147,60 +111,6 @@ __builtin_arm_lane_check (__ARM_NUM_LANES(__vec), \ __ARM_LANEQ(__vec, __idx)) -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q_s8 (int8_t * __addr, int8x16x4_t __value) -{ - union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst4qv16qi ((__builtin_neon_qi *) __addr, __rv.__o); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q_s16 (int16_t * __addr, int16x8x4_t __value) -{ - union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst4qv8hi ((__builtin_neon_hi *) __addr, __rv.__o); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q_s32 (int32_t * __addr, int32x4x4_t __value) -{ - union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst4qv4si ((__builtin_neon_si *) __addr, __rv.__o); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q_u8 (uint8_t * __addr, uint8x16x4_t __value) -{ - union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst4qv16qi ((__builtin_neon_qi *) __addr, __rv.__o); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q_u16 (uint16_t * __addr, uint16x8x4_t __value) -{ - union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst4qv8hi ((__builtin_neon_hi *) __addr, __rv.__o); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q_u32 (uint32_t * __addr, uint32x4x4_t __value) -{ - union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst4qv4si ((__builtin_neon_si *) __addr, __rv.__o); -} - __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vpnot (mve_pred16_t __a) @@ -208,168 +118,6 @@ __arm_vpnot (mve_pred16_t __a) return __builtin_mve_vpnotv16bi (__a); } -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q_s8 (int8_t * __addr, int8x16x2_t __value) -{ - union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst2qv16qi ((__builtin_neon_qi *) __addr, __rv.__o); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q_u8 (uint8_t * __addr, uint8x16x2_t __value) -{ - union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst2qv16qi ((__builtin_neon_qi *) __addr, __rv.__o); -} - -__extension__ extern __inline int8x16x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q_s8 (int8_t const * __addr) -{ - union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_mve_vld2qv16qi ((__builtin_neon_qi *) __addr); - return __rv.__i; -} - -__extension__ extern __inline uint8x16x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q_u8 (uint8_t const * __addr) -{ - union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_mve_vld2qv16qi ((__builtin_neon_qi *) __addr); - return __rv.__i; -} - -__extension__ extern __inline int8x16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q_s8 (int8_t const * __addr) -{ - union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__o = __builtin_mve_vld4qv16qi ((__builtin_neon_qi *) __addr); - return __rv.__i; -} - -__extension__ extern __inline uint8x16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q_u8 (uint8_t const * __addr) -{ - union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__o = __builtin_mve_vld4qv16qi ((__builtin_neon_qi *) __addr); - return __rv.__i; -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q_s16 (int16_t * __addr, int16x8x2_t __value) -{ - union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst2qv8hi ((__builtin_neon_hi *) __addr, __rv.__o); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q_u16 (uint16_t * __addr, uint16x8x2_t __value) -{ - union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst2qv8hi ((__builtin_neon_hi *) __addr, __rv.__o); -} - -__extension__ extern __inline int16x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q_s16 (int16_t const * __addr) -{ - union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_mve_vld2qv8hi ((__builtin_neon_hi *) __addr); - return __rv.__i; -} - -__extension__ extern __inline uint16x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q_u16 (uint16_t const * __addr) -{ - union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_mve_vld2qv8hi ((__builtin_neon_hi *) __addr); - return __rv.__i; -} - -__extension__ extern __inline int16x8x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q_s16 (int16_t const * __addr) -{ - union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__o = __builtin_mve_vld4qv8hi ((__builtin_neon_hi *) __addr); - return __rv.__i; -} - -__extension__ extern __inline uint16x8x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q_u16 (uint16_t const * __addr) -{ - union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__o = __builtin_mve_vld4qv8hi ((__builtin_neon_hi *) __addr); - return __rv.__i; -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q_s32 (int32_t * __addr, int32x4x2_t __value) -{ - union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst2qv4si ((__builtin_neon_si *) __addr, __rv.__o); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q_u32 (uint32_t * __addr, uint32x4x2_t __value) -{ - union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst2qv4si ((__builtin_neon_si *) __addr, __rv.__o); -} - -__extension__ extern __inline int32x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q_s32 (int32_t const * __addr) -{ - union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_mve_vld2qv4si ((__builtin_neon_si *) __addr); - return __rv.__i; -} - -__extension__ extern __inline uint32x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q_u32 (uint32_t const * __addr) -{ - union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_mve_vld2qv4si ((__builtin_neon_si *) __addr); - return __rv.__i; -} - -__extension__ extern __inline int32x4x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q_s32 (int32_t const * __addr) -{ - union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__o = __builtin_mve_vld4qv4si ((__builtin_neon_si *) __addr); - return __rv.__i; -} - -__extension__ extern __inline uint32x4x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q_u32 (uint32_t const * __addr) -{ - union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__o = __builtin_mve_vld4qv4si ((__builtin_neon_si *) __addr); - return __rv.__i; -} - __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vsetq_lane_s16 (int16_t __a, int16x8_t __b, const int __idx) @@ -620,78 +368,6 @@ __arm_srshr (int32_t value, const int shift) #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point. */ -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q_f16 (float16_t * __addr, float16x8x4_t __value) -{ - union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst4qv8hf (__addr, __rv.__o); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q_f32 (float32_t * __addr, float32x4x4_t __value) -{ - union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst4qv4sf (__addr, __rv.__o); -} - -__extension__ extern __inline float16x8x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q_f16 (float16_t const * __addr) -{ - union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__o = __builtin_mve_vld4qv8hf (__addr); - return __rv.__i; -} - -__extension__ extern __inline float16x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q_f16 (float16_t const * __addr) -{ - union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_mve_vld2qv8hf (__addr); - return __rv.__i; -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q_f16 (float16_t * __addr, float16x8x2_t __value) -{ - union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst2qv8hf (__addr, __rv.__o); -} - -__extension__ extern __inline float32x4x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q_f32 (float32_t const * __addr) -{ - union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__o = __builtin_mve_vld4qv4sf (__addr); - return __rv.__i; -} - -__extension__ extern __inline float32x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q_f32 (float32_t const * __addr) -{ - union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_mve_vld2qv4sf (__addr); - return __rv.__i; -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q_f32 (float32_t * __addr, float32x4x2_t __value) -{ - union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__i = __value; - __builtin_mve_vst2qv4sf (__addr, __rv.__o); -} - __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vsetq_lane_f16 (float16_t __a, float16x8_t __b, const int __idx) @@ -728,173 +404,6 @@ __arm_vgetq_lane_f32 (float32x4_t __a, const int __idx) #endif #ifdef __cplusplus -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q (int8_t * __addr, int8x16x4_t __value) -{ - __arm_vst4q_s8 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q (int16_t * __addr, int16x8x4_t __value) -{ - __arm_vst4q_s16 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q (int32_t * __addr, int32x4x4_t __value) -{ - __arm_vst4q_s32 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q (uint8_t * __addr, uint8x16x4_t __value) -{ - __arm_vst4q_u8 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q (uint16_t * __addr, uint16x8x4_t __value) -{ - __arm_vst4q_u16 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q (uint32_t * __addr, uint32x4x4_t __value) -{ - __arm_vst4q_u32 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q (int8_t * __addr, int8x16x2_t __value) -{ - __arm_vst2q_s8 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q (uint8_t * __addr, uint8x16x2_t __value) -{ - __arm_vst2q_u8 (__addr, __value); -} - -__extension__ extern __inline int8x16x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q (int8_t const * __addr) -{ - return __arm_vld2q_s8 (__addr); -} - -__extension__ extern __inline uint8x16x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q (uint8_t const * __addr) -{ - return __arm_vld2q_u8 (__addr); -} - -__extension__ extern __inline int8x16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q (int8_t const * __addr) -{ - return __arm_vld4q_s8 (__addr); -} - -__extension__ extern __inline uint8x16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q (uint8_t const * __addr) -{ - return __arm_vld4q_u8 (__addr); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q (int16_t * __addr, int16x8x2_t __value) -{ - __arm_vst2q_s16 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q (uint16_t * __addr, uint16x8x2_t __value) -{ - __arm_vst2q_u16 (__addr, __value); -} - -__extension__ extern __inline int16x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q (int16_t const * __addr) -{ - return __arm_vld2q_s16 (__addr); -} - -__extension__ extern __inline uint16x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q (uint16_t const * __addr) -{ - return __arm_vld2q_u16 (__addr); -} - -__extension__ extern __inline int16x8x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q (int16_t const * __addr) -{ - return __arm_vld4q_s16 (__addr); -} - -__extension__ extern __inline uint16x8x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q (uint16_t const * __addr) -{ - return __arm_vld4q_u16 (__addr); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q (int32_t * __addr, int32x4x2_t __value) -{ - __arm_vst2q_s32 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q (uint32_t * __addr, uint32x4x2_t __value) -{ - __arm_vst2q_u32 (__addr, __value); -} - -__extension__ extern __inline int32x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q (int32_t const * __addr) -{ - return __arm_vld2q_s32 (__addr); -} - -__extension__ extern __inline uint32x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q (uint32_t const * __addr) -{ - return __arm_vld2q_u32 (__addr); -} - -__extension__ extern __inline int32x4x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q (int32_t const * __addr) -{ - return __arm_vld4q_s32 (__addr); -} - -__extension__ extern __inline uint32x4x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q (uint32_t const * __addr) -{ - return __arm_vld4q_u32 (__addr); -} __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -1010,62 +519,6 @@ __arm_vgetq_lane (uint64x2_t __a, const int __idx) #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point. */ -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q (float16_t * __addr, float16x8x4_t __value) -{ - __arm_vst4q_f16 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst4q (float32_t * __addr, float32x4x4_t __value) -{ - __arm_vst4q_f32 (__addr, __value); -} - -__extension__ extern __inline float16x8x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q (float16_t const * __addr) -{ - return __arm_vld4q_f16 (__addr); -} - -__extension__ extern __inline float16x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q (float16_t const * __addr) -{ - return __arm_vld2q_f16 (__addr); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q (float16_t * __addr, float16x8x2_t __value) -{ - __arm_vst2q_f16 (__addr, __value); -} - -__extension__ extern __inline float32x4x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld4q (float32_t const * __addr) -{ - return __arm_vld4q_f32 (__addr); -} - -__extension__ extern __inline float32x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld2q (float32_t const * __addr) -{ - return __arm_vld2q_f32 (__addr); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst2q (float32_t * __addr, float32x4x2_t __value) -{ - __arm_vst2q_f32 (__addr, __value); -} - __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vsetq_lane (float16_t __a, float16x8_t __b, const int __idx) @@ -1405,51 +858,6 @@ extern void *__ARM_undef; #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point. */ -#define __arm_vst4q(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x4_t]: __arm_vst4q_s8 (__ARM_mve_coerce_s8_ptr(__p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x4_t)), \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x4_t]: __arm_vst4q_s16 (__ARM_mve_coerce_s16_ptr(__p0, int16_t *), __ARM_mve_coerce(__p1, int16x8x4_t)), \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x4_t]: __arm_vst4q_s32 (__ARM_mve_coerce_s32_ptr(__p0, int32_t *), __ARM_mve_coerce(__p1, int32x4x4_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x4_t]: __arm_vst4q_u8 (__ARM_mve_coerce_u8_ptr(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16x4_t)), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: __arm_vst4q_u16 (__ARM_mve_coerce_u16_ptr(__p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x4_t)), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: __arm_vst4q_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x4_t)), \ - int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8x4_t]: __arm_vst4q_f16 (__ARM_mve_coerce_f16_ptr(__p0, float16_t *), __ARM_mve_coerce(__p1, float16x8x4_t)), \ - int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4x4_t]: __arm_vst4q_f32 (__ARM_mve_coerce_f32_ptr(__p0, float32_t *), __ARM_mve_coerce(__p1, float32x4x4_t)));}) - -#define __arm_vld2q(p0) ( \ - _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *)), \ - int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld2q_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *)), \ - int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld2q_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *)))) - -#define __arm_vld4q(p0) ( \ - _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *)), \ - int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld4q_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *)), \ - int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld4q_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *)))) - -#define __arm_vst2q(p0,p1) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x2_t]: __arm_vst2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x2_t)), \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x2_t]: __arm_vst2q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8x2_t)), \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x2_t]: __arm_vst2q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4x2_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x2_t]: __arm_vst2q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16x2_t)), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x2_t]: __arm_vst2q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x2_t)), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x2_t]: __arm_vst2q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x2_t)), \ - int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8x2_t]: __arm_vst2q_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *), __ARM_mve_coerce(__p1, float16x8x2_t)), \ - int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4x2_t]: __arm_vst2q_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *), __ARM_mve_coerce(__p1, float32x4x2_t)));}) - #define __arm_vuninitializedq(p0) ({ __typeof(p0) __p0 = (p0); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ int (*)[__ARM_mve_type_int8x16_t]: __arm_vuninitializedq_s8 (), \ @@ -1492,25 +900,6 @@ extern void *__ARM_undef; #else /* MVE Integer. */ -#define __arm_vst4q(p0,p1) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x4_t]: __arm_vst4q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x4_t)), \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x4_t]: __arm_vst4q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8x4_t)), \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x4_t]: __arm_vst4q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4x4_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x4_t]: __arm_vst4q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16x4_t)), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: __arm_vst4q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x4_t)), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: __arm_vst4q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x4_t)));}) - -#define __arm_vst2q(p0,p1) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x2_t]: __arm_vst2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x2_t)), \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x2_t]: __arm_vst2q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8x2_t)), \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x2_t]: __arm_vst2q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4x2_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x2_t]: __arm_vst2q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16x2_t)), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x2_t]: __arm_vst2q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x2_t)), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x2_t]: __arm_vst2q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x2_t)));}) - - #define __arm_vuninitializedq(p0) ({ __typeof(p0) __p0 = (p0); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ int (*)[__ARM_mve_type_int8x16_t]: __arm_vuninitializedq_s8 (), \ @@ -1522,23 +911,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint32x4_t]: __arm_vuninitializedq_u32 (), \ int (*)[__ARM_mve_type_uint64x2_t]: __arm_vuninitializedq_u64 ());}) -#define __arm_vld2q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *)))) - - -#define __arm_vld4q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *)))) - #define __arm_vgetq_lane(p0,p1) ({ __typeof(p0) __p0 = (p0); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ int (*)[__ARM_mve_type_int8x16_t]: __arm_vgetq_lane_s8 (__ARM_mve_coerce(__p0, int8x16_t), p1), \ diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def index b85b334a81e..90d8f90b98f 100644 --- a/gcc/config/arm/arm_mve_builtins.def +++ b/gcc/config/arm/arm_mve_builtins.def @@ -18,7 +18,6 @@ along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ -VAR5 (STORE1, vst4q, v16qi, v8hi, v4si, v8hf, v4sf) VAR2 (UNOP_NONE_NONE, vrndxq_f, v8hf, v4sf) VAR2 (UNOP_NONE_NONE, vrndq_f, v8hf, v4sf) VAR2 (UNOP_NONE_NONE, vrndpq_f, v8hf, v4sf) @@ -679,9 +678,6 @@ VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsbciq_m_s, v4si) VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsbciq_m_u, v4si) VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsbcq_m_s, v4si) VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsbcq_m_u, v4si) -VAR5 (STORE1, vst2q, v16qi, v8hi, v4si, v8hf, v4sf) -VAR5 (LOAD1, vld4q, v16qi, v8hi, v4si, v8hf, v4sf) -VAR5 (LOAD1, vld2q, v16qi, v8hi, v4si, v8hf, v4sf) VAR1 (ASRL, sqrshr_,si) VAR1 (ASRL, sqrshrl_sat64_,di) VAR1 (ASRL, sqrshrl_sat48_,di) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 1caf5d18ad6..cfe712ceda9 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -139,7 +139,18 @@ (define_mode_iterator VQXMOV [V16QI V8HI V8HF V8BF V4SI V4SF V2DI TI]) ;; Opaque structure types wider than TImode. (define_mode_iterator VSTRUCT [(EI "!TARGET_HAVE_MVE") OI - (CI "!TARGET_HAVE_MVE") XI]) + (CI "!TARGET_HAVE_MVE") XI + (V2x16QI "TARGET_HAVE_MVE") + (V2x8HI "TARGET_HAVE_MVE") + (V2x4SI "TARGET_HAVE_MVE") + (V2x8HF "TARGET_HAVE_MVE_FLOAT") + (V2x4SF "TARGET_HAVE_MVE_FLOAT") + (V4x16QI "TARGET_HAVE_MVE") + (V4x8HI "TARGET_HAVE_MVE") + (V4x4SI "TARGET_HAVE_MVE") + (V4x8HF "TARGET_HAVE_MVE_FLOAT") + (V4x4SF "TARGET_HAVE_MVE_FLOAT") + ]) ;; Opaque structure types used in table lookups (except vtbl1/vtbx1). (define_mode_iterator VTAB [TI EI OI]) @@ -286,6 +297,29 @@ (define_mode_iterator MVE_7_HI [HI V16BI V8BI V4BI V2QI]) (define_mode_iterator MVE_V8HF [V8HF]) (define_mode_iterator MVE_V16QI [V16QI]) +(define_mode_attr MVE_VLD2_VST2 [(V16QI "V2x16QI") + (V8HI "V2x8HI") + (V4SI "V2x4SI") + (V8HF "V2x8HF") + (V4SF "V2x4SF")]) +(define_mode_attr MVE_vld2_vst2 [(V16QI "v2x16qi") + (V8HI "v2x8hi") + (V4SI "v2x4si") + (V8HF "v2x8hf") + (V4SF "v2x4sf")]) + +(define_mode_attr MVE_VLD4_VST4 [(V16QI "V4x16QI") + (V8HI "V4x8HI") + (V4SI "V4x4SI") + (V8HF "V4x8HF") + (V4SF "V4x4SF")]) + +(define_mode_attr MVE_vld4_vst4 [(V16QI "v4x16qi") + (V8HI "v4x8hi") + (V4SI "v4x4si") + (V8HF "v4x8hf") + (V4SF "v4x4sf")]) + ;; Types for MVE truncating stores and widening loads (define_mode_iterator MVE_w_narrow_TYPE [V8QI V4QI V4HI]) (define_mode_attr MVE_w_narrow_type [(V8QI "v8qi") (V4QI "v4qi") (V4HI "v4hi")]) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 70f6ec6c2cc..325dad87833 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -110,13 +110,14 @@ (define_insn "@mve_vdupq_n<mode>" ;; ;; [vst4q]) ;; -(define_insn "mve_vst4q<mode>" - [(set (match_operand:XI 0 "mve_struct_operand" "=Ug") - (unspec:XI [(match_operand:XI 1 "s_register_operand" "w") - (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] +(define_insn "@mve_vst4q<mode>" + [(set (match_operand:<MVE_VLD4_VST4> 0 "mve_struct_operand" "=Ug") + (unspec:<MVE_VLD4_VST4> + [(match_operand:<MVE_VLD4_VST4> 1 "s_register_operand" "w") + (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] VST4Q)) ] - "TARGET_HAVE_MVE" + "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD4_VST4>mode))" { rtx ops[6]; int regno = REGNO (operands[1]); @@ -4061,14 +4062,14 @@ (define_insn "@mve_<mve_insn>q_m_<supf>v4si" ;; ;; [vst2q]) ;; -(define_insn "mve_vst2q<mode>" - [(set (match_operand:OI 0 "mve_struct_operand" "=Ug") - (unspec:OI [(match_operand:OI 1 "s_register_operand" "w") - (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] +(define_insn "@mve_vst2q<mode>" + [(set (match_operand:<MVE_VLD2_VST2> 0 "mve_struct_operand" "=Ug") + (unspec:<MVE_VLD2_VST2> + [(match_operand:<MVE_VLD2_VST2> 1 "s_register_operand" "w") + (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] VST2Q)) ] - "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode)) - || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))" + "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD2_VST2>mode))" { rtx ops[4]; int regno = REGNO (operands[1]); @@ -4089,14 +4090,14 @@ (define_insn "mve_vst2q<mode>" ;; ;; [vld2q]) ;; -(define_insn "mve_vld2q<mode>" - [(set (match_operand:OI 0 "s_register_operand" "=w") - (unspec:OI [(match_operand:OI 1 "mve_struct_operand" "Ug") - (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] +(define_insn "@mve_vld2q<mode>" + [(set (match_operand:<MVE_VLD2_VST2> 0 "s_register_operand" "=w") + (unspec:<MVE_VLD2_VST2> + [(match_operand:<MVE_VLD2_VST2> 1 "mve_struct_operand" "Ug") + (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] VLD2Q)) ] - "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode)) - || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))" + "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD2_VST2>mode))" { rtx ops[4]; int regno = REGNO (operands[0]); @@ -4117,14 +4118,14 @@ (define_insn "mve_vld2q<mode>" ;; ;; [vld4q]) ;; -(define_insn "mve_vld4q<mode>" - [(set (match_operand:XI 0 "s_register_operand" "=w") - (unspec:XI [(match_operand:XI 1 "mve_struct_operand" "Ug") - (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] +(define_insn "@mve_vld4q<mode>" + [(set (match_operand:<MVE_VLD4_VST4> 0 "s_register_operand" "=w") + (unspec:<MVE_VLD4_VST4> + [(match_operand:<MVE_VLD4_VST4> 1 "mve_struct_operand" "Ug") + (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] VLD4Q)) ] - "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode)) - || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))" + "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD4_VST4>mode))" { rtx ops[6]; int regno = REGNO (operands[0]); diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index ff1c27a0d71..03a5cf9e7e3 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -492,12 +492,21 @@ (define_expand "vec_load_lanesoi<mode>" (unspec:OI [(match_operand:OI 1 "neon_struct_operand") (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] - "TARGET_NEON || TARGET_HAVE_MVE" + "TARGET_NEON" { - if (TARGET_NEON) - emit_insn (gen_neon_vld2<mode> (operands[0], operands[1])); - else - emit_insn (gen_mve_vld2q<mode> (operands[0], operands[1])); + emit_insn (gen_neon_vld2<mode> (operands[0], operands[1])); + DONE; +}) + +;;; On MVE we use V2xYYY modes instead of OI +(define_expand "vec_load_lanes<MVE_vld2_vst2><mode>" + [(set (match_operand:<MVE_VLD2_VST2> 0 "s_register_operand") + (unspec:<MVE_VLD2_VST2> [(match_operand:<MVE_VLD2_VST2> 1 "neon_struct_operand") + (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2))] + "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD2_VST2>mode))" +{ + emit_insn (gen_mve_vld2q<mode> (operands[0], operands[1])); DONE; }) @@ -506,12 +515,21 @@ (define_expand "vec_store_lanesoi<mode>" (unspec:OI [(match_operand:OI 1 "s_register_operand") (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST2))] - "TARGET_NEON || TARGET_HAVE_MVE" + "TARGET_NEON" { - if (TARGET_NEON) - emit_insn (gen_neon_vst2<mode> (operands[0], operands[1])); - else - emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1])); + emit_insn (gen_neon_vst2<mode> (operands[0], operands[1])); + DONE; +}) + +;;; On MVE we use V2xYYY modes instead of OI +(define_expand "vec_store_lanes<MVE_vld2_vst2><mode>" + [(set (match_operand:<MVE_VLD2_VST2> 0 "neon_struct_operand") + (unspec:<MVE_VLD2_VST2> [(match_operand:<MVE_VLD2_VST2> 1 "s_register_operand") + (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST2))] + "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD2_VST2>mode))" +{ + emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1])); DONE; }) @@ -519,12 +537,21 @@ (define_expand "vec_load_lanesxi<mode>" [(match_operand:XI 0 "s_register_operand") (match_operand:XI 1 "neon_struct_operand") (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] - "TARGET_NEON || TARGET_HAVE_MVE" + "TARGET_NEON" { - if (TARGET_NEON) - emit_insn (gen_neon_vld4<mode> (operands[0], operands[1])); - else - emit_insn (gen_mve_vld4q<mode> (operands[0], operands[1])); + emit_insn (gen_neon_vld4<mode> (operands[0], operands[1])); + DONE; +}) + +;;; On MVE we use V4xYYY modes instead of XI +(define_expand "vec_load_lanes<MVE_vld4_vst4><mode>" + [(set (match_operand:<MVE_VLD4_VST4> 0 "s_register_operand") + (unspec:<MVE_VLD4_VST4> [(match_operand:<MVE_VLD4_VST4> 1 "neon_struct_operand") + (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD4))] + "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD4_VST4>mode))" +{ + emit_insn (gen_mve_vld4q<mode> (operands[0], operands[1])); DONE; }) @@ -532,12 +559,21 @@ (define_expand "vec_store_lanesxi<mode>" [(match_operand:XI 0 "neon_struct_operand") (match_operand:XI 1 "s_register_operand") (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] - "TARGET_NEON || TARGET_HAVE_MVE" + "TARGET_NEON" { - if (TARGET_NEON) - emit_insn (gen_neon_vst4<mode> (operands[0], operands[1])); - else - emit_insn (gen_mve_vst4q<mode> (operands[0], operands[1])); + emit_insn (gen_neon_vst4<mode> (operands[0], operands[1])); + DONE; +}) + +;;; On MVE we use V4xYYY modes instead of XI +(define_expand "vec_store_lanes<MVE_vld4_vst4><mode>" + [(set (match_operand:<MVE_VLD4_VST4> 0 "neon_struct_operand") + (unspec:<MVE_VLD4_VST4> [(match_operand:<MVE_VLD4_VST4> 1 "s_register_operand") + (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST4))] + "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD4_VST4>mode))" +{ + emit_insn (gen_mve_vst4q<mode> (operands[0], operands[1])); DONE; }) -- 2.34.1