On 05/05/2021 13:39, Srinath Parvathaneni via Gcc-patches wrote:
Hi Richard,

-----Original Message-----
From: Richard Earnshaw <richard.earns...@foss.arm.com>
Sent: 05 May 2021 11:15
To: Srinath Parvathaneni <srinath.parvathan...@arm.com>; gcc-
patc...@gcc.gnu.org
Cc: Richard Earnshaw <richard.earns...@arm.com>
Subject: Re: [GCC][PATCH] arm: Remove duplicate definitions from
arm_mve.h (pr100419).



On 05/05/2021 10:56, Srinath Parvathaneni via Gcc-patches wrote:
Hi All,

This patch removes several duplicated intrinsic definitions from
arm_mve.h mentioned in PR100419 and also fixes the wrong arguments
in few of intrinsics polymorphic variants.

Regression tested and found no issues.

Ok for master ? GCC-11 and GCC-10 branch backports?
gcc/ChangeLog:

2021-05-04  Srinath Parvathaneni  <srinath.parvathan...@arm.com>

          PR target/100419
          * config/arm/arm_mve.h (__arm_vstrwq_scatter_offset): Fix wrong
arguments.
          (__arm_vcmpneq): Remove duplicate definition.
          (__arm_vstrwq_scatter_offset_p): Likewise.
          (__arm_vmaxq_x): Likewise.
          (__arm_vmlsdavaq): Likewise.
          (__arm_vmlsdavaxq): Likewise.
          (__arm_vmlsdavq_p): Likewise.
          (__arm_vmlsdavxq_p): Likewise.
          (__arm_vrmlaldavhaq): Likewise.
          (__arm_vstrbq_p): Likewise.
          (__arm_vstrbq_scatter_offset): Likewise.
          (__arm_vstrbq_scatter_offset_p): Likewise.
          (__arm_vstrdq_scatter_offset): Likewise.
          (__arm_vstrdq_scatter_offset_p): Likewise.
          (__arm_vstrdq_scatter_shifted_offset): Likewise.
          (__arm_vstrdq_scatter_shifted_offset_p): Likewise.

Co-authored-by: Joe Ramsay  <joe.ram...@arm.com>

Let's take this example:

-#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 =
(p1); \
+#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 =
(p0); \
     __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(p0, int32_t *), __p1,
__ARM_mve_coerce(__p2, int32x4_t)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(p0, uint32_t *),
__p1,
__ARM_mve_coerce(__p2, uint32x4_t)));})
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(__p0, int32_t *), p1,
__ARM_mve_coerce(__p2, int32x4_t)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(__p0, uint32_t *),
p1,
__ARM_mve_coerce(__p2, uint32x4_t)));})

It removes the safe shadow copy of p1 but adds a safe shadow copy of p0.
   Why?  Isn't it better (and safer) to just create shadow copies of all
the arguments and let the compiler worry about when it's safe to
eliminate them?

As you already know polymorphic variants are used to select the intrinsics 
based on type of their arguments.

Consider the following code from arm_mve.h:
__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vstrwq_scatter_offset_s32 (int32_t * __base, uint32x4_t __offset, 
int32x4_t __value)
{
   __builtin_mve_vstrwq_scatter_offset_sv4si ((__builtin_neon_si *) __base, 
__offset, __value);
}

__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vstrwq_scatter_offset_u32 (uint32_t * __base, uint32x4_t __offset, 
uint32x4_t __value)
{
   __builtin_mve_vstrwq_scatter_offset_uv4si ((__builtin_neon_si *) __base, 
__offset, __value);
}

__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vstrwq_scatter_offset_f32 (float32_t * __base, uint32x4_t __offset, 
float32x4_t __value)
{
   __builtin_mve_vstrwq_scatter_offset_fv4sf ((__builtin_neon_si *) __base, 
__offset, __value);
}

Of above 3 functions, which function is to be called from the following 
polymorphic variant is
decided based on type of arguments p0, p1 and p2.
#define __arm_vstrwq_scatter_offset(p0,p1,p2)

For the 3 function definitions mentioned above, only type of arguments 1 (p0) 
and 3 (p2) varies
whereas type of second argument (p1) is same (uint32x4_t).

This is the reason we need only shadow copy of p0 and p2 to determine the 
actual function to be called
and type of p1 is irrelevant. Previously p1 was wrongly used to determine the 
function instead of p0
and that is a bug, which got fixed in this patch.

Since type of p1 is irrelevant in deciding the function to be called and I 
believe adding shadow copy
for p1 (__typeof(p1) __p1 = (p1) ) in this macro expansion is of no use. 
Considering we have more than
250 polymorphic variants defined in arm_mve.h headers, this results in more 
than 250 lines of extra code.


Ah sorry, I'd missed that this was using the _Generic() feature and that p1 was only being used once in each variant.

On that basis, OK.

R.

Regards,
Srinath.

R.



###############     Attachment also inlined for ease of reply
###############


diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index
3a40c6e68161b64319b071f57a5b0d8393303cfd..dc1d874a6366eb5fe755a70c
72ed371c915bd04b 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -37808,33 +37808,19 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_base_p_u32(p0, p1, __ARM_mve_coerce(__p2,
uint32x4_t), p3), \
     int (*)[__ARM_mve_type_float32x4_t]:
__arm_vstrwq_scatter_base_p_f32(p0, p1, __ARM_mve_coerce(__p2,
float32x4_t), p3));})

-#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 =
(p1); \
+#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 =
(p0); \
     __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0,
\
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(p0, int32_t *), __p1,
__ARM_mve_coerce(__p2, int32x4_t)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(p0, uint32_t *),
__p1, __ARM_mve_coerce(__p2, uint32x4_t)), \
-  int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]:
__arm_vstrwq_scatter_offset_f32 (__ARM_mve_coerce(p0, float32_t *),
__p1, __ARM_mve_coerce(__p2, float32x4_t)));})
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(__p0, int32_t *), p1,
__ARM_mve_coerce(__p2, int32x4_t)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(__p0, uint32_t *),
p1, __ARM_mve_coerce(__p2, uint32x4_t)), \
+  int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]:
__arm_vstrwq_scatter_offset_f32 (__ARM_mve_coerce(__p0, float32_t *),
p1, __ARM_mve_coerce(__p2, float32x4_t)));})

-#define __arm_vstrwq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1
= (p1); \
+#define __arm_vstrwq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p0)
__p0 = (p0); \
     __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0,
\
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_p_s32 (__ARM_mve_coerce(p0, int32_t *),
__p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_p_u32 (__ARM_mve_coerce(p0, uint32_t *),
__p1, __ARM_mve_coerce(__p2, uint32x4_t), p3), \
-  int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]:
__arm_vstrwq_scatter_offset_p_f32 (__ARM_mve_coerce(p0, float32_t *),
__p1, __ARM_mve_coerce(__p2, float32x4_t), p3));})
-
-#define __arm_vstrwq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1
= (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0,
\
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_p_s32 (__ARM_mve_coerce(p0, int32_t *),
__p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_p_u32 (__ARM_mve_coerce(p0, uint32_t *),
__p1, __ARM_mve_coerce(__p2, uint32x4_t), p3), \
-  int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]:
__arm_vstrwq_scatter_offset_p_f32 (__ARM_mve_coerce(p0, float32_t *),
__p1, __ARM_mve_coerce(__p2, float32x4_t), p3));})
-
-#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 =
(p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0,
\
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(p0, int32_t *), __p1,
__ARM_mve_coerce(__p2, int32x4_t)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(p0, uint32_t *),
__p1, __ARM_mve_coerce(__p2, uint32x4_t)), \
-  int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]:
__arm_vstrwq_scatter_offset_f32 (__ARM_mve_coerce(p0, float32_t *),
__p1, __ARM_mve_coerce(__p2, float32x4_t)));})
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_p_s32 (__ARM_mve_coerce(__p0, int32_t *),
p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \
+  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_p_u32 (__ARM_mve_coerce(__p0, uint32_t *),
p1, __ARM_mve_coerce(__p2, uint32x4_t), p3), \
+  int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]:
__arm_vstrwq_scatter_offset_p_f32 (__ARM_mve_coerce(__p0, float32_t *),
p1, __ARM_mve_coerce(__p2, float32x4_t), p3));})

   #define __arm_vstrwq_scatter_shifted_offset(p0,p1,p2) ({ __typeof(p1)
__p1 = (p1); \
     __typeof(p2) __p2 = (p2); \
@@ -38422,6 +38408,12 @@ extern void *__ARM_undef;
   #define __arm_vcmpneq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
     __typeof(p1) __p1 = (p1); \
     _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8_t)), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16_t)), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32_t)), \
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
__ARM_mve_coerce(__p1, uint8_t)), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
__ARM_mve_coerce(__p1, uint16_t)), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
__ARM_mve_coerce(__p1, uint32_t)), \
     int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vcmpneq_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8x16_t)), \
     int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vcmpneq_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t)), \
     int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vcmpneq_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t)), \
@@ -38871,23 +38863,6 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
__arm_vcmpeqq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
__ARM_mve_coerce(__p1, uint16_t)), \
     int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
__arm_vcmpeqq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
__ARM_mve_coerce(__p1, uint32_t)));})

-#define __arm_vcmpneq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vcmpneq_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8x16_t)), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vcmpneq_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vcmpneq_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
__arm_vcmpneq_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
__ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
__arm_vcmpneq_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
__ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
__arm_vcmpneq_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
__ARM_mve_coerce(__p1, uint32x4_t)), \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8_t)), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
__ARM_mve_coerce(__p1, uint8_t)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
__ARM_mve_coerce(__p1, uint16_t)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
__ARM_mve_coerce(__p1, uint32_t)));})
-
-
   #define __arm_vqmovntq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
     __typeof(p1) __p1 = (p1); \
     _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
@@ -39036,22 +39011,6 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
__arm_vcmpneq_m_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
__ARM_mve_coerce(__p1, uint16x8_t), p2), \
     int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
__arm_vcmpneq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
__ARM_mve_coerce(__p1, uint32x4_t), p2));})

-#define __arm_vcmpneq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vcmpneq_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8x16_t)), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vcmpneq_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vcmpneq_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
__arm_vcmpneq_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
__ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
__arm_vcmpneq_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
__ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
__arm_vcmpneq_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
__ARM_mve_coerce(__p1, uint32x4_t)), \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8_t)), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
__ARM_mve_coerce(__p1, uint8_t)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
__ARM_mve_coerce(__p1, uint16_t)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
__arm_vcmpneq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
__ARM_mve_coerce(__p1, uint32_t)));})
-
   #define __arm_vshlcq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
     _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
     int (*)[__ARM_mve_type_int8x16_t]: __arm_vshlcq_s8
(__ARM_mve_coerce(__p0, int8x16_t), p1, p2), \
@@ -39367,52 +39326,6 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vminaq_m_s16 (__ARM_mve_coerce(__p0, uint16x8_t),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
     int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vminaq_m_s32 (__ARM_mve_coerce(__p0, uint32x4_t),
__ARM_mve_coerce(__p1, int32x4_t), p2));})

-#define __arm_vrmlaldavhaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vcmpltq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8x16_t), p2), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vcmpltq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vcmpltq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t), p2), \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
__arm_vcmpltq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8_t), p2), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
__arm_vcmpltq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16_t), p2), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
__arm_vcmpltq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32_t), p2));})
-
-#define __arm_vmlsdavxq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vcmpleq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8x16_t), p2), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vcmpleq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vcmpleq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t), p2), \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
__arm_vcmpleq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8_t), p2), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
__arm_vcmpleq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16_t), p2), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
__arm_vcmpleq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32_t), p2));})
-
-#define __arm_vmlsdavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vcmpgtq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8x16_t), p2), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vcmpgtq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vcmpgtq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t), p2), \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
__arm_vcmpgtq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8_t), p2), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
__arm_vcmpgtq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16_t), p2), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
__arm_vcmpgtq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32_t), p2));})
-
-#define __arm_vmlsdavaxq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int16x8_t]:
__arm_vshrntq_n_s16 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int32x4_t]:
__arm_vshrntq_n_s32 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int32x4_t), p2), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint16x8_t]:
__arm_vshrntq_n_u16 (__ARM_mve_coerce(__p0, uint8x16_t),
__ARM_mve_coerce(__p1, uint16x8_t), p2), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32x4_t]:
__arm_vshrntq_n_u32 (__ARM_mve_coerce(__p0, uint16x8_t),
__ARM_mve_coerce(__p1, uint32x4_t), p2));})
-
-#define __arm_vmlsdavaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int16x8_t]:
__arm_vrshrntq_n_s16 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int32x4_t]:
__arm_vrshrntq_n_s32 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int32x4_t), p2), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint16x8_t]:
__arm_vrshrntq_n_u16 (__ARM_mve_coerce(__p0, uint8x16_t),
__ARM_mve_coerce(__p1, uint16x8_t), p2), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32x4_t]:
__arm_vrshrntq_n_u32 (__ARM_mve_coerce(__p0, uint16x8_t),
__ARM_mve_coerce(__p1, uint32x4_t), p2));})
-
   #define __arm_vmovlbq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
     __typeof(p1) __p1 = (p1); \
     _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
@@ -39711,26 +39624,6 @@ extern void *__ARM_undef;
     int
(*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_m
ve_type_uint16x8_t]: __arm_vmulq_m_u16 (__ARM_mve_coerce(__p0,
uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, uint16x8_t), p3), \
     int
(*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_m
ve_type_uint32x4_t]: __arm_vmulq_m_u32 (__ARM_mve_coerce(__p0,
uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, uint32x4_t), p3));})

-#define __arm_vstrbq(p0,p1) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0,
\
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]:
__arm_vstrbq_s8 (__ARM_mve_coerce(p0, int8_t *),
__ARM_mve_coerce(__p1, int8x16_t)), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int16x8_t]:
__arm_vstrbq_s16 (__ARM_mve_coerce(p0, int8_t *),
__ARM_mve_coerce(__p1, int16x8_t)), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrbq_s32 (__ARM_mve_coerce(p0, int8_t *),
__ARM_mve_coerce(__p1, int32x4_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]:
__arm_vstrbq_u8 (__ARM_mve_coerce(p0, uint8_t *),
__ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]:
__arm_vstrbq_u16 (__ARM_mve_coerce(p0, uint8_t *),
__ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrbq_u32 (__ARM_mve_coerce(p0, uint8_t *),
__ARM_mve_coerce(__p1, uint32x4_t)));})
-
-#define __arm_vstrbq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 =
(p0); \
-  __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typ
eid(__p2)])0, \
-  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_m
ve_type_int8x16_t]: __arm_vstrbq_scatter_offset_s8
(__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, int8x16_t)), \
-  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_m
ve_type_int16x8_t]: __arm_vstrbq_scatter_offset_s16
(__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, int16x8_t)), \
-  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_m
ve_type_int32x4_t]: __arm_vstrbq_scatter_offset_s32
(__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, int32x4_t)), \
-  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_
mve_type_uint8x16_t]: __arm_vstrbq_scatter_offset_u8
(__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1,
uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t)), \
-  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_
mve_type_uint16x8_t]: __arm_vstrbq_scatter_offset_u16
(__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1,
uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t)), \
-  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_
mve_type_uint32x4_t]: __arm_vstrbq_scatter_offset_u32
(__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1,
uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t)));})
-
   #define __arm_vstrwq_scatter_base(p0,p1,p2) ({ __typeof(p2) __p2 = (p2);
\
     _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \
     int (*)[__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_base_s32(p0, p1, __ARM_mve_coerce(__p2,
int32x4_t)), \
@@ -39745,27 +39638,6 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]:
__arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce(p0, uint8_t const *),
__ARM_mve_coerce(__p1, uint16x8_t)), \
     int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce(p0, uint8_t const *),
__ARM_mve_coerce(__p1, uint32x4_t)));})

-#define __arm_vstrbq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]:
__arm_vstrbq_p_s8 (__ARM_mve_coerce(__p0, int8_t *),
__ARM_mve_coerce(__p1, int8x16_t), p2), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int16x8_t]:
__arm_vstrbq_p_s16 (__ARM_mve_coerce(__p0, int8_t *),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrbq_p_s32 (__ARM_mve_coerce(__p0, int8_t *),
__ARM_mve_coerce(__p1, int32x4_t), p2), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]:
__arm_vstrbq_p_u8 (__ARM_mve_coerce(__p0, uint8_t *),
__ARM_mve_coerce(__p1, uint8x16_t), p2), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]:
__arm_vstrbq_p_u16 (__ARM_mve_coerce(__p0, uint8_t *),
__ARM_mve_coerce(__p1, uint16x8_t), p2), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrbq_p_u32 (__ARM_mve_coerce(__p0, uint8_t *),
__ARM_mve_coerce(__p1, uint32x4_t), p2));})
-
-#define __arm_vstrbq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p0) __p0
= (p0); \
-  __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typ
eid(__p2)])0, \
-  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_m
ve_type_int8x16_t]: __arm_vstrbq_scatter_offset_p_s8
(__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, int8x16_t), p3), \
-  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_m
ve_type_int16x8_t]: __arm_vstrbq_scatter_offset_p_s16
(__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, int16x8_t), p3), \
-  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_m
ve_type_int32x4_t]: __arm_vstrbq_scatter_offset_p_s32
(__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, int32x4_t), p3), \
-  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_
mve_type_uint8x16_t]: __arm_vstrbq_scatter_offset_p_u8
(__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1,
uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \
-  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_
mve_type_uint16x8_t]: __arm_vstrbq_scatter_offset_p_u16
(__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1,
uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \
-  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_
mve_type_uint32x4_t]: __arm_vstrbq_scatter_offset_p_u32
(__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1,
uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));})
-
   #define __arm_vstrwq_scatter_base_p(p0,p1,p2,p3) ({ __typeof(p2) __p2
= (p2); \
     _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \
     int (*)[__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_base_p_s32
(p0, p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \
@@ -39921,34 +39793,6 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_base_s64
(p0, p1, __ARM_mve_coerce(__p2, int64x2_t)), \
     int (*)[__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_base_u64
(p0, p1, __ARM_mve_coerce(__p2, uint64x2_t)));})

-#define __arm_vstrdq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p0) __p0
= (p0); \
-  __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]:
__arm_vstrdq_scatter_offset_p_s64 (__ARM_mve_coerce(__p0, int64_t *),
__p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \
-  int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_offset_p_u64 (__ARM_mve_coerce(__p0, uint64_t *),
__p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));})
-
-#define __arm_vstrdq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 =
(p0); \
-  __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]:
__arm_vstrdq_scatter_offset_s64 (__ARM_mve_coerce(__p0, int64_t *),
__p1, __ARM_mve_coerce(__p2, int64x2_t)), \
-  int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_offset_u64 (__ARM_mve_coerce(__p0, uint64_t *),
__p1, __ARM_mve_coerce(__p2, uint64x2_t)));})
-
-#define __arm_vstrdq_scatter_shifted_offset_p(p0,p1,p2,p3)
({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]:
__arm_vstrdq_scatter_shifted_offset_p_s64 (__ARM_mve_coerce(__p0,
int64_t *), __p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \
-  int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_shifted_offset_p_u64 (__ARM_mve_coerce(__p0,
uint64_t *), __p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));})
-
-#define __arm_vstrdq_scatter_shifted_offset(p0,p1,p2) ({ __typeof(p0)
__p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]:
__arm_vstrdq_scatter_shifted_offset_s64 (__ARM_mve_coerce(__p0,
int64_t *), __p1, __ARM_mve_coerce(__p2, int64x2_t)), \
-  int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_shifted_offset_u64 (__ARM_mve_coerce(__p0,
uint64_t *), __p1, __ARM_mve_coerce(__p2, uint64x2_t)));})
-
   #define __arm_vstrhq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 =
(p1); \
     __typeof(p2) __p2 = (p2); \
     _Generic( (int
(*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typei
d(__p2)])0, \
@@ -39981,29 +39825,17 @@ extern void *__ARM_undef;
     int
(*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_
mve_type_uint16x8_t]: __arm_vstrhq_scatter_shifted_offset_p_u16
(__ARM_mve_coerce(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, uint16x8_t), p3), \
     int
(*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_
mve_type_uint32x4_t]: __arm_vstrhq_scatter_shifted_offset_p_u32
(__ARM_mve_coerce(p0, uint16_t *), __ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, uint32x4_t), p3));})

-#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 =
(p1); \
+#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 =
(p0); \
     __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0,
\
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(p0, int32_t *), __p1,
__ARM_mve_coerce(__p2, int32x4_t)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(p0, uint32_t *),
__p1, __ARM_mve_coerce(__p2, uint32x4_t)));})
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(__p0, int32_t *), p1,
__ARM_mve_coerce(__p2, int32x4_t)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(__p0, uint32_t *),
p1, __ARM_mve_coerce(__p2, uint32x4_t)));})

-#define __arm_vstrwq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1
= (p1); \
+#define __arm_vstrwq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p0)
__p0 = (p0); \
     __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0,
\
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_p_s32 (__ARM_mve_coerce(p0, int32_t *),
__p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_p_u32 (__ARM_mve_coerce(p0, uint32_t *),
__p1, __ARM_mve_coerce(__p2, uint32x4_t)));})
-
-#define __arm_vstrwq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1
= (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0,
\
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_p_s32 (__ARM_mve_coerce(p0, int32_t *),
__p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_p_u32 (__ARM_mve_coerce(p0, uint32_t *),
__p1, __ARM_mve_coerce(__p2, uint32x4_t), p3));})
-
-#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 =
(p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0,
\
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(p0, int32_t *), __p1,
__ARM_mve_coerce(__p2, int32x4_t)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(p0, uint32_t *),
__p1, __ARM_mve_coerce(__p2, uint32x4_t)));})
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrwq_scatter_offset_p_s32 (__ARM_mve_coerce(__p0, int32_t *),
p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \
+  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrwq_scatter_offset_p_u32 (__ARM_mve_coerce(__p0, uint32_t *),
p1, __ARM_mve_coerce(__p2, uint32x4_t), p3));})

   #define __arm_vstrwq_scatter_shifted_offset(p0,p1,p2) ({ __typeof(p1)
__p1 = (p1); \
     __typeof(p2) __p2 = (p2); \
@@ -40160,32 +39992,6 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
__arm_veorq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, uint16x8_t), p3), \
     int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
__arm_veorq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, uint32x4_t), p3));})

-#define __arm_vmaxq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmulq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8x16_t), p3), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmulq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t), p3), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmulq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t), p3), \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
__arm_vmulq_x_n_s8 (__ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8_t), p3), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
__arm_vmulq_x_n_s16 (__ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16_t), p3), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
__arm_vmulq_x_n_s32 (__ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32_t), p3), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
__arm_vmulq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, uint8x16_t), p3), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
__arm_vmulq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, uint16x8_t), p3), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
__arm_vmulq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, uint32x4_t), p3), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
__arm_vmulq_x_n_u8 (__ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, uint8_t), p3), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
__arm_vmulq_x_n_u16 (__ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, uint16_t), p3), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
__arm_vmulq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, uint32_t), p3));})
-
-#define __arm_vminq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vminq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8x16_t), p3), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vminq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t), p3), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vminq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t), p3), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
__arm_vminq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, uint8x16_t), p3), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
__arm_vminq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, uint16x8_t), p3), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
__arm_vminq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, uint32x4_t), p3));})
-
   #define __arm_vmovlbq_x(p1,p2) ({ __typeof(p1) __p1 = (p1); \
     _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
     int (*)[__ARM_mve_type_int8x16_t]: __arm_vmovlbq_x_s8
(__ARM_mve_coerce(__p1, int8x16_t), p2), \
@@ -41013,13 +40819,6 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsldavaxq_p_s16(__p0, __ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t), p3), \
     int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsldavaxq_p_s32(__p0, __ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t), p3));})

-#define __arm_vrmlaldavhaq_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0);
\
-  __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typ
eid(__p2)])0, \
-  int
(*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t][__ARM_mve_typ
e_int32x4_t]: __arm_vrmlaldavhaq_p_s32 (__ARM_mve_coerce(__p0,
int64_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2,
int32x4_t), p3), \
-  int
(*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t][__ARM_mve_ty
pe_uint32x4_t]: __arm_vrmlaldavhaq_p_u32 (__ARM_mve_coerce(__p0,
uint64_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2,
uint32x4_t), p3));})
-
   #define __arm_vrmlaldavhaxq_p(p0,p1,p2,p3)
__arm_vrmlaldavhaxq_p_s32(p0,p1,p2,p3)

   #define __arm_vrmlsldavhaq_p(p0,p1,p2,p3)
__arm_vrmlsldavhaq_p_s32(p0,p1,p2,p3)
@@ -41343,21 +41142,47 @@ extern void *__ARM_undef;
     int
(*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve
_type_int16x8_t]: __arm_vqrdmladhxq_m_s16 (__ARM_mve_coerce(__p0,
int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2,
int16x8_t), p3), \
     int
(*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve
_type_int32x4_t]: __arm_vqrdmladhxq_m_s32 (__ARM_mve_coerce(__p0,
int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2,
int32x4_t), p3));})

-#define __arm_vmlsdavaxq_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
+#define __arm_vmlsdavaxq_p(p0,p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
     __typeof(p2) __p2 = (p2); \
     _Generic( (int
(*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmlsdavaxq_p_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8x16_t), p3), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavaxq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t), p3), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavaxq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t), p3));})
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmlsdavaxq_p_s8 (p0, __ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8x16_t), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavaxq_p_s16 (p0, __ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavaxq_p_s32 (p0, __ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t), p3));})
+
+#define __arm_vmlsdavaq(p0,p1,p2) ({  __typeof(p1) __p1 = (p1); \
+  __typeof(p2) __p2 = (p2); \
+  _Generic( (int
(*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmlsdavaq_s8(p0, __ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8x16_t)), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavaq_s16(p0, __ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t)), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavaq_s32(p0, __ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t)));})
+
+#define __arm_vmlsdavaxq(p0,p1,p2) ({ __typeof(p2) __p2 = (p2); \
+  __typeof(p1) __p1 = (p1); \
+  _Generic( (int
(*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmlsdavaxq_s8(p0, __ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8x16_t)), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavaxq_s16(p0, __ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t)), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavaxq_s32(p0, __ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t)));})

-#define __arm_vmlsdavaq_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
+#define __arm_vmlsdavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
     __typeof(p1) __p1 = (p1); \
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmlsdavq_p_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8x16_t), p2), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavq_p_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavq_p_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t), p2));})
+
+#define __arm_vmlsdavxq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
+  __typeof(p1) __p1 = (p1); \
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmlsdavxq_p_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8x16_t), p2), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavxq_p_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavxq_p_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t), p2));})
+
+#define __arm_vmlsdavaq_p(p0,p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
     __typeof(p2) __p2 = (p2); \
     _Generic( (int
(*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmlsdavaq_p_s8(__p0, __ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8x16_t), p3), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavaq_p_s16(__p0, __ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t), p3), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavaq_p_s32(__p0, __ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t), p3));})
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmlsdavaq_p_s8(p0, __ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8x16_t), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavaq_p_s16(p0, __ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavaq_p_s32(p0, __ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t), p3));})

   #define __arm_vmladavaxq_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
     __typeof(p1) __p1 = (p1); \
@@ -41445,8 +41270,8 @@ extern void *__ARM_undef;

   #define __arm_viwdupq_u16(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
     _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u16
(__ARM_mve_coerce(__p0, uint32_t), p1, p2), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u16
(__ARM_mve_coerce(__p0, uint32_t *), p1, p2));})
+  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u16
(__ARM_mve_coerce(__p0, uint32_t), p1, (const int) p2), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u16
(__ARM_mve_coerce(__p0, uint32_t *), p1, (const int) p2));})

   #define __arm_viwdupq_u32(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
     _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
@@ -41628,16 +41453,6 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]:
__arm_vmaxavq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), p2), \
     int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]:
__arm_vmaxavq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), p2));})

-#define __arm_vmaxq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmaxq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8x16_t), p3), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmaxq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t), p3), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmaxq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t), p3), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
__arm_vmaxq_x_u8( __ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, uint8x16_t), p3), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
__arm_vmaxq_x_u16( __ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, uint16x8_t), p3), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
__arm_vmaxq_x_u32( __ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, uint32x4_t), p3));})
-
   #define __arm_vmaxvq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
     __typeof(p1) __p1 = (p1); \
     _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
@@ -41672,6 +41487,16 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]:
__arm_vminavq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), p2), \
     int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]:
__arm_vminavq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), p2));})

+#define __arm_vmaxq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
+  __typeof(p2) __p2 = (p2); \
+  _Generic( (int
(*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmaxq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8x16_t), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmaxq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmaxq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t), p3), \
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
__arm_vmaxq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, uint8x16_t), p3), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
__arm_vmaxq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, uint16x8_t), p3), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
__arm_vmaxq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, uint32x4_t), p3));})
+
   #define __arm_vminq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
     __typeof(p2) __p2 = (p2); \
     _Generic( (int
(*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
@@ -41810,22 +41635,6 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlaldavxq_p_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
     int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlaldavxq_p_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t), p2));})

-#define __arm_vmlsdavaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmlsdavaq_s8(__p0, __ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8x16_t)), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavaq_s16(__p0, __ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavaq_s32(__p0, __ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t)));})
-
-#define __arm_vmlsdavaxq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmlsdavaxq_s8(__p0, __ARM_mve_coerce(__p1, int8x16_t),
__ARM_mve_coerce(__p2, int8x16_t)), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavaxq_s16(__p0, __ARM_mve_coerce(__p1, int16x8_t),
__ARM_mve_coerce(__p2, int16x8_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavaxq_s32(__p0, __ARM_mve_coerce(__p1, int32x4_t),
__ARM_mve_coerce(__p2, int32x4_t)));})
-
   #define __arm_vmlsdavq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
     __typeof(p1) __p1 = (p1); \
     _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
@@ -41833,13 +41642,6 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavq_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t)), \
     int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavq_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t)));})

-#define __arm_vmlsdavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmlsdavq_p_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8x16_t), p2), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavq_p_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavq_p_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t), p2));})
-
   #define __arm_vmlsdavxq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
     __typeof(p1) __p1 = (p1); \
     _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
@@ -41847,13 +41649,6 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavxq_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t)), \
     int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavxq_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t)));})

-#define __arm_vmlsdavxq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
__arm_vmlsdavxq_p_s8 (__ARM_mve_coerce(__p0, int8x16_t),
__ARM_mve_coerce(__p1, int8x16_t), p2), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
__arm_vmlsdavxq_p_s16 (__ARM_mve_coerce(__p0, int16x8_t),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
__arm_vmlsdavxq_p_s32 (__ARM_mve_coerce(__p0, int32x4_t),
__ARM_mve_coerce(__p1, int32x4_t), p2));})
-
   #define __arm_vmlsldavaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
     __typeof(p1) __p1 = (p1); \
     __typeof(p2) __p2 = (p2); \
@@ -41948,13 +41743,6 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
__arm_vmulltq_poly_x_p8 (__ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, uint8x16_t), p3), \
     int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
__arm_vmulltq_poly_x_p16 (__ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, uint16x8_t), p3));})

-#define __arm_vrmlaldavhaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typ
eid(__p2)])0, \
-  int
(*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t][__ARM_mve_typ
e_int32x4_t]: __arm_vrmlaldavhaq_s32 (__ARM_mve_coerce(__p0, int64_t),
__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t)),
\
-  int
(*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t][__ARM_mve_ty
pe_uint32x4_t]: __arm_vrmlaldavhaq_u32 (__ARM_mve_coerce(__p0,
uint64_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2,
uint32x4_t)));})
-
   #define __arm_vrmlaldavhaxq(p0,p1,p2)
__arm_vrmlaldavhaxq_s32(p0,p1,p2)

   #define __arm_vrmlaldavhq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
@@ -41994,35 +41782,15 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]:
__arm_vstrbq_u16 (__ARM_mve_coerce(p0, uint8_t *),
__ARM_mve_coerce(__p1, uint16x8_t)), \
     int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrbq_u32 (__ARM_mve_coerce(p0, uint8_t *),
__ARM_mve_coerce(__p1, uint32x4_t)));})

-#define __arm_vstrbq_p(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0,
\
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]:
__arm_vstrbq_p_s8 (__ARM_mve_coerce(p0, int8_t *),
__ARM_mve_coerce(__p1, int8x16_t), p2), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int16x8_t]:
__arm_vstrbq_p_s16 (__ARM_mve_coerce(p0, int8_t *),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrbq_p_s32 (__ARM_mve_coerce(p0, int8_t *),
__ARM_mve_coerce(__p1, int32x4_t), p2), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]:
__arm_vstrbq_p_u8 (__ARM_mve_coerce(p0, uint8_t *),
__ARM_mve_coerce(__p1, uint8x16_t), p2), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]:
__arm_vstrbq_p_u16 (__ARM_mve_coerce(p0, uint8_t *),
__ARM_mve_coerce(__p1, uint16x8_t), p2), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrbq_p_u32 (__ARM_mve_coerce(p0, uint8_t *),
__ARM_mve_coerce(__p1, uint32x4_t), p2));})
-
-#define __arm_vstrbq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 =
(p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typei
d(__p2)])0, \
-  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_m
ve_type_int8x16_t]: __arm_vstrbq_scatter_offset_s8
(__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, int8x16_t)), \
-  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_m
ve_type_int16x8_t]: __arm_vstrbq_scatter_offset_s16
(__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, int16x8_t)), \
-  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_m
ve_type_int32x4_t]: __arm_vstrbq_scatter_offset_s32
(__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, int32x4_t)), \
-  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_
mve_type_uint8x16_t]: __arm_vstrbq_scatter_offset_u8
(__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, uint8x16_t)), \
-  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_
mve_type_uint16x8_t]: __arm_vstrbq_scatter_offset_u16
(__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, uint16x8_t)), \
-  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_
mve_type_uint32x4_t]: __arm_vstrbq_scatter_offset_u32
(__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, uint32x4_t)));})
-
-
-#define __arm_vstrbq_scatter_offset_p(p0,p1,p2,p3) ({__typeof(p1) __p1
= (p1); \
-  __typeof(p2) __p2 = (p2); \
-  _Generic( (int
(*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typei
d(__p2)])0, \
-  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_m
ve_type_int8x16_t]: __arm_vstrbq_scatter_offset_p_s8
(__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, int8x16_t), p3), \
-  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_m
ve_type_int16x8_t]: __arm_vstrbq_scatter_offset_p_s16
(__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, int16x8_t), p3), \
-  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_m
ve_type_int32x4_t]: __arm_vstrbq_scatter_offset_p_s32
(__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, int32x4_t), p3), \
-  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_
mve_type_uint8x16_t]: __arm_vstrbq_scatter_offset_p_u8
(__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, uint8x16_t), p3), \
-  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_
mve_type_uint16x8_t]: __arm_vstrbq_scatter_offset_p_u16
(__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, uint16x8_t), p3), \
-  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_
mve_type_uint32x4_t]: __arm_vstrbq_scatter_offset_p_u32
(__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, uint32x4_t), p3));})
+#define __arm_vstrbq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
+  __typeof(p1) __p1 = (p1); \
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]:
__arm_vstrbq_p_s8 (__ARM_mve_coerce(__p0, int8_t *),
__ARM_mve_coerce(__p1, int8x16_t), p2), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int16x8_t]:
__arm_vstrbq_p_s16 (__ARM_mve_coerce(__p0, int8_t *),
__ARM_mve_coerce(__p1, int16x8_t), p2), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int32x4_t]:
__arm_vstrbq_p_s32 (__ARM_mve_coerce(__p0, int8_t *),
__ARM_mve_coerce(__p1, int32x4_t), p2), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]:
__arm_vstrbq_p_u8 (__ARM_mve_coerce(__p0, uint8_t *),
__ARM_mve_coerce(__p1, uint8x16_t), p2), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]:
__arm_vstrbq_p_u16 (__ARM_mve_coerce(__p0, uint8_t *),
__ARM_mve_coerce(__p1, uint16x8_t), p2), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]:
__arm_vstrbq_p_u32 (__ARM_mve_coerce(__p0, uint8_t *),
__ARM_mve_coerce(__p1, uint32x4_t), p2));})

   #define __arm_vstrdq_scatter_base(p0,p1,p2) ({ __typeof(p2) __p2 = (p2);
\
     _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \
@@ -42034,29 +41802,65 @@ extern void *__ARM_undef;
     int (*)[__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_base_p_s64
(p0, p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \
     int (*)[__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_base_p_u64 (p0, p1, __ARM_mve_coerce(__p2,
uint64x2_t), p3));})

-#define __arm_vstrdq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 =
(p1); \
+#define __arm_vrmlaldavhaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
+  __typeof(p1) __p1 = (p1); \
     __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0,
\
-  int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]:
__arm_vstrdq_scatter_offset_s64 (__ARM_mve_coerce(p0, int64_t *), __p1,
__ARM_mve_coerce(__p2, int64x2_t)), \
-  int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_offset_u64 (__ARM_mve_coerce(p0, uint64_t *), __p1,
__ARM_mve_coerce(__p2, uint64x2_t)));})
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typ
eid(__p2)])0, \
+  int
(*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t][__ARM_mve_typ
e_int32x4_t]: __arm_vrmlaldavhaq_s32 (__ARM_mve_coerce(__p0, int64_t),
__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t)),
\
+  int
(*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t][__ARM_mve_ty
pe_uint32x4_t]: __arm_vrmlaldavhaq_u32 (__ARM_mve_coerce(__p0,
uint64_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2,
uint32x4_t)));})

-#define __arm_vstrdq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1
= (p1); \
+#define __arm_vrmlaldavhaq_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0);
\
+  __typeof(p1) __p1 = (p1); \
     __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0,
\
-  int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]:
__arm_vstrdq_scatter_offset_p_s64 (__ARM_mve_coerce(p0, int64_t *),
__p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \
-  int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_offset_p_u64 (__ARM_mve_coerce(p0, uint64_t *),
__p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));})
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typ
eid(__p2)])0, \
+  int
(*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t][__ARM_mve_typ
e_int32x4_t]: __arm_vrmlaldavhaq_p_s32 (__ARM_mve_coerce(__p0,
int64_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2,
int32x4_t), p3), \
+  int
(*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t][__ARM_mve_ty
pe_uint32x4_t]: __arm_vrmlaldavhaq_p_u32 (__ARM_mve_coerce(__p0,
uint64_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2,
uint32x4_t), p3));})

-#define __arm_vstrdq_scatter_shifted_offset(p0,p1,p2) ({ __typeof(p1)
__p1 = (p1); \
+#define __arm_vstrbq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 =
(p0); \
+  __typeof(p1) __p1 = (p1); \
     __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0,
\
-  int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]:
__arm_vstrdq_scatter_shifted_offset_s64 (__ARM_mve_coerce(p0, int64_t
*), __p1, __ARM_mve_coerce(__p2, int64x2_t)), \
-  int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_shifted_offset_u64 (__ARM_mve_coerce(p0, uint64_t
*), __p1, __ARM_mve_coerce(__p2, uint64x2_t)));})
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typ
eid(__p2)])0, \
+  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_m
ve_type_int8x16_t]: __arm_vstrbq_scatter_offset_s8
(__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, int8x16_t)), \
+  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_m
ve_type_int16x8_t]: __arm_vstrbq_scatter_offset_s16
(__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, int16x8_t)), \
+  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_m
ve_type_int32x4_t]: __arm_vstrbq_scatter_offset_s32
(__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, int32x4_t)), \
+  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_
mve_type_uint8x16_t]: __arm_vstrbq_scatter_offset_u8
(__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1,
uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t)), \
+  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_
mve_type_uint16x8_t]: __arm_vstrbq_scatter_offset_u16
(__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1,
uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t)), \
+  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_
mve_type_uint32x4_t]: __arm_vstrbq_scatter_offset_u32
(__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1,
uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t)));})

-#define __arm_vstrdq_scatter_shifted_offset_p(p0,p1,p2,p3)
({ __typeof(p1) __p1 = (p1); \
+#define __arm_vstrbq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p0) __p0
= (p0); \
+  __typeof(p1) __p1 = (p1); \
     __typeof(p2) __p2 = (p2); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0,
\
-  int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]:
__arm_vstrdq_scatter_shifted_offset_p_s64 (__ARM_mve_coerce(p0,
int64_t *), __p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \
-  int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_shifted_offset_p_u64 (__ARM_mve_coerce(p0,
uint64_t *), __p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));})
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typ
eid(__p2)])0, \
+  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_m
ve_type_int8x16_t]: __arm_vstrbq_scatter_offset_p_s8
(__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t),
__ARM_mve_coerce(__p2, int8x16_t), p3), \
+  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_m
ve_type_int16x8_t]: __arm_vstrbq_scatter_offset_p_s16
(__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t),
__ARM_mve_coerce(__p2, int16x8_t), p3), \
+  int
(*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_m
ve_type_int32x4_t]: __arm_vstrbq_scatter_offset_p_s32
(__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t),
__ARM_mve_coerce(__p2, int32x4_t), p3), \
+  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_
mve_type_uint8x16_t]: __arm_vstrbq_scatter_offset_p_u8
(__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1,
uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \
+  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_
mve_type_uint16x8_t]: __arm_vstrbq_scatter_offset_p_u16
(__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1,
uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \
+  int
(*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_
mve_type_uint32x4_t]: __arm_vstrbq_scatter_offset_p_u32
(__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1,
uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));})
+
+#define __arm_vstrdq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p0) __p0
= (p0); \
+  __typeof(p2) __p2 = (p2); \
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]:
__arm_vstrdq_scatter_offset_p_s64 (__ARM_mve_coerce(__p0, int64_t *),
p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \
+  int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_offset_p_u64 (__ARM_mve_coerce(__p0, uint64_t *),
p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));})
+
+#define __arm_vstrdq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 =
(p0); \
+  __typeof(p2) __p2 = (p2); \
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]:
__arm_vstrdq_scatter_offset_s64 (__ARM_mve_coerce(__p0, int64_t *), p1,
__ARM_mve_coerce(__p2, int64x2_t)), \
+  int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_offset_u64 (__ARM_mve_coerce(__p0, uint64_t *), p1,
__ARM_mve_coerce(__p2, uint64x2_t)));})
+
+#define __arm_vstrdq_scatter_shifted_offset_p(p0,p1,p2,p3)
({ __typeof(p0) __p0 = (p0); \
+  __typeof(p2) __p2 = (p2); \
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]:
__arm_vstrdq_scatter_shifted_offset_p_s64 (__ARM_mve_coerce(__p0,
int64_t *), p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \
+  int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_shifted_offset_p_u64 (__ARM_mve_coerce(__p0,
uint64_t *), p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));})
+
+#define __arm_vstrdq_scatter_shifted_offset(p0,p1,p2) ({ __typeof(p0)
__p0 = (p0); \
+  __typeof(p2) __p2 = (p2); \
+  _Generic( (int
(*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]:
__arm_vstrdq_scatter_shifted_offset_s64 (__ARM_mve_coerce(__p0,
int64_t *), p1, __ARM_mve_coerce(__p2, int64x2_t)), \
+  int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]:
__arm_vstrdq_scatter_shifted_offset_u64 (__ARM_mve_coerce(__p0,
uint64_t *), p1, __ARM_mve_coerce(__p2, uint64x2_t)));})

   #endif /* __cplusplus  */
   #endif /* __ARM_FEATURE_MVE  */

Reply via email to