Jonathan Wright <jonathan.wri...@arm.com> writes:
> Hi,
>
> This patch removes macros for vld3[q]_lane Neon intrinsics. This is a
> preparatory step before adding new modes for structures of Advanced
> SIMD vectors.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?

OK, thanks.

Richard

> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-08-16  Jonathan Wright  <jonathan.wri...@arm.com>
>
>         * config/aarch64/arm_neon.h (__LD3_LANE_FUNC): Delete.
>         (__LD3Q_LANE_FUNC): Delete.
>         (vld3_lane_u8): Define without macro.
>         (vld3_lane_u16): Likewise.
>         (vld3_lane_u32): Likewise.
>         (vld3_lane_u64): Likewise.
>         (vld3_lane_s8): Likewise.
>         (vld3_lane_s16): Likewise.
>         (vld3_lane_s32): Likewise.
>         (vld3_lane_s64): Likewise.
>         (vld3_lane_f16): Likewise.
>         (vld3_lane_f32): Likewise.
>         (vld3_lane_f64): Likewise.
>         (vld3_lane_p8): Likewise.
>         (vld3_lane_p16): Likewise.
>         (vld3_lane_p64): Likewise.
>         (vld3q_lane_u8): Likewise.
>         (vld3q_lane_u16): Likewise.
>         (vld3q_lane_u32): Likewise.
>         (vld3q_lane_u64): Likewise.
>         (vld3q_lane_s8): Likewise.
>         (vld3q_lane_s16): Likewise.
>         (vld3q_lane_s32): Likewise.
>         (vld3q_lane_s64): Likewise.
>         (vld3q_lane_f16): Likewise.
>         (vld3q_lane_f32): Likewise.
>         (vld3q_lane_f64): Likewise.
>         (vld3q_lane_p8): Likewise.
>         (vld3q_lane_p16): Likewise.
>         (vld3q_lane_p64): Likewise.
>         (vld3_lane_bf16): Likewise.
>         (vld3q_lane_bf16): Likewise.
>
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 
> 91c072fe4572ff0012aced11e0f609168e4afc10..29b62988a91909a928e02fd6891803e936a1c6a9
>  100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -20334,100 +20334,525 @@ vld2q_lane_p64 (const poly64_t * __ptr, 
> poly64x2x2_t __b, const int __c)
>  
>  /* vld3_lane */
>  
> -#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,      \
> -                      qmode, ptrmode, funcsuffix, signedtype)           \
> -__extension__ extern __inline intype \
> -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
> -vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
> -{                                                                       \
> -  __builtin_aarch64_simd_ci __o;                                        \
> -  largetype __temp;                                                     \
> -  __temp.val[0] =                                                       \
> -    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));       \
> -  __temp.val[1] =                                                       \
> -    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));       \
> -  __temp.val[2] =                                                       \
> -    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));       \
> -  __o = __builtin_aarch64_set_qregci##qmode (__o,                       \
> -                                         (signedtype) __temp.val[0],    \
> -                                         0);                            \
> -  __o = __builtin_aarch64_set_qregci##qmode (__o,                       \
> -                                         (signedtype) __temp.val[1],    \
> -                                         1);                            \
> -  __o = __builtin_aarch64_set_qregci##qmode (__o,                       \
> -                                         (signedtype) __temp.val[2],    \
> -                                         2);                            \
> -  __o =      __builtin_aarch64_ld3_lane##mode (                              
>    \
> -       (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
> -  __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0);       \
> -  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);       \
> -  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);       \
> -  return __b;                                                                
>    \
> +__extension__ extern __inline uint8x8x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_u8 (const uint8_t * __ptr, uint8x8x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  uint8x16x3_t __temp;
> +  __temp.val[0] = vcombine_u8 (__b.val[0], vcreate_u8 (0));
> +  __temp.val[1] = vcombine_u8 (__b.val[1], vcreate_u8 (0));
> +  __temp.val[2] = vcombine_u8 (__b.val[2], vcreate_u8 (0));
> +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 
> 0);
> +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 
> 1);
> +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 
> 2);
> +  __o =      __builtin_aarch64_ld3_lanev8qi (
> +       (__builtin_aarch64_simd_qi *) __ptr, __o, __c);
> +  __b.val[0] = (uint8x8_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (uint8x8_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (uint8x8_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
>  }
>  
> -__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf,
> -              v8hf, hf, f16, float16x8_t)
> -__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, 
> v4sf,
> -              sf, f32, float32x4_t)
> -__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, 
> v2df,
> -              df, f64, float64x2_t)
> -__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v8qi, v16qi, 
> qi, p8,
> -              int8x16_t)
> -__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v4hi, 
> v8hi, hi,
> -              p16, int16x8_t)
> -__LD3_LANE_FUNC (poly64x1x3_t, poly64x1_t, poly64x2x3_t, poly64_t, di,
> -              v2di_ssps, di, p64, poly64x2_t)
> -__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, 
> s8,
> -              int8x16_t)
> -__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v4hi, v8hi, 
> hi, s16,
> -              int16x8_t)
> -__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v2si, v4si, 
> si, s32,
> -              int32x4_t)
> -__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, di, v2di, di, 
> s64,
> -              int64x2_t)
> -__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v8qi, v16qi, 
> qi, u8,
> -              int8x16_t)
> -__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v4hi, 
> v8hi, hi,
> -              u16, int16x8_t)
> -__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v2si, 
> v4si, si,
> -              u32, int32x4_t)
> -__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, 
> di,
> -              u64, int64x2_t)
> +__extension__ extern __inline uint16x4x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_u16 (const uint16_t * __ptr, uint16x4x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  uint16x8x3_t __temp;
> +  __temp.val[0] = vcombine_u16 (__b.val[0], vcreate_u16 (0));
> +  __temp.val[1] = vcombine_u16 (__b.val[1], vcreate_u16 (0));
> +  __temp.val[2] = vcombine_u16 (__b.val[2], vcreate_u16 (0));
> +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
> +  __o =      __builtin_aarch64_ld3_lanev4hi (
> +       (__builtin_aarch64_simd_hi *) __ptr, __o, __c);
> +  __b.val[0] = (uint16x4_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (uint16x4_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (uint16x4_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline uint32x2x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_u32 (const uint32_t * __ptr, uint32x2x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  uint32x4x3_t __temp;
> +  __temp.val[0] = vcombine_u32 (__b.val[0], vcreate_u32 (0));
> +  __temp.val[1] = vcombine_u32 (__b.val[1], vcreate_u32 (0));
> +  __temp.val[2] = vcombine_u32 (__b.val[2], vcreate_u32 (0));
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2);
> +  __o =      __builtin_aarch64_ld3_lanev2si (
> +       (__builtin_aarch64_simd_si *) __ptr, __o, __c);
> +  __b.val[0] = (uint32x2_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (uint32x2_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (uint32x2_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline uint64x1x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_u64 (const uint64_t * __ptr, uint64x1x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  uint64x2x3_t __temp;
> +  __temp.val[0] = vcombine_u64 (__b.val[0], vcreate_u64 (0));
> +  __temp.val[1] = vcombine_u64 (__b.val[1], vcreate_u64 (0));
> +  __temp.val[2] = vcombine_u64 (__b.val[2], vcreate_u64 (0));
> +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2);
> +  __o =      __builtin_aarch64_ld3_lanedi (
> +       (__builtin_aarch64_simd_di *) __ptr, __o, __c);
> +  __b.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline int8x8x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_s8 (const int8_t * __ptr, int8x8x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  int8x16x3_t __temp;
> +  __temp.val[0] = vcombine_s8 (__b.val[0], vcreate_s8 (0));
> +  __temp.val[1] = vcombine_s8 (__b.val[1], vcreate_s8 (0));
> +  __temp.val[2] = vcombine_s8 (__b.val[2], vcreate_s8 (0));
> +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 
> 0);
> +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 
> 1);
> +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 
> 2);
> +  __o =      __builtin_aarch64_ld3_lanev8qi (
> +       (__builtin_aarch64_simd_qi *) __ptr, __o, __c);
> +  __b.val[0] = (int8x8_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (int8x8_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (int8x8_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline int16x4x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_s16 (const int16_t * __ptr, int16x4x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  int16x8x3_t __temp;
> +  __temp.val[0] = vcombine_s16 (__b.val[0], vcreate_s16 (0));
> +  __temp.val[1] = vcombine_s16 (__b.val[1], vcreate_s16 (0));
> +  __temp.val[2] = vcombine_s16 (__b.val[2], vcreate_s16 (0));
> +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
> +  __o =      __builtin_aarch64_ld3_lanev4hi (
> +       (__builtin_aarch64_simd_hi *) __ptr, __o, __c);
> +  __b.val[0] = (int16x4_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (int16x4_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (int16x4_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline int32x2x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_s32 (const int32_t * __ptr, int32x2x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  int32x4x3_t __temp;
> +  __temp.val[0] = vcombine_s32 (__b.val[0], vcreate_s32 (0));
> +  __temp.val[1] = vcombine_s32 (__b.val[1], vcreate_s32 (0));
> +  __temp.val[2] = vcombine_s32 (__b.val[2], vcreate_s32 (0));
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2);
> +  __o =      __builtin_aarch64_ld3_lanev2si (
> +       (__builtin_aarch64_simd_si *) __ptr, __o, __c);
> +  __b.val[0] = (int32x2_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (int32x2_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (int32x2_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline int64x1x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_s64 (const int64_t * __ptr, int64x1x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  int64x2x3_t __temp;
> +  __temp.val[0] = vcombine_s64 (__b.val[0], vcreate_s64 (0));
> +  __temp.val[1] = vcombine_s64 (__b.val[1], vcreate_s64 (0));
> +  __temp.val[2] = vcombine_s64 (__b.val[2], vcreate_s64 (0));
> +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2);
> +  __o =      __builtin_aarch64_ld3_lanedi (
> +       (__builtin_aarch64_simd_di *) __ptr, __o, __c);
> +  __b.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline float16x4x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_f16 (const float16_t * __ptr, float16x4x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  float16x8x3_t __temp;
> +  __temp.val[0] = vcombine_f16 (__b.val[0], vcreate_f16 (0));
> +  __temp.val[1] = vcombine_f16 (__b.val[1], vcreate_f16 (0));
> +  __temp.val[2] = vcombine_f16 (__b.val[2], vcreate_f16 (0));
> +  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[0], 
> 0);
> +  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[1], 
> 1);
> +  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[2], 
> 2);
> +  __o =      __builtin_aarch64_ld3_lanev4hf (
> +       (__builtin_aarch64_simd_hf *) __ptr, __o, __c);
> +  __b.val[0] = (float16x4_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (float16x4_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (float16x4_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline float32x2x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_f32 (const float32_t * __ptr, float32x2x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  float32x4x3_t __temp;
> +  __temp.val[0] = vcombine_f32 (__b.val[0], vcreate_f32 (0));
> +  __temp.val[1] = vcombine_f32 (__b.val[1], vcreate_f32 (0));
> +  __temp.val[2] = vcombine_f32 (__b.val[2], vcreate_f32 (0));
> +  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[0], 
> 0);
> +  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[1], 
> 1);
> +  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[2], 
> 2);
> +  __o =      __builtin_aarch64_ld3_lanev2sf (
> +       (__builtin_aarch64_simd_sf *) __ptr, __o, __c);
> +  __b.val[0] = (float32x2_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (float32x2_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (float32x2_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline float64x1x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_f64 (const float64_t * __ptr, float64x1x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  float64x2x3_t __temp;
> +  __temp.val[0] = vcombine_f64 (__b.val[0], vcreate_f64 (0));
> +  __temp.val[1] = vcombine_f64 (__b.val[1], vcreate_f64 (0));
> +  __temp.val[2] = vcombine_f64 (__b.val[2], vcreate_f64 (0));
> +  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[0], 
> 0);
> +  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[1], 
> 1);
> +  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[2], 
> 2);
> +  __o =      __builtin_aarch64_ld3_lanedi (
> +       (__builtin_aarch64_simd_di *) __ptr, __o, __c);
> +  __b.val[0] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline poly8x8x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_p8 (const poly8_t * __ptr, poly8x8x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  poly8x16x3_t __temp;
> +  __temp.val[0] = vcombine_p8 (__b.val[0], vcreate_p8 (0));
> +  __temp.val[1] = vcombine_p8 (__b.val[1], vcreate_p8 (0));
> +  __temp.val[2] = vcombine_p8 (__b.val[2], vcreate_p8 (0));
> +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 
> 0);
> +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 
> 1);
> +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 
> 2);
> +  __o =      __builtin_aarch64_ld3_lanev8qi (
> +       (__builtin_aarch64_simd_qi *) __ptr, __o, __c);
> +  __b.val[0] = (poly8x8_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (poly8x8_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (poly8x8_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline poly16x4x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_p16 (const poly16_t * __ptr, poly16x4x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  poly16x8x3_t __temp;
> +  __temp.val[0] = vcombine_p16 (__b.val[0], vcreate_p16 (0));
> +  __temp.val[1] = vcombine_p16 (__b.val[1], vcreate_p16 (0));
> +  __temp.val[2] = vcombine_p16 (__b.val[2], vcreate_p16 (0));
> +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
> +  __o =      __builtin_aarch64_ld3_lanev4hi (
> +       (__builtin_aarch64_simd_hi *) __ptr, __o, __c);
> +  __b.val[0] = (poly16x4_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (poly16x4_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (poly16x4_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline poly64x1x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_p64 (const poly64_t * __ptr, poly64x1x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  poly64x2x3_t __temp;
> +  __temp.val[0] = vcombine_p64 (__b.val[0], vcreate_p64 (0));
> +  __temp.val[1] = vcombine_p64 (__b.val[1], vcreate_p64 (0));
> +  __temp.val[2] = vcombine_p64 (__b.val[2], vcreate_p64 (0));
> +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2);
> +  __o =      __builtin_aarch64_ld3_lanedi (
> +       (__builtin_aarch64_simd_di *) __ptr, __o, __c);
> +  __b.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
>  
>  /* vld3q_lane */
>  
> -#define __LD3Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
> -__extension__ extern __inline intype \
> -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
> -vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> -{                                                                       \
> -  __builtin_aarch64_simd_ci __o;                                        \
> -  intype ret;                                                                
>    \
> -  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \
> -  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \
> -  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \
> -  __o = __builtin_aarch64_ld3_lane##mode (                              \
> -     (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);             \
> -  ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0);       \
> -  ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1);       \
> -  ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2);       \
> -  return ret;                                                                
>    \
> +__extension__ extern __inline uint8x16x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_u8 (const uint8_t * __ptr, uint8x16x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  uint8x16x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev16qi (
> +     (__builtin_aarch64_simd_qi *) __ptr, __o, __c);
> +  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
> +__extension__ extern __inline uint16x8x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_u16 (const uint16_t * __ptr, uint16x8x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  uint16x8x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev8hi (
> +     (__builtin_aarch64_simd_hi *) __ptr, __o, __c);
> +  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
> +__extension__ extern __inline uint32x4x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_u32 (const uint32_t * __ptr, uint32x4x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  uint32x4x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev4si (
> +     (__builtin_aarch64_simd_si *) __ptr, __o, __c);
> +  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
> +__extension__ extern __inline uint64x2x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_u64 (const uint64_t * __ptr, uint64x2x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  uint64x2x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev2di (
> +     (__builtin_aarch64_simd_di *) __ptr, __o, __c);
> +  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
> +__extension__ extern __inline int8x16x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_s8 (const int8_t * __ptr, int8x16x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  int8x16x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev16qi (
> +     (__builtin_aarch64_simd_qi *) __ptr, __o, __c);
> +  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
> +__extension__ extern __inline int16x8x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_s16 (const int16_t * __ptr, int16x8x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  int16x8x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev8hi (
> +     (__builtin_aarch64_simd_hi *) __ptr, __o, __c);
> +  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
> +__extension__ extern __inline int32x4x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_s32 (const int32_t * __ptr, int32x4x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  int32x4x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev4si (
> +     (__builtin_aarch64_simd_si *) __ptr, __o, __c);
> +  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
> +__extension__ extern __inline int64x2x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_s64 (const int64_t * __ptr, int64x2x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  int64x2x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev2di (
> +     (__builtin_aarch64_simd_di *) __ptr, __o, __c);
> +  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
>  }
>  
> -__LD3Q_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
> -__LD3Q_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
> -__LD3Q_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
> -__LD3Q_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
> -__LD3Q_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
> -__LD3Q_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64)
> -__LD3Q_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
> -__LD3Q_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
> -__LD3Q_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
> -__LD3Q_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
> -__LD3Q_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
> -__LD3Q_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
> -__LD3Q_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
> -__LD3Q_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
> +__extension__ extern __inline float16x8x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_f16 (const float16_t * __ptr, float16x8x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  float16x8x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev8hf (
> +     (__builtin_aarch64_simd_hf *) __ptr, __o, __c);
> +  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
> +__extension__ extern __inline float32x4x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_f32 (const float32_t * __ptr, float32x4x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  float32x4x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev4sf (
> +     (__builtin_aarch64_simd_sf *) __ptr, __o, __c);
> +  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
> +__extension__ extern __inline float64x2x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_f64 (const float64_t * __ptr, float64x2x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  float64x2x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev2df (
> +     (__builtin_aarch64_simd_df *) __ptr, __o, __c);
> +  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
> +__extension__ extern __inline poly8x16x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_p8 (const poly8_t * __ptr, poly8x16x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  poly8x16x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev16qi (
> +     (__builtin_aarch64_simd_qi *) __ptr, __o, __c);
> +  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
> +__extension__ extern __inline poly16x8x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_p16 (const poly16_t * __ptr, poly16x8x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  poly16x8x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev8hi (
> +     (__builtin_aarch64_simd_hi *) __ptr, __o, __c);
> +  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
> +__extension__ extern __inline poly64x2x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_p64 (const poly64_t * __ptr, poly64x2x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  poly64x2x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev2di (
> +     (__builtin_aarch64_simd_di *) __ptr, __o, __c);
> +  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
>  
>  /* vld4_lane */
>  
> @@ -34979,9 +35404,43 @@ vld2q_lane_bf16 (const bfloat16_t * __ptr, 
> bfloat16x8x2_t __b, const int __c)
>    return ret;
>  }
>  
> -__LD3_LANE_FUNC (bfloat16x4x3_t, bfloat16x4_t, bfloat16x8x3_t, bfloat16_t, 
> v4bf,
> -              v8bf, bf, bf16, bfloat16x8_t)
> -__LD3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16)
> +__extension__ extern __inline bfloat16x4x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3_lane_bf16 (const bfloat16_t * __ptr, bfloat16x4x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  bfloat16x8x3_t __temp;
> +  __temp.val[0] = vcombine_bf16 (__b.val[0], vcreate_bf16 (0));
> +  __temp.val[1] = vcombine_bf16 (__b.val[1], vcreate_bf16 (0));
> +  __temp.val[2] = vcombine_bf16 (__b.val[2], vcreate_bf16 (0));
> +  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 
> 0);
> +  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 
> 1);
> +  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 
> 2);
> +  __o =      __builtin_aarch64_ld3_lanev4bf (
> +       (__builtin_aarch64_simd_bf *) __ptr, __o, __c);
> +  __b.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregcidi (__o, 0);
> +  __b.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregcidi (__o, 1);
> +  __b.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregcidi (__o, 2);
> +  return __b;
> +}
> +
> +__extension__ extern __inline bfloat16x8x3_t
> +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__))
> +vld3q_lane_bf16 (const bfloat16_t * __ptr, bfloat16x8x3_t __b, const int __c)
> +{
> +  __builtin_aarch64_simd_ci __o;
> +  bfloat16x8x3_t ret;
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1);
> +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2);
> +  __o = __builtin_aarch64_ld3_lanev8bf (
> +     (__builtin_aarch64_simd_bf *) __ptr, __o, __c);
> +  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv4si (__o, 0);
> +  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv4si (__o, 1);
> +  ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv4si (__o, 2);
> +  return ret;
> +}
> +
>  __LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, 
> v4bf,
>                v8bf, bf, bf16, bfloat16x8_t)
>  __LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16)
> @@ -35280,8 +35739,6 @@ vaddq_p128 (poly128_t __a, poly128_t __b)
>  #undef __aarch64_vdupq_laneq_u32
>  #undef __aarch64_vdupq_laneq_u64
>  
> -#undef __LD3_LANE_FUNC
> -#undef __LD3Q_LANE_FUNC
>  #undef __LD4_LANE_FUNC
>  #undef __LD4Q_LANE_FUNC
>  

Reply via email to