Hi all, Second version of the patch here implementing the bfloat16_t neon related load intrinsics: vld2_lane_bf16, vld2q_lane_bf16, vld3_lane_bf16, vld3q_lane_bf16 vld4_lane_bf16, vld4q_lane_bf16.
This better narrows testcases so they do not cause regressions for the arm backend where these intrinsics are not yet present. Please see refer to: ACLE <https://developer.arm.com/docs/101028/latest> ISA <https://developer.arm.com/docs/ddi0596/latest> Okay for trunk? Thanks! Andrea
>From 08bd8d745bc46ca4b9dd24906dea2743dda66cc5 Mon Sep 17 00:00:00 2001 From: Andrea Corallo <andrea.cora...@arm.com> Date: Thu, 15 Oct 2020 10:16:18 +0200 Subject: [PATCH] aarch64: Add bfloat16 vldN_lane_bf16 + vldNq_lane_bf16 intrisics gcc/ChangeLog 2020-10-15 Andrea Corallo <andrea.cora...@arm.com> * config/aarch64/arm_neon.h (__LDX_LANE_FUNC): Move to the bottom of the file so we can use these also for defining the bf16 related intrinsics. (vld2_lane_bf16, vld2q_lane_bf16, vld3_lane_bf16, vld3q_lane_bf16) (vld4_lane_bf16, vld4q_lane_bf16): Add new intrinsics. gcc/testsuite/ChangeLog 2020-10-15 Andrea Corallo <andrea.cora...@arm.com> * gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_1.c: New testcase. * gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_2.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld4_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_bf16_indices_1.c: Likewise. --- gcc/config/aarch64/arm_neon.h | 792 +++++++++--------- .../advsimd-intrinsics/bf16_vldN_lane_1.c | 74 ++ .../advsimd-intrinsics/bf16_vldN_lane_2.c | 52 ++ .../vld2_lane_bf16_indices_1.c | 17 + .../vld2q_lane_bf16_indices_1.c | 17 + .../vld3_lane_bf16_indices_1.c | 17 + .../vld3q_lane_bf16_indices_1.c | 17 + .../vld4_lane_bf16_indices_1.c | 17 + .../vld4q_lane_bf16_indices_1.c | 17 + 9 files changed, 629 insertions(+), 391 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_bf16_indices_1.c diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index d943f63a274..2bb20e15069 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -20792,311 +20792,6 @@ vld4q_dup_p64 (const poly64_t * __a) return ret; } -/* vld2_lane */ - -#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ - qmode, ptrmode, funcsuffix, signedtype) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_oi __o; \ - largetype __temp; \ - __temp.val[0] = \ - vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ - __temp.val[1] = \ - vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ - __o = __builtin_aarch64_set_qregoi##qmode (__o, \ - (signedtype) __temp.val[0], \ - 0); \ - __o = __builtin_aarch64_set_qregoi##qmode (__o, \ - (signedtype) __temp.val[1], \ - 1); \ - __o = __builtin_aarch64_ld2_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \ - __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \ - return __b; \ -} - -__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf, - v8hf, hf, f16, float16x8_t) -__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf, - sf, f32, float32x4_t) -__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, df, v2df, - df, f64, float64x2_t) -__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, - p16, int16x8_t) -__LD2_LANE_FUNC (poly64x1x2_t, poly64x1_t, poly64x2x2_t, poly64_t, di, - v2di_ssps, di, p64, poly64x2_t) -__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, di, v2di, di, s64, - int64x2_t) -__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, - u16, int16x8_t) -__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, - u32, int32x4_t) -__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di, - u64, int64x2_t) - -#undef __LD2_LANE_FUNC - -/* vld2q_lane */ - -#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_oi __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \ - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \ - __o = __builtin_aarch64_ld2_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0); \ - ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1); \ - return ret; \ -} - -__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64) -__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD2_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32) -__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64) -__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64) - -#undef __LD2_LANE_FUNC - -/* vld3_lane */ - -#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ - qmode, ptrmode, funcsuffix, signedtype) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_ci __o; \ - largetype __temp; \ - __temp.val[0] = \ - vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ - __temp.val[1] = \ - vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ - __temp.val[2] = \ - vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[0], \ - 0); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[1], \ - 1); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[2], \ - 2); \ - __o = __builtin_aarch64_ld3_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \ - __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \ - __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \ - return __b; \ -} - -__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf, - v8hf, hf, f16, float16x8_t) -__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, v4sf, - sf, f32, float32x4_t) -__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, v2df, - df, f64, float64x2_t) -__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, - p16, int16x8_t) -__LD3_LANE_FUNC (poly64x1x3_t, poly64x1_t, poly64x2x3_t, poly64_t, di, - v2di_ssps, di, p64, poly64x2_t) -__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, di, v2di, di, s64, - int64x2_t) -__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, - u16, int16x8_t) -__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v2si, v4si, si, - u32, int32x4_t) -__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di, - u64, int64x2_t) - -#undef __LD3_LANE_FUNC - -/* vld3q_lane */ - -#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_ci __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \ - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \ - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \ - __o = __builtin_aarch64_ld3_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0); \ - ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1); \ - ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2); \ - return ret; \ -} - -__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64) -__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD3_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32) -__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64) -__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64) - -#undef __LD3_LANE_FUNC - -/* vld4_lane */ - -#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ - qmode, ptrmode, funcsuffix, signedtype) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_xi __o; \ - largetype __temp; \ - __temp.val[0] = \ - vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ - __temp.val[1] = \ - vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ - __temp.val[2] = \ - vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ - __temp.val[3] = \ - vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[0], \ - 0); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[1], \ - 1); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[2], \ - 2); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[3], \ - 3); \ - __o = __builtin_aarch64_ld4_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \ - __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \ - __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \ - __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \ - return __b; \ -} - -/* vld4q_lane */ - -__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf, - v8hf, hf, f16, float16x8_t) -__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf, - sf, f32, float32x4_t) -__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df, - df, f64, float64x2_t) -__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, - p16, int16x8_t) -__LD4_LANE_FUNC (poly64x1x4_t, poly64x1_t, poly64x2x4_t, poly64_t, di, - v2di_ssps, di, p64, poly64x2_t) -__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, di, v2di, di, s64, - int64x2_t) -__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, - u16, int16x8_t) -__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v2si, v4si, si, - u32, int32x4_t) -__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di, - u64, int64x2_t) - -#undef __LD4_LANE_FUNC - -/* vld4q_lane */ - -#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_xi __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \ - __o = __builtin_aarch64_ld4_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0); \ - ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1); \ - ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2); \ - ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3); \ - return ret; \ -} - -__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64) -__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD4_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32) -__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64) -__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64) - -#undef __LD4_LANE_FUNC - /* vmax */ __extension__ extern __inline float32x2_t @@ -35768,110 +35463,425 @@ vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b, return __builtin_aarch64_usdot_laneqv16qi_ssuss (__r, __a, __b, __index); } -__extension__ extern __inline int32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsudot_lane_s32 (int32x2_t __r, int8x8_t __a, uint8x8_t __b, const int __index) -{ - return __builtin_aarch64_sudot_lanev8qi_sssus (__r, __a, __b, __index); -} +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudot_lane_s32 (int32x2_t __r, int8x8_t __a, uint8x8_t __b, const int __index) +{ + return __builtin_aarch64_sudot_lanev8qi_sssus (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, uint8x16_t __b, + const int __index) +{ + return __builtin_aarch64_sudot_laneqv8qi_sssus (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, uint8x8_t __b, + const int __index) +{ + return __builtin_aarch64_sudot_lanev16qi_sssus (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, uint8x16_t __b, + const int __index) +{ + return __builtin_aarch64_sudot_laneqv16qi_sssus (__r, __a, __b, __index); +} + +/* Matrix Multiply-Accumulate. */ + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) +{ + return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b); +} + +#pragma GCC pop_options + +__extension__ extern __inline poly8x8_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vadd_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly16x4_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vadd_p16 (poly16x4_t __a, poly16x4_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly64x1_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vadd_p64 (poly64x1_t __a, poly64x1_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly8x16_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly16x8_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p16 (poly16x8_t __a, poly16x8_t __b) +{ + return __a ^__b; +} + +__extension__ extern __inline poly64x2_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p64 (poly64x2_t __a, poly64x2_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly128_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p128 (poly128_t __a, poly128_t __b) +{ + return __a ^ __b; +} + +/* vld2_lane */ + +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ + qmode, ptrmode, funcsuffix, signedtype) \ +__extension__ extern __inline intype \ +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ +vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_oi __o; \ + largetype __temp; \ + __temp.val[0] = \ + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ + __temp.val[1] = \ + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ + __o = __builtin_aarch64_set_qregoi##qmode (__o, \ + (signedtype) __temp.val[0], \ + 0); \ + __o = __builtin_aarch64_set_qregoi##qmode (__o, \ + (signedtype) __temp.val[1], \ + 1); \ + __o = __builtin_aarch64_ld2_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \ + __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \ + return __b; \ +} + +__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf, + v8hf, hf, f16, float16x8_t) +__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf, + sf, f32, float32x4_t) +__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, df, v2df, + df, f64, float64x2_t) +__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8, + int8x16_t) +__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, + p16, int16x8_t) +__LD2_LANE_FUNC (poly64x1x2_t, poly64x1_t, poly64x2x2_t, poly64_t, di, + v2di_ssps, di, p64, poly64x2_t) +__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8, + int8x16_t) +__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16, + int16x8_t) +__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32, + int32x4_t) +__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, di, v2di, di, s64, + int64x2_t) +__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8, + int8x16_t) +__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, + u16, int16x8_t) +__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, + u32, int32x4_t) +__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di, + u64, int64x2_t) +__LD2_LANE_FUNC (bfloat16x4x2_t, bfloat16x4_t, bfloat16x8x2_t, bfloat16_t, v4bf, + v8bf, bf, bf16, bfloat16x8_t) + +#undef __LD2_LANE_FUNC + +/* vld2q_lane */ + +#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ +__extension__ extern __inline intype \ +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ +vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_oi __o; \ + intype ret; \ + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \ + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \ + __o = __builtin_aarch64_ld2_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0); \ + ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1); \ + return ret; \ +} + +__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16) +__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32) +__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64) +__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8) +__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16) +__LD2_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64) +__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8) +__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16) +__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32) +__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64) +__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8) +__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16) +__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32) +__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64) +__LD2_LANE_FUNC (bfloat16x8x2_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) + +#undef __LD2_LANE_FUNC + +/* vld3_lane */ + +#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ + qmode, ptrmode, funcsuffix, signedtype) \ +__extension__ extern __inline intype \ +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ +vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_ci __o; \ + largetype __temp; \ + __temp.val[0] = \ + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ + __temp.val[1] = \ + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ + __temp.val[2] = \ + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ + __o = __builtin_aarch64_set_qregci##qmode (__o, \ + (signedtype) __temp.val[0], \ + 0); \ + __o = __builtin_aarch64_set_qregci##qmode (__o, \ + (signedtype) __temp.val[1], \ + 1); \ + __o = __builtin_aarch64_set_qregci##qmode (__o, \ + (signedtype) __temp.val[2], \ + 2); \ + __o = __builtin_aarch64_ld3_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \ + __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \ + __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \ + return __b; \ +} + +__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf, + v8hf, hf, f16, float16x8_t) +__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, v4sf, + sf, f32, float32x4_t) +__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, v2df, + df, f64, float64x2_t) +__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8, + int8x16_t) +__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, + p16, int16x8_t) +__LD3_LANE_FUNC (poly64x1x3_t, poly64x1_t, poly64x2x3_t, poly64_t, di, + v2di_ssps, di, p64, poly64x2_t) +__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8, + int8x16_t) +__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16, + int16x8_t) +__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v2si, v4si, si, s32, + int32x4_t) +__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, di, v2di, di, s64, + int64x2_t) +__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8, + int8x16_t) +__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, + u16, int16x8_t) +__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v2si, v4si, si, + u32, int32x4_t) +__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di, + u64, int64x2_t) +__LD3_LANE_FUNC (bfloat16x4x3_t, bfloat16x4_t, bfloat16x8x3_t, bfloat16_t, v4bf, + v8bf, bf, bf16, bfloat16x8_t) -__extension__ extern __inline int32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, uint8x16_t __b, - const int __index) -{ - return __builtin_aarch64_sudot_laneqv8qi_sssus (__r, __a, __b, __index); -} +#undef __LD3_LANE_FUNC -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, uint8x8_t __b, - const int __index) -{ - return __builtin_aarch64_sudot_lanev16qi_sssus (__r, __a, __b, __index); -} +/* vld3q_lane */ -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, uint8x16_t __b, - const int __index) -{ - return __builtin_aarch64_sudot_laneqv16qi_sssus (__r, __a, __b, __index); +#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ +__extension__ extern __inline intype \ +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ +vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_ci __o; \ + intype ret; \ + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \ + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \ + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \ + __o = __builtin_aarch64_ld3_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0); \ + ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1); \ + ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2); \ + return ret; \ } -/* Matrix Multiply-Accumulate. */ +__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16) +__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32) +__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64) +__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8) +__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16) +__LD3_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64) +__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8) +__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16) +__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32) +__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64) +__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8) +__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16) +__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32) +__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64) +__LD3_LANE_FUNC (bfloat16x8x3_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) -{ - return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b); -} +#undef __LD3_LANE_FUNC -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) -{ - return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b); -} +/* vld4_lane */ -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) -{ - return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b); +#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ + qmode, ptrmode, funcsuffix, signedtype) \ +__extension__ extern __inline intype \ +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ +vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_xi __o; \ + largetype __temp; \ + __temp.val[0] = \ + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ + __temp.val[1] = \ + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ + __temp.val[2] = \ + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ + __temp.val[3] = \ + vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \ + __o = __builtin_aarch64_set_qregxi##qmode (__o, \ + (signedtype) __temp.val[0], \ + 0); \ + __o = __builtin_aarch64_set_qregxi##qmode (__o, \ + (signedtype) __temp.val[1], \ + 1); \ + __o = __builtin_aarch64_set_qregxi##qmode (__o, \ + (signedtype) __temp.val[2], \ + 2); \ + __o = __builtin_aarch64_set_qregxi##qmode (__o, \ + (signedtype) __temp.val[3], \ + 3); \ + __o = __builtin_aarch64_ld4_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \ + __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \ + __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \ + __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \ + return __b; \ } -#pragma GCC pop_options +/* vld4q_lane */ -__extension__ extern __inline poly8x8_t -__attribute ((__always_inline__, __gnu_inline__, __artificial__)) -vadd_p8 (poly8x8_t __a, poly8x8_t __b) -{ - return __a ^ __b; -} +__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf, + v8hf, hf, f16, float16x8_t) +__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf, + sf, f32, float32x4_t) +__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df, + df, f64, float64x2_t) +__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8, + int8x16_t) +__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, + p16, int16x8_t) +__LD4_LANE_FUNC (poly64x1x4_t, poly64x1_t, poly64x2x4_t, poly64_t, di, + v2di_ssps, di, p64, poly64x2_t) +__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8, + int8x16_t) +__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16, + int16x8_t) +__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v2si, v4si, si, s32, + int32x4_t) +__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, di, v2di, di, s64, + int64x2_t) +__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8, + int8x16_t) +__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, + u16, int16x8_t) +__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v2si, v4si, si, + u32, int32x4_t) +__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di, + u64, int64x2_t) +__LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, + v8bf, bf, bf16, bfloat16x8_t) -__extension__ extern __inline poly16x4_t -__attribute ((__always_inline__, __gnu_inline__, __artificial__)) -vadd_p16 (poly16x4_t __a, poly16x4_t __b) -{ - return __a ^ __b; -} -__extension__ extern __inline poly64x1_t -__attribute ((__always_inline__, __gnu_inline__, __artificial__)) -vadd_p64 (poly64x1_t __a, poly64x1_t __b) -{ - return __a ^ __b; -} +#undef __LD4_LANE_FUNC -__extension__ extern __inline poly8x16_t -__attribute ((__always_inline__, __gnu_inline__, __artificial__)) -vaddq_p8 (poly8x16_t __a, poly8x16_t __b) -{ - return __a ^ __b; -} +/* vld4q_lane */ -__extension__ extern __inline poly16x8_t -__attribute ((__always_inline__, __gnu_inline__, __artificial__)) -vaddq_p16 (poly16x8_t __a, poly16x8_t __b) -{ - return __a ^__b; +#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ +__extension__ extern __inline intype \ +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ +vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_xi __o; \ + intype ret; \ + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \ + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \ + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \ + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \ + __o = __builtin_aarch64_ld4_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0); \ + ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1); \ + ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2); \ + ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3); \ + return ret; \ } -__extension__ extern __inline poly64x2_t -__attribute ((__always_inline__, __gnu_inline__, __artificial__)) -vaddq_p64 (poly64x2_t __a, poly64x2_t __b) -{ - return __a ^ __b; -} +__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16) +__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32) +__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64) +__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8) +__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16) +__LD4_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64) +__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8) +__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16) +__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32) +__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64) +__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8) +__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16) +__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32) +__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64) +__LD4_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) -__extension__ extern __inline poly128_t -__attribute ((__always_inline__, __gnu_inline__, __artificial__)) -vaddq_p128 (poly128_t __a, poly128_t __b) -{ - return __a ^ __b; -} +#undef __LD4_LANE_FUNC #undef __aarch64_vget_lane_any diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_1.c new file mode 100644 index 00000000000..a83ed3e45da --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_1.c @@ -0,0 +1,74 @@ +/* { dg-do run { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +#include <arm_neon.h> + +extern void abort (void); + +typedef union +{ + bfloat16_t bf16; + uint16_t u16; +} bfloat16_u_t; + +#define VARIANTS(VARIANT, STRUCT) \ +VARIANT (bfloat16, , 4, _bf16, 3, STRUCT) \ +VARIANT (bfloat16, q, 8, _bf16, 7, STRUCT) + +#define TESTMETH(BASE, Q, ELTS, SUFFIX, LANE, STRUCT) \ + int \ + test_vld##STRUCT##Q##_lane##SUFFIX (const bfloat16_u_t *data, \ + const bfloat16_u_t *overwrite) \ + { \ + BASE##x##ELTS##x##STRUCT##_t vectors; \ + bfloat16_u_t temp[ELTS]; \ + int i,j; \ + for (i = 0; i < STRUCT; i++, data += ELTS) \ + vectors.val[i] = vld1##Q##SUFFIX ((bfloat16_t *)data); \ + vectors = vld##STRUCT##Q##_lane##SUFFIX ((bfloat16_t *) overwrite, \ + vectors, LANE); \ + while (--i >= 0) \ + { \ + vst1##Q##SUFFIX ((bfloat16_t *)temp, vectors.val[i]); \ + data -= ELTS; /* Point at value loaded before vldN_lane. */ \ + for (j = 0; j < ELTS; j++) \ + if (temp[j].u16 != (j == LANE ? overwrite[i].u16 : data[j].u16)) \ + return 1; \ + } \ + return 0; \ + } + +/* Tests of vld2_lane and vld2q_lane. */ +VARIANTS (TESTMETH, 2) +/* Tests of vld3_lane and vld3q_lane. */ +VARIANTS (TESTMETH, 3) +/* Tests of vld4_lane and vld4q_lane. */ +VARIANTS (TESTMETH, 4) + +#define CHECK(BASE, Q, ELTS, SUFFIX, LANE, STRUCT) \ + if (test_vld##STRUCT##Q##_lane##SUFFIX ((const bfloat16_u_t *)orig_data, \ + BASE##_data) != 0) \ + abort (); + +int +main (int argc, char **argv) +{ + /* Original data for all vector formats. */ + uint64_t orig_data[8] = {0x1234567890abcdefULL, 0x13579bdf02468aceULL, + 0x012389ab4567cdefULL, 0xdeeddadacafe0431ULL, + 0x1032547698badcfeULL, 0xbadbadbadbad0badULL, + 0x0102030405060708ULL, 0x0f0e0d0c0b0a0908ULL}; + + /* Data with which vldN_lane will overwrite some of previous. */ + bfloat16_u_t bfloat16_data[4]; + bfloat16_data[0].u16 = 0xABAB; + bfloat16_data[1].u16 = 0x0; + bfloat16_data[2].u16 = 0xCAFE; + bfloat16_data[3].u16 = 0x1234; + + VARIANTS (CHECK, 2); + VARIANTS (CHECK, 3); + VARIANTS (CHECK, 4); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_2.c new file mode 100644 index 00000000000..670cf0ba75a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_2.c @@ -0,0 +1,52 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-O2 --save-temps" } */ + +#include <arm_neon.h> + +bfloat16x4x2_t +test_vld2_lane_bf16 (const bfloat16_t *ptr, bfloat16x4x2_t b) +{ + return vld2_lane_bf16 (ptr, b, 2); +} + +bfloat16x8x2_t +test_vld2q_lane_bf16 (const bfloat16_t *ptr, bfloat16x8x2_t b) +{ + return vld2q_lane_bf16 (ptr, b, 2); +} + +/* { dg-final { scan-assembler-times "ld2\\t{v2.h - v3.h}\\\[2\\\], \\\[x0\\\]" 2 } } */ + +bfloat16x4x3_t +test_vld3_lane_bf16 (const bfloat16_t *ptr, bfloat16x4x3_t b) +{ + return vld3_lane_bf16 (ptr, b, 2); +} + +/* { dg-final { scan-assembler-times "ld3\t{v4.h - v6.h}\\\[2\\\], \\\[x0\\\]" 1 } } */ + +bfloat16x8x3_t +test_vld3q_lane_bf16 (const bfloat16_t *ptr, bfloat16x8x3_t b) +{ + return vld3q_lane_bf16 (ptr, b, 2); +} + +/* { dg-final { scan-assembler-times "ld3\t{v1.h - v3.h}\\\[2\\\], \\\[x0\\\]" 1 } } */ + +bfloat16x4x4_t +test_vld4_lane_bf16 (const bfloat16_t *ptr, bfloat16x4x4_t b) +{ + return vld4_lane_bf16 (ptr, b, 2); +} + +/* { dg-final { scan-assembler-times "ld4\t{v4.h - v7.h}\\\[2\\\], \\\[x0\\\]" 1 } } */ + +bfloat16x8x4_t +test_vld4q_lane_bf16 (const bfloat16_t *ptr, bfloat16x8x4_t b) +{ + return vld4q_lane_bf16 (ptr, b, 2); +} + +/* { dg-final { scan-assembler-times "ld4\t{v0.h - v3.h}\\\[2\\\], \\\[x0\\\]" 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c new file mode 100644 index 00000000000..d3727970f9c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c @@ -0,0 +1,17 @@ +#include <arm_neon.h> + +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x4x2_t +f_vld2_lane_bf16 (bfloat16_t * p, bfloat16x4x2_t v) +{ + bfloat16x4x2_t res; + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ + res = vld2_lane_bf16 (p, v, 4); + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ + res = vld2_lane_bf16 (p, v, -1); + return res; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c new file mode 100644 index 00000000000..a74e24acfe5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c @@ -0,0 +1,17 @@ +#include <arm_neon.h> + +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x8x2_t +f_vld2q_lane_bf16 (bfloat16_t * p, bfloat16x8x2_t v) +{ + bfloat16x8x2_t res; + /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vld2q_lane_bf16 (p, v, 8); + /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vld2q_lane_bf16 (p, v, -1); + return res; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c new file mode 100644 index 00000000000..be87a9a4e00 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c @@ -0,0 +1,17 @@ +#include <arm_neon.h> + +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x4x3_t +f_vld3_lane_bf16 (bfloat16_t * p, bfloat16x4x3_t v) +{ + bfloat16x4x3_t res; + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ + res = vld3_lane_bf16 (p, v, 4); + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ + res = vld3_lane_bf16 (p, v, -1); + return res; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c new file mode 100644 index 00000000000..7a8171cc5d0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c @@ -0,0 +1,17 @@ +#include <arm_neon.h> + +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x8x3_t +f_vld3q_lane_bf16 (bfloat16_t * p, bfloat16x8x3_t v) +{ + bfloat16x8x3_t res; + /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vld3q_lane_bf16 (p, v, 8); + /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vld3q_lane_bf16 (p, v, -1); + return res; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_bf16_indices_1.c new file mode 100644 index 00000000000..e9d4c4c26b1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_bf16_indices_1.c @@ -0,0 +1,17 @@ +#include <arm_neon.h> + +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x4x4_t +f_vld4_lane_bf16 (bfloat16_t * p, bfloat16x4x4_t v) +{ + bfloat16x4x4_t res; + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ + res = vld4_lane_bf16 (p, v, 4); + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ + res = vld4_lane_bf16 (p, v, -1); + return res; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_bf16_indices_1.c new file mode 100644 index 00000000000..76222cc43c0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_bf16_indices_1.c @@ -0,0 +1,17 @@ +#include <arm_neon.h> + +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x8x4_t +f_vld4q_lane_bf16 (bfloat16_t * p, bfloat16x8x4_t v) +{ + bfloat16x8x4_t res; + /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vld4q_lane_bf16 (p, v, 8); + /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vld4q_lane_bf16 (p, v, -1); + return res; +} -- 2.20.1