Re: [GCC][PATCH][AArch64] Add bfloat16 vdup and vreinterpret ACLE intrinsics

Mihail Ionescu Tue, 25 Feb 2020 06:33:32 -0800

Hi,

On 02/17/2020 05:53 PM, Mihail Ionescu wrote:

Hi,


This patch adds support for the bf16 duplicate and reinterpret intrinsics.
ACLE documents are at https://developer.arm.com/docs/101028/latest
ISA documents are at https://developer.arm.com/docs/ddi0596/latest

Regression tested on aarch64-none-linux-gnu.


Is it ok for trunk?


gcc/ChangeLog:

2020-02-17  Mihail Ionescu  <mihail.ione...@arm.com>

        * config/aarch64/iterators.md (VDQF_F16) Add V4BF and V8BF.
        (VALL_F16): Likewise.
        (VALLDI_F16): Likewise.
        (Vtype): Likewise.
        (Vetype): Likewise.
        (vswap_width_name): Likewise.
        (VSWAP_WIDTH): Likewise.
        (Vel): Likewise.
        (VEL): Likewise.
        (q): Likewise.
        * config/aarch64/aarch64-simd.md
        (vec_init<mode><Vel>): Add vector init pattern for bf16.
        (aarch64_simd_dup): Change pattern iterator to include bf16.
        (aarch64_dup_lane): Likewise.
        (aarch64_get_lane): Likewise.
        (vec_extract): Likewise.
        * config/aarch64/arm_bf16.h
        (vset_lane_bf16, vsetq_lane_bf16): New.
        (vget_lane_bf16, vgetq_lane_bf16): New.
        (vcreate_bf16): New.
        (vdup_n_bf16, vdupq_n_bf16): New.
        (vdup_lane_bf16, vdup_laneq_bf16): New.
        (vdupq_lane_bf16, vdupq_laneq_bf16): New.
        (vduph_lane_bf16, vduph_laneq_bf16): New.
        (vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New.
        (vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New.
        (vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New.
        (vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New.
        (vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New.
        (vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New.
        (vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New.
        (vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New.
        (vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New.
        (vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New.
        (vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New
        (vreinterpret_bf16_f16, vreinterpretq_bf16_f16): New
        (vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New.
        (vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New.
        (vreinterpretq_bf16_p128): New.
        (vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New.
        (vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New.
        (vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New.
        (vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New.
        (vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New.
        (vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New.
        (vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New.
        (vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New.
        (vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New.
        (vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New.
        (vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New.
        (vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New.
        (vreinterpret_f64_bf16,vreinterpretq_f64_bf16): New.
        (vreinterpret_f16_bf16,vreinterpretq_f16_bf16): New.
        (vreinterpretq_p128_bf16): New.


gcc/testsuite/ChangeLog:

2020-02-17  Mihail Ionescu  <mihail.ione...@arm.com>

        * gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c: New test.
        * gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c: New test.

Regards,
Mihail


###############     Attachment also inlined for ease of reply    ###############


diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 
7f05c3f9eca844b0e7b824a191223a4906c825b1..3cc3ace83fabf25d8e2e6e70382d335afd974290
 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34614,6 +34614,507 @@ vrnd64xq_f64 (float64x2_t __a)
  #pragma GCC push_options
  #pragma GCC target ("arch=armv8.2-a+bf16")

+__extension__ extern __inline bfloat16x4_t

+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vset_lane_bf16 (bfloat16_t __elem, bfloat16x4_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsetq_lane_bf16 (bfloat16_t __elem, bfloat16x8_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vgetq_lane_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcreate_bf16 (uint64_t __a)
+{
+  return (bfloat16x4_t) __a;
+}
+
+/* vdup */
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_bf16 (bfloat16_t __a)
+{
+  return (bfloat16x4_t) {__a, __a, __a, __a};
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_bf16 (bfloat16_t __a)
+{
+  return (bfloat16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+/* vreinterpret */
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u8 (uint8x8_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u16 (uint16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u32 (uint32x2_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u64 (uint64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s8 (int8x8_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s16 (int16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s32 (int32x2_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s64 (int64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p8 (poly8x8_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p16 (poly16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p64 (poly64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f16 (float16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f32 (float32x2_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f64 (float64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u8 (uint8x16_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u16 (uint16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u32 (uint32x4_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u64 (uint64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s8 (int8x16_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s16 (int16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s32 (int32x4_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s64 (int64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p8 (poly8x16_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p16 (poly16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p64 (poly64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p128 (poly128_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f16 (float16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f32 (float32x4_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f64 (float64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s8_bf16 (bfloat16x4_t __a)
+{
+  return (int8x8_t)__a;
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s16_bf16 (bfloat16x4_t __a)
+{
+  return (int16x4_t)__a;
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s32_bf16 (bfloat16x4_t __a)
+{
+  return (int32x2_t)__a;
+}
+
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s64_bf16 (bfloat16x4_t __a)
+{
+  return (int64x1_t)__a;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u8_bf16 (bfloat16x4_t __a)
+{
+  return (uint8x8_t)__a;
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u16_bf16 (bfloat16x4_t __a)
+{
+  return (uint16x4_t)__a;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u32_bf16 (bfloat16x4_t __a)
+{
+  return (uint32x2_t)__a;
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u64_bf16 (bfloat16x4_t __a)
+{
+  return (uint64x1_t)__a;
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f16_bf16 (bfloat16x4_t __a)
+{
+  return (float16x4_t)__a;
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f32_bf16 (bfloat16x4_t __a)
+{
+  return (float32x2_t)__a;
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f64_bf16 (bfloat16x4_t __a)
+{
+  return (float64x1_t)__a;
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p8_bf16 (bfloat16x4_t __a)
+{
+  return (poly8x8_t)__a;
+}
+
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p16_bf16 (bfloat16x4_t __a)
+{
+  return (poly16x4_t)__a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_bf16 (bfloat16x4_t __a)
+{
+  return (poly64x1_t)__a;
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s8_bf16 (bfloat16x8_t __a)
+{
+  return (int8x16_t)__a;
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s16_bf16 (bfloat16x8_t __a)
+{
+  return (int16x8_t)__a;
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s32_bf16 (bfloat16x8_t __a)
+{
+  return (int32x4_t)__a;
+}
+
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s64_bf16 (bfloat16x8_t __a)
+{
+  return (int64x2_t)__a;
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u8_bf16 (bfloat16x8_t __a)
+{
+  return (uint8x16_t)__a;
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u16_bf16 (bfloat16x8_t __a)
+{
+  return (uint16x8_t)__a;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u32_bf16 (bfloat16x8_t __a)
+{
+  return (uint32x4_t)__a;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u64_bf16 (bfloat16x8_t __a)
+{
+  return (uint64x2_t)__a;
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f16_bf16 (bfloat16x8_t __a)
+{
+  return (float16x8_t)__a;
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f32_bf16 (bfloat16x8_t __a)
+{
+  return (float32x4_t)__a;
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f64_bf16 (bfloat16x8_t __a)
+{
+  return (float64x2_t)__a;
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p8_bf16 (bfloat16x8_t __a)
+{
+  return (poly8x16_t)__a;
+}
+
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p16_bf16 (bfloat16x8_t __a)
+{
+  return (poly16x8_t)__a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_bf16 (bfloat16x8_t __a)
+{
+  return (poly64x2_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_bf16 (bfloat16x8_t __a)
+{
+  return (poly128_t)__a;
+}
+
  __extension__ extern __inline float32x2_t
  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 
fc7856e58c9ab65acce3fc43b18356c8c3ff6aae..76c8d9abbe79355078799a4ca227b2a352636ef4
 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -136,7 +136,8 @@
  (define_mode_iterator PTR [(SI "ptr_mode == SImode") (DI "ptr_mode == 
DImode")])

;; Advanced SIMD Float modes suitable for moving, loading and storing.

-(define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF])
+(define_mode_iterator VDQF_F16  [V4HF V8HF V2SF V4SF V2DF
+                                V4BF V8BF])

;; Advanced SIMD Float modes.

  (define_mode_iterator VDQF [V2SF V4SF V2DF])
@@ -177,7 +178,7 @@

;; All Advanced SIMD modes suitable for moving, loading, and storing.

  (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
-                               V4HF V8HF V2SF V4SF V2DF])
+                               V4HF V8HF V4BF V8BF V2SF V4SF V2DF])

;; All Advanced SIMD modes suitable for moving, loading, and storing,

  ;; including special Bfloat vector types.
@@ -193,7 +194,7 @@

;; All Advanced SIMD modes and DI.

  (define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
-                                 V4HF V8HF V2SF V4SF V2DF DI])
+                                 V4HF V8HF V4BF V8BF V2SF V4SF V2DF DI])

;; All Advanced SIMD modes, plus DI and DF.

  (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI
@@ -821,6 +822,7 @@

(define_mode_attr Vtype [(V8QI "8b") (V16QI "16b")

                         (V4HI "4h") (V8HI  "8h")
+                        (V4BF "4h") (V8BF  "8h")
                           (V2SI "2s") (V4SI  "4s")
                           (DI   "1d") (DF    "1d")
                           (V2DI "2d") (V2SF "2s")
@@ -863,6 +865,7 @@
                          (VNx4SF "s") (VNx2SF "s")
                          (VNx2DI "d")
                          (VNx2DF "d")
+                         (BF "h") (V4BF "h") (V8BF "h")
                          (HF "h")
                          (SF "s") (DF "d")
                          (QI "b") (HI "h")
@@ -928,6 +931,7 @@
                       (DF   "DF") (V2DF  "DF")
                       (SI   "SI") (HI    "HI")
                       (QI   "QI")
+                      (V4BF "BF") (V8BF "BF")
                       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
                       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
                       (VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
@@ -946,6 +950,7 @@
                       (V2DF "df") (DF   "df")
                       (SI   "si") (HI   "hi")
                       (QI   "qi")
+                      (V4BF "bf") (V8BF "bf")
                       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
                       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
                       (VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
@@ -1249,6 +1254,7 @@

(define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI")

                                (V4HI "V8HI") (V8HI  "V4HI")
+                               (V8BF "V4BF") (V4BF  "V8BF")
                                (V2SI "V4SI") (V4SI  "V2SI")
                                (DI   "V2DI") (V2DI  "DI")
                                (V2SF "V4SF") (V4SF  "V2SF")
@@ -1261,6 +1267,7 @@
                                    (DI   "to_128") (V2DI  "to_64")
                                    (V4HF "to_128") (V8HF  "to_64")
                                    (V2SF "to_128") (V4SF  "to_64")
+                                   (V4BF "to_128") (V8BF  "to_64")
                                    (DF   "to_128") (V2DF  "to_64")])

;; For certain vector-by-element multiplication instructions we must

@@ -1294,6 +1301,7 @@
  ;; Defined to '_q' for 128-bit types.
  (define_mode_attr q [(V8QI "") (V16QI "_q")
                     (V4HI "") (V8HI  "_q")
+                    (V4BF "") (V8BF  "_q")
                     (V2SI "") (V4SI  "_q")
                     (DI   "") (V2DI  "_q")
                     (V4HF "") (V8HF "_q")
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
new file mode 100644
index 
0000000000000000000000000000000000000000..6f9eb3a9da6b1395a60b63c0a8d21aba366dd12d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
@@ -0,0 +1,85 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-save-temps" } */
+
+#include <arm_neon.h>
+
+float32x2_t test_vcreate (float32x2_t r, uint64_t a, uint64_t b)
+{
+  bfloat16x4_t _a = vcreate_bf16(a);
+  bfloat16x4_t _b = vcreate_bf16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+/* { dg-final { scan-assembler {bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h} } } 
*/
+
+bfloat16x4_t test_vset_lane_bf16 (bfloat16_t a, bfloat16x4_t b)
+{
+  return vset_lane_bf16 (a, b, 3);
+}
+
+bfloat16x8_t test_vsetq_lane_bf16 (bfloat16_t a, bfloat16x8_t b)
+{
+  return vsetq_lane_bf16 (a, b, 7);
+}
+/* { dg-final { scan-assembler-times "ins\\t" 2 } } */
+
+bfloat16x4_t vdup_test (bfloat16_t a)
+{
+  return vdup_n_bf16 (a);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.4h, v\[0-9\]+.h\\\[0\\\]" } } 
*/
+
+bfloat16x8_t vdupq_test (bfloat16_t a)
+{
+  return vdupq_n_bf16 (a);
+}
+
+bfloat16x8_t test_vdupq_lane_bf16 (bfloat16x4_t a)
+{
+  return vdupq_lane_bf16 (a, 1);
+}
+/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.8h, 
v\[0-9\]+.h\\\[0\\\]" 2 } } */
+
+bfloat16_t test_vget_lane_bf16 (bfloat16x4_t a)
+{
+  return vget_lane_bf16 (a, 1);
+}
+/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[1\\\]" 
2 } } */
+
+bfloat16x4_t test_vdup_lane_bf16 (bfloat16x4_t a)
+{
+  return vdup_lane_bf16 (a, 1);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" } 
} */
+
+bfloat16x4_t test_vdup_laneq_bf16 (bfloat16x8_t a)
+{
+  return vdup_laneq_bf16 (a, 7);
+}
+/* { dg-final { scan-assembler "tbl\\tv\[0-9\]+\.16b, {v\[0-9\]+\.16b}, 
v\[0-9\]+\.16b" } } */
+
+bfloat16x8_t test_vdupq_laneq_bf16 (bfloat16x8_t a)
+{
+  return vdupq_laneq_bf16 (a, 5);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.8h, v\[0-9\]+\.h\\\[5\\\]" } 
} */
+
+bfloat16_t test_vduph_lane_bf16 (bfloat16x4_t a)
+{
+  return vduph_lane_bf16 (a, 3);
+}
+/* { dg-final { scan-assembler "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[3\\\]" } } */
+
+bfloat16_t test_vgetq_lane_bf16 (bfloat16x8_t a)
+{
+  return vgetq_lane_bf16 (a, 7);
+}
+
+bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a)
+{
+  return vduph_laneq_bf16 (a, 7);
+}
+/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[7\\\]" 
2 } } */
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c
new file mode 100644
index 
0000000000000000000000000000000000000000..f5adf40c648e16c649ef5d68accb291b822f2936
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c
@@ -0,0 +1,466 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-save-temps" } */
+
+#include <arm_neon.h>
+
+float32x2_t
+test_vbfdot_f32_s8 (float32x2_t r, int8x8_t a, int8x8_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_s8(a);
+  bfloat16x4_t _b = vreinterpret_bf16_s8(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_s16 (float32x2_t r, int16x4_t a, int16x4_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_s16(a);
+  bfloat16x4_t _b = vreinterpret_bf16_s16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_s32 (float32x2_t r, int32x2_t a, int32x2_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_s32(a);
+  bfloat16x4_t _b = vreinterpret_bf16_s32(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_s64 (float32x2_t r, int64x1_t a, int64x1_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_s64(a);
+  bfloat16x4_t _b = vreinterpret_bf16_s64(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_u8 (float32x2_t r, uint8x8_t a, uint8x8_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_u8(a);
+  bfloat16x4_t _b = vreinterpret_bf16_u8(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_u16 (float32x2_t r, uint16x4_t a, uint16x4_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_u16(a);
+  bfloat16x4_t _b = vreinterpret_bf16_u16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_u32 (float32x2_t r, uint32x2_t a, uint32x2_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_u32(a);
+  bfloat16x4_t _b = vreinterpret_bf16_u32(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_u64 (float32x2_t r, uint64x1_t a, uint64x1_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_u64(a);
+  bfloat16x4_t _b = vreinterpret_bf16_u64(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_p8 (float32x2_t r, poly8x8_t a, poly8x8_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_p8(a);
+  bfloat16x4_t _b = vreinterpret_bf16_p8(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_p16 (float32x2_t r, poly16x4_t a, poly16x4_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_p16(a);
+  bfloat16x4_t _b = vreinterpret_bf16_p16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_p64 (float32x2_t r, poly64x1_t a, poly64x1_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_p64(a);
+  bfloat16x4_t _b = vreinterpret_bf16_p64(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_f16 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_f16(a);
+  bfloat16x4_t _b = vreinterpret_bf16_f16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_f32 (float32x2_t r, float32x2_t a, float32x2_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_f32(a);
+  bfloat16x4_t _b = vreinterpret_bf16_f32(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_f64 (float32x2_t r, float64x1_t a, float64x1_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_f64(a);
+  bfloat16x4_t _b = vreinterpret_bf16_f64(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_s8 (float32x4_t r, int8x16_t a, int8x16_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_s8(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_s8(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_s16 (float32x4_t r, int16x8_t a, int16x8_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_s16(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_s16(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_s32 (float32x4_t r, int32x4_t a, int32x4_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_s32(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_s32(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_s64 (float32x4_t r, int64x2_t a, int64x2_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_s64(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_s64(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_u8 (float32x4_t r, uint8x16_t a, uint8x16_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_u8(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_u8(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_u16 (float32x4_t r, uint16x8_t a, uint16x8_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_u16(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_u16(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_u32 (float32x4_t r, uint32x4_t a, uint32x4_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_u32(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_u32(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_u64 (float32x4_t r, uint64x2_t a, uint64x2_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_u64(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_u64(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_p8 (float32x4_t r, poly8x16_t a, poly8x16_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_p8(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_p8(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_p16 (float32x4_t r, poly16x8_t a, poly16x8_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_p16(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_p16(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_p64 (float32x4_t r, poly64x2_t a, poly64x2_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_p64(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_p64(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_p128 (float32x4_t r, poly128_t a, poly128_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_p128(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_p128(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_f16 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_f16(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_f16(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_f32 (float32x4_t r, float32x4_t a, float32x4_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_f32(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_f32(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_f64 (float32x4_t r, float64x2_t a, float64x2_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_f64(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_f64(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+/* { dg-final { scan-assembler-times {bfdot\tv[0-9]+.2s, v[0-9]+.4h, 
v[0-9]+.4h} 14 } } */
+/* { dg-final { scan-assembler-times {bfdot\tv[0-9]+.4s, v[0-9]+.8h, 
v[0-9]+.8h} 15 } } */
+
+int8x8_t test_vreinterpret_s8_bf16 (bfloat16x4_t a, int8x8_t b)
+{
+  int8x8_t _a = vreinterpret_s8_bf16 (a);
+  return vadd_s8 (_a, b);
+}
+
+int16x4_t test_vreinterpret_s16_bf16 (bfloat16x4_t a, int16x4_t b)
+{
+  int16x4_t _a = vreinterpret_s16_bf16 (a);
+  return vadd_s16 (_a, b);
+}
+
+int32x2_t test_vreinterpret_s32_bf16 (bfloat16x4_t a, int32x2_t b)
+{
+  int32x2_t _a = vreinterpret_s32_bf16 (a);
+  return vadd_s32 (_a, b);
+}
+
+int64x1_t test_vreinterpret_s64_bf16 (bfloat16x4_t a, int64x1_t b)
+{
+  int64x1_t _a = vreinterpret_s64_bf16 (a);
+  return vrshl_s64 (_a, b);
+}
+
+uint8x8_t test_vreinterpret_u8_bf16 (bfloat16x4_t a, uint8x8_t b)
+{
+  uint8x8_t _a = vreinterpret_u8_bf16 (a);
+  return vadd_u8 (_a, b);
+}
+
+uint16x4_t test_vreinterpret_u16_bf16 (bfloat16x4_t a, uint16x4_t b)
+{
+  uint16x4_t _a = vreinterpret_u16_bf16 (a);
+  return vadd_u16 (_a, b);
+}
+
+uint32x2_t test_vreinterpret_u32_bf16 (bfloat16x4_t a, uint32x2_t b)
+{
+  uint32x2_t _a = vreinterpret_u32_bf16 (a);
+  return vadd_u32 (_a, b);
+}
+
+uint64x1_t test_vreinterpret_u64_bf16 (bfloat16x4_t a, int64x1_t b)
+{
+  uint64x1_t _a = vreinterpret_u64_bf16 (a);
+  return vrshl_u64 (_a, b);
+}
+
+poly8x8_t test_vreinterpret_p8_bf16 (bfloat16x4_t a, poly8x8_t b)
+{
+  poly8x8_t _a = vreinterpret_p8_bf16 (a);
+  return vzip1_p8 (_a, b);
+}
+
+poly16x4_t test_vreinterpret_p16_bf16 (bfloat16x4_t a, poly16x4_t b)
+{
+  poly16x4_t _a = vreinterpret_p16_bf16 (a);
+  return vzip1_p16 (_a, b);
+}
+
+poly64x1_t test_vreinterpret_p64_bf16 (bfloat16x4_t a, poly64x1_t b)
+{
+  poly64x1_t _a = vreinterpret_p64_bf16 (a);
+  return vsli_n_p64 (_a, b, 3);
+}
+
+float32x2_t test_vreinterpret_f32_bf16 (bfloat16x4_t a, float32x2_t b)
+{
+  float32x2_t _a = vreinterpret_f32_bf16 (a);
+  return vsub_f32 (_a, b);
+}
+
+float64x1_t test_vreinterpret_f64_bf16 (bfloat16x4_t a, float64x1_t b)
+{
+  float64x1_t _a = vreinterpret_f64_bf16 (a);
+  return vsub_f64 (_a, b);
+}
+
+int8x16_t test_vreinterpretq_s8_bf16 (bfloat16x8_t a, int8x16_t b)
+{
+  int8x16_t _a = vreinterpretq_s8_bf16 (a);
+  return vaddq_s8 (_a, b);
+}
+
+int16x8_t test_vreinterpretq_s16_bf16 (bfloat16x8_t a, int16x8_t b)
+{
+  int16x8_t _a = vreinterpretq_s16_bf16 (a);
+  return vaddq_s16 (_a, b);
+}
+
+int32x4_t test_vreinterpretq_s32_bf16 (bfloat16x8_t a, int32x4_t b)
+{
+  int32x4_t _a = vreinterpretq_s32_bf16 (a);
+  return vaddq_s32 (_a, b);
+}
+
+int64x2_t test_vreinterpretq_s64_bf16 (bfloat16x8_t a, int64x2_t b)
+{
+  int64x2_t _a = vreinterpretq_s64_bf16 (a);
+  return vaddq_s64 (_a, b);
+}
+
+uint8x16_t test_vreinterpretq_u8_bf16 (bfloat16x8_t a, uint8x16_t b)
+{
+  uint8x16_t _a = vreinterpretq_u8_bf16 (a);
+  return vaddq_u8 (_a, b);
+}
+
+uint16x8_t test_vreinterpretq_u16_bf16 (bfloat16x8_t a, uint16x8_t b)
+{
+  uint16x8_t _a = vreinterpretq_u16_bf16 (a);
+  return vaddq_u16 (_a, b);
+}
+
+uint32x4_t test_vreinterpretq_u32_bf16 (bfloat16x8_t a, uint32x4_t b)
+{
+  uint32x4_t _a = vreinterpretq_u32_bf16 (a);
+  return vaddq_u32 (_a, b);
+}
+
+uint64x2_t test_vreinterpretq_u64_bf16 (bfloat16x8_t a, uint64x2_t b)
+{
+  uint64x2_t _a = vreinterpretq_u64_bf16 (a);
+  return vaddq_u64 (_a, b);
+}
+
+poly8x16_t test_vreinterpretq_p8_bf16 (bfloat16x8_t a, poly8x16_t b)
+{
+  poly8x16_t _a = vreinterpretq_p8_bf16 (a);
+  return vzip1q_p8 (_a, b);
+}
+
+poly16x8_t test_vreinterpretq_p16_bf16 (bfloat16x8_t a, poly16x8_t b)
+{
+  poly16x8_t _a = vreinterpretq_p16_bf16 (a);
+  return vzip1q_p16 (_a, b);
+}
+
+poly64x2_t test_vreinterpretq_p64_bf16 (bfloat16x8_t a, poly64x2_t b)
+{
+  poly64x2_t _a = vreinterpretq_p64_bf16 (a);
+  return vsliq_n_p64 (_a, b, 3);
+}
+
+poly128_t test_vreinterpretq_p128_bf16 (bfloat16x8_t a, poly16x8_t b)
+{
+  poly128_t _a = vreinterpretq_p128_bf16 (a);
+  return _a;
+}
+
+float32x4_t test_vreinterpretq_f32_bf16 (bfloat16x8_t a, float32x4_t b)
+{
+  float32x4_t _a = vreinterpretq_f32_bf16 (a);
+  return vsubq_f32 (_a, b);
+}
+
+float64x2_t test_vreinterpretq_f64_bf16 (bfloat16x8_t a, float64x2_t b)
+{
+  float64x2_t _a = vreinterpretq_f64_bf16 (a);
+  return vsubq_f64 (_a, b);
+}
+
+float16x4_t test_vreinterpret_f16_bf16 (bfloat16x4_t a)
+{
+  return vreinterpret_f16_bf16 (a);
+}
+
+float16x8_t test_vreinterpretq_f16_bf16 (bfloat16x8_t a)
+{
+  return vreinterpretq_f16_bf16 (a);
+}
+
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s} 
2 } } */
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h} 
2 } } */
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b} 
2 } } */
+
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s} 
2 } } */
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h} 
2 } } */
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.16b, v[0-9]+.16b, 
v[0-9]+.16b} 2 } } */
+
+/* { dg-final { scan-assembler {fsub\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s} } } 
*/
+/* { dg-final { scan-assembler {fsub\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s} } } 
*/
+/* { dg-final { scan-assembler {fsub\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d} } } 
*/
+/* { dg-final { scan-assembler {fsub\td[0-9]+, d[0-9]+, d[0-9]+} } } */
+
+/* { dg-final { scan-assembler {zip1\tv[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b} } } 
*/
+/* { dg-final { scan-assembler {zip1\tv[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b} } 
} */
+/* { dg-final { scan-assembler {zip1\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h} } } 
*/
+/* { dg-final { scan-assembler {zip1\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h} } } 
*/
+
+/* { dg-final { scan-assembler {sli\tv[0-9]+.2d, v[0-9]+.2d, 3} } } */
+/* { dg-final { scan-assembler {sli\td[0-9]+, d[0-9]+, 3} } } */
+
+/* { dg-final { scan-assembler {urshl\td[0-9]+, d[0-9]+, d[0-9]+} } } */
+/* { dg-final { scan-assembler {srshl\td[0-9]+, d[0-9]+, d[0-9]+} } } */


Found a small issue when rebasing on latest trunk.
vdup_laneq_bf16 was checked against the wrong instruction. It generates
a dup, not a tbl. I've updated the patch with the fix.

Is it ok for trunk?


Regards,
Mihail

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 
7f05c3f9eca844b0e7b824a191223a4906c825b1..3cc3ace83fabf25d8e2e6e70382d335afd974290
 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34614,6 +34614,507 @@ vrnd64xq_f64 (float64x2_t __a)
 #pragma GCC push_options
 #pragma GCC target ("arch=armv8.2-a+bf16")
 
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vset_lane_bf16 (bfloat16_t __elem, bfloat16x4_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsetq_lane_bf16 (bfloat16_t __elem, bfloat16x8_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vgetq_lane_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcreate_bf16 (uint64_t __a)
+{
+  return (bfloat16x4_t) __a;
+}
+
+/* vdup */
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_bf16 (bfloat16_t __a)
+{
+  return (bfloat16x4_t) {__a, __a, __a, __a};
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_bf16 (bfloat16_t __a)
+{
+  return (bfloat16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+/* vreinterpret */
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u8 (uint8x8_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u16 (uint16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u32 (uint32x2_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u64 (uint64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s8 (int8x8_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s16 (int16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s32 (int32x2_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s64 (int64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p8 (poly8x8_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p16 (poly16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p64 (poly64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f16 (float16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f32 (float32x2_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f64 (float64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u8 (uint8x16_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u16 (uint16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u32 (uint32x4_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u64 (uint64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s8 (int8x16_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s16 (int16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s32 (int32x4_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s64 (int64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p8 (poly8x16_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p16 (poly16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p64 (poly64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p128 (poly128_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f16 (float16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f32 (float32x4_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f64 (float64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s8_bf16 (bfloat16x4_t __a)
+{
+  return (int8x8_t)__a;
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s16_bf16 (bfloat16x4_t __a)
+{
+  return (int16x4_t)__a;
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s32_bf16 (bfloat16x4_t __a)
+{
+  return (int32x2_t)__a;
+}
+
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s64_bf16 (bfloat16x4_t __a)
+{
+  return (int64x1_t)__a;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u8_bf16 (bfloat16x4_t __a)
+{
+  return (uint8x8_t)__a;
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u16_bf16 (bfloat16x4_t __a)
+{
+  return (uint16x4_t)__a;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u32_bf16 (bfloat16x4_t __a)
+{
+  return (uint32x2_t)__a;
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u64_bf16 (bfloat16x4_t __a)
+{
+  return (uint64x1_t)__a;
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f16_bf16 (bfloat16x4_t __a)
+{
+  return (float16x4_t)__a;
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f32_bf16 (bfloat16x4_t __a)
+{
+  return (float32x2_t)__a;
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f64_bf16 (bfloat16x4_t __a)
+{
+  return (float64x1_t)__a;
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p8_bf16 (bfloat16x4_t __a)
+{
+  return (poly8x8_t)__a;
+}
+
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p16_bf16 (bfloat16x4_t __a)
+{
+  return (poly16x4_t)__a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_bf16 (bfloat16x4_t __a)
+{
+  return (poly64x1_t)__a;
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s8_bf16 (bfloat16x8_t __a)
+{
+  return (int8x16_t)__a;
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s16_bf16 (bfloat16x8_t __a)
+{
+  return (int16x8_t)__a;
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s32_bf16 (bfloat16x8_t __a)
+{
+  return (int32x4_t)__a;
+}
+
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s64_bf16 (bfloat16x8_t __a)
+{
+  return (int64x2_t)__a;
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u8_bf16 (bfloat16x8_t __a)
+{
+  return (uint8x16_t)__a;
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u16_bf16 (bfloat16x8_t __a)
+{
+  return (uint16x8_t)__a;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u32_bf16 (bfloat16x8_t __a)
+{
+  return (uint32x4_t)__a;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u64_bf16 (bfloat16x8_t __a)
+{
+  return (uint64x2_t)__a;
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f16_bf16 (bfloat16x8_t __a)
+{
+  return (float16x8_t)__a;
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f32_bf16 (bfloat16x8_t __a)
+{
+  return (float32x4_t)__a;
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f64_bf16 (bfloat16x8_t __a)
+{
+  return (float64x2_t)__a;
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p8_bf16 (bfloat16x8_t __a)
+{
+  return (poly8x16_t)__a;
+}
+
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p16_bf16 (bfloat16x8_t __a)
+{
+  return (poly16x8_t)__a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_bf16 (bfloat16x8_t __a)
+{
+  return (poly64x2_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_bf16 (bfloat16x8_t __a)
+{
+  return (poly128_t)__a;
+}
+
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 
fc7856e58c9ab65acce3fc43b18356c8c3ff6aae..76c8d9abbe79355078799a4ca227b2a352636ef4
 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -136,7 +136,8 @@
 (define_mode_iterator PTR [(SI "ptr_mode == SImode") (DI "ptr_mode == 
DImode")])
 
 ;; Advanced SIMD Float modes suitable for moving, loading and storing.
-(define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF])
+(define_mode_iterator VDQF_F16  [V4HF V8HF V2SF V4SF V2DF
+                                V4BF V8BF])
 
 ;; Advanced SIMD Float modes.
 (define_mode_iterator VDQF [V2SF V4SF V2DF])
@@ -177,7 +178,7 @@
 
 ;; All Advanced SIMD modes suitable for moving, loading, and storing.
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
-                               V4HF V8HF V2SF V4SF V2DF])
+                               V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
 
 ;; All Advanced SIMD modes suitable for moving, loading, and storing,
 ;; including special Bfloat vector types.
@@ -193,7 +194,7 @@
 
 ;; All Advanced SIMD modes and DI.
 (define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
-                                 V4HF V8HF V2SF V4SF V2DF DI])
+                                 V4HF V8HF V4BF V8BF V2SF V4SF V2DF DI])
 
 ;; All Advanced SIMD modes, plus DI and DF.
 (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI
@@ -821,6 +822,7 @@
 
 (define_mode_attr Vtype [(V8QI "8b") (V16QI "16b")
                         (V4HI "4h") (V8HI  "8h")
+                        (V4BF "4h") (V8BF  "8h")
                          (V2SI "2s") (V4SI  "4s")
                          (DI   "1d") (DF    "1d")
                          (V2DI "2d") (V2SF "2s")
@@ -863,6 +865,7 @@
                          (VNx4SF "s") (VNx2SF "s")
                          (VNx2DI "d")
                          (VNx2DF "d")
+                         (BF "h") (V4BF "h") (V8BF "h")
                          (HF "h")
                          (SF "s") (DF "d")
                          (QI "b") (HI "h")
@@ -928,6 +931,7 @@
                       (DF   "DF") (V2DF  "DF")
                       (SI   "SI") (HI    "HI")
                       (QI   "QI")
+                      (V4BF "BF") (V8BF "BF")
                       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
                       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
                       (VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
@@ -946,6 +950,7 @@
                       (V2DF "df") (DF   "df")
                       (SI   "si") (HI   "hi")
                       (QI   "qi")
+                      (V4BF "bf") (V8BF "bf")
                       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
                       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
                       (VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
@@ -1249,6 +1254,7 @@
 
 (define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI")
                                (V4HI "V8HI") (V8HI  "V4HI")
+                               (V8BF "V4BF") (V4BF  "V8BF")
                                (V2SI "V4SI") (V4SI  "V2SI")
                                (DI   "V2DI") (V2DI  "DI")
                                (V2SF "V4SF") (V4SF  "V2SF")
@@ -1261,6 +1267,7 @@
                                    (DI   "to_128") (V2DI  "to_64")
                                    (V4HF "to_128") (V8HF  "to_64")
                                    (V2SF "to_128") (V4SF  "to_64")
+                                   (V4BF "to_128") (V8BF  "to_64")
                                    (DF   "to_128") (V2DF  "to_64")])
 
 ;; For certain vector-by-element multiplication instructions we must
@@ -1294,6 +1301,7 @@
 ;; Defined to '_q' for 128-bit types.
 (define_mode_attr q [(V8QI "") (V16QI "_q")
                     (V4HI "") (V8HI  "_q")
+                    (V4BF "") (V8BF  "_q")
                     (V2SI "") (V4SI  "_q")
                     (DI   "") (V2DI  "_q")
                     (V4HF "") (V8HF "_q")
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
new file mode 100644
index 
0000000000000000000000000000000000000000..c42c7acbbe92d348e9cd18d3a03a38895af83632
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
@@ -0,0 +1,85 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-save-temps" } */
+
+#include <arm_neon.h>
+
+float32x2_t test_vcreate (float32x2_t r, uint64_t a, uint64_t b)
+{
+  bfloat16x4_t _a = vcreate_bf16(a);
+  bfloat16x4_t _b = vcreate_bf16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+/* { dg-final { scan-assembler {bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h} } } 
*/
+
+bfloat16x4_t test_vset_lane_bf16 (bfloat16_t a, bfloat16x4_t b)
+{
+  return vset_lane_bf16 (a, b, 3);
+}
+
+bfloat16x8_t test_vsetq_lane_bf16 (bfloat16_t a, bfloat16x8_t b)
+{
+  return vsetq_lane_bf16 (a, b, 7);
+}
+/* { dg-final { scan-assembler-times "ins\\t" 2 } } */
+
+bfloat16x4_t vdup_test (bfloat16_t a)
+{
+  return vdup_n_bf16 (a);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.4h, v\[0-9\]+.h\\\[0\\\]" } } 
*/
+
+bfloat16x8_t vdupq_test (bfloat16_t a)
+{
+  return vdupq_n_bf16 (a);
+}
+
+bfloat16x8_t test_vdupq_lane_bf16 (bfloat16x4_t a)
+{
+  return vdupq_lane_bf16 (a, 1);
+}
+/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.8h, 
v\[0-9\]+.h\\\[0\\\]" 2 } } */
+
+bfloat16_t test_vget_lane_bf16 (bfloat16x4_t a)
+{
+  return vget_lane_bf16 (a, 1);
+}
+/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[1\\\]" 
2 } } */
+
+bfloat16x4_t test_vdup_lane_bf16 (bfloat16x4_t a)
+{
+  return vdup_lane_bf16 (a, 1);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" } 
} */
+
+bfloat16x4_t test_vdup_laneq_bf16 (bfloat16x8_t a)
+{
+  return vdup_laneq_bf16 (a, 7);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.8h, v\[0-9\]+\.h\\\[7\\\]" } 
} */
+
+bfloat16x8_t test_vdupq_laneq_bf16 (bfloat16x8_t a)
+{
+  return vdupq_laneq_bf16 (a, 5);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.8h, v\[0-9\]+\.h\\\[5\\\]" } 
} */
+
+bfloat16_t test_vduph_lane_bf16 (bfloat16x4_t a)
+{
+  return vduph_lane_bf16 (a, 3);
+}
+/* { dg-final { scan-assembler "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[3\\\]" } } */
+
+bfloat16_t test_vgetq_lane_bf16 (bfloat16x8_t a)
+{
+  return vgetq_lane_bf16 (a, 7);
+}
+
+bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a)
+{
+  return vduph_laneq_bf16 (a, 7);
+}
+/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[7\\\]" 
2 } } */
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c
new file mode 100644
index 
0000000000000000000000000000000000000000..f5adf40c648e16c649ef5d68accb291b822f2936
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c
@@ -0,0 +1,466 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-save-temps" } */
+
+#include <arm_neon.h>
+
+float32x2_t
+test_vbfdot_f32_s8 (float32x2_t r, int8x8_t a, int8x8_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_s8(a);
+  bfloat16x4_t _b = vreinterpret_bf16_s8(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_s16 (float32x2_t r, int16x4_t a, int16x4_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_s16(a);
+  bfloat16x4_t _b = vreinterpret_bf16_s16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_s32 (float32x2_t r, int32x2_t a, int32x2_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_s32(a);
+  bfloat16x4_t _b = vreinterpret_bf16_s32(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_s64 (float32x2_t r, int64x1_t a, int64x1_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_s64(a);
+  bfloat16x4_t _b = vreinterpret_bf16_s64(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_u8 (float32x2_t r, uint8x8_t a, uint8x8_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_u8(a);
+  bfloat16x4_t _b = vreinterpret_bf16_u8(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_u16 (float32x2_t r, uint16x4_t a, uint16x4_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_u16(a);
+  bfloat16x4_t _b = vreinterpret_bf16_u16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_u32 (float32x2_t r, uint32x2_t a, uint32x2_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_u32(a);
+  bfloat16x4_t _b = vreinterpret_bf16_u32(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_u64 (float32x2_t r, uint64x1_t a, uint64x1_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_u64(a);
+  bfloat16x4_t _b = vreinterpret_bf16_u64(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_p8 (float32x2_t r, poly8x8_t a, poly8x8_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_p8(a);
+  bfloat16x4_t _b = vreinterpret_bf16_p8(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_p16 (float32x2_t r, poly16x4_t a, poly16x4_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_p16(a);
+  bfloat16x4_t _b = vreinterpret_bf16_p16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_p64 (float32x2_t r, poly64x1_t a, poly64x1_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_p64(a);
+  bfloat16x4_t _b = vreinterpret_bf16_p64(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_f16 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_f16(a);
+  bfloat16x4_t _b = vreinterpret_bf16_f16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_f32 (float32x2_t r, float32x2_t a, float32x2_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_f32(a);
+  bfloat16x4_t _b = vreinterpret_bf16_f32(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_f64 (float32x2_t r, float64x1_t a, float64x1_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_f64(a);
+  bfloat16x4_t _b = vreinterpret_bf16_f64(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_s8 (float32x4_t r, int8x16_t a, int8x16_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_s8(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_s8(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_s16 (float32x4_t r, int16x8_t a, int16x8_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_s16(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_s16(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_s32 (float32x4_t r, int32x4_t a, int32x4_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_s32(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_s32(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_s64 (float32x4_t r, int64x2_t a, int64x2_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_s64(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_s64(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_u8 (float32x4_t r, uint8x16_t a, uint8x16_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_u8(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_u8(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_u16 (float32x4_t r, uint16x8_t a, uint16x8_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_u16(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_u16(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_u32 (float32x4_t r, uint32x4_t a, uint32x4_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_u32(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_u32(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_u64 (float32x4_t r, uint64x2_t a, uint64x2_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_u64(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_u64(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_p8 (float32x4_t r, poly8x16_t a, poly8x16_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_p8(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_p8(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_p16 (float32x4_t r, poly16x8_t a, poly16x8_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_p16(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_p16(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_p64 (float32x4_t r, poly64x2_t a, poly64x2_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_p64(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_p64(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_p128 (float32x4_t r, poly128_t a, poly128_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_p128(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_p128(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_f16 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_f16(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_f16(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_f32 (float32x4_t r, float32x4_t a, float32x4_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_f32(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_f32(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_f64 (float32x4_t r, float64x2_t a, float64x2_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_f64(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_f64(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+/* { dg-final { scan-assembler-times {bfdot\tv[0-9]+.2s, v[0-9]+.4h, 
v[0-9]+.4h} 14 } } */
+/* { dg-final { scan-assembler-times {bfdot\tv[0-9]+.4s, v[0-9]+.8h, 
v[0-9]+.8h} 15 } } */
+
+int8x8_t test_vreinterpret_s8_bf16 (bfloat16x4_t a, int8x8_t b)
+{
+  int8x8_t _a = vreinterpret_s8_bf16 (a);
+  return vadd_s8 (_a, b);
+}
+
+int16x4_t test_vreinterpret_s16_bf16 (bfloat16x4_t a, int16x4_t b)
+{
+  int16x4_t _a = vreinterpret_s16_bf16 (a);
+  return vadd_s16 (_a, b);
+}
+
+int32x2_t test_vreinterpret_s32_bf16 (bfloat16x4_t a, int32x2_t b)
+{
+  int32x2_t _a = vreinterpret_s32_bf16 (a);
+  return vadd_s32 (_a, b);
+}
+
+int64x1_t test_vreinterpret_s64_bf16 (bfloat16x4_t a, int64x1_t b)
+{
+  int64x1_t _a = vreinterpret_s64_bf16 (a);
+  return vrshl_s64 (_a, b);
+}
+
+uint8x8_t test_vreinterpret_u8_bf16 (bfloat16x4_t a, uint8x8_t b)
+{
+  uint8x8_t _a = vreinterpret_u8_bf16 (a);
+  return vadd_u8 (_a, b);
+}
+
+uint16x4_t test_vreinterpret_u16_bf16 (bfloat16x4_t a, uint16x4_t b)
+{
+  uint16x4_t _a = vreinterpret_u16_bf16 (a);
+  return vadd_u16 (_a, b);
+}
+
+uint32x2_t test_vreinterpret_u32_bf16 (bfloat16x4_t a, uint32x2_t b)
+{
+  uint32x2_t _a = vreinterpret_u32_bf16 (a);
+  return vadd_u32 (_a, b);
+}
+
+uint64x1_t test_vreinterpret_u64_bf16 (bfloat16x4_t a, int64x1_t b)
+{
+  uint64x1_t _a = vreinterpret_u64_bf16 (a);
+  return vrshl_u64 (_a, b);
+}
+
+poly8x8_t test_vreinterpret_p8_bf16 (bfloat16x4_t a, poly8x8_t b)
+{
+  poly8x8_t _a = vreinterpret_p8_bf16 (a);
+  return vzip1_p8 (_a, b);
+}
+
+poly16x4_t test_vreinterpret_p16_bf16 (bfloat16x4_t a, poly16x4_t b)
+{
+  poly16x4_t _a = vreinterpret_p16_bf16 (a);
+  return vzip1_p16 (_a, b);
+}
+
+poly64x1_t test_vreinterpret_p64_bf16 (bfloat16x4_t a, poly64x1_t b)
+{
+  poly64x1_t _a = vreinterpret_p64_bf16 (a);
+  return vsli_n_p64 (_a, b, 3);
+}
+
+float32x2_t test_vreinterpret_f32_bf16 (bfloat16x4_t a, float32x2_t b)
+{
+  float32x2_t _a = vreinterpret_f32_bf16 (a);
+  return vsub_f32 (_a, b);
+}
+
+float64x1_t test_vreinterpret_f64_bf16 (bfloat16x4_t a, float64x1_t b)
+{
+  float64x1_t _a = vreinterpret_f64_bf16 (a);
+  return vsub_f64 (_a, b);
+}
+
+int8x16_t test_vreinterpretq_s8_bf16 (bfloat16x8_t a, int8x16_t b)
+{
+  int8x16_t _a = vreinterpretq_s8_bf16 (a);
+  return vaddq_s8 (_a, b);
+}
+
+int16x8_t test_vreinterpretq_s16_bf16 (bfloat16x8_t a, int16x8_t b)
+{
+  int16x8_t _a = vreinterpretq_s16_bf16 (a);
+  return vaddq_s16 (_a, b);
+}
+
+int32x4_t test_vreinterpretq_s32_bf16 (bfloat16x8_t a, int32x4_t b)
+{
+  int32x4_t _a = vreinterpretq_s32_bf16 (a);
+  return vaddq_s32 (_a, b);
+}
+
+int64x2_t test_vreinterpretq_s64_bf16 (bfloat16x8_t a, int64x2_t b)
+{
+  int64x2_t _a = vreinterpretq_s64_bf16 (a);
+  return vaddq_s64 (_a, b);
+}
+
+uint8x16_t test_vreinterpretq_u8_bf16 (bfloat16x8_t a, uint8x16_t b)
+{
+  uint8x16_t _a = vreinterpretq_u8_bf16 (a);
+  return vaddq_u8 (_a, b);
+}
+
+uint16x8_t test_vreinterpretq_u16_bf16 (bfloat16x8_t a, uint16x8_t b)
+{
+  uint16x8_t _a = vreinterpretq_u16_bf16 (a);
+  return vaddq_u16 (_a, b);
+}
+
+uint32x4_t test_vreinterpretq_u32_bf16 (bfloat16x8_t a, uint32x4_t b)
+{
+  uint32x4_t _a = vreinterpretq_u32_bf16 (a);
+  return vaddq_u32 (_a, b);
+}
+
+uint64x2_t test_vreinterpretq_u64_bf16 (bfloat16x8_t a, uint64x2_t b)
+{
+  uint64x2_t _a = vreinterpretq_u64_bf16 (a);
+  return vaddq_u64 (_a, b);
+}
+
+poly8x16_t test_vreinterpretq_p8_bf16 (bfloat16x8_t a, poly8x16_t b)
+{
+  poly8x16_t _a = vreinterpretq_p8_bf16 (a);
+  return vzip1q_p8 (_a, b);
+}
+
+poly16x8_t test_vreinterpretq_p16_bf16 (bfloat16x8_t a, poly16x8_t b)
+{
+  poly16x8_t _a = vreinterpretq_p16_bf16 (a);
+  return vzip1q_p16 (_a, b);
+}
+
+poly64x2_t test_vreinterpretq_p64_bf16 (bfloat16x8_t a, poly64x2_t b)
+{
+  poly64x2_t _a = vreinterpretq_p64_bf16 (a);
+  return vsliq_n_p64 (_a, b, 3);
+}
+
+poly128_t test_vreinterpretq_p128_bf16 (bfloat16x8_t a, poly16x8_t b)
+{
+  poly128_t _a = vreinterpretq_p128_bf16 (a);
+  return _a;
+}
+
+float32x4_t test_vreinterpretq_f32_bf16 (bfloat16x8_t a, float32x4_t b)
+{
+  float32x4_t _a = vreinterpretq_f32_bf16 (a);
+  return vsubq_f32 (_a, b);
+}
+
+float64x2_t test_vreinterpretq_f64_bf16 (bfloat16x8_t a, float64x2_t b)
+{
+  float64x2_t _a = vreinterpretq_f64_bf16 (a);
+  return vsubq_f64 (_a, b);
+}
+
+float16x4_t test_vreinterpret_f16_bf16 (bfloat16x4_t a)
+{
+  return vreinterpret_f16_bf16 (a);
+}
+
+float16x8_t test_vreinterpretq_f16_bf16 (bfloat16x8_t a)
+{
+  return vreinterpretq_f16_bf16 (a);
+}
+
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s} 
2 } } */
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h} 
2 } } */
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b} 
2 } } */
+
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s} 
2 } } */
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h} 
2 } } */
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.16b, v[0-9]+.16b, 
v[0-9]+.16b} 2 } } */
+
+/* { dg-final { scan-assembler {fsub\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s} } } 
*/
+/* { dg-final { scan-assembler {fsub\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s} } } 
*/
+/* { dg-final { scan-assembler {fsub\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d} } } 
*/
+/* { dg-final { scan-assembler {fsub\td[0-9]+, d[0-9]+, d[0-9]+} } } */
+
+/* { dg-final { scan-assembler {zip1\tv[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b} } } 
*/
+/* { dg-final { scan-assembler {zip1\tv[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b} } 
} */
+/* { dg-final { scan-assembler {zip1\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h} } } 
*/
+/* { dg-final { scan-assembler {zip1\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h} } } 
*/
+
+/* { dg-final { scan-assembler {sli\tv[0-9]+.2d, v[0-9]+.2d, 3} } } */
+/* { dg-final { scan-assembler {sli\td[0-9]+, d[0-9]+, 3} } } */
+
+/* { dg-final { scan-assembler {urshl\td[0-9]+, d[0-9]+, d[0-9]+} } } */
+/* { dg-final { scan-assembler {srshl\td[0-9]+, d[0-9]+, d[0-9]+} } } */

Re: [GCC][PATCH][AArch64] Add bfloat16 vdup and vreinterpret ACLE intrinsics

Reply via email to