Hi,
This is the latest version of the patch. I am forcing -mfloat-abi=hard
because the register allocator behaves differently depending on the
float-abi used.
Thanks,
Delia
On 3/4/20 5:20 PM, Kyrill Tkachov wrote:
Hi Delia,
On 3/3/20 5:23 PM, Delia Burduv wrote:
Hi,
I noticed that the patch doesn't apply cleanly. I fixed it and this is
the latest version.
Thanks,
Delia
On 3/3/20 4:23 PM, Delia Burduv wrote:
Sorry, I forgot the attachment.
On 3/3/20 4:20 PM, Delia Burduv wrote:
Hi,
I made a mistake in the previous patch. This is the latest version.
Please let me know if it is ok.
Thanks,
Delia
On 2/21/20 3:18 PM, Delia Burduv wrote:
Hi Kyrill,
The arm_bf16.h is only used for scalar operations. That is how the
aarch64 versions are implemented too.
Thanks,
Delia
On 2/21/20 2:06 PM, Kyrill Tkachov wrote:
Hi Delia,
On 2/19/20 5:25 PM, Delia Burduv wrote:
Hi,
Here is the latest version of the patch. It just has some minor
formatting changes that were brought up by Richard Sandiford in the
AArch64 patches
Thanks,
Delia
On 1/22/20 5:29 PM, Delia Burduv wrote:
> Ping.
>
> I will change the tests to use the exact input and output
registers as
> Richard Sandiford suggested for the AArch64 patches.
>
> On 12/20/19 6:46 PM, Delia Burduv wrote:
>> This patch adds the ARMv8.6 ACLE BFloat16 store intrinsics
>> vst<n>{q}_bf16 as part of the BFloat16 extension.
>>
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
>>
>> The intrinsics are declared in arm_neon.h .
>> A new test is added to check assembler output.
>>
>> This patch depends on the Arm back-end patche.
>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>
>> Tested for regression on arm-none-eabi and armeb-none-eabi. I
don't
>> have commit rights, so if this is ok can someone please commit
it for me?
>>
>> gcc/ChangeLog:
>>
>> 2019-11-14 Delia Burduv <delia.bur...@arm.com>
>>
>> * config/arm/arm_neon.h (bfloat16_t): New typedef.
>> (bfloat16x4x2_t): New typedef.
>> (bfloat16x8x2_t): New typedef.
>> (bfloat16x4x3_t): New typedef.
>> (bfloat16x8x3_t): New typedef.
>> (bfloat16x4x4_t): New typedef.
>> (bfloat16x8x4_t): New typedef.
>> (vst2_bf16): New.
>> (vst2q_bf16): New.
>> (vst3_bf16): New.
>> (vst3q_bf16): New.
>> (vst4_bf16): New.
>> (vst4q_bf16): New.
>> * config/arm/arm-builtins.c (E_V2BFmode): New mode.
>> (VAR13): New.
>> (arm_simd_types[Bfloat16x2_t]):New type.
>> * config/arm/arm-modes.def (V2BF): New mode.
>> * config/arm/arm-simd-builtin-types.def
>> (Bfloat16x2_t): New entry.
>> * config/arm/arm_neon_builtins.def
>> (vst2): Changed to VAR13 and added v4bf, v8bf
>> (vst3): Changed to VAR13 and added v4bf, v8bf
>> (vst4): Changed to VAR13 and added v4bf, v8bf
>> * config/arm/iterators.md (VDXBF): New iterator.
>> (VQ2BF): New iterator.
>> (V_elem): Added V4BF, V8BF.
>> (V_sz_elem): Added V4BF, V8BF.
>> (V_mode_nunits): Added V4BF, V8BF.
>> (q): Added V4BF, V8BF.
>> *config/arm/neon.md (vst2): Used new iterators.
>> (vst3): Used new iterators.
>> (vst3qa): Used new iterators.
>> (vst3qb): Used new iterators.
>> (vst4): Used new iterators.
>> (vst4qa): Used new iterators.
>> (vst4qb): Used new iterators.
>>
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2019-11-14 Delia Burduv <delia.bur...@arm.com>
>>
>> * gcc.target/arm/simd/bf16_vstn_1.c: New test.
One thing I just noticed in this and the other arm bfloat16
patches...
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index
3c78f435009ab027f92693d00ab5b40960d5419d..fd81c18948db3a7f6e8e863d32511f75bf950e6a
100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,89 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r,
float32x4_t __a, float32x4_t __b,
return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
}
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+typedef struct bfloat16x4x2_t
+{
+ bfloat16x4_t val[2];
+} bfloat16x4x2_t;
These should be in a new arm_bf16.h file that gets included in the
main arm_neon.h file, right?
I believe the aarch64 versions are implemented that way.
Otherwise the patch looks good to me.
Thanks!
Kyrill
+
+typedef struct bfloat16x8x2_t
+{
+ bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c
b/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c
new file mode 100644
index
0000000000000000000000000000000000000000..b52ecfb959776fd04c7c33908cb7f8898ec3fe0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c
@@ -0,0 +1,84 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+
I don't see the check-function-bodies checks being performed in my
testing. Changing the directives order to:
/* { dg-do assemble } */
/* { dg-options "-save-temps" } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
/* { dg-final { check-function-bodies "**" "" } } */
makes them run but they fail, I think because this test also needs an
-O2 option, same as the load intrinsics patch. Can you please adjust the
order of the dg-* directives in the test and the function body scan
tests to match the codegen?
With this, it will be ready to go :)
Thanks,
Kyrill
+#include "arm_neon.h"
+
+/*
+**test_vst2_bf16:
+** ...
+** vst2.16 {d16-d17}, \[r0\]
+** ...
+*/
+void
+test_vst2_bf16 (bfloat16_t *ptr, bfloat16x4x2_t val)
+{
+ vst2_bf16 (ptr, val);
+}
+
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 4d31405cf6e09e3a61faa3e8142940bbdb23c60a..e0561c58fb3367876ce0164880df76f7331ec4e8 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -342,6 +342,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS]
#define v4bf_UP E_V4BFmode
#define v2si_UP E_V2SImode
#define v2sf_UP E_V2SFmode
+#define v2bf_UP E_V2BFmode
#define di_UP E_DImode
#define v16qi_UP E_V16QImode
#define v8hi_UP E_V8HImode
@@ -405,6 +406,9 @@ typedef struct {
#define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \
VAR1 (T, N, L)
+#define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+ VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
+ VAR1 (T, N, M)
/* The builtin data can be found in arm_neon_builtins.def, arm_vfp_builtins.def
and arm_acle_builtins.def. The entries in arm_neon_builtins.def require
@@ -1037,6 +1041,7 @@ arm_init_simd_builtin_types (void)
arm_simd_types[Float32x4_t].eltype = float_type_node;
/* Init Bfloat vector types with underlying __bf16 scalar type. */
+ arm_simd_types[Bfloat16x2_t].eltype = arm_bf16_type_node;
arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node;
arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node;
diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def
index ea92ef35723f979c8bb1f6bfb4fbeb6cd1e4b6e9..6e48223b63d98fcbe38960700dd0949d74629f7f 100644
--- a/gcc/config/arm/arm-modes.def
+++ b/gcc/config/arm/arm-modes.def
@@ -80,6 +80,7 @@ VECTOR_MODE (FLOAT, HF, 2); /* V2HF */
FLOAT_MODE (BF, 2, 0);
ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
+VECTOR_MODE (FLOAT, BF, 2); /* V2BF. */
VECTOR_MODE (FLOAT, BF, 4); /* V4BF. */
VECTOR_MODE (FLOAT, BF, 8); /* V8BF. */
diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def
index ea3c9f97b71f03ac28d83266bcdaddcd0d42678b..e35bb765cdf60b127f844877ca938dfb674ec16a 100644
--- a/gcc/config/arm/arm-simd-builtin-types.def
+++ b/gcc/config/arm/arm-simd-builtin-types.def
@@ -48,5 +48,6 @@
ENTRY (Float16x8_t, V8HF, none, 128, float16, 19)
ENTRY (Float32x4_t, V4SF, none, 128, float32, 19)
+ ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20)
ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20)
ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20)
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index a66961d0c513323844dd069b05cdfccc3e432cfc..425a2a49b69d7e3070059dd0a79ae3d306400f4b 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -19382,6 +19382,36 @@ vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
#pragma GCC push_options
#pragma GCC target ("arch=armv8.2-a+bf16")
+typedef struct bfloat16x4x2_t
+{
+ bfloat16x4_t val[2];
+} bfloat16x4x2_t;
+
+typedef struct bfloat16x8x2_t
+{
+ bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+
+typedef struct bfloat16x4x3_t
+{
+ bfloat16x4_t val[3];
+} bfloat16x4x3_t;
+
+typedef struct bfloat16x8x3_t
+{
+ bfloat16x8_t val[3];
+} bfloat16x8x3_t;
+
+typedef struct bfloat16x4x4_t
+{
+ bfloat16x4_t val[4];
+} bfloat16x4x4_t;
+
+typedef struct bfloat16x8x4_t
+{
+ bfloat16x8_t val[4];
+} bfloat16x8x4_t;
+
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvt_f32_bf16 (bfloat16x4_t __a)
@@ -19426,6 +19456,54 @@ vcvtq_high_bf16_f32 (bfloat16x8_t inactive, float32x4_t __a)
return __builtin_neon_vbfcvtv4sf_highv8bf (inactive, __a);
}
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_bf16 (bfloat16_t * __ptr, bfloat16x4x2_t __val)
+{
+ union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __val };
+ return __builtin_neon_vst2v4bf (__ptr, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_bf16 (bfloat16_t * __ptr, bfloat16x8x2_t __val)
+{
+ union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __val };
+ return __builtin_neon_vst2v8bf (__ptr, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_bf16 (bfloat16_t * __ptr, bfloat16x4x3_t __val)
+{
+ union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __val };
+ return __builtin_neon_vst3v4bf (__ptr, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_bf16 (bfloat16_t * __ptr, bfloat16x8x3_t __val)
+{
+ union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __val };
+ return __builtin_neon_vst3v8bf (__ptr, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_bf16 (bfloat16_t * __ptr, bfloat16x4x4_t __val)
+{
+ union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __val };
+ return __builtin_neon_vst4v4bf (__ptr, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_bf16 (bfloat16_t * __ptr, bfloat16x8x4_t __val)
+{
+ union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __val };
+ return __builtin_neon_vst4v8bf (__ptr, __bu.__o);
+}
+
#pragma GCC pop_options
#ifdef __cplusplus
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index 48c06c43a1744da7e143f6070ac945e8dd7225b6..d85a2d4b1fcf9e851f215dfdd4b305e59ded651c 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -325,8 +325,8 @@ VAR11 (LOAD1, vld2,
VAR9 (LOAD1LANE, vld2_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di)
-VAR11 (STORE1, vst2,
- v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR13 (STORE1, vst2,
+ v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf)
VAR9 (STORE1LANE, vst2_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR11 (LOAD1, vld3,
@@ -334,8 +334,8 @@ VAR11 (LOAD1, vld3,
VAR9 (LOAD1LANE, vld3_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di)
-VAR11 (STORE1, vst3,
- v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR13 (STORE1, vst3,
+ v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf)
VAR9 (STORE1LANE, vst3_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR11 (LOAD1, vld4,
@@ -343,8 +343,8 @@ VAR11 (LOAD1, vld4,
VAR9 (LOAD1LANE, vld4_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di)
-VAR11 (STORE1, vst4,
- v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR13 (STORE1, vst4,
+ v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf)
VAR9 (STORE1LANE, vst4_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR2 (TERNOP, sdot, v8qi, v16qi)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 5f4e3d1235813ab81c176505f9a98d702359f7ec..0c03e747c3643e018f4f62dda5e832dfb1af758f 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -84,6 +84,9 @@
;; Double-width vector modes plus 64-bit elements.
(define_mode_iterator VDX [V8QI V4HI V4HF V4BF V2SI V2SF DI])
+;; Double-width vector modes plus 64-bit elements, including V4BF.
+(define_mode_iterator VDXBF [V8QI V4HI V4HF (V4BF "TARGET_BF16_SIMD") V2SI V2SF DI])
+
;; Double-width vector modes plus 64-bit elements,
;; with V4BFmode added, suitable for moves.
(define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI])
@@ -100,6 +103,9 @@
;; Quad-width vector modes, including V8HF.
(define_mode_iterator VQ2 [V16QI V8HI V8HF V4SI V4SF])
+;; Quad-width vector modes, including V8BF.
+(define_mode_iterator VQ2BF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF])
+
;; Quad-width vector modes with 16- or 32-bit elements
(define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index f5286d9c4b1a309f6ebe864c86596aaceb05c05b..fcf59aee32a955b6bb3e7b98a4d880a0e631b4be 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -5541,7 +5541,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vst2<mode>"
[(set (match_operand:TI 0 "neon_struct_operand" "=Um")
(unspec:TI [(match_operand:TI 1 "s_register_operand" "w")
- (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST2))]
"TARGET_NEON"
{
@@ -5566,7 +5566,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vst2<mode>"
[(set (match_operand:OI 0 "neon_struct_operand" "=Um")
(unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
- (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST2))]
"TARGET_NEON"
"vst2.<V_sz_elem>\t%h1, %A0"
@@ -5810,7 +5810,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vst3<mode>"
[(set (match_operand:EI 0 "neon_struct_operand" "=Um")
(unspec:EI [(match_operand:EI 1 "s_register_operand" "w")
- (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST3))]
"TARGET_NEON"
{
@@ -5837,7 +5837,7 @@ if (BYTES_BIG_ENDIAN)
(define_expand "neon_vst3<mode>"
[(match_operand:CI 0 "neon_struct_operand")
(match_operand:CI 1 "s_register_operand")
- (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
rtx mem;
@@ -5852,7 +5852,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vst3qa<mode>"
[(set (match_operand:EI 0 "neon_struct_operand" "=Um")
(unspec:EI [(match_operand:CI 1 "s_register_operand" "w")
- (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST3A))]
"TARGET_NEON"
{
@@ -5871,7 +5871,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vst3qb<mode>"
[(set (match_operand:EI 0 "neon_struct_operand" "=Um")
(unspec:EI [(match_operand:CI 1 "s_register_operand" "w")
- (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST3B))]
"TARGET_NEON"
{
@@ -6135,7 +6135,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vst4<mode>"
[(set (match_operand:OI 0 "neon_struct_operand" "=Um")
(unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
- (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST4))]
"TARGET_NEON"
{
@@ -6163,7 +6163,7 @@ if (BYTES_BIG_ENDIAN)
(define_expand "neon_vst4<mode>"
[(match_operand:XI 0 "neon_struct_operand")
(match_operand:XI 1 "s_register_operand")
- (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
rtx mem;
@@ -6178,7 +6178,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vst4qa<mode>"
[(set (match_operand:OI 0 "neon_struct_operand" "=Um")
(unspec:OI [(match_operand:XI 1 "s_register_operand" "w")
- (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST4A))]
"TARGET_NEON"
{
@@ -6198,7 +6198,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vst4qb<mode>"
[(set (match_operand:OI 0 "neon_struct_operand" "=Um")
(unspec:OI [(match_operand:XI 1 "s_register_operand" "w")
- (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST4B))]
"TARGET_NEON"
{
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..2657b6f7cc4f3b5b7089a962933931b16686083a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c
@@ -0,0 +1,84 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps -O2 -mfloat-abi=hard" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+**test_vst2_bf16:
+** ...
+** vst2.16 {d0-d1}, \[r0\]
+** bx lr
+*/
+void
+test_vst2_bf16 (bfloat16_t *ptr, bfloat16x4x2_t val)
+{
+ vst2_bf16 (ptr, val);
+}
+
+/*
+**test_vst2q_bf16:
+** ...
+** vst2.16 {d0-d3}, \[r0\]
+** bx lr
+*/
+void
+test_vst2q_bf16 (bfloat16_t *ptr, bfloat16x8x2_t val)
+{
+ vst2q_bf16 (ptr, val);
+}
+
+/*
+**test_vst3_bf16:
+** ...
+** vst3.16 {d0-d2}, \[r0\]
+** bx lr
+*/
+void
+test_vst3_bf16 (bfloat16_t *ptr, bfloat16x4x3_t val)
+{
+ vst3_bf16 (ptr, val);
+}
+
+/*
+**test_vst3q_bf16:
+** ...
+** vst3.16 {d17, d19, d21}, \[r0\]
+** bx lr
+*/
+void
+test_vst3q_bf16 (bfloat16_t *ptr, bfloat16x8x3_t val)
+{
+ vst3q_bf16 (ptr, val);
+}
+
+/*
+**test_vst4_bf16:
+** ...
+** vst4.16 {d0-d3}, \[r0\]
+** bx lr
+*/
+void
+test_vst4_bf16 (bfloat16_t *ptr, bfloat16x4x4_t val)
+{
+ vst4_bf16 (ptr, val);
+}
+
+/*
+**test_vst4q_bf16:
+** ...
+** vst4.16 {d1, d3, d5, d7}, \[r0\]
+** bx lr
+*/
+void
+test_vst4q_bf16 (bfloat16_t *ptr, bfloat16x8x4_t val)
+{
+ vst4q_bf16 (ptr, val);
+}
+
+int main()
+{
+ return 0;
+}