On 12/20/19 2:36 PM, Richard Sandiford wrote:
> Stam Markianos-Wright <[email protected]> writes:
>> Hi all,
>>
>> This patch adds the ARMv8.6 Extension ACLE intrinsics for the bfloat bfdot
>> operation.
>>
>> The functions are declared in arm_neon.h with the armv8.2-a+bf16 target
>> option
>> as required.
>>
>> RTL patterns are defined to generate assembler.
>>
>> Tests added to verify expected assembly and perform adequate lane checks.
>>
>> This patch depends on:
>>
>> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html
>>
>> for testuite effective_target update and on:
>>
>> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01323.html
>> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01324.html
>>
>> for back-end Bfloat enablement.
>>
>> Cheers,
>> Stam
>>
>>
>> gcc/ChangeLog:
>>
>> 2019-11-04 Stam Markianos-Wright <[email protected]>
>>
>> * config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot,
>> aarch64_bfdot_lane, aarch64_bfdot_laneq): New.
>> * config/aarch64/aarch64-simd.md
>> (aarch64_bfdot, aarch64_bfdot_lane): New.
>> * config/aarch64/arm_neon.h (vbfdot_f32, vbfdotq_f32, vbfdot_lane_f32,
>> vbfdotq_lane_f32, vbfdot_laneq_f32, vbfdotq_laneq_f32): New.
>> * config/aarch64/iterators.md (UNSPEC_BFDOT, VBF, isquadop, Vbfdottype,
>> VBFMLA_W): New.
>
> Changelog nit: the continuation lines should be indened by a tab only.
Yes, sorry, that's my email client messing things up again! Fixed
locally and will carry over when I do the commit.
>
>> diff --git a/gcc/config/aarch64/aarch64-simd.md
>> b/gcc/config/aarch64/aarch64-simd.md
>> index
>> c4858ab7cffd786066646a5cd95a168311990b76..bdc26c190610580e57e9749804b7729ee4e34793
>> 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -7027,3 +7027,37 @@
>> "xtn\t%0.<Vntype>, %1.<Vtype>"
>> [(set_attr "type" "neon_shift_imm_narrow_q")]
>> )
>> +
>> +(define_insn "aarch64_bfdot<mode>"
>> + [(set (match_operand:VDQSF 0 "register_operand" "=w")
>> + (plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0")
>> + (unspec:VDQSF [(match_operand:<VBFMLA_W> 2
>> + "register_operand" "w")
>> + (match_operand:<VBFMLA_W> 3
>> + "register_operand" "w")]
>> + UNSPEC_BFDOT)))]
>
> The operands to the plus should be the other way around, so that
> the more complicated operand comes first,
>
Done
>> + "TARGET_BF16_SIMD"
>> + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
>> + [(set_attr "type" "neon_dot<q>")]
>> +)
>> +
>> +
>> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
>> + [(set (match_operand:VDQSF 0 "register_operand" "=w")
>> + (plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0")
>> + (unspec:VDQSF [(match_operand:<VDQSF:VBFMLA_W> 2
>> + "register_operand" "w")
>> + (match_operand: VBF 3
>
> Nit: should be no space before "VBF".
Done
>
>> + "register_operand" "w")
>> + (match_operand:SI 4
>> + "const_int_operand" "n")]
>> + UNSPEC_BFDOT)))]
>> + "TARGET_BF16_SIMD"
>> +{
>> + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
>> + int lane = INTVAL (operands[4]);
>> + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
>
> Should only be one space after "=".
Done
>
>> + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
>> +}
>> + [(set_attr "type" "neon_dot<VDQSF:q>")]
>> +)
>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
>> index
>> 5996df0a612caff3c881fc15b0aa12b8f91a193b..0357d97cc4143c3a9c56260d9a9cc24138afc049
>> 100644
>> --- a/gcc/config/aarch64/arm_neon.h
>> +++ b/gcc/config/aarch64/arm_neon.h
>> @@ -34612,6 +34612,57 @@ vrnd64xq_f64 (float64x2_t __a)
>>
>> #include "arm_bf16.h"
>>
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8.2-a+bf16")
>> +
>> +__extension__ extern __inline float32x2_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
>> +{
>> + return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
>> +}
>> +
>> +__extension__ extern __inline float32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
>> +{
>> + return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
>> +}
>> +
>> +__extension__ extern __inline float32x2_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfdot_lane_f32 \
>> + (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, const int
>> __index)
>
> Stray backslash (same comment as for the USDOT/SUDOT review
> just posted).
Done
>
>> diff --git
>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>> new file mode 100644
>> index
>> 0000000000000000000000000000000000000000..62ac715c2a9c4468eb7c143464390dbf1144d6d6
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>> @@ -0,0 +1,80 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon } */
>> +/* { dg-additional-options "--save-temps" } */
>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +**ufoo:
>> +** ...
>> +** bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h
>> +** ...
>> +** ret
>> +*/
>> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>> +{
>> + return vbfdot_f32 (r, x, y);
>> +}
>
> Same comments as for SUDOT and USDOT here too.
Same changes as US/SUDOT.
Thank you!
Stam
>
> Thanks,
> Richard
>
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def
b/gcc/config/aarch64/aarch64-simd-builtins.def
index
f4ca35a59704c761fe2ac2b6d401fff7c8aba80d..6c5b61c37bcb340f963861723c6e365e32f6ca95
100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -682,3 +682,8 @@
BUILTIN_VSFDF (UNOP, frint32x, 0)
BUILTIN_VSFDF (UNOP, frint64z, 0)
BUILTIN_VSFDF (UNOP, frint64x, 0)
+
+ /* Implemented by aarch64_bfdot{_lane}{q}<mode>. */
+ VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
+ VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
+ VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index
adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9
100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7028,3 +7028,36 @@
"xtn\t%0.<Vntype>, %1.<Vtype>"
[(set_attr "type" "neon_shift_imm_narrow_q")]
)
+
+(define_insn "aarch64_bfdot<mode>"
+ [(set (match_operand:VDQSF 0 "register_operand" "=w")
+ (plus:VDQSF
+ (unspec:VDQSF
+ [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
+ (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
+ UNSPEC_BFDOT)
+ (match_operand:VDQSF 1 "register_operand" "0")))]
+ "TARGET_BF16_SIMD"
+ "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
+ [(set_attr "type" "neon_dot<q>")]
+)
+
+
+(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
+ [(set (match_operand:VDQSF 0 "register_operand" "=w")
+ (plus:VDQSF
+ (unspec:VDQSF
+ [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
+ (match_operand:VBF 3 "register_operand" "w")
+ (match_operand:SI 4 "const_int_operand" "n")]
+ UNSPEC_BFDOT)
+ (match_operand:VDQSF 1 "register_operand" "0")))]
+ "TARGET_BF16_SIMD"
+{
+ int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
+ int lane = INTVAL (operands[4]);
+ operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
+ return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
+}
+ [(set_attr "type" "neon_dot<VDQSF:q>")]
+)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index
ee4bb76bcd4f52bdf99ba9b24fc5749ba555a73b..c304c2c4597550882377d1dfce03fff92e8ebde3
100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34611,6 +34611,57 @@ vrnd64xq_f64 (float64x2_t __a)
#include "arm_bf16.h"
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
+{
+ return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+ return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
+}
+
+#pragma GCC pop_options
+
#undef __aarch64_vget_lane_any
#undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index
04262645a019087b600ff47667c13381dab10d66..2277abcaf7f10a256ddbadb1d4be40ba42f0ac67
100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -119,6 +119,9 @@
;; Quad vector with only 2 element modes.
(define_mode_iterator VQ_2E [V2DI V2DF])
+;; BFmode vector modes.
+(define_mode_iterator VBF [V4BF V8BF])
+
;; This mode iterator allows :P to be used for patterns that operate on
;; addresses in different modes. In LP64, only DI will match, while in
;; ILP32, either can match.
@@ -671,6 +674,7 @@
UNSPEC_UMULHS ; Used in aarch64-sve2.md.
UNSPEC_UMULHRS ; Used in aarch64-sve2.md.
UNSPEC_ASRD ; Used in aarch64-sve.md.
+ UNSPEC_BFDOT ; Used in aarch64-simd.md.
])
;; ------------------------------------------------------------------
@@ -727,6 +731,8 @@
(define_mode_attr FCVT_CHANGE_MODE [(SI "DF") (DI "SF")])
+(define_mode_attr isquadop [(V4BF "") (V8BF "q")])
+
;; For scalar usage of vector/FP registers
(define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d")
(HF "h") (SF "s") (DF "d")
@@ -1308,6 +1314,9 @@
;; Register suffix for DOTPROD input types from the return type.
(define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
+;; Register suffix for BFDOT input types from the return type.
+(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")])
+
;; Sum of lengths of instructions needed to move vector registers of a mode.
(define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
@@ -1318,6 +1327,9 @@
;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub
(define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")])
+;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub
+(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")])
+
(define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")])
(define_mode_attr f16quad [(V2SF "") (V4SF "q")])
diff --git
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
new file mode 100644
index
0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-O -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**ufoo:
+** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
+** ret
+*/
+float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+ return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq:
+** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
+** ret
+*/
+float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+ return vbfdotq_f32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+** bfdot v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\])
+** ret
+*/
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+ return vbfdot_lane_f32 (r, x, y, 0);
+}
+
+/*
+**ufooq_laneq:
+** bfdot v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\])
+** ret
+*/
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+ return vbfdotq_laneq_f32 (r, x, y, 2);
+}
+
+/*
+**ufoo_laneq:
+** bfdot v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\])
+** ret
+*/
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+ return vbfdot_laneq_f32 (r, x, y, 3);
+}
+
+/*
+**ufooq_lane:
+** bfdot v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\])
+** ret
+*/
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+ return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
+/*
+**ufoo_untied:
+** mov v0.8b, v1.8b
+** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
+** ret
+*/
+float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x,
bfloat16x4_t y)
+{
+ return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq_lane_untied:
+** mov v0.16b, v1.16b
+** bfdot v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\])
+** ret
+*/
+float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t
x, bfloat16x4_t y)
+{
+ return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
diff --git
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
new file mode 100644
index
0000000000000000000000000000000000000000..a4da60a0a721c6ea819e28cb8f178c317eb54de1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-O -mbig-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**ufoo:
+** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
+** ret
+*/
+float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+ return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq:
+** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
+** ret
+*/
+float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+ return vbfdotq_f32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+** bfdot v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\])
+** ret
+*/
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+ return vbfdot_lane_f32 (r, x, y, 0);
+}
+
+/*
+**ufooq_laneq:
+** bfdot v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\])
+** ret
+*/
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+ return vbfdotq_laneq_f32 (r, x, y, 2);
+}
+
+/*
+**ufoo_laneq:
+** bfdot v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\])
+** ret
+*/
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+ return vbfdot_laneq_f32 (r, x, y, 3);
+}
+
+/*
+**ufooq_lane:
+** bfdot v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\])
+** ret
+*/
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+ return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
+/*
+**ufoo_untied:
+** mov v0.8b, v1.8b
+** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
+** ret
+*/
+float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x,
bfloat16x4_t y)
+{
+ return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq_lane_untied:
+** mov v0.16b, v1.16b
+** bfdot v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\])
+** ret
+*/
+float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t
x, bfloat16x4_t y)
+{
+ return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
diff --git
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
new file mode 100644
index
0000000000000000000000000000000000000000..607126203b00213d94471a1adefe16f265104af8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
@@ -0,0 +1,28 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "--save-temps" } */
+
+#include <arm_neon.h>
+
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+ return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0
- 1} "" { target *-*-* } 0 } */
+}
+
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+ return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range
0 - 3} "" { target *-*-* } 0 } */
+}
+
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+ return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 -
3} "" { target *-*-* } 0 } */
+}
+
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+ return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 -
1} "" { target *-*-* } 0 } */
+}
+