Re: [GCC][PATCH][AArch64]Add ACLE intrinsics for bfdot for ARMv8.6 Extension

Richard Sandiford Thu, 09 Jan 2020 07:54:26 -0800

Please update the names of the testsuite files to match the ones
in the bfloat16_t patch.  (Same for the usdot/sudot patch -- sorry
for forgetting there.)


OK with that change, thanks.

Richard

Stam Markianos-Wright <stam.markianos-wri...@arm.com> writes:
> On 12/30/19 10:29 AM, Richard Sandiford wrote:
>> Stam Markianos-Wright <stam.markianos-wri...@arm.com> writes:
>>> diff --git a/gcc/config/aarch64/aarch64-simd.md 
>>> b/gcc/config/aarch64/aarch64-simd.md
>>> index 
>>> adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9
>>>  100644
>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>> @@ -7028,3 +7028,36 @@
>>>     "xtn\t%0.<Vntype>, %1.<Vtype>"
>>>     [(set_attr "type" "neon_shift_imm_narrow_q")]
>>>   )
>>> +
>>> +(define_insn "aarch64_bfdot<mode>"
>>> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
>>> +   (plus:VDQSF
>>> +     (unspec:VDQSF
>>> +      [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
>>> +       (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
>>> +       UNSPEC_BFDOT)
>>> +     (match_operand:VDQSF 1 "register_operand" "0")))]
>>> +  "TARGET_BF16_SIMD"
>>> +  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
>>> +  [(set_attr "type" "neon_dot<q>")]
>>> +)
>>> +
>>> +
>>> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
>> 
>> Too many blank lines.
>
> Fixed, sorry I hadn't noticed!
>
>> 
>>> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
>>> +   (plus:VDQSF
>>> +     (unspec:VDQSF
>>> +      [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
>>> +       (match_operand:VBF 3 "register_operand" "w")
>>> +       (match_operand:SI 4 "const_int_operand" "n")]
>>> +       UNSPEC_BFDOT)
>>> +     (match_operand:VDQSF 1 "register_operand" "0")))]
>>> +  "TARGET_BF16_SIMD"
>>> +{
>>> +  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
>>> +  int lane = INTVAL (operands[4]);
>>> +  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
>>> +  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
>>> +}
>>> +  [(set_attr "type" "neon_dot<VDQSF:q>")]
>>> +)
>>> [...]
>>> diff --git 
>>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c 
>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>>> new file mode 100644
>>> index 
>>> 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>>> @@ -0,0 +1,91 @@
>>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>>> +/* { dg-additional-options "-O -save-temps" } */
>>> +/* { dg-final { check-function-bodies "**" "" } } */
>>> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
>> 
>> Same comment as for USDOT/SUDOT regarding the dg- markup.
>
> Done!
>> 
>>> +
>>> +#include <arm_neon.h>
>>> +
>>> +/*
>>> +**ufoo:
>>> +** bfdot   v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
>>> +** ret
>>> +*/
>>> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>>> +{
>>> +  return vbfdot_f32 (r, x, y);
>>> +}
>>> +
>>> +/*
>>> +**ufooq:
>>> +** bfdot   v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
>>> +** ret
>>> +*/
>>> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>>> +{
>>> +  return vbfdotq_f32 (r, x, y);
>>> +}
>> 
>> The (...|...)s here are correct.
> Yep.
>> 
>>> +
>>> +/*
>>> +**ufoo_lane:
>>> +** bfdot   v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\])
>>> +** ret
>>> +*/
>>> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>>> +{
>>> +  return vbfdot_lane_f32 (r, x, y, 0);
>>> +}
>>> +
>>> +/*
>>> +**ufooq_laneq:
>>> +** bfdot   v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\])
>>> +** ret
>>> +*/
>>> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>>> +{
>>> +  return vbfdotq_laneq_f32 (r, x, y, 2);
>>> +}
>>> +
>>> +/*
>>> +**ufoo_laneq:
>>> +** bfdot   v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\])
>>> +** ret
>>> +*/
>>> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
>>> +{
>>> +  return vbfdot_laneq_f32 (r, x, y, 3);
>>> +}
>>> +
>>> +/*
>>> +**ufooq_lane:
>>> +** bfdot   v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\])
>>> +** ret
>>> +*/
>>> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
>>> +{
>>> +  return vbfdotq_lane_f32 (r, x, y, 1);
>>> +}
>> 
>> But these aren't, since the operands must be in the order given.
> Yep.
>> 
>>> +
>>> +/*
>>> +**ufoo_untied:
>>> +** mov     v0.8b, v1.8b
>>> +** bfdot   v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
>>> +** ret
>>> +*/
>>> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, 
>>> bfloat16x4_t y)
>>> +{
>>> +  return vbfdot_f32 (r, x, y);
>>> +}
>> 
>> Similarly, OK here.
> Yep.
>> 
>>> +
>>> +/*
>>> +**ufooq_lane_untied:
>>> +** mov     v0.16b, v1.16b
>>> +** bfdot   v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\])
>>> +** ret
>>> +*/
>>> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, 
>>> bfloat16x8_t x, bfloat16x4_t y)
>>> +{
>>> +  return vbfdotq_lane_f32 (r, x, y, 1);
>>> +}
>> 
>> ...but not here.
> Yep.
>> 
>> Same comments for the big-endian test.
> Done.
>
> Thank you so much for the in depth review comments!
>
> Cheers,
> Stam
>> 
>> Thanks,
>> Richard
>> 
>
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
> b/gcc/config/aarch64/aarch64-simd-builtins.def
> index 
> 57fc5933b43bfc0da132342c681b8a2c14549c9c..41ccda8a5d77b8ec3cfd984f3c5fc02369e7199f
>  100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -682,3 +682,8 @@
>    BUILTIN_VSFDF (UNOP, frint32x, 0)
>    BUILTIN_VSFDF (UNOP, frint64z, 0)
>    BUILTIN_VSFDF (UNOP, frint64x, 0)
> +
> +  /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
> +  VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
> +  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
> +  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> cea9592695ac8bd2f4e625f8b769ddaf716e9091..a95489dc17ac38be8e85457ad1804387f1772dc3
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -7025,3 +7025,35 @@
>    "xtn\t%0.<Vntype>, %1.<Vtype>"
>    [(set_attr "type" "neon_shift_imm_narrow_q")]
>  )
> +
> +(define_insn "aarch64_bfdot<mode>"
> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
> +     (plus:VDQSF
> +       (unspec:VDQSF
> +        [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
> +         (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
> +         UNSPEC_BFDOT)
> +       (match_operand:VDQSF 1 "register_operand" "0")))]
> +  "TARGET_BF16_SIMD"
> +  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
> +  [(set_attr "type" "neon_dot<q>")]
> +)
> +
> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
> +     (plus:VDQSF
> +       (unspec:VDQSF
> +        [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
> +         (match_operand:VBF 3 "register_operand" "w")
> +         (match_operand:SI 4 "const_int_operand" "n")]
> +         UNSPEC_BFDOT)
> +       (match_operand:VDQSF 1 "register_operand" "0")))]
> +  "TARGET_BF16_SIMD"
> +{
> +  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
> +  int lane = INTVAL (operands[4]);
> +  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
> +  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
> +}
> +  [(set_attr "type" "neon_dot<VDQSF:q>")]
> +)
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 
> eaba156e26cf35b07b96972fe2741a9c00d6caa9..1a8b27956d4ca25e0ed6f3c38030b3eba0546c4f
>  100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -34611,6 +34611,57 @@ vrnd64xq_f64 (float64x2_t __a)
>  
>  #include "arm_bf16.h"
>  
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.2-a+bf16")
> +
> +__extension__ extern __inline float32x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
> +{
> +  return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
> +{
> +  return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline float32x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
> +              const int __index)
> +{
> +  return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
> +               const int __index)
> +{
> +  return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline float32x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
> +               const int __index)
> +{
> +  return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
> +                const int __index)
> +{
> +  return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
> +}
> +
> +#pragma GCC pop_options
> +
>  #undef __aarch64_vget_lane_any
>  
>  #undef __aarch64_vdup_lane_any
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 
> 2d566ca1a5fad18b701f1954cff967342085874a..091d3a2fb6926f614d354052961d0913d41f71e9
>  100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -122,6 +122,9 @@
>  ;; Quad vector with only 2 element modes.
>  (define_mode_iterator VQ_2E [V2DI V2DF])
>  
> +;; BFmode vector modes.
> +(define_mode_iterator VBF [V4BF V8BF])
> +
>  ;; This mode iterator allows :P to be used for patterns that operate on
>  ;; addresses in different modes.  In LP64, only DI will match, while in
>  ;; ILP32, either can match.
> @@ -671,6 +674,7 @@
>      UNSPEC_UMULHS    ; Used in aarch64-sve2.md.
>      UNSPEC_UMULHRS   ; Used in aarch64-sve2.md.
>      UNSPEC_ASRD              ; Used in aarch64-sve.md.
> +    UNSPEC_BFDOT     ; Used in aarch64-simd.md.
>  ])
>  
>  ;; ------------------------------------------------------------------
> @@ -727,6 +731,8 @@
>  
>  (define_mode_attr FCVT_CHANGE_MODE [(SI "DF") (DI "SF")])
>  
> +(define_mode_attr isquadop [(V4BF "") (V8BF "q")])
> +
>  ;; For scalar usage of vector/FP registers
>  (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d")
>                   (HF  "h") (SF "s") (DF "d")
> @@ -1310,6 +1316,9 @@
>  ;; Register suffix for DOTPROD input types from the return type.
>  (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
>  
> +;; Register suffix for BFDOT input types from the return type.
> +(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")])
> +
>  ;; Sum of lengths of instructions needed to move vector registers of a mode.
>  (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
>  
> @@ -1320,6 +1329,9 @@
>  ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub
>  (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")])
>  
> +;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub
> +(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")])
> +
>  (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")])
>  
>  (define_mode_attr f16quad [(V2SF "") (V4SF "q")])
> diff --git 
> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..ad51507731bbb165de64e583ebfbf8047b4eb781
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
> @@ -0,0 +1,91 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "-save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +**ufoo:
> +**   bfdot   v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
> +**   ret
> +*/
> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}
> +
> +/*
> +**ufooq:
> +**   bfdot   v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
> +**   ret
> +*/
> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_f32 (r, x, y);
> +}
> +
> +/*
> +**ufoo_lane:
> +**   bfdot   v0.2s, v1.4h, v2.2h\[0\]
> +**   ret
> +*/
> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_lane_f32 (r, x, y, 0);
> +}
> +
> +/*
> +**ufooq_laneq:
> +**   bfdot   v0.4s, v1.8h, v2.2h\[2\]
> +**   ret
> +*/
> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_laneq_f32 (r, x, y, 2);
> +}
> +
> +/*
> +**ufoo_laneq:
> +**   bfdot   v0.2s, v1.4h, v2.2h\[3\]
> +**   ret
> +*/
> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
> +{
> +  return vbfdot_laneq_f32 (r, x, y, 3);
> +}
> +
> +/*
> +**ufooq_lane:
> +**   bfdot   v0.4s, v1.8h, v2.2h\[1\]
> +**   ret
> +*/
> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 1);
> +}
> +
> +/*
> +**ufoo_untied:
> +**   mov     v0.8b, v1.8b
> +**   bfdot   v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
> +**   ret
> +*/
> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, 
> bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}
> +
> +/*
> +**ufooq_lane_untied:
> +**   mov     v0.16b, v1.16b
> +**   bfdot   v0.4s, v2.8h, v3.2h\[1\]
> +**   ret
> +*/
> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, 
> bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 1);
> +}
> +
> diff --git 
> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..58bdee5ac9df602b7569724200b3c9ab7c72bb28
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
> @@ -0,0 +1,91 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "-mbig-endian --save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +**ufoo:
> +**   bfdot   v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
> +**   ret
> +*/
> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}
> +
> +/*
> +**ufooq:
> +**   bfdot   v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
> +**   ret
> +*/
> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_f32 (r, x, y);
> +}
> +
> +/*
> +**ufoo_lane:
> +**   bfdot   v0.2s, v1.4h, v2.2h\[0\]
> +**   ret
> +*/
> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_lane_f32 (r, x, y, 0);
> +}
> +
> +/*
> +**ufooq_laneq:
> +**   bfdot   v0.4s, v1.8h, v2.2h\[2\]
> +**   ret
> +*/
> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_laneq_f32 (r, x, y, 2);
> +}
> +
> +/*
> +**ufoo_laneq:
> +**   bfdot   v0.2s, v1.4h, v2.2h\[3\]
> +**   ret
> +*/
> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
> +{
> +  return vbfdot_laneq_f32 (r, x, y, 3);
> +}
> +
> +/*
> +**ufooq_lane:
> +**   bfdot   v0.4s, v1.8h, v2.2h\[1\]
> +**   ret
> +*/
> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 1);
> +}
> +
> +/*
> +**ufoo_untied:
> +**   mov     v0.8b, v1.8b
> +**   bfdot   v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
> +**   ret
> +*/
> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, 
> bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}
> +
> +/*
> +**ufooq_lane_untied:
> +**   mov     v0.16b, v1.16b
> +**   bfdot   v0.4s, v2.8h, v3.2h\[1\]
> +**   ret
> +*/
> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, 
> bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 1);
> +}
> +
> diff --git 
> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..607126203b00213d94471a1adefe16f265104af8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
> @@ -0,0 +1,28 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "--save-temps" } */
> +
> +#include <arm_neon.h>
> +
> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 
> 0 - 1} "" { target *-*-* } 0 } */
> +}
> +
> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of 
> range 0 - 3} "" { target *-*-* } 0 } */
> +}
> +
> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
> +{
> +  return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 
> - 3} "" { target *-*-* } 0 } */
> +}
> +
> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 
> - 1} "" { target *-*-* } 0 } */
> +}
> +

Re: [GCC][PATCH][AArch64]Add ACLE intrinsics for bfdot for ARMv8.6 Extension

Reply via email to