Stam Markianos-Wright <stam.markianos-wri...@arm.com> writes: > Hi all, > > This patch adds the ARMv8.6 Extension ACLE intrinsics for the bfloat bfdot > operation. > > The functions are declared in arm_neon.h with the armv8.2-a+bf16 target > option > as required. > > RTL patterns are defined to generate assembler. > > Tests added to verify expected assembly and perform adequate lane checks. > > This patch depends on: > > https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html > > for testuite effective_target update and on: > > https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01323.html > https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01324.html > > for back-end Bfloat enablement. > > Cheers, > Stam > > > gcc/ChangeLog: > > 2019-11-04 Stam Markianos-Wright <stam.markianos-wri...@arm.com> > > * config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot, > aarch64_bfdot_lane, aarch64_bfdot_laneq): New. > * config/aarch64/aarch64-simd.md > (aarch64_bfdot, aarch64_bfdot_lane): New. > * config/aarch64/arm_neon.h (vbfdot_f32, vbfdotq_f32, vbfdot_lane_f32, > vbfdotq_lane_f32, vbfdot_laneq_f32, vbfdotq_laneq_f32): New. > * config/aarch64/iterators.md (UNSPEC_BFDOT, VBF, isquadop, Vbfdottype, > VBFMLA_W): New.
Changelog nit: the continuation lines should be indened by a tab only. > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > c4858ab7cffd786066646a5cd95a168311990b76..bdc26c190610580e57e9749804b7729ee4e34793 > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -7027,3 +7027,37 @@ > "xtn\t%0.<Vntype>, %1.<Vtype>" > [(set_attr "type" "neon_shift_imm_narrow_q")] > ) > + > +(define_insn "aarch64_bfdot<mode>" > + [(set (match_operand:VDQSF 0 "register_operand" "=w") > + (plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0") > + (unspec:VDQSF [(match_operand:<VBFMLA_W> 2 > + "register_operand" "w") > + (match_operand:<VBFMLA_W> 3 > + "register_operand" "w")] > + UNSPEC_BFDOT)))] The operands to the plus should be the other way around, so that the more complicated operand comes first, > + "TARGET_BF16_SIMD" > + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>" > + [(set_attr "type" "neon_dot<q>")] > +) > + > + > +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" > + [(set (match_operand:VDQSF 0 "register_operand" "=w") > + (plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0") > + (unspec:VDQSF [(match_operand:<VDQSF:VBFMLA_W> 2 > + "register_operand" "w") > + (match_operand: VBF 3 Nit: should be no space before "VBF". > + "register_operand" "w") > + (match_operand:SI 4 > + "const_int_operand" "n")] > + UNSPEC_BFDOT)))] > + "TARGET_BF16_SIMD" > +{ > + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant (); > + int lane = INTVAL (operands[4]); > + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); Should only be one space after "=". > + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]"; > +} > + [(set_attr "type" "neon_dot<VDQSF:q>")] > +) > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index > 5996df0a612caff3c881fc15b0aa12b8f91a193b..0357d97cc4143c3a9c56260d9a9cc24138afc049 > 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -34612,6 +34612,57 @@ vrnd64xq_f64 (float64x2_t __a) > > #include "arm_bf16.h" > > +#pragma GCC push_options > +#pragma GCC target ("arch=armv8.2-a+bf16") > + > +__extension__ extern __inline float32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) > +{ > + return __builtin_aarch64_bfdotv2sf (__r, __a, __b); > +} > + > +__extension__ extern __inline float32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) > +{ > + return __builtin_aarch64_bfdotv4sf (__r, __a, __b); > +} > + > +__extension__ extern __inline float32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbfdot_lane_f32 \ > + (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, const int > __index) Stray backslash (same comment as for the USDOT/SUDOT review just posted). > diff --git > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..62ac715c2a9c4468eb7c143464390dbf1144d6d6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c > @@ -0,0 +1,80 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > +/* { dg-add-options arm_v8_2a_bf16_neon } */ > +/* { dg-additional-options "--save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ > + > +#include <arm_neon.h> > + > +/* > +**ufoo: > +** ... > +** bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h > +** ... > +** ret > +*/ > +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_f32 (r, x, y); > +} Same comments as for SUDOT and USDOT here too. Thanks, Richard