This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat as part of the BFloat16 extension. (https://developer.arm.com/docs/101028/latest.) The intrinsics are declared in arm_neon.h and the RTL patterns are defined in neon.md. Two new tests are added to check assembler output and lane indices.
This patch depends on the Arm back-end patche. (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have commit rights, so if this is ok can someone please commit it for me? gcc/ChangeLog: 2019-11-12 Delia Burduv <delia.bur...@arm.com> * config/arm/arm_neon.h (vbfmmlaq_f32): New. (vbfmlalbq_f32): New. (vbfmlaltq_f32): New. (vbfmlalbq_lane_f32): New. (vbfmlaltq_lane_f32): New. (vbfmlalbq_laneq_f32): New. (vbfmlaltq_laneq_f32): New. * config/arm/arm_neon_builtins.def (vbfmmla): New. (vbfmab): New. (vbfmat): New. (vbfmab_lane): New. (vbfmat_lane): New. (vbfmab_laneq): New. (vbfmat_laneq): New. * config/arm/iterators.md (BF_MA): New int iterator. (bt): New int attribute. (VQXBF): Copy of VQX with V8BF. (V_HALF): Added V8BF. * config/arm/neon.md (neon_vbfmmlav8hi): New insn. (neon_vbfma<bt>v8hi): New insn. (neon_vbfma<bt>_lanev8hi): New insn. (neon_vbfma<bt>_laneqv8hi): New expand. (neon_vget_high<mode>): Changed iterator to VQXBF. * config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC. (UNSPEC_BFMAB): New UNSPEC. (UNSPEC_BFMAT): New UNSPEC. 2019-11-12 Delia Burduv <delia.bur...@arm.com> * gcc.target/arm/simd/bf16_ma_1.c: New test. * gcc.target/arm/simd/bf16_ma_2.c: New test. * gcc.target/arm/simd/bf16_mmla_1.c: New test.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 71e7568e4315a9354062dee5442ca4af9d9660a9..097d7bb30ad0109ca2f41885206b1cfb2ce962dc 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -91,6 +91,60 @@ typedef float float32_t; #ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC typedef __simd128_bfloat16_t bfloat16x8_t; typedef __simd64_bfloat16_t bfloat16x4_t; + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vbfmmlav8bf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vbfmabv8bf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vbfmatv8bf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_neon_vbfmab_lanev8bf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_neon_vbfmat_lanev8bf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_neon_vbfmab_laneqv8bf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_neon_vbfmat_laneqv8bf (__r, __a, __b, __index); +} + #endif #pragma GCC pop_options #pragma GCC pop_options diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index bcccf93f7fa2750e9006e5856efecbec0fb331b9..169781fa9a07930eb755165019427be055dc36ef 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -373,3 +373,12 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf) VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf) VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf) VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf) + +VAR1 (TERNOP, vbfmmla, v8bf) + +VAR1 (TERNOP, vbfmab, v8bf) +VAR1 (TERNOP, vbfmat, v8bf) +VAR1 (MAC_LANE, vbfmab_lane, v8bf) +VAR1 (MAC_LANE, vbfmat_lane, v8bf) +VAR1 (MAC_LANE, vbfmab_laneq, v8bf) +VAR1 (MAC_LANE, vbfmat_laneq, v8bf) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 439021fa0733ac31706287c4f98d62b080afc3a1..b31f54ffe8957d3dad0a7e3d3fedc48911e7b2c4 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -108,6 +108,9 @@ ;; Quad-width vector modes plus 64-bit elements. (define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI]) +;; Quad-width vector modes plus 64-bit elements and V8BF. +(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF V2DI]) + ;; Quad-width vector modes without floating-point elements. (define_mode_iterator VQI [V16QI V8HI V4SI]) @@ -488,6 +491,8 @@ (define_int_iterator VCADD [UNSPEC_VCADD90 UNSPEC_VCADD270]) (define_int_iterator VCMLA [UNSPEC_VCMLA UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270]) +(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT]) + ;;---------------------------------------------------------------------------- ;; Mode attributes ;;---------------------------------------------------------------------------- @@ -612,7 +617,8 @@ (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI") (V8HF "V4HF") (V4SI "V2SI") (V4SF "V2SF") (V2DF "DF") - (V2DI "DI") (V4HF "HF")]) + (V2DI "DI") (V4HF "HF") + (V8BF "V4BF")]) ;; Same, but lower-case. (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi") @@ -1174,4 +1180,7 @@ (define_int_attr opsuffix [(UNSPEC_DOT_S "s8") (UNSPEC_DOT_U "u8")]) +;; An iterator for VFMA<bt> +(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")]) + (define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") (UNSPEC_SMLAWT "smlawt")]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index b724aab65f720bf0e48bb828f0874426effd235c..42763de178a96422f9df7f4500e4328adfa81d27 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -3879,7 +3879,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vget_high<mode>" [(match_operand:<V_HALF> 0 "s_register_operand") - (match_operand:VQX 1 "s_register_operand")] + (match_operand:VQXBF 1 "s_register_operand")] "TARGET_NEON" { emit_move_insn (operands[0], @@ -6556,3 +6556,62 @@ if (BYTES_BIG_ENDIAN) "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2" [(set_attr "type" "neon_fp_abd_s<q>")] ) + +(define_insn "neon_vbfmmlav8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus:V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "w")] + UNSPEC_BFMMLA)))] + "TARGET_BF16_SIMD" + "vmmla.bf16\\t%q0, %q2, %q3" + [(set_attr "type" "neon_mla_s_q")] +) + +(define_insn "neon_vbfma<bt>v8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus: V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "w")] + BF_MA)))] + "TARGET_BF16_SIMD" + "vfma<bt>.bf16\\t%q0, %q2, %q3" + [(set_attr "type" "neon_fp_mla_s")] +) + +(define_insn "neon_vbfma<bt>_lanev8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus: V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V4BF 3 "register_operand" "x") + (match_operand:SI 4 "const_int_operand" "n")] + BF_MA)))] + "TARGET_BF16_SIMD" + "vfma<bt>.bf16\\t%q0, %q2, %P3[%c4]" + [(set_attr "type" "neon_fp_mla_s")] +) + +(define_expand "neon_vbfma<bt>_laneqv8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus: V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "x") + (match_operand:SI 4 "const_int_operand" "n")] + BF_MA)))] + "TARGET_BF16_SIMD" + { + int lane = INTVAL (operands[4]); + gcc_assert (lane >=0 && lane <=7); + if (lane < 4) + emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], operands[3], operands[4])); + else + { + rtx op_highpart = gen_reg_rtx (V4BFmode); + emit_insn (gen_neon_vget_highv8bf (op_highpart, operands[3])); + operands[4] = GEN_INT (lane - 4); + emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], op_highpart, operands[4])); + } + DONE; + } + [(set_attr "type" "neon_fp_mla_s")] +) diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index b4196b0e5cd939c3ee5e3f9bd19622fcc963adae..f452082b4bdb3a22a8e3b62113bb7f9470279e93 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -493,4 +493,7 @@ UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270 + UNSPEC_BFMMLA + UNSPEC_BFMAB + UNSPEC_BFMAT ]) diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c new file mode 100644 index 0000000000000000000000000000000000000000..ead3e9d569f45f5507985e5d7cb12e0541349dd1 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c @@ -0,0 +1,84 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +/* +**test_vbfmlalbq_f32: +** ... +** vfmab.bf16\tq[0-9]+, q[0-9]+, q[0-9]+ +** ... +*/ +float32x4_t +test_vbfmlalbq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlalbq_f32 (r, a, b); +} + +/* +**test_vbfmlaltq_f32: +** ... +** vfmat.bf16\tq[0-9]+, q[0-9]+, q[0-9]+ +** ... +*/ +float32x4_t +test_vbfmlaltq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlaltq_f32 (r, a, b); +} + +/* +**test_vbfmlalbq_lane_f32: +** ... +** vfmab.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\] +** ... +*/ +float32x4_t +test_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return vbfmlalbq_lane_f32 (r, a, b, 0); +} + +/* +**test_vbfmlaltq_lane_f32: +** ... +** vfmat.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[2\] +** ... +*/ +float32x4_t +test_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return vbfmlaltq_lane_f32 (r, a, b, 2); +} + +/* +**test_vbfmlalbq_laneq_f32: +** ... +** vfmab.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[1\] +** ... +*/ +float32x4_t +test_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlalbq_laneq_f32 (r, a, b, 5); +} + +/* +**test_vbfmlaltq_laneq_f32: +** ... +** vfmat.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[3\] +** ... +*/ +float32x4_t +test_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlaltq_laneq_f32 (r, a, b, 7); +} + +int main() +{ + return 0; +} diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c new file mode 100644 index 0000000000000000000000000000000000000000..226ed7e1d8e4747d73b0518c809aaf0e3c5bc78d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c @@ -0,0 +1,31 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +#include "arm_neon.h" + +/* Test lane index limits for vbfmlalbq_lane_f32 */ +float32x4_t +test_vbfmlalbq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return __builtin_neon_vbfmab_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */ +} + +float32x4_t +test_vbfmlalbq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return __builtin_neon_vbfmab_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */ +} + +/* Test lane index limits for vbfmlaltq_lane_f32 */ +float32x4_t +test_vbfmlaltq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return __builtin_neon_vbfmat_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */ +} + +float32x4_t +test_vbfmlaltq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return __builtin_neon_vbfmat_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */ +} diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c new file mode 100644 index 0000000000000000000000000000000000000000..0c7422b78c385850eaa53492af0da8826e8b3b4a --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c @@ -0,0 +1,24 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +/* +**test_vbfmmlaq_f32: +** ... +** vmmla.bf16\tq[0-9]+, q[0-9]+, q[0-9]+ +** ... +*/ +float32x4_t +test_vbfmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfmmlaq_f32 (r, x, y); +} + +int main() +{ + return 0; +}