Here is the updated patch. The changes are minor, so let me know if there is anything else to fix or if it can be committed.
Thank you, Delia On 1/30/20 2:55 PM, Kyrill Tkachov wrote: > Hi Delia, > > > On 1/28/20 4:44 PM, Delia Burduv wrote: >> Ping. >> ------------------------------------------------------------------------ >> *From:* Delia Burduv <delia.bur...@arm.com> >> *Sent:* 22 January 2020 17:26 >> *To:* gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org> >> *Cc:* ni...@redhat.com <ni...@redhat.com>; Richard Earnshaw >> <richard.earns...@arm.com>; Ramana Radhakrishnan >> <ramana.radhakrish...@arm.com>; Kyrylo Tkachov <kyrylo.tkac...@arm.com> >> *Subject:* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla >> and vfma<b/t> for AArch32 AdvSIMD >> Ping. >> >> I have read Richard Sandiford's comments on the AArch64 patches and I >> will apply what is relevant to this patch as well. Particularly, I will >> change the tests to use the exact input and output registers and I will >> change the types of the rtl patterns. > > > Please send the updated patches so that someone can commit them for you > once they're reviewed. > > Thanks, > > Kyrill > > >> >> On 12/20/19 6:44 PM, Delia Burduv wrote: >> > This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat >> > as part of the BFloat16 extension. >> > (https://developer.arm.com/docs/101028/latest.) >> > The intrinsics are declared in arm_neon.h and the RTL patterns are >> > defined in neon.md. >> > Two new tests are added to check assembler output and lane indices. >> > >> > This patch depends on the Arm back-end patche. >> > (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) >> > >> > Tested for regression on arm-none-eabi and armeb-none-eabi. I don't >> have >> > commit rights, so if this is ok can someone please commit it for me? >> > >> > gcc/ChangeLog: >> > >> > 2019-11-12 Delia Burduv <delia.bur...@arm.com> >> > >> > * config/arm/arm_neon.h (vbfmmlaq_f32): New. >> > (vbfmlalbq_f32): New. >> > (vbfmlaltq_f32): New. >> > (vbfmlalbq_lane_f32): New. >> > (vbfmlaltq_lane_f32): New. >> > (vbfmlalbq_laneq_f32): New. >> > (vbfmlaltq_laneq_f32): New. >> > * config/arm/arm_neon_builtins.def (vbfmmla): New. >> > (vbfmab): New. >> > (vbfmat): New. >> > (vbfmab_lane): New. >> > (vbfmat_lane): New. >> > (vbfmab_laneq): New. >> > (vbfmat_laneq): New. >> > * config/arm/iterators.md (BF_MA): New int iterator. >> > (bt): New int attribute. >> > (VQXBF): Copy of VQX with V8BF. >> > (V_HALF): Added V8BF. >> > * config/arm/neon.md (neon_vbfmmlav8hi): New insn. >> > (neon_vbfma<bt>v8hi): New insn. >> > (neon_vbfma<bt>_lanev8hi): New insn. >> > (neon_vbfma<bt>_laneqv8hi): New expand. >> > (neon_vget_high<mode>): Changed iterator to VQXBF. >> > * config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC. >> > (UNSPEC_BFMAB): New UNSPEC. >> > (UNSPEC_BFMAT): New UNSPEC. >> > >> > 2019-11-12 Delia Burduv <delia.bur...@arm.com> >> > >> > * gcc.target/arm/simd/bf16_ma_1.c: New test. >> > * gcc.target/arm/simd/bf16_ma_2.c: New test. >> > * gcc.target/arm/simd/bf16_mmla_1.c: New test.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 3c78f435009ab027f92693d00ab5b40960d5419d..81f8008ea6a5fb11eb09f6685ba24bb0c54fb248 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -18742,6 +18742,64 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index); } +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+bf16") + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vbfmmlav8bf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vbfmabv8bf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vbfmatv8bf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_neon_vbfmab_lanev8bf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_neon_vbfmat_lanev8bf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_neon_vbfmab_laneqv8bf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_neon_vbfmat_laneqv8bf (__r, __a, __b, __index); +} + +#pragma GCC pop_options + #pragma GCC pop_options #endif diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index e9ff4e501cbb5d16b9211f5bc96db376ddf21afc..cc06783daf393f7166fd922f86b3db79c02ba188 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -373,3 +373,12 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf) VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf) VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf) VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf) + +VAR1 (TERNOP, vbfmmla, v8bf) + +VAR1 (TERNOP, vbfmab, v8bf) +VAR1 (TERNOP, vbfmat, v8bf) +VAR1 (MAC_LANE, vbfmab_lane, v8bf) +VAR1 (MAC_LANE, vbfmat_lane, v8bf) +VAR1 (MAC_LANE, vbfmab_laneq, v8bf) +VAR1 (MAC_LANE, vbfmat_laneq, v8bf) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 33e29509f00a89fa23d0546687c0e4643f0b32d2..72b8ce0bb26dcd520603b907b4f86a74d0164332 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -106,6 +106,9 @@ ;; Quad-width vector modes plus 64-bit elements. (define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI]) +;; Quad-width vector modes plus 64-bit elements and V8BF. +(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF V2DI]) + ;; Quad-width vector modes without floating-point elements. (define_mode_iterator VQI [V16QI V8HI V4SI]) @@ -485,6 +488,8 @@ (define_int_iterator VCADD [UNSPEC_VCADD90 UNSPEC_VCADD270]) (define_int_iterator VCMLA [UNSPEC_VCMLA UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270]) +(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT]) + ;;---------------------------------------------------------------------------- ;; Mode attributes ;;---------------------------------------------------------------------------- @@ -609,7 +614,8 @@ (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI") (V8HF "V4HF") (V4SI "V2SI") (V4SF "V2SF") (V2DF "DF") - (V2DI "DI") (V4HF "HF")]) + (V2DI "DI") (V4HF "HF") + (V8BF "V4BF")]) ;; Same, but lower-case. (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi") @@ -1171,4 +1177,7 @@ (define_int_attr opsuffix [(UNSPEC_DOT_S "s8") (UNSPEC_DOT_U "u8")]) +;; An iterator for VFMA<bt> +(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")]) + (define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") (UNSPEC_SMLAWT "smlawt")]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 6087ca6f2badde6a492bb515a2cb5846f3d4ad8e..4e0d0b5c317a81839de9dee581c5e351d3193dfa 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -3875,7 +3875,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vget_high<mode>" [(match_operand:<V_HALF> 0 "s_register_operand") - (match_operand:VQX 1 "s_register_operand")] + (match_operand:VQXBF 1 "s_register_operand")] "TARGET_NEON" { emit_move_insn (operands[0], @@ -6552,3 +6552,64 @@ if (BYTES_BIG_ENDIAN) "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2" [(set_attr "type" "neon_fp_abd_s<q>")] ) + +(define_insn "neon_vbfmmlav8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus:V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "w")] + UNSPEC_BFMMLA)))] + "TARGET_BF16_SIMD" + "vmmla.bf16\\t%q0, %q2, %q3" + [(set_attr "type" "neon_fp_mla_s_q")] +) + +(define_insn "neon_vbfma<bt>v8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus: V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "w")] + BF_MA)))] + "TARGET_BF16_SIMD" + "vfma<bt>.bf16\\t%q0, %q2, %q3" + [(set_attr "type" "neon_fp_mla_s_q")] +) + +(define_insn "neon_vbfma<bt>_lanev8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus: V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V4BF 3 "register_operand" "x") + (match_operand:SI 4 "const_int_operand" "n")] + BF_MA)))] + "TARGET_BF16_SIMD" + "vfma<bt>.bf16\\t%q0, %q2, %P3[%c4]" + [(set_attr "type" "neon_fp_mla_s_scalar_q")] +) + +(define_expand "neon_vbfma<bt>_laneqv8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus: V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "x") + (match_operand:SI 4 "const_int_operand" "n")] + BF_MA)))] + "TARGET_BF16_SIMD" + { + int lane = INTVAL (operands[4]); + gcc_assert (lane >=0 && lane <=7); + if (lane < 4) + { + emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], operands[3], operands[4])); + } + else + { + rtx op_highpart = gen_reg_rtx (V4BFmode); + emit_insn (gen_neon_vget_highv8bf (op_highpart, operands[3])); + operands[4] = GEN_INT (lane - 4); + emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], op_highpart, operands[4])); + } + DONE; + } + [(set_attr "type" "neon_fp_mla_s_scalar_q")] +) diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 8f4a705f43efdb6baf03b39cee589cf728620687..97f08abec0a089b5cd95840da12ae22f7c960b28 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -501,4 +501,7 @@ UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270 + UNSPEC_BFMMLA + UNSPEC_BFMAB + UNSPEC_BFMAT ]) diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c new file mode 100644 index 0000000000000000000000000000000000000000..855e86b91fc69904f488dea1b277de6cc4ecba7e --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c @@ -0,0 +1,79 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps" } */ +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ + +#include "arm_neon.h" + +/* +**test_vbfmlalbq_f32: +** ... +** vfmab.bf16\tq0, q1, q2 +** bx\tlr +*/ +float32x4_t +test_vbfmlalbq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlalbq_f32 (r, a, b); +} + +/* +**test_vbfmlaltq_f32: +** ... +** vfmat.bf16\tq0, q1, q2 +** bx\tlr +*/ +float32x4_t +test_vbfmlaltq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlaltq_f32 (r, a, b); +} + +/* +**test_vbfmlalbq_lane_f32: +** ... +** vfmab.bf16\tq0, q1, d4[0] +** bx\tlr +*/ +float32x4_t +test_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return vbfmlalbq_lane_f32 (r, a, b, 0); +} + +/* +**test_vbfmlaltq_lane_f32: +** ... +** vfmat.bf16\tq0, q1, d4[2] +** bx\tlr +*/ +float32x4_t +test_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return vbfmlaltq_lane_f32 (r, a, b, 2); +} + +/* +**test_vbfmlalbq_laneq_f32: +** ... +** vfmab.bf16\tq0, q1, d5[1] +** bx\tlr +*/ +float32x4_t +test_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlalbq_laneq_f32 (r, a, b, 5); +} + +/* +**test_vbfmlaltq_laneq_f32: +** ... +** vfmat.bf16\tq0, q1, d5[3] +** bx\tlr +*/ +float32x4_t +test_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlaltq_laneq_f32 (r, a, b, 7); +} diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c new file mode 100644 index 0000000000000000000000000000000000000000..226ed7e1d8e4747d73b0518c809aaf0e3c5bc78d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c @@ -0,0 +1,31 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +#include "arm_neon.h" + +/* Test lane index limits for vbfmlalbq_lane_f32 */ +float32x4_t +test_vbfmlalbq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return __builtin_neon_vbfmab_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */ +} + +float32x4_t +test_vbfmlalbq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return __builtin_neon_vbfmab_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */ +} + +/* Test lane index limits for vbfmlaltq_lane_f32 */ +float32x4_t +test_vbfmlaltq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return __builtin_neon_vbfmat_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */ +} + +float32x4_t +test_vbfmlaltq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return __builtin_neon_vbfmat_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */ +} diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c new file mode 100644 index 0000000000000000000000000000000000000000..9370e4d945a353ac7329929d27920b3a0aa08281 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c @@ -0,0 +1,18 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps" } */ +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ + +#include <arm_neon.h> + +/*test_vbfmmlaq_f32: +** ... +** vmmla.bf16\tq0, q1, q2 +** bx\tlr +*/ +float32x4_t +test_vbfmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfmmlaq_f32 (r, x, y); +}