Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma for AArch32 AdvSIMD

Delia Burduv Fri, 31 Jan 2020 07:24:37 -0800

Here is the updated patch. The changes are minor, so let me know if there is anything else to fix or if it can be committed.
Thank you, Delia On 1/30/20 2:55 PM, Kyrill Tkachov wrote: > Hi Delia, > > > On 1/28/20 4:44 PM, Delia Burduv wrote: >> Ping. >> ------------------------------------------------------------------------ >> *From:* Delia Burduv <[email protected]> >> *Sent:* 22 January 2020 17:26 >> *To:* [email protected] <[email protected]> >> *Cc:* [email protected] <[email protected]>; Richard Earnshaw >> <[email protected]>; Ramana Radhakrishnan >> <[email protected]>; Kyrylo Tkachov <[email protected]> >> *Subject:* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla >> and vfma<b/t> for AArch32 AdvSIMD >> Ping. >> >> I have read Richard Sandiford's comments on the AArch64 patches and I >> will apply what is relevant to this patch as well. Particularly, I will >> change the tests to use the exact input and output registers and I will >> change the types of the rtl patterns. > > > Please send the updated patches so that someone can commit them for you > once they're reviewed. > > Thanks, > > Kyrill > > >> >> On 12/20/19 6:44 PM, Delia Burduv wrote: >> > This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat >> > as part of the BFloat16 extension. >> > (https://developer.arm.com/docs/101028/latest.) >> > The intrinsics are declared in arm_neon.h and the RTL patterns are >> > defined in neon.md. >> > Two new tests are added to check assembler output and lane indices. >> > >> > This patch depends on the Arm back-end patche. >> > (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) >> > >> > Tested for regression on arm-none-eabi and armeb-none-eabi. I don't >> have >> > commit rights, so if this is ok can someone please commit it for me? >> > >> > gcc/ChangeLog: >> > >> > 2019-11-12 Delia Burduv <[email protected]> >> > >> > * config/arm/arm_neon.h (vbfmmlaq_f32): New. >> > (vbfmlalbq_f32): New. >> > (vbfmlaltq_f32): New. >> > (vbfmlalbq_lane_f32): New. >> > (vbfmlaltq_lane_f32): New. >> > (vbfmlalbq_laneq_f32): New. >> > (vbfmlaltq_laneq_f32): New. >> > * config/arm/arm_neon_builtins.def (vbfmmla): New. >> > (vbfmab): New. >> > (vbfmat): New. >> > (vbfmab_lane): New. >> > (vbfmat_lane): New. >> > (vbfmab_laneq): New. >> > (vbfmat_laneq): New. >> > * config/arm/iterators.md (BF_MA): New int iterator. >> > (bt): New int attribute. >> > (VQXBF): Copy of VQX with V8BF. >> > (V_HALF): Added V8BF. >> > * config/arm/neon.md (neon_vbfmmlav8hi): New insn. >> > (neon_vbfma<bt>v8hi): New insn. >> > (neon_vbfma<bt>_lanev8hi): New insn. >> > (neon_vbfma<bt>_laneqv8hi): New expand. >> > (neon_vget_high<mode>): Changed iterator to VQXBF. >> > * config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC. >> > (UNSPEC_BFMAB): New UNSPEC. >> > (UNSPEC_BFMAT): New UNSPEC. >> > >> > 2019-11-12 Delia Burduv <[email protected]> >> > >> > * gcc.target/arm/simd/bf16_ma_1.c: New test. >> > * gcc.target/arm/simd/bf16_ma_2.c: New test. >> > * gcc.target/arm/simd/bf16_mmla_1.c: New test.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 3c78f435009ab027f92693d00ab5b40960d5419d..81f8008ea6a5fb11eb09f6685ba24bb0c54fb248 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -18742,6 +18742,64 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index); } +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+bf16") + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vbfmmlav8bf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vbfmabv8bf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vbfmatv8bf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_neon_vbfmab_lanev8bf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_neon_vbfmat_lanev8bf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_neon_vbfmab_laneqv8bf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_neon_vbfmat_laneqv8bf (__r, __a, __b, __index); +} + +#pragma GCC pop_options + #pragma GCC pop_options #endif diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index e9ff4e501cbb5d16b9211f5bc96db376ddf21afc..cc06783daf393f7166fd922f86b3db79c02ba188 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -373,3 +373,12 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf) VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf) VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf) VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf) + +VAR1 (TERNOP, vbfmmla, v8bf) + +VAR1 (TERNOP, vbfmab, v8bf) +VAR1 (TERNOP, vbfmat, v8bf) +VAR1 (MAC_LANE, vbfmab_lane, v8bf) +VAR1 (MAC_LANE, vbfmat_lane, v8bf) +VAR1 (MAC_LANE, vbfmab_laneq, v8bf) +VAR1 (MAC_LANE, vbfmat_laneq, v8bf) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 33e29509f00a89fa23d0546687c0e4643f0b32d2..72b8ce0bb26dcd520603b907b4f86a74d0164332 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -106,6 +106,9 @@ ;; Quad-width vector modes plus 64-bit elements. (define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI]) +;; Quad-width vector modes plus 64-bit elements and V8BF. +(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF V2DI]) + ;; Quad-width vector modes without floating-point elements. (define_mode_iterator VQI [V16QI V8HI V4SI]) @@ -485,6 +488,8 @@ (define_int_iterator VCADD [UNSPEC_VCADD90 UNSPEC_VCADD270]) (define_int_iterator VCMLA [UNSPEC_VCMLA UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270]) +(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT]) + ;;---------------------------------------------------------------------------- ;; Mode attributes ;;---------------------------------------------------------------------------- @@ -609,7 +614,8 @@ (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI") (V8HF "V4HF") (V4SI "V2SI") (V4SF "V2SF") (V2DF "DF") - (V2DI "DI") (V4HF "HF")]) + (V2DI "DI") (V4HF "HF") + (V8BF "V4BF")]) ;; Same, but lower-case. (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi") @@ -1171,4 +1177,7 @@ (define_int_attr opsuffix [(UNSPEC_DOT_S "s8") (UNSPEC_DOT_U "u8")]) +;; An iterator for VFMA<bt> +(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")]) + (define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") (UNSPEC_SMLAWT "smlawt")]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 6087ca6f2badde6a492bb515a2cb5846f3d4ad8e..4e0d0b5c317a81839de9dee581c5e351d3193dfa 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -3875,7 +3875,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vget_high<mode>" [(match_operand:<V_HALF> 0 "s_register_operand") - (match_operand:VQX 1 "s_register_operand")] + (match_operand:VQXBF 1 "s_register_operand")] "TARGET_NEON" { emit_move_insn (operands[0], @@ -6552,3 +6552,64 @@ if (BYTES_BIG_ENDIAN) "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2" [(set_attr "type" "neon_fp_abd_s<q>")] ) + +(define_insn "neon_vbfmmlav8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus:V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "w")] + UNSPEC_BFMMLA)))] + "TARGET_BF16_SIMD" + "vmmla.bf16\\t%q0, %q2, %q3" + [(set_attr "type" "neon_fp_mla_s_q")] +) + +(define_insn "neon_vbfma<bt>v8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus: V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "w")] + BF_MA)))] + "TARGET_BF16_SIMD" + "vfma<bt>.bf16\\t%q0, %q2, %q3" + [(set_attr "type" "neon_fp_mla_s_q")] +) + +(define_insn "neon_vbfma<bt>_lanev8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus: V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V4BF 3 "register_operand" "x") + (match_operand:SI 4 "const_int_operand" "n")] + BF_MA)))] + "TARGET_BF16_SIMD" + "vfma<bt>.bf16\\t%q0, %q2, %P3[%c4]" + [(set_attr "type" "neon_fp_mla_s_scalar_q")] +) + +(define_expand "neon_vbfma<bt>_laneqv8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus: V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "x") + (match_operand:SI 4 "const_int_operand" "n")] + BF_MA)))] + "TARGET_BF16_SIMD" + { + int lane = INTVAL (operands[4]); + gcc_assert (lane >=0 && lane <=7); + if (lane < 4) + { + emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], operands[3], operands[4])); + } + else + { + rtx op_highpart = gen_reg_rtx (V4BFmode); + emit_insn (gen_neon_vget_highv8bf (op_highpart, operands[3])); + operands[4] = GEN_INT (lane - 4); + emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], op_highpart, operands[4])); + } + DONE; + } + [(set_attr "type" "neon_fp_mla_s_scalar_q")] +) diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 8f4a705f43efdb6baf03b39cee589cf728620687..97f08abec0a089b5cd95840da12ae22f7c960b28 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -501,4 +501,7 @@ UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270 + UNSPEC_BFMMLA + UNSPEC_BFMAB + UNSPEC_BFMAT ]) diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c new file mode 100644 index 0000000000000000000000000000000000000000..855e86b91fc69904f488dea1b277de6cc4ecba7e --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c @@ -0,0 +1,79 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps" } */ +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ + +#include "arm_neon.h" + +/* +**test_vbfmlalbq_f32: +** ... +** vfmab.bf16\tq0, q1, q2 +** bx\tlr +*/ +float32x4_t +test_vbfmlalbq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlalbq_f32 (r, a, b); +} + +/* +**test_vbfmlaltq_f32: +** ... +** vfmat.bf16\tq0, q1, q2 +** bx\tlr +*/ +float32x4_t +test_vbfmlaltq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlaltq_f32 (r, a, b); +} + +/* +**test_vbfmlalbq_lane_f32: +** ... +** vfmab.bf16\tq0, q1, d4[0] +** bx\tlr +*/ +float32x4_t +test_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return vbfmlalbq_lane_f32 (r, a, b, 0); +} + +/* +**test_vbfmlaltq_lane_f32: +** ... +** vfmat.bf16\tq0, q1, d4[2] +** bx\tlr +*/ +float32x4_t +test_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return vbfmlaltq_lane_f32 (r, a, b, 2); +} + +/* +**test_vbfmlalbq_laneq_f32: +** ... +** vfmab.bf16\tq0, q1, d5[1] +** bx\tlr +*/ +float32x4_t +test_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlalbq_laneq_f32 (r, a, b, 5); +} + +/* +**test_vbfmlaltq_laneq_f32: +** ... +** vfmat.bf16\tq0, q1, d5[3] +** bx\tlr +*/ +float32x4_t +test_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlaltq_laneq_f32 (r, a, b, 7); +} diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c new file mode 100644 index 0000000000000000000000000000000000000000..226ed7e1d8e4747d73b0518c809aaf0e3c5bc78d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c @@ -0,0 +1,31 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +#include "arm_neon.h" + +/* Test lane index limits for vbfmlalbq_lane_f32 */ +float32x4_t +test_vbfmlalbq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return __builtin_neon_vbfmab_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */ +} + +float32x4_t +test_vbfmlalbq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return __builtin_neon_vbfmab_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */ +} + +/* Test lane index limits for vbfmlaltq_lane_f32 */ +float32x4_t +test_vbfmlaltq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return __builtin_neon_vbfmat_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */ +} + +float32x4_t +test_vbfmlaltq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return __builtin_neon_vbfmat_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */ +} diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c new file mode 100644 index 0000000000000000000000000000000000000000..9370e4d945a353ac7329929d27920b3a0aa08281 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c @@ -0,0 +1,18 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps" } */ +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ + +#include <arm_neon.h> + +/*test_vbfmmlaq_f32: +** ... +** vmmla.bf16\tq0, q1, q2 +** bx\tlr +*/ +float32x4_t +test_vbfmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfmmlaq_f32 (r, x, y); +}

Previous message

View by thread

View by date

Next message

Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla an... Delia Burduv

Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmm... Delia Burduv

Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16... Kyrill Tkachov

Re: [GCC][PATCH][AArch32] ACLE intrinsics bflo... Delia Burduv

Re: [GCC][PATCH][AArch32] ACLE intrinsics ... Delia Burduv

Re: [GCC][PATCH][AArch32] ACLE intrin... Kyrill Tkachov

Re: [GCC][PATCH][AArch32] ACLE in... Delia Burduv

Re: [GCC][PATCH][AArch32] ACL... Kyrill Tkachov

Re: [GCC][PATCH][AArch32] ACL... Kyrill Tkachov

Reply via email to