Hi Richard, On 23/01/2020 15:28, Richard Sandiford wrote: > Dennis Zhang <dennis.zh...@arm.com> writes: >> Hi all, >> On 16/12/2019 13:53, Dennis Zhang wrote: >>> Hi all, >>> >>> This patch is part of a series adding support for Armv8.6-A features. >>> It depends on the Armv8.6-A effective target checking patch, >>> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html. >>> >>> This patch adds intrinsics for matrix multiply-accumulate operations >>> including vmmlaq_s32, vmmlaq_u32, and vusmmlaq_s32. >>> >>> ACLE documents are at https://developer.arm.com/docs/101028/latest >>> ISA documents are at https://developer.arm.com/docs/ddi0596/latest >>> >>> Regtested & bootstrapped for aarch64-none-linux-gnu. >>> >>> Is it OK for trunk please? >>> >> >> This patch is rebased to the trunk top. >> There is no dependence on any other patches now. >> Regtested again. >> >> Is it OK for trunk please? >> >> Cheers >> Dennis >> >> gcc/ChangeLog: >> >> 2020-01-23 Dennis Zhang <dennis.zh...@arm.com> >> >> * config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SSUS): New macro. >> * config/aarch64/aarch64-simd-builtins.def (simd_smmla): New. >> (simd_ummla, simd_usmmla): New. >> * config/aarch64/aarch64-simd.md (aarch64_simd_<sur>mmlav16qi): New. >> * config/aarch64/arm_neon.h (vmmlaq_s32, vmmlaq_u32): New. >> (vusmmlaq_s32): New. >> * config/aarch64/iterators.md (unspec): Add UNSPEC_SMATMUL, >> UNSPEC_UMATMUL, and UNSPEC_USMATMUL. >> (sur): Likewise. >> (MATMUL): New iterator. >> >> gcc/testsuite/ChangeLog: >> >> 2020-01-23 Dennis Zhang <dennis.zh...@arm.com> >> >> * gcc.target/aarch64/advsimd-intrinsics/vmmla.c: New test. >> >> diff --git a/gcc/config/aarch64/aarch64-builtins.c >> b/gcc/config/aarch64/aarch64-builtins.c >> index f0e0461b7f0..033a6d4e92f 100644 >> --- a/gcc/config/aarch64/aarch64-builtins.c >> +++ b/gcc/config/aarch64/aarch64-builtins.c >> @@ -176,6 +176,10 @@ >> aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] >> = { qualifier_unsigned, qualifier_unsigned, >> qualifier_unsigned, qualifier_immediate }; >> #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers) >> +static enum aarch64_type_qualifiers >> +aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS] >> + = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none }; >> +#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers) >> >> >> static enum aarch64_type_qualifiers >> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def >> b/gcc/config/aarch64/aarch64-simd-builtins.def >> index 57fc5933b43..06025b110cc 100644 >> --- a/gcc/config/aarch64/aarch64-simd-builtins.def >> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def >> @@ -682,3 +682,8 @@ >> BUILTIN_VSFDF (UNOP, frint32x, 0) >> BUILTIN_VSFDF (UNOP, frint64z, 0) >> BUILTIN_VSFDF (UNOP, frint64x, 0) >> + >> + /* Implemented by aarch64_simd_<sur>mmlav16qi. */ >> + VAR1 (TERNOP, simd_smmla, 0, v16qi) >> + VAR1 (TERNOPU, simd_ummla, 0, v16qi) >> + VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi) >> \ No newline at end of file >> diff --git a/gcc/config/aarch64/aarch64-simd.md >> b/gcc/config/aarch64/aarch64-simd.md >> index 2989096b170..409ec28d293 100644 >> --- a/gcc/config/aarch64/aarch64-simd.md >> +++ b/gcc/config/aarch64/aarch64-simd.md >> @@ -7025,3 +7025,15 @@ >> "xtn\t%0.<Vntype>, %1.<Vtype>" >> [(set_attr "type" "neon_shift_imm_narrow_q")] >> ) >> + >> +;; 8-bit integer matrix multiply-accumulate >> +(define_insn "aarch64_simd_<sur>mmlav16qi" >> + [(set (match_operand:V4SI 0 "register_operand" "=w") >> + (plus:V4SI (match_operand:V4SI 1 "register_operand" "0") >> + (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w") >> + (match_operand:V16QI 3 "register_operand" "w")] >> + MATMUL)))] >> + "TARGET_I8MM" >> + "<sur>mmla\\t%0.4s, %2.16b, %3.16b" >> + [(set_attr "type" "neon_mla_s_q")] >> +) >> \ No newline at end of file > > (Would be good to add the newline) > > The canonical rtl order for commutative operations like plus is > to put the most complicated expression first (roughly speaking -- > the rules are a bit more precise than that). So this should be: > > [(set (match_operand:V4SI 0 "register_operand" "=w") > (plus:V4SI (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w") > (match_operand:V16QI 3 "register_operand" "w")] > MATMUL) > (match_operand:V4SI 1 "register_operand" "0")))] > >> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h >> index eaba156e26c..918000d98dc 100644 >> --- a/gcc/config/aarch64/arm_neon.h >> +++ b/gcc/config/aarch64/arm_neon.h >> @@ -34609,6 +34609,36 @@ vrnd64xq_f64 (float64x2_t __a) >> >> #pragma GCC pop_options >> >> +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */ >> + >> +#pragma GCC push_options >> +#pragma GCC target ("arch=armv8.2-a+i8mm") >> + >> +/* Matrix Multiply-Accumulate. */ >> + >> +__extension__ extern __inline int32x4_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) >> +{ >> + return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b); >> +} >> + >> +__extension__ extern __inline uint32x4_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) >> +{ >> + return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b); >> +} >> + >> +__extension__ extern __inline int32x4_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) >> +{ >> + return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b); >> +} >> + >> +#pragma GCC pop_options >> + >> #include "arm_bf16.h" >> >> #undef __aarch64_vget_lane_any >> diff --git a/gcc/config/aarch64/iterators.md >> b/gcc/config/aarch64/iterators.md >> index b9843b83c5f..57aca36f646 100644 >> --- a/gcc/config/aarch64/iterators.md >> +++ b/gcc/config/aarch64/iterators.md >> @@ -581,6 +581,9 @@ >> UNSPEC_FMLSL ; Used in aarch64-simd.md. >> UNSPEC_FMLAL2 ; Used in aarch64-simd.md. >> UNSPEC_FMLSL2 ; Used in aarch64-simd.md. >> + UNSPEC_SMATMUL ; Used in aarch64-simd.md. >> + UNSPEC_UMATMUL ; Used in aarch64-simd.md. >> + UNSPEC_USMATMUL ; Used in aarch64-simd.md. >> UNSPEC_ADR ; Used in aarch64-sve.md. >> UNSPEC_SEL ; Used in aarch64-sve.md. >> UNSPEC_BRKA ; Used in aarch64-sve.md. >> @@ -2531,6 +2534,8 @@ >> >> (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT]) >> >> +(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL UNSPEC_USMATMUL]) >> + >> ;; Iterators for atomic operations. >> >> (define_int_iterator ATOMIC_LDOP >> @@ -2738,6 +2743,8 @@ >> (UNSPEC_URSHL "ur") (UNSPEC_SRSHL "sr") >> (UNSPEC_UQRSHL "u") (UNSPEC_SQRSHL "s") >> (UNSPEC_SDOT "s") (UNSPEC_UDOT "u") >> + (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u") >> + (UNSPEC_USMATMUL "us") >> ]) >> >> (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r") >> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c >> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c >> new file mode 100644 >> index 00000000000..348b2f51779 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c >> @@ -0,0 +1,37 @@ >> +/* { dg-do assemble } */ > > I assume this should be dg-run, otherwise there's no point in having > the main function and comparison. The dg-run would need to be > conditional on whether the target supports i8mm. > > Alternatively, we could keep it simple and stick to an assembler > test, in which case I think we should have one function per call, > with no main. > >> +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ >> +/* { dg-options "-save-temps -O2" } */ >> +/* { dg-additional-options "-march=armv8.2-a+i8mm" } */ >> + >> +#include "arm_neon.h" >> + >> +extern void abort(); >> + >> +#define VAR4(v) {v, v, v, v} >> +#define VAR16(v) {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v} >> +#define TEST(t, f, r, a, b, ...) { \ >> + t##32x4_t f##_ref = { __VA_ARGS__ }; \ >> + t##32x4_t f##_out = f (r, a, b); \ >> + for (int i = 0; i < 4; i++) \ >> + if (f##_out[i] != f##_ref[i]) \ >> + abort(); \ >> +} >> + >> +int >> +main() >> +{ >> + int32x4_t s32 = VAR4(-1); >> + uint32x4_t u32 = VAR4(1); >> + int8x16_t s8 = VAR16(-1); >> + uint8x16_t u8 = VAR16(1); >> + >> + TEST(int, vmmlaq_s32, s32, s8, s8, 7, 7, 7, 7); >> + TEST(uint, vmmlaq_u32, u32, u8, u8, 9, 9, 9, 9); >> + TEST(int, vusmmlaq_s32, s32, u8, s8, -9, -9, -9, -9); >> + >> + return 0; >> +} >> + >> +/* { dg-final { scan-assembler {smmla\tv[0-9]+.4s, v[0-9]+.16b, >> v[0-9]+.16b} } } */ >> +/* { dg-final { scan-assembler {ummla\tv[0-9]+.4s, v[0-9]+.16b, >> v[0-9]+.16b} } } */ >> +/* { dg-final { scan-assembler {usmmla\tv[0-9]+.4s, v[0-9]+.16b, >> v[0-9]+.16b} } } */ >> \ No newline at end of file > > This is going to look like inventing a new rule, sorry, since nothing > else in the directory does this yet, but: IMO it's better to put a > \t at the beginning of each scan-assembler. As it stands the > usmmla instruction would satisfy the first scan-assembler too, > so we wouldn't pick up cases in which smmla failed to be generated. > > Thanks, > Richard >
Thanks a lot for the review. The patch is updated as suggested: 1, Fix RTL format. 2, Test only for assembler. Each instruction starts with '\t' to avoid confusing. Could you please help to check if it's OK for trunk? Cheers Dennis gcc/ChangeLog: 2020-01-23 Dennis Zhang <dennis.zh...@arm.com> * config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SSUS): New macro. * config/aarch64/aarch64-simd-builtins.def (simd_smmla): New. (simd_ummla, simd_usmmla): New. * config/aarch64/aarch64-simd.md (aarch64_simd_<sur>mmlav16qi): New. * config/aarch64/arm_neon.h (vmmlaq_s32, vmmlaq_u32): New. (vusmmlaq_s32): New. * config/aarch64/iterators.md (unspec): Add UNSPEC_SMATMUL, UNSPEC_UMATMUL, and UNSPEC_USMATMUL. (sur): Likewise. (MATMUL): New iterator. gcc/testsuite/ChangeLog: 2020-01-23 Dennis Zhang <dennis.zh...@arm.com> * gcc.target/aarch64/simd/vmmla.c: New test.
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index f0e0461b7f0..033a6d4e92f 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -176,6 +176,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, qualifier_immediate }; #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers) +static enum aarch64_type_qualifiers +aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none }; +#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers) static enum aarch64_type_qualifiers diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 57fc5933b43..885c2540514 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -682,3 +682,8 @@ BUILTIN_VSFDF (UNOP, frint32x, 0) BUILTIN_VSFDF (UNOP, frint64z, 0) BUILTIN_VSFDF (UNOP, frint64x, 0) + + /* Implemented by aarch64_simd_<sur>mmlav16qi. */ + VAR1 (TERNOP, simd_smmla, 0, v16qi) + VAR1 (TERNOPU, simd_ummla, 0, v16qi) + VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 2989096b170..b7659068b7d 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -7025,3 +7025,15 @@ "xtn\t%0.<Vntype>, %1.<Vtype>" [(set_attr "type" "neon_shift_imm_narrow_q")] ) + +;; 8-bit integer matrix multiply-accumulate +(define_insn "aarch64_simd_<sur>mmlav16qi" + [(set (match_operand:V4SI 0 "register_operand" "=w") + (plus:V4SI + (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w") + (match_operand:V16QI 3 "register_operand" "w")] MATMUL) + (match_operand:V4SI 1 "register_operand" "0")))] + "TARGET_I8MM" + "<sur>mmla\\t%0.4s, %2.16b, %3.16b" + [(set_attr "type" "neon_mla_s_q")] +) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index eaba156e26c..918000d98dc 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -34609,6 +34609,36 @@ vrnd64xq_f64 (float64x2_t __a) #pragma GCC pop_options +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */ + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+i8mm") + +/* Matrix Multiply-Accumulate. */ + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) +{ + return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b); +} + +#pragma GCC pop_options + #include "arm_bf16.h" #undef __aarch64_vget_lane_any diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index b9843b83c5f..57aca36f646 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -581,6 +581,9 @@ UNSPEC_FMLSL ; Used in aarch64-simd.md. UNSPEC_FMLAL2 ; Used in aarch64-simd.md. UNSPEC_FMLSL2 ; Used in aarch64-simd.md. + UNSPEC_SMATMUL ; Used in aarch64-simd.md. + UNSPEC_UMATMUL ; Used in aarch64-simd.md. + UNSPEC_USMATMUL ; Used in aarch64-simd.md. UNSPEC_ADR ; Used in aarch64-sve.md. UNSPEC_SEL ; Used in aarch64-sve.md. UNSPEC_BRKA ; Used in aarch64-sve.md. @@ -2531,6 +2534,8 @@ (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT]) +(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL UNSPEC_USMATMUL]) + ;; Iterators for atomic operations. (define_int_iterator ATOMIC_LDOP @@ -2738,6 +2743,8 @@ (UNSPEC_URSHL "ur") (UNSPEC_SRSHL "sr") (UNSPEC_UQRSHL "u") (UNSPEC_SQRSHL "s") (UNSPEC_SDOT "s") (UNSPEC_UDOT "u") + (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u") + (UNSPEC_USMATMUL "us") ]) (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r") diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c new file mode 100644 index 00000000000..c3d31f128c6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c @@ -0,0 +1,27 @@ +/* { dg-do assemble} */ +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ +/* { dg-additional-options "-march=armv8.2-a+i8mm" } */ + +#include "arm_neon.h" + +int32x4_t +test_vmmlaq_s32 (int32x4_t r, int8x16_t a, int8x16_t b) +{ + return vmmlaq_s32 (r, a, b); +} + +uint32x4_t +test_vmmlaq_u32 (uint32x4_t r, uint8x16_t a, uint8x16_t b) +{ + return vmmlaq_u32 (r, a, b); +} + +int32x4_t +test_vusmmlaq_s32 (int32x4_t r, uint8x16_t a, int8x16_t b) +{ + return vusmmlaq_s32 (r, a, b); +} + +/* { dg-final { scan-assembler-times "\tsmmla\\tv\[0-9\]\+\\.4s, v\[0-9\]\+\\.16b, v\[0-9\]\+\\.16b" 1 } } */ +/* { dg-final { scan-assembler-times "\tummla\\tv\[0-9\]\+\\.4s, v\[0-9\]\+\\.16b, v\[0-9\]\+\\.16b" 1 } } */ +/* { dg-final { scan-assembler-times "\tusmmla\\tv\[0-9\]\+\\.4s, v\[0-9\]\+\\.16b, v\[0-9\]\+\\.16b" 1 } } */