Spencer Abson <spencer.ab...@arm.com> writes: > Extend the ternary op/UNSPEC_SEL combiner patterns from SVE_FULL_F/ > SVE_FULL_F_BF to SVE_F/SVE_F_BF, where the strictness value is > SVE_RELAXED_GP. > > We can only reliably test the 'merging with the third input' (addend) > and 'independent value' patterns at this stage as the canocalisation that > reorders the multiplicands based on the second SEL input would be performed > by the conditional expander. > > Another difficulty is that we can't test these fused multiply/SEL combines > without using __builtin_fma and friends. The reason for this is as follows: > > We support COND_ADD, COND_SUB, and COND_MUL optabs, so match.pd will > canonicalize patterns like ADD/SUB/MUL combined with a VEC_COND_EXPR into > these conditional forms. Later, when widening_mul tries to fold these into > conditional fused multiply operations, the transformation fails - simply > because we haven’t implemented those conditional fused multiply optabs yet. > > Hence why this patch lacks tests for BFloat16... > > gcc/ChangeLog: > > * config/aarch64/aarch64-sve.md (*cond_<optab><mode>_2_relaxed): > Extend from SVE_FULL_F to SVE_F. > (*cond_<optab><mode>_4_relaxed): Extend from SVE_FULL_F_B16B16 > to SVE_F_B16B16. > (*cond_<optab><mode>_any_relaxed): Likewise. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/sve/unpacked_cond_fmla_1.c: New test. > * gcc.target/aarch64/sve/unpacked_cond_fmls_1.c: Likewise. > * gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c: Likewise. > * gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c: Likewise.
OK, thanks. BTW, I just realised that all my comments saying "please add a token test that SEL doesn't get dropped" are probably completely bogus, since those cases will instead be handed by the follow-on patches to the conditional optabs. Is that right? Please ignore if so. :) Sorry, I've been thinking about this a patch at a time while ignoring the bigger picture. Richard > --- > gcc/config/aarch64/aarch64-sve.md | 38 ++++++++-------- > .../aarch64/sve/unpacked_cond_fmla_1.c | 43 +++++++++++++++++++ > .../aarch64/sve/unpacked_cond_fmls_1.c | 43 +++++++++++++++++++ > .../aarch64/sve/unpacked_cond_fnmla_1.c | 43 +++++++++++++++++++ > .../aarch64/sve/unpacked_cond_fnmls_1.c | 43 +++++++++++++++++++ > 5 files changed, 191 insertions(+), 19 deletions(-) > create mode 100644 > gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c > create mode 100644 > gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c > create mode 100644 > gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c > create mode 100644 > gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c > > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index 8c1921ddf5c..e5443980e8b 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -7622,15 +7622,15 @@ > ;; Predicated floating-point ternary operations, merging with the > ;; first input. > (define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" > - [(set (match_operand:SVE_FULL_F 0 "register_operand") > - (unspec:SVE_FULL_F > + [(set (match_operand:SVE_F 0 "register_operand") > + (unspec:SVE_F > [(match_operand:<VPRED> 1 "register_operand") > - (unspec:SVE_FULL_F > + (unspec:SVE_F > [(match_operand 5) > (const_int SVE_RELAXED_GP) > - (match_operand:SVE_FULL_F 2 "register_operand") > - (match_operand:SVE_FULL_F 3 "register_operand") > - (match_operand:SVE_FULL_F 4 "register_operand")] > + (match_operand:SVE_F 2 "register_operand") > + (match_operand:SVE_F 3 "register_operand") > + (match_operand:SVE_F 4 "register_operand")] > SVE_COND_FP_TERNARY) > (match_dup 2)] > UNSPEC_SEL))] > @@ -7668,15 +7668,15 @@ > ;; Predicated floating-point ternary operations, merging with the > ;; third input. > (define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed" > - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") > - (unspec:SVE_FULL_F_B16B16 > + [(set (match_operand:SVE_F_B16B16 0 "register_operand") > + (unspec:SVE_F_B16B16 > [(match_operand:<VPRED> 1 "register_operand") > - (unspec:SVE_FULL_F_B16B16 > + (unspec:SVE_F_B16B16 > [(match_operand 5) > (const_int SVE_RELAXED_GP) > - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") > - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") > - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] > + (match_operand:SVE_F_B16B16 2 "register_operand") > + (match_operand:SVE_F_B16B16 3 "register_operand") > + (match_operand:SVE_F_B16B16 4 "register_operand")] > SVE_COND_FP_TERNARY) > (match_dup 4)] > UNSPEC_SEL))] > @@ -7714,17 +7714,17 @@ > ;; Predicated floating-point ternary operations, merging with an > ;; independent value. > (define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" > - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") > - (unspec:SVE_FULL_F_B16B16 > + [(set (match_operand:SVE_F_B16B16 0 "register_operand") > + (unspec:SVE_F_B16B16 > [(match_operand:<VPRED> 1 "register_operand") > - (unspec:SVE_FULL_F_B16B16 > + (unspec:SVE_F_B16B16 > [(match_operand 6) > (const_int SVE_RELAXED_GP) > - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") > - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") > - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] > + (match_operand:SVE_F_B16B16 2 "register_operand") > + (match_operand:SVE_F_B16B16 3 "register_operand") > + (match_operand:SVE_F_B16B16 4 "register_operand")] > SVE_COND_FP_TERNARY) > - (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")] > + (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")] > UNSPEC_SEL))] > "TARGET_SVE > && (<supports_bf16> || !<is_bf16>) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c > new file mode 100644 > index 00000000000..e2a5bdab1dd > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c > @@ -0,0 +1,43 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 > -fno-trapping-math" } */ > + > +#include <stdint.h> > + > +#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i]) > +#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i]) > +#define FNMLA(SUFF) -FMLA (SUFF) > +#define FNMLS(SUFF) -FMLS (SUFF) > + > +#define a_i a[i] > +#define b_i b[i] > +#define c_i c[i] > + > +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE) \ > + void \ > + f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out, \ > + TYPE0 *__restrict a, \ > + TYPE0 *__restrict b, \ > + TYPE0 *__restrict c, \ > + TYPE1 *__restrict p) \ > + { \ > + for (unsigned int i = 0; i < COUNT; i++) \ > + out[i] = p[i] ? FN : MERGE; \ > + } > + > +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ > + TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i) \ > + TEST_FN (FN, TYPE0, TYPE1, COUNT, 5) > + > +TEST_ALL (FMLA (f16), _Float16, uint64_t, 32) > + > +TEST_ALL (FMLA (f16), _Float16, uint32_t, 64) > + > +TEST_ALL (FMLA (f32), float, uint64_t, 32) > + > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s\n} 1 } } */ > +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ > + > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ > + > +/* { dg-final { scan-assembler-not {\tsel\t} } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c > new file mode 100644 > index 00000000000..5a83fab8ec7 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c > @@ -0,0 +1,43 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 > -fno-trapping-math" } */ > + > +#include <stdint.h> > + > +#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i]) > +#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i]) > +#define FNMLA(SUFF) -FMLA (SUFF) > +#define FNMLS(SUFF) -FMLS (SUFF) > + > +#define a_i a[i] > +#define b_i b[i] > +#define c_i c[i] > + > +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE) \ > + void \ > + f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out, \ > + TYPE0 *__restrict a, \ > + TYPE0 *__restrict b, \ > + TYPE0 *__restrict c, \ > + TYPE1 *__restrict p) \ > + { \ > + for (unsigned int i = 0; i < COUNT; i++) \ > + out[i] = p[i] ? FN : MERGE; \ > + } > + > +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ > + TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i) \ > + TEST_FN (FN, TYPE0, TYPE1, COUNT, 5) > + > +TEST_ALL (FMLS (f16), _Float16, uint64_t, 32) > + > +TEST_ALL (FMLS (f16), _Float16, uint32_t, 64) > + > +TEST_ALL (FMLS (f32), float, uint64_t, 32) > + > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s\n} 1 } } */ > +/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ > + > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ > + > +/* { dg-final { scan-assembler-not {\tsel\t} } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c > new file mode 100644 > index 00000000000..5d4eff806dd > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c > @@ -0,0 +1,43 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 > -fno-trapping-math" } */ > + > +#include <stdint.h> > + > +#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i]) > +#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i]) > +#define FNMLA(SUFF) -FMLA (SUFF) > +#define FNMLS(SUFF) -FMLS (SUFF) > + > +#define a_i a[i] > +#define b_i b[i] > +#define c_i c[i] > + > +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE) \ > + void \ > + f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out, \ > + TYPE0 *__restrict a, \ > + TYPE0 *__restrict b, \ > + TYPE0 *__restrict c, \ > + TYPE1 *__restrict p) \ > + { \ > + for (unsigned int i = 0; i < COUNT; i++) \ > + out[i] = p[i] ? FN : MERGE; \ > + } > + > +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ > + TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i) \ > + TEST_FN (FN, TYPE0, TYPE1, COUNT, 5) > + > +TEST_ALL (FNMLA (f16), _Float16, uint64_t, 32) > + > +TEST_ALL (FNMLA (f16), _Float16, uint32_t, 64) > + > +TEST_ALL (FNMLA (f32), float, uint64_t, 32) > + > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s\n} 1 } } */ > +/* { dg-final { scan-assembler-times {\tfnmla\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ > + > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tfnmla\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ > + > +/* { dg-final { scan-assembler-not {\tsel\t} } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c > new file mode 100644 > index 00000000000..5569bbabd7d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c > @@ -0,0 +1,43 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 > -fno-trapping-math" } */ > + > +#include <stdint.h> > + > +#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i]) > +#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i]) > +#define FNMLA(SUFF) -FMLA (SUFF) > +#define FNMLS(SUFF) -FMLS (SUFF) > + > +#define a_i a[i] > +#define b_i b[i] > +#define c_i c[i] > + > +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE) \ > + void \ > + f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out, \ > + TYPE0 *__restrict a, \ > + TYPE0 *__restrict b, \ > + TYPE0 *__restrict c, \ > + TYPE1 *__restrict p) \ > + { \ > + for (unsigned int i = 0; i < COUNT; i++) \ > + out[i] = p[i] ? FN : MERGE; \ > + } > + > +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ > + TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i) \ > + TEST_FN (FN, TYPE0, TYPE1, COUNT, 5) > + > +TEST_ALL (FNMLS (f16), _Float16, uint64_t, 32) > + > +TEST_ALL (FNMLS (f16), _Float16, uint32_t, 64) > + > +TEST_ALL (FNMLS (f32), float, uint64_t, 32) > + > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s\n} 1 } } */ > +/* { dg-final { scan-assembler-times {\tfnmls\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ > + > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tfnmls\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ > + > +/* { dg-final { scan-assembler-not {\tsel\t} } } */