Extend the ternary op/UNSPEC_SEL combiner patterns from SVE_FULL_F/ SVE_FULL_F_BF to SVE_F/SVE_F_BF, where the strictness value is SVE_RELAXED_GP.
We can only reliably test the 'merging with the third input' (addend) and 'independent value' patterns at this stage as the canocalisation that reorders the multiplicands based on the second SEL input would be performed by the conditional expander. Another difficulty is that we can't test these fused multiply/SEL combines without using __builtin_fma and friends. The reason for this is as follows: We support COND_ADD, COND_SUB, and COND_MUL optabs, so match.pd will canonicalize patterns like ADD/SUB/MUL combined with a VEC_COND_EXPR into these conditional forms. Later, when widening_mul tries to fold these into conditional fused multiply operations, the transformation fails - simply because we haven’t implemented those conditional fused multiply optabs yet. Hence why this patch lacks tests for BFloat16... gcc/ChangeLog: * config/aarch64/aarch64-sve.md (*cond_<optab><mode>_2_relaxed): Extend from SVE_FULL_F to SVE_F. (*cond_<optab><mode>_4_relaxed): Extend from SVE_FULL_F_B16B16 to SVE_F_B16B16. (*cond_<optab><mode>_any_relaxed): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/unpacked_cond_fmla_1.c: New test. * gcc.target/aarch64/sve/unpacked_cond_fmls_1.c: Likewise. * gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c: Likewise. * gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c: Likewise. --- gcc/config/aarch64/aarch64-sve.md | 38 ++++++++-------- .../aarch64/sve/unpacked_cond_fmla_1.c | 43 +++++++++++++++++++ .../aarch64/sve/unpacked_cond_fmls_1.c | 43 +++++++++++++++++++ .../aarch64/sve/unpacked_cond_fnmla_1.c | 43 +++++++++++++++++++ .../aarch64/sve/unpacked_cond_fnmls_1.c | 43 +++++++++++++++++++ 5 files changed, 191 insertions(+), 19 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 8c1921ddf5c..e5443980e8b 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -7622,15 +7622,15 @@ ;; Predicated floating-point ternary operations, merging with the ;; first input. (define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "register_operand") - (match_operand:SVE_FULL_F 4 "register_operand")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "register_operand") + (match_operand:SVE_F 4 "register_operand")] SVE_COND_FP_TERNARY) (match_dup 2)] UNSPEC_SEL))] @@ -7668,15 +7668,15 @@ ;; Predicated floating-point ternary operations, merging with the ;; third input. (define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY) (match_dup 4)] UNSPEC_SEL))] @@ -7714,17 +7714,17 @@ ;; Predicated floating-point ternary operations, merging with an ;; independent value. (define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_operand 6) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY) - (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c new file mode 100644 index 00000000000..e2a5bdab1dd --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 -fno-trapping-math" } */ + +#include <stdint.h> + +#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i]) +#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i]) +#define FNMLA(SUFF) -FMLA (SUFF) +#define FNMLS(SUFF) -FMLS (SUFF) + +#define a_i a[i] +#define b_i b[i] +#define c_i c[i] + +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE) \ + void \ + f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out, \ + TYPE0 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE0 *__restrict c, \ + TYPE1 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? FN : MERGE; \ + } + +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 5) + +TEST_ALL (FMLA (f16), _Float16, uint64_t, 32) + +TEST_ALL (FMLA (f16), _Float16, uint32_t, 64) + +TEST_ALL (FMLA (f32), float, uint64_t, 32) + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c new file mode 100644 index 00000000000..5a83fab8ec7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 -fno-trapping-math" } */ + +#include <stdint.h> + +#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i]) +#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i]) +#define FNMLA(SUFF) -FMLA (SUFF) +#define FNMLS(SUFF) -FMLS (SUFF) + +#define a_i a[i] +#define b_i b[i] +#define c_i c[i] + +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE) \ + void \ + f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out, \ + TYPE0 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE0 *__restrict c, \ + TYPE1 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? FN : MERGE; \ + } + +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 5) + +TEST_ALL (FMLS (f16), _Float16, uint64_t, 32) + +TEST_ALL (FMLS (f16), _Float16, uint32_t, 64) + +TEST_ALL (FMLS (f32), float, uint64_t, 32) + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c new file mode 100644 index 00000000000..5d4eff806dd --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 -fno-trapping-math" } */ + +#include <stdint.h> + +#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i]) +#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i]) +#define FNMLA(SUFF) -FMLA (SUFF) +#define FNMLS(SUFF) -FMLS (SUFF) + +#define a_i a[i] +#define b_i b[i] +#define c_i c[i] + +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE) \ + void \ + f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out, \ + TYPE0 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE0 *__restrict c, \ + TYPE1 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? FN : MERGE; \ + } + +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 5) + +TEST_ALL (FNMLA (f16), _Float16, uint64_t, 32) + +TEST_ALL (FNMLA (f16), _Float16, uint32_t, 64) + +TEST_ALL (FNMLA (f32), float, uint64_t, 32) + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfnmla\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfnmla\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c new file mode 100644 index 00000000000..5569bbabd7d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 -fno-trapping-math" } */ + +#include <stdint.h> + +#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i]) +#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i]) +#define FNMLA(SUFF) -FMLA (SUFF) +#define FNMLS(SUFF) -FMLS (SUFF) + +#define a_i a[i] +#define b_i b[i] +#define c_i c[i] + +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE) \ + void \ + f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out, \ + TYPE0 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE0 *__restrict c, \ + TYPE1 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? FN : MERGE; \ + } + +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 5) + +TEST_ALL (FNMLS (f16), _Float16, uint64_t, 32) + +TEST_ALL (FNMLS (f16), _Float16, uint32_t, 64) + +TEST_ALL (FNMLS (f32), float, uint64_t, 32) + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfnmls\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfnmls\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ -- 2.34.1