[PATCH 13/14] aarch64: Relaxed SEL combiner patterns for unpacked SVE FP ternary arithmetic

Spencer Abson Mon, 02 Jun 2025 03:13:00 -0700

Extend the ternary op/UNSPEC_SEL combiner patterns from SVE_FULL_F/
SVE_FULL_F_BF to SVE_F/SVE_F_BF, where the strictness value is
SVE_RELAXED_GP.


We can only reliably test the 'merging with the third input' (addend)
and 'independent value' patterns at this stage as the canocalisation that
reorders the multiplicands based on the second SEL input would be performed
by the conditional expander.

Another difficulty is that we can't test these fused multiply/SEL combines
without using __builtin_fma and friends.  The reason for this is as follows:

We support COND_ADD, COND_SUB, and COND_MUL optabs, so match.pd will
canonicalize patterns like ADD/SUB/MUL combined with a VEC_COND_EXPR into
these conditional forms.  Later, when widening_mul tries to fold these into
conditional fused multiply operations, the transformation fails - simply
because we haven’t implemented those conditional fused multiply optabs yet.

Hence why this patch lacks tests for BFloat16...

gcc/ChangeLog:

        * config/aarch64/aarch64-sve.md (*cond_<optab><mode>_2_relaxed):
        Extend from SVE_FULL_F to SVE_F.
        (*cond_<optab><mode>_4_relaxed): Extend from SVE_FULL_F_B16B16
        to SVE_F_B16B16.
        (*cond_<optab><mode>_any_relaxed): Likewise.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/sve/unpacked_cond_fmla_1.c: New test.
        * gcc.target/aarch64/sve/unpacked_cond_fmls_1.c: Likewise.
        * gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c: Likewise.
        * gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md             | 38 ++++++++--------
 .../aarch64/sve/unpacked_cond_fmla_1.c        | 43 +++++++++++++++++++
 .../aarch64/sve/unpacked_cond_fmls_1.c        | 43 +++++++++++++++++++
 .../aarch64/sve/unpacked_cond_fnmla_1.c       | 43 +++++++++++++++++++
 .../aarch64/sve/unpacked_cond_fnmls_1.c       | 43 +++++++++++++++++++
 5 files changed, 191 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 8c1921ddf5c..e5443980e8b 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -7622,15 +7622,15 @@
 ;; Predicated floating-point ternary operations, merging with the
 ;; first input.
 (define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-       (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+       (unspec:SVE_F
          [(match_operand:<VPRED> 1 "register_operand")
-          (unspec:SVE_FULL_F
+          (unspec:SVE_F
             [(match_operand 5)
              (const_int SVE_RELAXED_GP)
-             (match_operand:SVE_FULL_F 2 "register_operand")
-             (match_operand:SVE_FULL_F 3 "register_operand")
-             (match_operand:SVE_FULL_F 4 "register_operand")]
+             (match_operand:SVE_F 2 "register_operand")
+             (match_operand:SVE_F 3 "register_operand")
+             (match_operand:SVE_F 4 "register_operand")]
             SVE_COND_FP_TERNARY)
           (match_dup 2)]
          UNSPEC_SEL))]
@@ -7668,15 +7668,15 @@
 ;; Predicated floating-point ternary operations, merging with the
 ;; third input.
 (define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-       (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+       (unspec:SVE_F_B16B16
          [(match_operand:<VPRED> 1 "register_operand")
-          (unspec:SVE_FULL_F_B16B16
+          (unspec:SVE_F_B16B16
             [(match_operand 5)
              (const_int SVE_RELAXED_GP)
-             (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-             (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
-             (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+             (match_operand:SVE_F_B16B16 2 "register_operand")
+             (match_operand:SVE_F_B16B16 3 "register_operand")
+             (match_operand:SVE_F_B16B16 4 "register_operand")]
             SVE_COND_FP_TERNARY)
           (match_dup 4)]
          UNSPEC_SEL))]
@@ -7714,17 +7714,17 @@
 ;; Predicated floating-point ternary operations, merging with an
 ;; independent value.
 (define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-       (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+       (unspec:SVE_F_B16B16
          [(match_operand:<VPRED> 1 "register_operand")
-          (unspec:SVE_FULL_F_B16B16
+          (unspec:SVE_F_B16B16
             [(match_operand 6)
              (const_int SVE_RELAXED_GP)
-             (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-             (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
-             (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+             (match_operand:SVE_F_B16B16 2 "register_operand")
+             (match_operand:SVE_F_B16B16 3 "register_operand")
+             (match_operand:SVE_F_B16B16 4 "register_operand")]
             SVE_COND_FP_TERNARY)
-          (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")]
+          (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")]
          UNSPEC_SEL))]
   "TARGET_SVE
    && (<supports_bf16> || !<is_bf16>)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c
new file mode 100644
index 00000000000..e2a5bdab1dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 
-fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define FMLA(SUFF)  __builtin_fma##SUFF (a[i], b[i], c[i])
+#define FMLS(SUFF)  __builtin_fma##SUFF (a[i], -b[i], c[i])
+#define FNMLA(SUFF) -FMLA (SUFF)
+#define FNMLS(SUFF) -FMLS (SUFF)
+
+#define a_i a[i]
+#define b_i b[i]
+#define c_i c[i]
+
+#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE)                        \
+  void                                                         \
+  f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out,                \
+                                TYPE0 *__restrict a,           \
+                                TYPE0 *__restrict b,           \
+                                TYPE0 *__restrict c,           \
+                                TYPE1 *__restrict p)           \
+  {                                                            \
+    for (unsigned int i = 0; i < COUNT; i++)                   \
+      out[i] = p[i] ? FN : MERGE;                              \
+  }
+
+#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \
+  TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i)  \
+  TEST_FN (FN, TYPE0, TYPE1, COUNT, 5)
+
+TEST_ALL (FMLA (f16), _Float16, uint64_t, 32)
+
+TEST_ALL (FMLA (f16), _Float16, uint32_t, 64)
+
+TEST_ALL (FMLA (f32), float, uint64_t, 32)
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, 
z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c
new file mode 100644
index 00000000000..5a83fab8ec7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 
-fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define FMLA(SUFF)  __builtin_fma##SUFF (a[i], b[i], c[i])
+#define FMLS(SUFF)  __builtin_fma##SUFF (a[i], -b[i], c[i])
+#define FNMLA(SUFF) -FMLA (SUFF)
+#define FNMLS(SUFF) -FMLS (SUFF)
+
+#define a_i a[i]
+#define b_i b[i]
+#define c_i c[i]
+
+#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE)                        \
+  void                                                         \
+  f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out,                \
+                                TYPE0 *__restrict a,           \
+                                TYPE0 *__restrict b,           \
+                                TYPE0 *__restrict c,           \
+                                TYPE1 *__restrict p)           \
+  {                                                            \
+    for (unsigned int i = 0; i < COUNT; i++)                   \
+      out[i] = p[i] ? FN : MERGE;                              \
+  }
+
+#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \
+  TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i)  \
+  TEST_FN (FN, TYPE0, TYPE1, COUNT, 5)
+
+TEST_ALL (FMLS (f16), _Float16, uint64_t, 32)
+
+TEST_ALL (FMLS (f16), _Float16, uint32_t, 64)
+
+TEST_ALL (FMLS (f32), float, uint64_t, 32)
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, 
z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c
new file mode 100644
index 00000000000..5d4eff806dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 
-fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define FMLA(SUFF)  __builtin_fma##SUFF (a[i], b[i], c[i])
+#define FMLS(SUFF)  __builtin_fma##SUFF (a[i], -b[i], c[i])
+#define FNMLA(SUFF) -FMLA (SUFF)
+#define FNMLS(SUFF) -FMLS (SUFF)
+
+#define a_i a[i]
+#define b_i b[i]
+#define c_i c[i]
+
+#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE)                        \
+  void                                                         \
+  f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out,                \
+                                TYPE0 *__restrict a,           \
+                                TYPE0 *__restrict b,           \
+                                TYPE0 *__restrict c,           \
+                                TYPE1 *__restrict p)           \
+  {                                                            \
+    for (unsigned int i = 0; i < COUNT; i++)                   \
+      out[i] = p[i] ? FN : MERGE;                              \
+  }
+
+#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \
+  TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i)  \
+  TEST_FN (FN, TYPE0, TYPE1, COUNT, 5)
+
+TEST_ALL (FNMLA (f16), _Float16, uint64_t, 32)
+
+TEST_ALL (FNMLA (f16), _Float16, uint32_t, 64)
+
+TEST_ALL (FNMLA (f32), float, uint64_t, 32)
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfnmla\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, 
z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfnmla\tz[0-9]+\.h, p[0-7]/m, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c
new file mode 100644
index 00000000000..5569bbabd7d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 
-fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define FMLA(SUFF)  __builtin_fma##SUFF (a[i], b[i], c[i])
+#define FMLS(SUFF)  __builtin_fma##SUFF (a[i], -b[i], c[i])
+#define FNMLA(SUFF) -FMLA (SUFF)
+#define FNMLS(SUFF) -FMLS (SUFF)
+
+#define a_i a[i]
+#define b_i b[i]
+#define c_i c[i]
+
+#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE)                        \
+  void                                                         \
+  f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out,                \
+                                TYPE0 *__restrict a,           \
+                                TYPE0 *__restrict b,           \
+                                TYPE0 *__restrict c,           \
+                                TYPE1 *__restrict p)           \
+  {                                                            \
+    for (unsigned int i = 0; i < COUNT; i++)                   \
+      out[i] = p[i] ? FN : MERGE;                              \
+  }
+
+#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \
+  TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i)  \
+  TEST_FN (FN, TYPE0, TYPE1, COUNT, 5)
+
+TEST_ALL (FNMLS (f16), _Float16, uint64_t, 32)
+
+TEST_ALL (FNMLS (f16), _Float16, uint32_t, 64)
+
+TEST_ALL (FNMLS (f32), float, uint64_t, 32)
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfnmls\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, 
z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfnmls\tz[0-9]+\.h, p[0-7]/m, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
-- 
2.34.1

[PATCH 13/14] aarch64: Relaxed SEL combiner patterns for unpacked SVE FP ternary arithmetic

Reply via email to