[PATCH 04/14] aarch64: Add support for unpacked SVE FP comparisons

Spencer Abson Mon, 02 Jun 2025 03:10:17 -0700

This patch extends our vec_cmp expander to support partial FP modes.

We use an unnatural predicate mode to govern unpacked FP operations under
flag_trapping_math, so the expansion must handle cases where the comparison's
target and governing predicates have different modes.


While such predicates enable all of the defined part of the operation, they
are not all-true.  Their false bits contribute to the (trapping) behavior of
the operation, so the operation itself should not have SVE_KNOWN_PTRUE.

gcc/ChangeLog:

        * config/aarch64/aarch64-sve.md (vec_cmp<mode><vpred>): Extend
        to handle partial FP modes.
        (@aarch64_pred_fcm<cmp_op><mode>): Likewise.
        (@aarch64_pred_fcmuo<mode>): Likewise.
        * config/aarch64/aarch64.cc (aarch64_emit_sve_fp_cond): Handle
        unnatural governing predicates.
        (aarch64_emit_sve_or_fp_conds): Likewise.
        (aarch64_emit_sve_invert_fp_cond): Likewise.
        (aarch64_expand_sve_vec_cmp_float): Likewise.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/sve/unpacked_fcm_1.c: New test.
        * gcc.target/aarch64/sve/unpacked_fcm_2.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md             |  16 +-
 gcc/config/aarch64/aarch64.cc                 |  47 +-
 .../gcc.target/aarch64/sve/unpacked_fcm_1.c   | 545 ++++++++++++++++++
 .../gcc.target/aarch64/sve/unpacked_fcm_2.c   |  47 ++
 4 files changed, 631 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 7484aeeb161..6c5129bc0c6 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8600,8 +8600,8 @@
 (define_expand "vec_cmp<mode><vpred>"
   [(set (match_operand:<VPRED> 0 "register_operand")
        (match_operator:<VPRED> 1 "comparison_operator"
-         [(match_operand:SVE_FULL_F 2 "register_operand")
-          (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]))]
+         [(match_operand:SVE_F 2 "register_operand")
+          (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero")]))]
   "TARGET_SVE"
   {
     aarch64_expand_sve_vec_cmp_float (operands[0], GET_CODE (operands[1]),
@@ -8614,10 +8614,10 @@
 (define_insn "@aarch64_pred_fcm<cmp_op><mode>"
   [(set (match_operand:<VPRED> 0 "register_operand")
        (unspec:<VPRED>
-         [(match_operand:<VPRED> 1 "register_operand")
+         [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
           (match_operand:SI 2 "aarch64_sve_ptrue_flag")
-          (match_operand:SVE_FULL_F 3 "register_operand")
-          (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+          (match_operand:SVE_F 3 "register_operand")
+          (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
          SVE_COND_FP_CMP_I0))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 3 , 4   ]
@@ -8630,10 +8630,10 @@
 (define_insn "@aarch64_pred_fcmuo<mode>"
   [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
        (unspec:<VPRED>
-         [(match_operand:<VPRED> 1 "register_operand" "Upl")
+         [(match_operand:<VPRED> 1 "aarch64_predicate_operand" "Upl")
           (match_operand:SI 2 "aarch64_sve_ptrue_flag")
-          (match_operand:SVE_FULL_F 3 "register_operand" "w")
-          (match_operand:SVE_FULL_F 4 "register_operand" "w")]
+          (match_operand:SVE_F 3 "register_operand" "w")
+          (match_operand:SVE_F 4 "register_operand" "w")]
          UNSPEC_COND_FCMUO))]
   "TARGET_SVE"
   "fcmuo\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index b13fce2a859..287de0f5ae4 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27240,7 +27240,7 @@ aarch64_emit_sve_fp_cond (rtx target, rtx_code code, 
rtx pred,
                          bool known_ptrue_p, rtx op0, rtx op1)
 {
   rtx flag = gen_int_mode (known_ptrue_p, SImode);
-  rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
+  rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
                               gen_rtvec (4, pred, flag, op0, op1),
                               aarch64_unspec_cond_code (code));
   emit_set_insn (target, unspec);
@@ -27259,10 +27259,10 @@ static void
 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
                              rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
 {
-  machine_mode pred_mode = GET_MODE (pred);
-  rtx tmp1 = gen_reg_rtx (pred_mode);
+  machine_mode target_mode = GET_MODE (target);
+  rtx tmp1 = gen_reg_rtx (target_mode);
   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
-  rtx tmp2 = gen_reg_rtx (pred_mode);
+  rtx tmp2 = gen_reg_rtx (target_mode);
   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
 }
@@ -27279,8 +27279,7 @@ static void
 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
                                 bool known_ptrue_p, rtx op0, rtx op1)
 {
-  machine_mode pred_mode = GET_MODE (pred);
-  rtx tmp = gen_reg_rtx (pred_mode);
+  rtx tmp = gen_reg_rtx (GET_MODE (target));
   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
   aarch64_emit_unop (target, one_cmpl_optab, tmp);
 }
@@ -27292,10 +27291,16 @@ aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code 
code, rtx pred,
 void
 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1)
 {
-  machine_mode pred_mode = GET_MODE (target);
   machine_mode data_mode = GET_MODE (op0);
+  rtx pred = aarch64_sve_fp_pred (data_mode, nullptr);
 
-  rtx ptrue = aarch64_ptrue_reg (pred_mode);
+  /* The governing and destination modes.  */
+  machine_mode pred_mode = GET_MODE (pred);
+  machine_mode target_mode = GET_MODE (target);
+
+  /* Also determines SVE_KNOWN_PTRUE, since an unnatural GP from
+     sve_fp_pred would disable part of the operation.   */
+  bool natural_p = pred_mode == target_mode;
   switch (code)
     {
     case UNORDERED:
@@ -27309,12 +27314,12 @@ aarch64_expand_sve_vec_cmp_float (rtx target, 
rtx_code code, rtx op0, rtx op1)
     case EQ:
     case NE:
       /* There is native support for the comparison.  */
-      aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
+      aarch64_emit_sve_fp_cond (target, code, pred, natural_p, op0, op1);
       return;
 
     case LTGT:
       /* This is a trapping operation (LT or GT).  */
-      aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
+      aarch64_emit_sve_or_fp_conds (target, LT, GT, pred, natural_p, op0, op1);
       return;
 
     case UNEQ:
@@ -27323,7 +27328,7 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code 
code, rtx op0, rtx op1)
          /* This would trap for signaling NaNs.  */
          op1 = force_reg (data_mode, op1);
          aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
-                                       ptrue, true, op0, op1);
+                                       pred, natural_p, op0, op1);
          return;
        }
       /* fall through */
@@ -27333,11 +27338,21 @@ aarch64_expand_sve_vec_cmp_float (rtx target, 
rtx_code code, rtx op0, rtx op1)
     case UNGE:
       if (flag_trapping_math)
        {
-         /* Work out which elements are ordered.  */
-         rtx ordered = gen_reg_rtx (pred_mode);
          op1 = force_reg (data_mode, op1);
-         aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
-                                          ptrue, true, op0, op1);
+
+         /* Work out which elements are unordered.  */
+         rtx uo_tmp = gen_reg_rtx (target_mode);
+         aarch64_emit_sve_fp_cond (uo_tmp, UNORDERED, pred, natural_p,
+                                   op0, op1);
+
+         /* Invert the result.  Use PRED again to maintain the intended
+            trapping behavior.  */
+         if (!natural_p)
+           uo_tmp = gen_lowpart (pred_mode, uo_tmp);
+
+         rtx ordered = gen_reg_rtx (pred_mode);
+         emit_insn (gen_aarch64_pred_z (XOR, pred_mode,
+                                        ordered, pred, pred, uo_tmp));
 
          /* Test the opposite condition for the ordered elements,
             then invert the result.  */
@@ -27362,7 +27377,7 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code 
code, rtx op0, rtx op1)
 
   /* There is native support for the inverse comparison.  */
   code = reverse_condition_maybe_unordered (code);
-  aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
+  aarch64_emit_sve_invert_fp_cond (target, code, pred, natural_p, op0, op1);
 }
 
 /* Return true if:
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
new file mode 100644
index 00000000000..7e39b79991b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
@@ -0,0 +1,545 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 
-fno-schedule-insns2" } */
+
+#include <stdint.h>
+
+#define UNLT(A, B) (!__builtin_isgreaterequal (A, B))
+#define UNLE(A, B) (!__builtin_isgreater (A, B))
+#define UNGT(A, B) (!__builtin_islessequal (A, B))
+#define UNGE(A, B) (!__builtin_isless (A, B))
+#define UNEQ(A, B) (!__builtin_islessgreater (A, B))
+
+#define EQ(A, B) ((A) == (B))
+#define NE(A, B) ((A) != (B))
+#define LE(A, B) ((A) <= (B))
+#define LT(A, B) ((A) < (B))
+#define GE(A, B) ((A) >= (B))
+#define GT(A, B) ((A) > (B))
+#define ORDERED(A, B) (!__builtin_isunordered (A, B))
+#define UNORDERED(A, B) (__builtin_isunordered (A, B))
+
+#define b_i b[i]
+
+#define TEST_FCM(TYPE0, TYPE1, CMP, RHS, COUNT)                      \
+  void                                                       \
+  f_##TYPE0##_##TYPE1##_##CMP##_##RHS (TYPE0 *__restrict out, \
+                                      TYPE1 *__restrict a,   \
+                                      TYPE1 *__restrict b)   \
+  {                                                          \
+    for (unsigned int i = 0; i < COUNT; i++)                 \
+      out[i] = CMP (a[i], RHS) ? 3 : out[i];                 \
+  }
+
+#define TEST_CC_REG(CMP)                     \
+  TEST_FCM (uint64_t, float, CMP, b_i, 32)    \
+  TEST_FCM (uint32_t, _Float16, CMP, b_i, 64) \
+  TEST_FCM (uint64_t, _Float16, CMP, b_i, 32)
+
+#define TEST_CC_ALL(CMP)                   \
+  TEST_CC_REG (CMP)                        \
+  TEST_FCM (uint64_t, float, CMP, 0, 32)    \
+  TEST_FCM (uint32_t, _Float16, CMP, 0, 64) \
+  TEST_FCM (uint64_t, _Float16, CMP, 0, 32)
+
+
+/*
+** f_uint64_t_float_UNLT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmge   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNLT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmge   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNLT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmge   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNLT)
+
+/*
+** f_uint64_t_float_UNLE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmgt   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNLE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmgt   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNLE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmgt   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNLE)
+
+/*
+** f_uint64_t_float_UNGT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmle   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNGT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmle   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNGT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmle   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNGT)
+
+/*
+** f_uint64_t_float_UNGE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmlt   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNGE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmlt   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNGE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmlt   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNGE)
+
+/*
+** f_uint64_t_float_UNEQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmne   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNEQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmne   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNEQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     eor     (p[0-9]+)\.b, \1/z, \1\.b, \2\.b
+**     fcmne   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNEQ)
+
+/*
+** f_uint64_t_float_EQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmeq   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_EQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmeq   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_EQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmeq   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_EQ_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmeq   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_EQ_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmeq   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_EQ_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmeq   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (EQ)
+
+/*
+** f_uint64_t_float_NE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmne   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_NE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmne   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_NE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmne   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_NE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmne   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_NE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmne   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_NE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmne   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (NE)
+
+/*
+** f_uint64_t_float_LE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmle   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmle   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmle   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_LE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmle   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmle   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmle   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (LE)
+
+/*
+** f_uint64_t_float_LT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmlt   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmlt   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmlt   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_LT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmlt   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmlt   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmlt   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (LT)
+
+/*
+** f_uint64_t_float_GE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmge   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmge   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmge   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_GE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmge   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmge   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmge   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (GE)
+
+/*
+** f_uint64_t_float_GT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmgt   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmgt   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmgt   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_GT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmgt   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmgt   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmgt   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (GT)
+
+/*
+** f_uint64_t_float_ORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_ORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmuo   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_ORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (ORDERED)
+
+/*
+** f_uint64_t_float_UNORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, vl64
+**     fcmuo   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, vl32
+**     fcmuo   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNORDERED)
+
+
+/* { dg-final { check-function-bodies "**" "" ""} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c
new file mode 100644
index 00000000000..74104c775c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 
-fno-trapping-math" } */
+
+#include "unpacked_fcm_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 57 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
-- 
2.34.1

[PATCH 04/14] aarch64: Add support for unpacked SVE FP comparisons

Reply via email to