Use ccmp to perform all TImode comparisons branchless.

        * config/aarch64/aarch64.c (aarch64_gen_compare_reg): Expand all of
        the comparisons for TImode, not just NE.
        * config/aarch64/aarch64.md (cbranchti4, cstoreti4): New.
---
 gcc/config/aarch64/aarch64.c  | 182 +++++++++++++++++++++++++++++++---
 gcc/config/aarch64/aarch64.md |  28 ++++++
 2 files changed, 196 insertions(+), 14 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d7899dad759..911dc1c91cd 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2363,32 +2363,186 @@ rtx
 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 {
   machine_mode cmp_mode = GET_MODE (x);
-  machine_mode cc_mode;
   rtx cc_reg;
 
   if (cmp_mode == TImode)
     {
-      gcc_assert (code == NE);
-
-      cc_mode = CCmode;
-      cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
-
       rtx x_lo = operand_subword (x, 0, 0, TImode);
-      rtx y_lo = operand_subword (y, 0, 0, TImode);
-      emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
-
       rtx x_hi = operand_subword (x, 1, 0, TImode);
-      rtx y_hi = operand_subword (y, 1, 0, TImode);
-      emit_insn (gen_ccmpccdi (cc_reg, x_hi, y_hi,
-                              gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
-                              GEN_INT (aarch64_nzcv_codes[AARCH64_NE])));
+      rtx y_lo, y_hi, tmp;
+
+      if (y == const0_rtx)
+       {
+         y_lo = y_hi = y;
+         switch (code)
+           {
+           case EQ:
+           case NE:
+             /* For equality, IOR the two halves together.  If this gets
+                used for a branch, we expect this to fold to cbz/cbnz;
+                otherwise it's no larger than cmp+ccmp below.  Beware of
+                the compare-and-swap post-reload split and use cmp+ccmp.  */
+             if (!can_create_pseudo_p ())
+               break;
+             tmp = gen_reg_rtx (DImode);
+             emit_insn (gen_iordi3 (tmp, x_hi, x_lo));
+             emit_insn (gen_cmpdi (tmp, const0_rtx));
+             cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
+             goto done;
+
+           case LT:
+           case GE:
+             /* Check only the sign bit.  Choose to expose this detail,
+                lest something later tries to use a COMPARE in a way
+                that doesn't correspond.  This is "tst".  */
+             cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
+             tmp = gen_rtx_AND (DImode, x_hi, GEN_INT (INT64_MIN));
+             tmp = gen_rtx_COMPARE (CC_NZmode, tmp, const0_rtx);
+             emit_set_insn (cc_reg, tmp);
+             code = (code == LT ? NE : EQ);
+             goto done;
+
+           case LE:
+           case GT:
+             /* For GT, (x_hi >= 0) && ((x_hi | x_lo) != 0),
+                and of course the inverse for LE.  */
+             emit_insn (gen_cmpdi (x_hi, const0_rtx));
+
+             tmp = gen_reg_rtx (DImode);
+             emit_insn (gen_iordi3 (tmp, x_hi, x_lo));
+
+             /* Combine the two terms:
+                (GE ? (compare tmp 0) : EQ),
+                so that the whole term is true for NE, false for EQ.  */
+             cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
+             emit_insn (gen_ccmpccdi
+                        (cc_reg, tmp, const0_rtx,
+                         gen_rtx_GE (VOIDmode, cc_reg, const0_rtx),
+                         GEN_INT (aarch64_nzcv_codes[AARCH64_EQ])));
+
+             /* The result is entirely within the Z bit. */
+             code = (code == GT ? NE : EQ);
+             goto done;
+
+           default:
+             break;
+           }
+       }
+      else
+       {
+         y_lo = operand_subword (y, 0, 0, TImode);
+         y_hi = operand_subword (y, 1, 0, TImode);
+       }
+
+      cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
+      switch (code)
+       {
+       case EQ:
+       case NE:
+         /* For EQ, (x_lo == y_lo) && (x_hi == y_hi).  */
+         emit_insn (gen_cmpdi (x_lo, y_lo));
+         emit_insn (gen_ccmpccdi (cc_reg, x_hi, y_hi,
+                                  gen_rtx_EQ (VOIDmode, cc_reg, const0_rtx),
+                                  GEN_INT (aarch64_nzcv_codes[AARCH64_NE])));
+         break;
+
+       case LEU:
+       case GTU:
+         std::swap (x_lo, y_lo);
+         std::swap (x_hi, y_hi);
+         code = swap_condition (code);
+         /* fall through */
+
+       case LTU:
+       case GEU:
+         /* For LTU, (x - y), as double-word arithmetic.  */
+         emit_insn (gen_cmpdi (x_lo, y_lo));
+         /* The ucmp*_carryinC pattern uses zero_extend, and so cannot
+            take the constant 0 we allow elsewhere.  Force to reg now
+            and allow combine to eliminate via simplification.  */
+         x_hi = force_reg (DImode, x_hi);
+         y_hi = force_reg (DImode, y_hi);
+         emit_insn (gen_ucmpdi3_carryinC(x_hi, y_hi));
+         /* The result is entirely within the C bit. */
+         break;
+
+       case LE:
+       case GT:
+         /*
+          * For LE,
+          *    !((x_hi > y_hi) || (x_hi == y_hi && x_lo > y_lo))
+          * -> !(x_hi > y_hi) && !(x_hi == y_hi && x_lo > y_lo)
+          * -> (x_hi <= y_hi) && !(x_hi == y_hi && x_lo > y_lo)
+          */
+
+         /* Compute the first term (x_hi <= y_hi) and save it in tmp. */
+         tmp = gen_reg_rtx (SImode);
+         emit_insn (gen_cmpdi (x_hi, y_hi));
+         emit_set_insn (tmp, gen_rtx_LE (SImode, cc_reg, const0_rtx));
+
+         /* Compute the second term (x_hi == y_hi && x_lo > y_lo):
+            (EQ ? (compare x_lo y_lo) : LE),
+            so that the whole term is true for GT, false for LE.  */
+         emit_insn (gen_ccmpccdi (cc_reg, x_lo, y_lo,
+                                  gen_rtx_EQ (VOIDmode, cc_reg, const0_rtx),
+                                  GEN_INT (aarch64_nzcv_codes[AARCH64_LE])));
+
+         /* Combine the two terms.  Since we want !(second_term):
+            (LE ? (compare tmp 0) : EQ),
+            so that the whole term is true for NE, false for EQ.  */
+         emit_insn (gen_ccmpccsi (cc_reg, tmp, const0_rtx,
+                                  gen_rtx_LE (VOIDmode, cc_reg, const0_rtx),
+                                  GEN_INT (aarch64_nzcv_codes[AARCH64_EQ])));
+
+         /* The result is entirely within the Z bit. */
+         code = (code == GE ? NE : EQ);
+         break;
+
+       case LT:
+       case GE:
+         /*
+          * For GE,
+          *    !((x_hi < y_hi) || (x_hi == y_hi && x_lo < y_lo))
+          * -> !(x_hi < y_hi) && !(x_hi == y_hi && x_lo < y_lo)
+          * -> (x_hi >= y_hi) && !(x_hi == y_hi && x_lo < y_lo)
+          * and of course the inverse for LT.
+          */
+
+         /* Compute the first term (x_hi >= y_hi) and save it in tmp. */
+         tmp = gen_reg_rtx (SImode);
+         emit_insn (gen_cmpdi (x_hi, y_hi));
+         emit_set_insn (tmp, gen_rtx_GE (SImode, cc_reg, const0_rtx));
+
+         /* Compute the second term (x_hi == y_hi && x_lo < y_lo):
+            (EQ ? (compare x_lo y_lo) : GE),
+            so that the whole term is true for LT, false for GE.  */
+         emit_insn (gen_ccmpccdi (cc_reg, x_lo, y_lo,
+                                  gen_rtx_EQ (VOIDmode, cc_reg, const0_rtx),
+                                  GEN_INT (aarch64_nzcv_codes[AARCH64_GE])));
+
+         /* Combine the two terms.  Since we want !(second_term):
+            (GE ? (compare tmp 0) : EQ),
+            so that the whole term is true for NE, false for EQ.  */
+         emit_insn (gen_ccmpccsi (cc_reg, tmp, const0_rtx,
+                                  gen_rtx_GE (VOIDmode, cc_reg, const0_rtx),
+                                  GEN_INT (aarch64_nzcv_codes[AARCH64_EQ])));
+
+         /* The result is entirely within the Z bit. */
+         code = (code == GE ? NE : EQ);
+         break;
+
+       default:
+         gcc_unreachable ();
+       }
     }
   else
     {
-      cc_mode = SELECT_CC_MODE (code, x, y);
+      machine_mode cc_mode = SELECT_CC_MODE (code, x, y);
       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
     }
+
+ done:
   return gen_rtx_fmt_ee (code, VOIDmode, cc_reg, const0_rtx);
 }
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c789b641e7c..fb076b60e3c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -471,6 +471,20 @@
   operands[2] = const0_rtx;
 })
 
+(define_expand "cbranchti4"
+  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
+                           [(match_operand:TI 1 "register_operand")
+                            (match_operand:TI 2 "aarch64_reg_or_zero")])
+                          (label_ref (match_operand 3 "" ""))
+                          (pc)))]
+  ""
+{
+  operands[0] = aarch64_gen_compare_reg (GET_CODE (operands[0]), operands[1],
+                                        operands[2]);
+  operands[1] = XEXP (operands[0], 0);
+  operands[2] = const0_rtx;
+})
+
 (define_expand "cbranch<mode>4"
   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
                            [(match_operand:GPF 1 "register_operand")
@@ -4144,6 +4158,20 @@
   operands[3] = const0_rtx;
 })
 
+(define_expand "cstoreti4"
+  [(set (match_operand:SI 0 "register_operand")
+       (match_operator:SI 1 "aarch64_comparison_operator"
+        [(match_operand:TI 2 "register_operand")
+         (match_operand:TI 3 "aarch64_reg_or_zero")]))]
+  ""
+{
+  operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
+                                        operands[3]);
+  PUT_MODE (operands[1], SImode);
+  operands[2] = XEXP (operands[1], 0);
+  operands[3] = const0_rtx;
+})
+
 (define_expand "cstorecc4"
   [(set (match_operand:SI 0 "register_operand")
        (match_operator 1 "aarch64_comparison_operator_mode"
-- 
2.20.1

Reply via email to