Use ccmp to perform all TImode comparisons branchless. * config/aarch64/aarch64.c (aarch64_gen_compare_reg): Expand all of the comparisons for TImode, not just NE. * config/aarch64/aarch64.md (cbranchti4, cstoreti4): New. --- gcc/config/aarch64/aarch64.c | 182 +++++++++++++++++++++++++++++++--- gcc/config/aarch64/aarch64.md | 28 ++++++ 2 files changed, 196 insertions(+), 14 deletions(-)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index d7899dad759..911dc1c91cd 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -2363,32 +2363,186 @@ rtx aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y) { machine_mode cmp_mode = GET_MODE (x); - machine_mode cc_mode; rtx cc_reg; if (cmp_mode == TImode) { - gcc_assert (code == NE); - - cc_mode = CCmode; - cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM); - rtx x_lo = operand_subword (x, 0, 0, TImode); - rtx y_lo = operand_subword (y, 0, 0, TImode); - emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo)); - rtx x_hi = operand_subword (x, 1, 0, TImode); - rtx y_hi = operand_subword (y, 1, 0, TImode); - emit_insn (gen_ccmpccdi (cc_reg, x_hi, y_hi, - gen_rtx_EQ (cc_mode, cc_reg, const0_rtx), - GEN_INT (aarch64_nzcv_codes[AARCH64_NE]))); + rtx y_lo, y_hi, tmp; + + if (y == const0_rtx) + { + y_lo = y_hi = y; + switch (code) + { + case EQ: + case NE: + /* For equality, IOR the two halves together. If this gets + used for a branch, we expect this to fold to cbz/cbnz; + otherwise it's no larger than cmp+ccmp below. Beware of + the compare-and-swap post-reload split and use cmp+ccmp. */ + if (!can_create_pseudo_p ()) + break; + tmp = gen_reg_rtx (DImode); + emit_insn (gen_iordi3 (tmp, x_hi, x_lo)); + emit_insn (gen_cmpdi (tmp, const0_rtx)); + cc_reg = gen_rtx_REG (CCmode, CC_REGNUM); + goto done; + + case LT: + case GE: + /* Check only the sign bit. Choose to expose this detail, + lest something later tries to use a COMPARE in a way + that doesn't correspond. This is "tst". */ + cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); + tmp = gen_rtx_AND (DImode, x_hi, GEN_INT (INT64_MIN)); + tmp = gen_rtx_COMPARE (CC_NZmode, tmp, const0_rtx); + emit_set_insn (cc_reg, tmp); + code = (code == LT ? NE : EQ); + goto done; + + case LE: + case GT: + /* For GT, (x_hi >= 0) && ((x_hi | x_lo) != 0), + and of course the inverse for LE. */ + emit_insn (gen_cmpdi (x_hi, const0_rtx)); + + tmp = gen_reg_rtx (DImode); + emit_insn (gen_iordi3 (tmp, x_hi, x_lo)); + + /* Combine the two terms: + (GE ? (compare tmp 0) : EQ), + so that the whole term is true for NE, false for EQ. */ + cc_reg = gen_rtx_REG (CCmode, CC_REGNUM); + emit_insn (gen_ccmpccdi + (cc_reg, tmp, const0_rtx, + gen_rtx_GE (VOIDmode, cc_reg, const0_rtx), + GEN_INT (aarch64_nzcv_codes[AARCH64_EQ]))); + + /* The result is entirely within the Z bit. */ + code = (code == GT ? NE : EQ); + goto done; + + default: + break; + } + } + else + { + y_lo = operand_subword (y, 0, 0, TImode); + y_hi = operand_subword (y, 1, 0, TImode); + } + + cc_reg = gen_rtx_REG (CCmode, CC_REGNUM); + switch (code) + { + case EQ: + case NE: + /* For EQ, (x_lo == y_lo) && (x_hi == y_hi). */ + emit_insn (gen_cmpdi (x_lo, y_lo)); + emit_insn (gen_ccmpccdi (cc_reg, x_hi, y_hi, + gen_rtx_EQ (VOIDmode, cc_reg, const0_rtx), + GEN_INT (aarch64_nzcv_codes[AARCH64_NE]))); + break; + + case LEU: + case GTU: + std::swap (x_lo, y_lo); + std::swap (x_hi, y_hi); + code = swap_condition (code); + /* fall through */ + + case LTU: + case GEU: + /* For LTU, (x - y), as double-word arithmetic. */ + emit_insn (gen_cmpdi (x_lo, y_lo)); + /* The ucmp*_carryinC pattern uses zero_extend, and so cannot + take the constant 0 we allow elsewhere. Force to reg now + and allow combine to eliminate via simplification. */ + x_hi = force_reg (DImode, x_hi); + y_hi = force_reg (DImode, y_hi); + emit_insn (gen_ucmpdi3_carryinC(x_hi, y_hi)); + /* The result is entirely within the C bit. */ + break; + + case LE: + case GT: + /* + * For LE, + * !((x_hi > y_hi) || (x_hi == y_hi && x_lo > y_lo)) + * -> !(x_hi > y_hi) && !(x_hi == y_hi && x_lo > y_lo) + * -> (x_hi <= y_hi) && !(x_hi == y_hi && x_lo > y_lo) + */ + + /* Compute the first term (x_hi <= y_hi) and save it in tmp. */ + tmp = gen_reg_rtx (SImode); + emit_insn (gen_cmpdi (x_hi, y_hi)); + emit_set_insn (tmp, gen_rtx_LE (SImode, cc_reg, const0_rtx)); + + /* Compute the second term (x_hi == y_hi && x_lo > y_lo): + (EQ ? (compare x_lo y_lo) : LE), + so that the whole term is true for GT, false for LE. */ + emit_insn (gen_ccmpccdi (cc_reg, x_lo, y_lo, + gen_rtx_EQ (VOIDmode, cc_reg, const0_rtx), + GEN_INT (aarch64_nzcv_codes[AARCH64_LE]))); + + /* Combine the two terms. Since we want !(second_term): + (LE ? (compare tmp 0) : EQ), + so that the whole term is true for NE, false for EQ. */ + emit_insn (gen_ccmpccsi (cc_reg, tmp, const0_rtx, + gen_rtx_LE (VOIDmode, cc_reg, const0_rtx), + GEN_INT (aarch64_nzcv_codes[AARCH64_EQ]))); + + /* The result is entirely within the Z bit. */ + code = (code == GE ? NE : EQ); + break; + + case LT: + case GE: + /* + * For GE, + * !((x_hi < y_hi) || (x_hi == y_hi && x_lo < y_lo)) + * -> !(x_hi < y_hi) && !(x_hi == y_hi && x_lo < y_lo) + * -> (x_hi >= y_hi) && !(x_hi == y_hi && x_lo < y_lo) + * and of course the inverse for LT. + */ + + /* Compute the first term (x_hi >= y_hi) and save it in tmp. */ + tmp = gen_reg_rtx (SImode); + emit_insn (gen_cmpdi (x_hi, y_hi)); + emit_set_insn (tmp, gen_rtx_GE (SImode, cc_reg, const0_rtx)); + + /* Compute the second term (x_hi == y_hi && x_lo < y_lo): + (EQ ? (compare x_lo y_lo) : GE), + so that the whole term is true for LT, false for GE. */ + emit_insn (gen_ccmpccdi (cc_reg, x_lo, y_lo, + gen_rtx_EQ (VOIDmode, cc_reg, const0_rtx), + GEN_INT (aarch64_nzcv_codes[AARCH64_GE]))); + + /* Combine the two terms. Since we want !(second_term): + (GE ? (compare tmp 0) : EQ), + so that the whole term is true for NE, false for EQ. */ + emit_insn (gen_ccmpccsi (cc_reg, tmp, const0_rtx, + gen_rtx_GE (VOIDmode, cc_reg, const0_rtx), + GEN_INT (aarch64_nzcv_codes[AARCH64_EQ]))); + + /* The result is entirely within the Z bit. */ + code = (code == GE ? NE : EQ); + break; + + default: + gcc_unreachable (); + } } else { - cc_mode = SELECT_CC_MODE (code, x, y); + machine_mode cc_mode = SELECT_CC_MODE (code, x, y); cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM); emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y)); } + + done: return gen_rtx_fmt_ee (code, VOIDmode, cc_reg, const0_rtx); } diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index c789b641e7c..fb076b60e3c 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -471,6 +471,20 @@ operands[2] = const0_rtx; }) +(define_expand "cbranchti4" + [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator" + [(match_operand:TI 1 "register_operand") + (match_operand:TI 2 "aarch64_reg_or_zero")]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "" +{ + operands[0] = aarch64_gen_compare_reg (GET_CODE (operands[0]), operands[1], + operands[2]); + operands[1] = XEXP (operands[0], 0); + operands[2] = const0_rtx; +}) + (define_expand "cbranch<mode>4" [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator" [(match_operand:GPF 1 "register_operand") @@ -4144,6 +4158,20 @@ operands[3] = const0_rtx; }) +(define_expand "cstoreti4" + [(set (match_operand:SI 0 "register_operand") + (match_operator:SI 1 "aarch64_comparison_operator" + [(match_operand:TI 2 "register_operand") + (match_operand:TI 3 "aarch64_reg_or_zero")]))] + "" +{ + operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2], + operands[3]); + PUT_MODE (operands[1], SImode); + operands[2] = XEXP (operands[1], 0); + operands[3] = const0_rtx; +}) + (define_expand "cstorecc4" [(set (match_operand:SI 0 "register_operand") (match_operator 1 "aarch64_comparison_operator_mode" -- 2.20.1