On Wed, Sep 12, 2018 at 03:02:41PM +0200, Richard Biener wrote: > So I guess your patch is OK (with the small followup you posted).
Thanks, here is what I've committed after another bootstrap/regtest: 2018-09-12 Jakub Jelinek <ja...@redhat.com> PR middle-end/82853 * expr.h (maybe_optimize_mod_cmp): Declare. * expr.c (mod_inv): New function. (maybe_optimize_mod_cmp): New function. (do_store_flag): Use it. * cfgexpand.c (expand_gimple_cond): Likewise. * gcc.target/i386/pr82853-1.c: New test. * gcc.target/i386/pr82853-2.c: New test. --- gcc/expr.h.jj 2018-09-11 18:12:17.716325233 +0200 +++ gcc/expr.h 2018-09-12 15:20:28.329957642 +0200 @@ -290,6 +290,8 @@ expand_normal (tree exp) a string constant. */ extern tree string_constant (tree, tree *, tree *, tree *); +extern enum tree_code maybe_optimize_mod_cmp (enum tree_code, tree *, tree *); + /* Two different ways of generating switch statements. */ extern int try_casesi (tree, tree, tree, tree, rtx, rtx, rtx, profile_probability); extern int try_tablejump (tree, tree, tree, tree, rtx, rtx, profile_probability); --- gcc/expr.c.jj 2018-09-11 18:12:17.665326073 +0200 +++ gcc/expr.c 2018-09-12 15:20:37.448807135 +0200 @@ -11491,6 +11491,243 @@ string_constant (tree arg, tree *ptr_off return init; } +/* Compute the modular multiplicative inverse of A modulo M + using extended Euclid's algorithm. Assumes A and M are coprime. */ +static wide_int +mod_inv (const wide_int &a, const wide_int &b) +{ + /* Verify the assumption. */ + gcc_checking_assert (wi::eq_p (wi::gcd (a, b), 1)); + + unsigned int p = a.get_precision () + 1; + gcc_checking_assert (b.get_precision () + 1 == p); + wide_int c = wide_int::from (a, p, UNSIGNED); + wide_int d = wide_int::from (b, p, UNSIGNED); + wide_int x0 = wide_int::from (0, p, UNSIGNED); + wide_int x1 = wide_int::from (1, p, UNSIGNED); + + if (wi::eq_p (b, 1)) + return wide_int::from (1, p, UNSIGNED); + + while (wi::gt_p (c, 1, UNSIGNED)) + { + wide_int t = d; + wide_int q = wi::divmod_trunc (c, d, UNSIGNED, &d); + c = t; + wide_int s = x0; + x0 = wi::sub (x1, wi::mul (q, x0)); + x1 = s; + } + if (wi::lt_p (x1, 0, SIGNED)) + x1 += d; + return x1; +} + +/* Attempt to optimize unsigned (X % C1) == C2 (or (X % C1) != C2). + If C1 is odd to: + (X - C2) * C3 <= C4 (or >), where + C3 is modular multiplicative inverse of C1 and 1<<prec and + C4 is ((1<<prec) - 1) / C1 or ((1<<prec) - 1) / C1 - 1 (the latter + if C2 > ((1<<prec) - 1) % C1). + If C1 is even, S = ctz (C1) and C2 is 0, use + ((X * C3) r>> S) <= C4, where C3 is modular multiplicative + inverse of C1>>S and 1<<prec and C4 is (((1<<prec) - 1) / (C1>>S)) >> S. + + For signed (X % C1) == 0 if C1 is odd to (all operations in it + unsigned): + (X * C3) + C4 <= 2 * C4, where + C3 is modular multiplicative inverse of (unsigned) C1 and 1<<prec and + C4 is ((1<<(prec - 1) - 1) / C1). + If C1 is even, S = ctz(C1), use + ((X * C3) + C4) r>> S <= (C4 >> (S - 1)) + where C3 is modular multiplicative inverse of (unsigned)(C1>>S) and 1<<prec + and C4 is ((1<<(prec - 1) - 1) / (C1>>S)) & (-1<<S). + + See the Hacker's Delight book, section 10-17. */ +enum tree_code +maybe_optimize_mod_cmp (enum tree_code code, tree *arg0, tree *arg1) +{ + gcc_checking_assert (code == EQ_EXPR || code == NE_EXPR); + gcc_checking_assert (TREE_CODE (*arg1) == INTEGER_CST); + + if (optimize < 2) + return code; + + gimple *stmt = get_def_for_expr (*arg0, TRUNC_MOD_EXPR); + if (stmt == NULL) + return code; + + tree treeop0 = gimple_assign_rhs1 (stmt); + tree treeop1 = gimple_assign_rhs2 (stmt); + if (TREE_CODE (treeop0) != SSA_NAME + || TREE_CODE (treeop1) != INTEGER_CST + /* x % pow2 is handled right already. */ + || integer_pow2p (treeop1) + /* Don't optimize the undefined behavior case x % 0; + x % 1 should have been optimized into zero, punt if + it makes it here for whatever reason; + x % -c should have been optimized into x % c. */ + || compare_tree_int (treeop1, 2) <= 0 + /* Likewise x % c == d where d >= c should be always false. */ + || tree_int_cst_le (treeop1, *arg1)) + return code; + + tree type = TREE_TYPE (*arg0); + scalar_int_mode mode; + if (!is_a <scalar_int_mode> (TYPE_MODE (type), &mode)) + return code; + if (GET_MODE_BITSIZE (mode) != TYPE_PRECISION (type) + || TYPE_PRECISION (type) <= 1) + return code; + + signop sgn = UNSIGNED; + /* If both operands are known to have the sign bit clear, handle + even the signed modulo case as unsigned. treeop1 is always + positive >= 2, checked above. */ + if (!TYPE_UNSIGNED (type) && get_range_pos_neg (treeop0) != 1) + sgn = SIGNED; + + if (!TYPE_UNSIGNED (type)) + { + if (tree_int_cst_sgn (*arg1) == -1) + return code; + type = unsigned_type_for (type); + if (!type || TYPE_MODE (type) != TYPE_MODE (TREE_TYPE (*arg0))) + return code; + } + + int prec = TYPE_PRECISION (type); + wide_int w = wi::to_wide (treeop1); + int shift = wi::ctz (w); + /* Unsigned (X % C1) == C2 is equivalent to (X - C2) % C1 == 0 if + C2 <= -1U % C1, because for any Z >= 0U - C2 in that case (Z % C1) != 0. + If C1 is odd, we can handle all cases by subtracting + C4 below. We could handle even the even C1 and C2 > -1U % C1 cases + e.g. by testing for overflow on the subtraction, punt on that for now + though. */ + if ((sgn == SIGNED || shift) && !integer_zerop (*arg1)) + { + if (sgn == SIGNED) + return code; + wide_int x = wi::umod_trunc (wi::mask (prec, false, prec), w); + if (wi::gtu_p (wi::to_wide (*arg1), x)) + return code; + } + + imm_use_iterator imm_iter; + use_operand_p use_p; + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, treeop0) + { + gimple *use_stmt = USE_STMT (use_p); + /* Punt if treeop0 is used in the same bb in a division + or another modulo with the same divisor. We should expect + the division and modulo combined together. */ + if (use_stmt == stmt + || gimple_bb (use_stmt) != gimple_bb (stmt)) + continue; + if (!is_gimple_assign (use_stmt) + || (gimple_assign_rhs_code (use_stmt) != TRUNC_DIV_EXPR + && gimple_assign_rhs_code (use_stmt) != TRUNC_MOD_EXPR)) + continue; + if (gimple_assign_rhs1 (use_stmt) != treeop0 + || !operand_equal_p (gimple_assign_rhs2 (use_stmt), treeop1, 0)) + continue; + return code; + } + + w = wi::lrshift (w, shift); + wide_int a = wide_int::from (w, prec + 1, UNSIGNED); + wide_int b = wi::shifted_mask (prec, 1, false, prec + 1); + wide_int m = wide_int::from (mod_inv (a, b), prec, UNSIGNED); + tree c3 = wide_int_to_tree (type, m); + tree c5 = NULL_TREE; + wide_int d, e; + if (sgn == UNSIGNED) + { + d = wi::divmod_trunc (wi::mask (prec, false, prec), w, UNSIGNED, &e); + /* Use <= floor ((1<<prec) - 1) / C1 only if C2 <= ((1<<prec) - 1) % C1, + otherwise use < or subtract one from C4. E.g. for + x % 3U == 0 we transform this into x * 0xaaaaaaab <= 0x55555555, but + x % 3U == 1 already needs to be + (x - 1) * 0xaaaaaaabU <= 0x55555554. */ + if (!shift && wi::gtu_p (wi::to_wide (*arg1), e)) + d -= 1; + if (shift) + d = wi::lrshift (d, shift); + } + else + { + e = wi::udiv_trunc (wi::mask (prec - 1, false, prec), w); + if (!shift) + d = wi::lshift (e, 1); + else + { + e = wi::bit_and (e, wi::mask (shift, true, prec)); + d = wi::lrshift (e, shift - 1); + } + c5 = wide_int_to_tree (type, e); + } + tree c4 = wide_int_to_tree (type, d); + + rtx op0 = expand_normal (treeop0); + treeop0 = make_tree (TREE_TYPE (treeop0), op0); + + bool speed_p = optimize_insn_for_speed_p (); + + do_pending_stack_adjust (); + + location_t loc = gimple_location (stmt); + struct separate_ops ops; + ops.code = TRUNC_MOD_EXPR; + ops.location = loc; + ops.type = TREE_TYPE (treeop0); + ops.op0 = treeop0; + ops.op1 = treeop1; + ops.op2 = NULL_TREE; + start_sequence (); + rtx mor = expand_expr_real_2 (&ops, NULL_RTX, TYPE_MODE (ops.type), + EXPAND_NORMAL); + rtx_insn *moinsns = get_insns (); + end_sequence (); + + unsigned mocost = seq_cost (moinsns, speed_p); + mocost += rtx_cost (mor, mode, EQ, 0, speed_p); + mocost += rtx_cost (expand_normal (*arg1), mode, EQ, 1, speed_p); + + tree t = fold_convert_loc (loc, type, treeop0); + if (!integer_zerop (*arg1)) + t = fold_build2_loc (loc, MINUS_EXPR, type, t, fold_convert (type, *arg1)); + t = fold_build2_loc (loc, MULT_EXPR, type, t, c3); + if (sgn == SIGNED) + t = fold_build2_loc (loc, PLUS_EXPR, type, t, c5); + if (shift) + { + tree s = build_int_cst (NULL_TREE, shift); + t = fold_build2_loc (loc, RROTATE_EXPR, type, t, s); + } + + start_sequence (); + rtx mur = expand_normal (t); + rtx_insn *muinsns = get_insns (); + end_sequence (); + + unsigned mucost = seq_cost (muinsns, speed_p); + mucost += rtx_cost (mur, mode, LE, 0, speed_p); + mucost += rtx_cost (expand_normal (c4), mode, LE, 1, speed_p); + + if (mocost <= mucost) + { + emit_insn (moinsns); + *arg0 = make_tree (TREE_TYPE (*arg0), mor); + return code; + } + + emit_insn (muinsns); + *arg0 = make_tree (type, mur); + *arg1 = c4; + return code == EQ_EXPR ? LE_EXPR : GT_EXPR; +} + /* Generate code to calculate OPS, and exploded expression using a store-flag instruction and return an rtx for the result. OPS reflects a comparison. @@ -11548,7 +11785,7 @@ do_store_flag (sepops ops, rtx target, m STRIP_NOPS (arg0); STRIP_NOPS (arg1); - + /* For vector typed comparisons emit code to generate the desired all-ones or all-zeros mask. Conveniently use the VEC_COND_EXPR expander for this. */ @@ -11567,6 +11804,24 @@ do_store_flag (sepops ops, rtx target, m } } + /* Optimize (x % C1) == C2 or (x % C1) != C2 if it is beneficial + into (x - C2) * C3 < C4. */ + if ((ops->code == EQ_EXPR || ops->code == NE_EXPR) + && TREE_CODE (arg0) == SSA_NAME + && TREE_CODE (arg1) == INTEGER_CST) + { + enum tree_code code = maybe_optimize_mod_cmp (ops->code, &arg0, &arg1); + if (code != ops->code) + { + struct separate_ops nops = *ops; + nops.code = ops->code = code; + nops.op0 = arg0; + nops.op1 = arg1; + nops.type = TREE_TYPE (arg0); + return do_store_flag (&nops, target, mode); + } + } + /* Get the rtx comparison code to use. We know that EXP is a comparison operation of some type. Some comparisons against 1 and -1 can be converted to comparisons with zero. Do so here so that the tests --- gcc/cfgexpand.c.jj 2018-09-11 18:12:17.461329440 +0200 +++ gcc/cfgexpand.c 2018-09-12 15:20:28.333957576 +0200 @@ -2477,6 +2477,13 @@ expand_gimple_cond (basic_block bb, gcon } } + /* Optimize (x % C1) == C2 or (x % C1) != C2 if it is beneficial + into (x - C2) * C3 < C4. */ + if ((code == EQ_EXPR || code == NE_EXPR) + && TREE_CODE (op0) == SSA_NAME + && TREE_CODE (op1) == INTEGER_CST) + code = maybe_optimize_mod_cmp (code, &op0, &op1); + last2 = last = get_last_insn (); extract_true_false_edges_from_block (bb, &true_edge, &false_edge); --- gcc/testsuite/gcc.target/i386/pr82853-1.c.jj 2018-09-12 15:20:28.334957560 +0200 +++ gcc/testsuite/gcc.target/i386/pr82853-1.c 2018-09-12 15:20:28.334957560 +0200 @@ -0,0 +1,15 @@ +/* PR middle-end/82853 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-bmi2 -mtune=generic" } */ +/* { dg-final { scan-assembler-times "mul\[lq]\t" 7 } } */ +/* { dg-final { scan-assembler-not "div\[lq]\t" } } */ +/* { dg-final { scan-assembler-not "lea\[lq]\t\[^\n\r]*,\[^\n\r]*,2\\)" } } */ + +unsigned f1 (unsigned x) { return (x % 679U) == 0; } +unsigned f2 (unsigned x) { return (x % 1738U) == 0; } +void bar (void); +void f3 (unsigned x) { if (x % 3 == 0) bar (); } +void f4 (unsigned x) { if (x % 3 == 1) bar (); } +void f5 (unsigned x) { if (x % 3 == 2) bar (); } +int f6 (int x) { return x % 3 == 0; } +int f7 (int x) { return x % 6 == 0; } --- gcc/testsuite/gcc.target/i386/pr82853-2.c.jj 2018-09-12 15:20:28.334957560 +0200 +++ gcc/testsuite/gcc.target/i386/pr82853-2.c 2018-09-12 15:20:28.334957560 +0200 @@ -0,0 +1,7 @@ +/* PR middle-end/82853 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-bmi2 -mtune=generic" } */ +/* { dg-final { scan-assembler-times "mul\[lq]\t" 2 } } */ +/* { dg-final { scan-assembler-not "div\[lq]\t" } } */ + +unsigned f1 (unsigned x, unsigned *y) { *y = x / 679U; return (x % 679U) == 0; } Jakub