Hi! Improve expansion of x % c1 == c2 and x % c1 != c2 checks.
As mentioned in Hacker's Delight book, section 10-{16,17}, we can improve generated code for modulo by constant, if we only use the result to equality compare against some other constant (0 for all divisors, other constants only in certain cases at least so far). Right now for modulo we usually emit a highpart multiplication (to get quotient) followed by normal multiplication by the quotient and subtraction (plus the comparison). As the comment in the code try to explain, if c1 is odd and it is unsigned modulo, we can expand it as (x - c2) * c3 <= c4 where c3 is modular multiplicative inverse of c1 in the corresponding unsigned type and c4 is either -1U / c1 or one less than that, depending on the c2 value. If c1 is even, the patch uses r>> ctz (c1), but then supports only a subset of c2 values (only if the highest unsigned c2 values % c1 don't yield 0). The patch also supports signed modulo, again both for odd and even c1, but only for c2 == 0. The optimization is done during expansion using TER, and the patch computes cost of emitting normal modulo + cost of the c2 constant vs. cost of emitting the new optimized code + cost of the c4 constant. The patch doesn't try to optimize if x is used in division or another modulo by the same constant in the same basic block, assuming that it is likely optimized into division + modulo combined operation or at least the high part multiply reused between the two. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? This optimization during those 2 bootstraps + regtests triggered 1239 times (counted cases where the cost of optimized sequence was smaller than original), in particular 545x with unsigned modulo with odd c1, 51x with signed modulo with odd c1, 454x with unsigned modulo with even c1 and 189x with signed modulo with even c1. In the PR there is somewhat bigger test coverage, but I'm not sure if it isn't too big to be included in the testsuite. On a fast box each of the 6 tests takes 2-4 seconds to compile and runtime for each test is 2-18 seconds (if skipping the 128-bit tests 2-6 seconds), the non-32/64-bit tests are 36-64KiB long, 128-bit tests 104-184KiB (plus it needs random ()). 2018-09-04 Jakub Jelinek <ja...@redhat.com> PR middle-end/82853 * expr.h (maybe_optimize_mod_cmp): Declare. * expr.c (mod_inv): New function. (maybe_optimize_mod_cmp): New function. (do_store_flag): Use it. * cfgexpand.c (expand_gimple_cond): Likewise. * gcc.target/i386/pr82853-1.c: New test. * gcc.target/i386/pr82853-2.c: New test. --- gcc/expr.h.jj 2018-08-29 23:36:15.806122967 +0200 +++ gcc/expr.h 2018-09-04 09:38:35.215881588 +0200 @@ -290,6 +290,8 @@ expand_normal (tree exp) a string constant. */ extern tree string_constant (tree, tree *, tree *, tree *); +extern enum tree_code maybe_optimize_mod_cmp (enum tree_code, tree *, tree *); + /* Two different ways of generating switch statements. */ extern int try_casesi (tree, tree, tree, tree, rtx, rtx, rtx, profile_probability); extern int try_tablejump (tree, tree, tree, tree, rtx, rtx, profile_probability); --- gcc/expr.c.jj 2018-08-29 23:36:15.806122967 +0200 +++ gcc/expr.c 2018-09-04 12:31:37.538106639 +0200 @@ -11491,6 +11491,241 @@ string_constant (tree arg, tree *ptr_off return init; } +/* Compute the modular multiplicative inverse of A modulo M + using extended Euclid's algorithm. Assumes A and M are coprime. */ +static wide_int +mod_inv (const wide_int &a, const wide_int &b) +{ + /* Verify the assumption. */ + gcc_checking_assert (wi::eq_p (wi::gcd (a, b), 1)); + + unsigned int p = a.get_precision () + 1; + gcc_checking_assert (b.get_precision () + 1 == p); + wide_int c = wide_int::from (a, p, UNSIGNED); + wide_int d = wide_int::from (b, p, UNSIGNED); + wide_int x0 = wide_int::from (0, p, UNSIGNED); + wide_int x1 = wide_int::from (1, p, UNSIGNED); + + if (wi::eq_p (b, 1)) + return wide_int::from (1, p, UNSIGNED); + + while (wi::gt_p (c, 1, UNSIGNED)) + { + wide_int t = d; + wide_int q = wi::divmod_trunc (c, d, UNSIGNED, &d); + c = t; + wide_int s = x0; + x0 = wi::sub (x1, wi::mul (q, x0)); + x1 = s; + } + if (wi::lt_p (x1, 0, SIGNED)) + x1 += d; + return x1; +} + +/* Attempt to optimize unsigned (X % C1) == C2 (or (X % C1) != C2). + If C1 is odd to: + (X - C2) * C3 <= C4 (or >), where + C3 is modular multiplicative inverse of C1 and 1<<prec and + C4 is ((1<<prec) - 1) / C1 or ((1<<prec) - 1) / C1 - 1 (the latter + if C2 > ((1<<prec) - 1) % C1). + If C1 is even, S = ctz (C1) and C2 is 0, use + ((X * C3) r>> S) <= C4, where C3 is modular multiplicative + inverse of C1>>S and 1<<prec and C4 is (((1<<prec) - 1) / (C1>>S)) >> S. + + For signed (X % C1) == 0 if C1 is odd to (all operations in it + unsigned): + (X * C3) + C4 <= 2 * C4, where + C3 is modular multiplicative inverse of (unsigned) C1 and 1<<prec and + C4 is ((1<<(prec - 1) - 1) / C1). + If C1 is even, S = ctz(C1), use + ((X * C3) + C4) r>> S <= (C4 >> (S - 1)) + where C3 is modular multiplicative inverse of (unsigned)(C1>>S) and 1<<prec + and C4 is ((1<<(prec - 1) - 1) / (C1>>S)) & (-1<<S). + + See the Hacker's Delight book, section 10-17. */ +enum tree_code +maybe_optimize_mod_cmp (enum tree_code code, tree *arg0, tree *arg1) +{ + gcc_checking_assert (code == EQ_EXPR || code == NE_EXPR); + gcc_checking_assert (TREE_CODE (*arg1) == INTEGER_CST); + + if (optimize < 2) + return code; + + gimple *stmt = get_def_for_expr (*arg0, TRUNC_MOD_EXPR); + if (stmt == NULL) + return code; + + tree treeop0 = gimple_assign_rhs1 (stmt); + tree treeop1 = gimple_assign_rhs2 (stmt); + if (TREE_CODE (treeop0) != SSA_NAME + || TREE_CODE (treeop1) != INTEGER_CST + /* x % pow2 is handled right already. */ + || integer_pow2p (treeop1) + /* Don't optimize the undefined behavior case x % 0; + x % 1 should have been optimized into zero, punt if + it makes it here for whatever reason; + x % -c should have been optimized into x % c. */ + || compare_tree_int (treeop1, 2) <= 0 + /* Likewise x % c == d where d >= c should be always + false. */ + || tree_int_cst_le (treeop1, *arg1)) + return code; + + tree type = TREE_TYPE (*arg0); + scalar_int_mode mode; + if (!is_a <scalar_int_mode> (TYPE_MODE (type), &mode)) + return code; + if (GET_MODE_BITSIZE (mode) != TYPE_PRECISION (type) + || TYPE_PRECISION (type) <= 1) + return code; + + signop sgn = UNSIGNED; + /* If both operands are known to have the sign bit clear, handle + even the signed modulo case as unsigned. treeop1 is always + positive >= 2, checked above. */ + if (!TYPE_UNSIGNED (type) && get_range_pos_neg (treeop0) != 1) + sgn = SIGNED; + + if (!TYPE_UNSIGNED (type)) + { + type = unsigned_type_for (type); + if (!type || TYPE_MODE (type) != TYPE_MODE (TREE_TYPE (*arg0))) + return code; + } + + int prec = TYPE_PRECISION (type); + wide_int w = wi::to_wide (treeop1); + int shift = wi::ctz (w); + /* Unsigned (X % C1) == C2 is equivalent to (X - C2) % C1 == 0 if + C2 <= -1U % C1, because for any Z >= 0U - C2 in that case (Z % C1) != 0. + If C1 is odd, we can handle all cases by subtracting + C4 below. We could handle even the even C1 and C2 > -1U % C1 cases + e.g. by testing for overflow on the subtraction, punt on that for now + though. */ + if ((sgn == SIGNED || shift) && !integer_zerop (*arg1)) + { + if (sgn == SIGNED) + return code; + wide_int x = wi::umod_trunc (wi::mask (prec, false, prec), w); + if (wi::gtu_p (wi::to_wide (*arg1), x)) + return code; + } + + imm_use_iterator imm_iter; + use_operand_p use_p; + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, treeop0) + { + gimple *use_stmt = USE_STMT (use_p); + /* Punt if treeop0 is used in the same bb in a division + or another modulo with the same divisor. We should expect + the division and modulo combined together. */ + if (use_stmt == stmt + || gimple_bb (use_stmt) != gimple_bb (stmt)) + continue; + if (!is_gimple_assign (use_stmt) + || (gimple_assign_rhs_code (use_stmt) != TRUNC_DIV_EXPR + && gimple_assign_rhs_code (use_stmt) != TRUNC_MOD_EXPR)) + continue; + if (gimple_assign_rhs1 (use_stmt) != treeop0 + || !operand_equal_p (gimple_assign_rhs2 (use_stmt), treeop1, 0)) + continue; + return code; + } + + w = wi::lrshift (w, shift); + wide_int a = wide_int::from (w, prec + 1, UNSIGNED); + wide_int b = wi::shifted_mask (prec, 1, false, prec + 1); + wide_int m = wide_int::from (mod_inv (a, b), prec, UNSIGNED); + tree c3 = wide_int_to_tree (type, m); + tree c5 = NULL_TREE; + wide_int d, e; + if (sgn == UNSIGNED) + { + d = wi::divmod_trunc (wi::mask (prec, false, prec), w, UNSIGNED, &e); + /* Use <= floor ((1<<prec) - 1) / C1 only if C2 <= ((1<<prec) - 1) % C1, + otherwise use < or subtract one from C4. E.g. for + x % 3U == 0 we transform this into x * 0xaaaaaaab <= 0x55555555, but + x % 3U == 1 already needs to be + (x - 1) * 0xaaaaaaabU <= 0x55555554. */ + if (!shift && wi::gtu_p (wi::to_wide (*arg1), e)) + d -= 1; + if (shift) + d = wi::lrshift (d, shift); + } + else + { + e = wi::udiv_trunc (wi::mask (prec - 1, false, prec), w); + if (!shift) + d = wi::lshift (e, 1); + else + { + e = wi::bit_and (e, wi::mask (shift, true, prec)); + d = wi::lrshift (e, shift - 1); + } + c5 = wide_int_to_tree (type, e); + } + tree c4 = wide_int_to_tree (type, d); + + rtx op0 = expand_normal (treeop0); + treeop0 = make_tree (TREE_TYPE (treeop0), op0); + + bool speed_p = optimize_insn_for_speed_p (); + + do_pending_stack_adjust (); + + location_t loc = gimple_location (stmt); + struct separate_ops ops; + ops.code = TRUNC_MOD_EXPR; + ops.location = loc; + ops.type = TREE_TYPE (treeop0); + ops.op0 = treeop0; + ops.op1 = treeop1; + ops.op2 = NULL_TREE; + start_sequence (); + rtx mor = expand_expr_real_2 (&ops, NULL_RTX, TYPE_MODE (ops.type), + EXPAND_NORMAL); + rtx_insn *moinsns = get_insns (); + end_sequence (); + + unsigned mocost = seq_cost (moinsns, speed_p); + mocost += rtx_cost (mor, mode, EQ, 0, speed_p); + mocost += rtx_cost (expand_normal (*arg1), mode, EQ, 1, speed_p); + + tree t = fold_convert_loc (loc, type, treeop0); + if (!integer_zerop (*arg1)) + t = fold_build2_loc (loc, MINUS_EXPR, type, t, fold_convert (type, *arg1)); + t = fold_build2_loc (loc, MULT_EXPR, type, t, c3); + if (sgn == SIGNED) + t = fold_build2_loc (loc, PLUS_EXPR, type, t, c5); + if (shift) + { + tree s = build_int_cst (NULL_TREE, shift); + t = fold_build2_loc (loc, RROTATE_EXPR, type, t, s); + } + + start_sequence (); + rtx mur = expand_normal (t); + rtx_insn *muinsns = get_insns (); + end_sequence (); + + unsigned mucost = seq_cost (muinsns, speed_p); + mucost += rtx_cost (mur, mode, LE, 0, speed_p); + mucost += rtx_cost (expand_normal (c4), mode, LE, 1, speed_p); + + if (mocost <= mucost) + { + *arg0 = build2_loc (loc, TRUNC_MOD_EXPR, TREE_TYPE (treeop0), + treeop0, treeop1); + return code; + } + + *arg0 = t; + *arg1 = c4; + return code == EQ_EXPR ? LE_EXPR : GT_EXPR; +} + /* Generate code to calculate OPS, and exploded expression using a store-flag instruction and return an rtx for the result. OPS reflects a comparison. @@ -11548,7 +11783,7 @@ do_store_flag (sepops ops, rtx target, m STRIP_NOPS (arg0); STRIP_NOPS (arg1); - + /* For vector typed comparisons emit code to generate the desired all-ones or all-zeros mask. Conveniently use the VEC_COND_EXPR expander for this. */ @@ -11567,6 +11802,24 @@ do_store_flag (sepops ops, rtx target, m } } + /* Optimize (x % C1) == C2 or (x % C1) != C2 if it is beneficial + into (x - C2) * C3 < C4. */ + if ((ops->code == EQ_EXPR || ops->code == NE_EXPR) + && TREE_CODE (arg0) == SSA_NAME + && TREE_CODE (arg1) == INTEGER_CST) + { + enum tree_code code = maybe_optimize_mod_cmp (ops->code, &arg0, &arg1); + if (code != ops->code) + { + struct separate_ops nops = *ops; + nops.code = ops->code = code; + nops.op0 = arg0; + nops.op1 = arg1; + nops.type = TREE_TYPE (arg0); + return do_store_flag (&nops, target, mode); + } + } + /* Get the rtx comparison code to use. We know that EXP is a comparison operation of some type. Some comparisons against 1 and -1 can be converted to comparisons with zero. Do so here so that the tests --- gcc/cfgexpand.c.jj 2018-08-27 17:50:47.223432391 +0200 +++ gcc/cfgexpand.c 2018-09-04 09:39:02.088434814 +0200 @@ -2477,6 +2477,13 @@ expand_gimple_cond (basic_block bb, gcon } } + /* Optimize (x % C1) == C2 or (x % C1) != C2 if it is beneficial + into (x - C2) * C3 < C4. */ + if ((code == EQ_EXPR || code == NE_EXPR) + && TREE_CODE (op0) == SSA_NAME + && TREE_CODE (op1) == INTEGER_CST) + code = maybe_optimize_mod_cmp (code, &op0, &op1); + last2 = last = get_last_insn (); extract_true_false_edges_from_block (bb, &true_edge, &false_edge); --- gcc/testsuite/gcc.target/i386/pr82853-1.c.jj 2018-09-04 14:18:02.604719752 +0200 +++ gcc/testsuite/gcc.target/i386/pr82853-1.c 2018-09-04 14:19:32.964228535 +0200 @@ -0,0 +1,15 @@ +/* PR middle-end/82853 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-bmi2 -mtune=generic" } */ +/* { dg-final { scan-assembler-times "mul\[lq]\t" 7 } } */ +/* { dg-final { scan-assembler-not "div\[lq]\t" } } */ +/* { dg-final { scan-assembler-not "lea\[lq]\t\[^\n\r]*,\[^\n\r]*,2\\)" } } */ + +unsigned f1 (unsigned x) { return (x % 679U) == 0; } +unsigned f2 (unsigned x) { return (x % 1738U) == 0; } +void bar (void); +void f3 (unsigned x) { if (x % 3 == 0) bar (); } +void f4 (unsigned x) { if (x % 3 == 1) bar (); } +void f5 (unsigned x) { if (x % 3 == 2) bar (); } +int f6 (int x) { return x % 3 == 0; } +int f7 (int x) { return x % 6 == 0; } --- gcc/testsuite/gcc.target/i386/pr82853-2.c.jj 2018-09-04 14:20:04.981700147 +0200 +++ gcc/testsuite/gcc.target/i386/pr82853-2.c 2018-09-04 14:20:22.151416254 +0200 @@ -0,0 +1,7 @@ +/* PR middle-end/82853 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-bmi2 -mtune=generic" } */ +/* { dg-final { scan-assembler-times "mul\[lq]\t" 2 } } */ +/* { dg-final { scan-assembler-not "div\[lq]\t" } } */ + +unsigned f1 (unsigned x, unsigned *y) { *y = x / 679U; return (x % 679U) == 0; } Jakub