Hi! For targets that don't have {,u}mulv<mode>4 insn we try 3 different expansions of the basic signed * signed -> signed or unsigned * unsigned -> unsigned overflow computation. The first one is done if if (GET_MODE_2XWIDER_MODE (mode).exists (&wmode) && targetm.scalar_mode_supported_p (wmode)) and we emit a WIDEN_MULT_EXPR followed by extraction of the hipart from it (for testing overflow if both unsigned and signed) and lowpart (result of the multiplication and for signed overflow testing where we use MSB of it). This case is meant for use by smaller modes, e.g. subword, where it is generally pretty efficient. Unfortunately on some targets, e.g. mips 64-bit, where the is no widening mult optab it can be expanded as a libcall on the full wmode operands, which is slow and causes problems e.g. to some freestanding environments like Linux kernel that don't bother to link in libgcc.a or replacement thereof. Then there is another case, usually pretty large, with usually two but sometimes one multiplication, and various conditionals, shifts, etc. meant primarily for the widest supported mode. And the last fallback is just doing multiplication and never computing overflow, hopefully it is never used at least on sane targets.
This patch attempts to check if we'd emit WIDEN_MULT_EXPR as a libcall and in that case tries to use the other possibilities, and only falls back to the WIDEN_MULT_EXPR with a libcall if we'd otherwise use the last fallback without overflow computation. In addition to it, it adds support for targets that have supported MULT_HIGHPART_EXPR for that mode, by doing pretty much what the WIDEN_MULT_EXPR case does, but instead of doing one multiplication to compute both lowpart and highpart and then shifts to split those we use one multiplication to compute the lowpart and one MULT_HIGHPART_EXPR to compute the highpart. In theory this method doesn't have to be always faster than the one with hmode, because the MULT_HIGHPART_EXPR case does 2 multiplications plus one comparison, while the hmode code does sometimes just one, but it is significantly shorter, fewer conditionals/branches so I think it should be generally a win (if it turns out not to be the case on some target, we could restrict it to -Os only or whatever). And lastly, the MULT_HIGHPART_EXPR case can actually be the optimal code if we are only checking for the overflow and don't actually need the multiplication value, it is unsigned multiply and we don't need any res using code afterwards; in that case the low part multiply can be DCEd and only the highpart multiply + comparison will remain. So, the patch adds check for single IMAGPART_EXPR use and other conditions and uses the MULT_HIGHPART_EXPR code in preference of the WIDEN_MULT_EXPR in that case. Bootstrapped/regtested on x86_64-linux and i686-linux, tested on the testcase with cross to mips, ok for trunk? 2017-11-14 Jakub Jelinek <ja...@redhat.com> PR target/82981 * internal-fn.c: Include gimple-ssa.h, tree-phinodes.h and ssa-iterators.h. (can_widen_mult_without_libcall): New function. (expand_mul_overflow): If only checking unsigned mul overflow, not result, and can do efficiently MULT_HIGHPART_EXPR, emit that. Don't use WIDEN_MULT_EXPR if it would involve a libcall, unless no other way works. Add MULT_HIGHPART_EXPR + MULT_EXPR support. (expand_DIVMOD): Formatting fix. * expmed.h (expand_mult): Add NO_LIBCALL argument. * expmed.c (expand_mult): Likewise. Use OPTAB_WIDEN rather than OPTAB_LIB_WIDEN if NO_LIBCALL is true, and allow it to fail. --- gcc/internal-fn.c.jj 2017-10-23 10:13:08.000000000 +0200 +++ gcc/internal-fn.c 2017-11-14 16:48:25.414403348 +0100 @@ -46,6 +46,9 @@ along with GCC; see the file COPYING3. #include "recog.h" #include "builtins.h" #include "optabs-tree.h" +#include "gimple-ssa.h" +#include "tree-phinodes.h" +#include "ssa-iterators.h" /* The names of each internal function, indexed by function number. */ const char *const internal_fn_name_array[] = { @@ -1172,6 +1175,35 @@ expand_neg_overflow (location_t loc, tre } } +/* Return true if UNS WIDEN_MULT_EXPR with result mode WMODE and operand + mode MODE can be expanded without using a libcall. */ + +static bool +can_widen_mult_without_libcall (scalar_int_mode wmode, scalar_int_mode mode, + rtx op0, rtx op1, bool uns) +{ + if (find_widening_optab_handler (umul_widen_optab, wmode, mode) + != CODE_FOR_nothing) + return true; + + if (find_widening_optab_handler (smul_widen_optab, wmode, mode) + != CODE_FOR_nothing) + return true; + + rtx_insn *last = get_last_insn (); + if (CONSTANT_P (op0)) + op0 = convert_modes (wmode, mode, op0, uns); + else + op0 = gen_raw_REG (wmode, LAST_VIRTUAL_REGISTER + 1); + if (CONSTANT_P (op1)) + op1 = convert_modes (wmode, mode, op1, uns); + else + op1 = gen_raw_REG (wmode, LAST_VIRTUAL_REGISTER + 2); + rtx ret = expand_mult (wmode, op0, op1, NULL_RTX, uns, true); + delete_insns_since (last); + return ret != NULL_RTX; +} + /* Add mul overflow checking to the statement STMT. */ static void @@ -1465,9 +1497,29 @@ expand_mul_overflow (location_t loc, tre ops.op1 = make_tree (type, op1); ops.op2 = NULL_TREE; ops.location = loc; + + /* Optimize unsigned overflow check where we don't use the + multiplication result, just whether overflow happened. + If we can do MULT_HIGHPART_EXPR, that followed by + comparison of the result against zero is cheapest. + We'll still compute res, but it should be DCEd later. */ + use_operand_p use; + gimple *use_stmt; + if (!is_ubsan + && lhs + && uns + && !(uns0_p && uns1_p && !unsr_p) + && can_mult_highpart_p (mode, uns) == 1 + && single_imm_use (lhs, &use, &use_stmt) + && is_gimple_assign (use_stmt) + && gimple_assign_rhs_code (use_stmt) == IMAGPART_EXPR) + goto highpart; + if (GET_MODE_2XWIDER_MODE (mode).exists (&wmode) - && targetm.scalar_mode_supported_p (wmode)) + && targetm.scalar_mode_supported_p (wmode) + && can_widen_mult_without_libcall (wmode, mode, op0, op1, uns)) { + twoxwider: ops.code = WIDEN_MULT_EXPR; ops.type = build_nonstandard_integer_type (GET_MODE_PRECISION (wmode), uns); @@ -1495,6 +1547,35 @@ expand_mul_overflow (location_t loc, tre profile_probability::very_likely ()); } } + else if (can_mult_highpart_p (mode, uns) == 1) + { + highpart: + ops.code = MULT_HIGHPART_EXPR; + ops.type = type; + + rtx hipart = expand_expr_real_2 (&ops, NULL_RTX, mode, + EXPAND_NORMAL); + ops.code = MULT_EXPR; + res = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL); + if (uns) + /* For the unsigned multiplication, there was overflow if + HIPART is non-zero. */ + do_compare_rtx_and_jump (hipart, const0_rtx, EQ, true, mode, + NULL_RTX, NULL, done_label, + profile_probability::very_likely ()); + else + { + rtx signbit = expand_shift (RSHIFT_EXPR, mode, res, prec - 1, + NULL_RTX, 0); + /* RES is low half of the double width result, HIPART + the high half. There was overflow if + HIPART is different from RES < 0 ? -1 : 0. */ + do_compare_rtx_and_jump (signbit, hipart, EQ, true, mode, + NULL_RTX, NULL, done_label, + profile_probability::very_likely ()); + } + + } else if (int_mode_for_size (prec / 2, 1).exists (&hmode) && 2 * GET_MODE_PRECISION (hmode) == prec) { @@ -1800,6 +1881,11 @@ expand_mul_overflow (location_t loc, tre tem = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL); emit_move_insn (res, tem); } + else if (GET_MODE_2XWIDER_MODE (mode).exists (&wmode) + && targetm.scalar_mode_supported_p (wmode)) + /* Even emitting a libcall is better than not detecting overflow + at all. */ + goto twoxwider; else { gcc_assert (!is_ubsan); @@ -2588,7 +2674,7 @@ expand_DIVMOD (internal_fn, gcall *call_ expand_expr (build2 (COMPLEX_EXPR, TREE_TYPE (lhs), make_tree (TREE_TYPE (arg0), quotient), make_tree (TREE_TYPE (arg1), remainder)), - target, VOIDmode, EXPAND_NORMAL); + target, VOIDmode, EXPAND_NORMAL); } /* Expand a call to FN using the operands in STMT. FN has a single --- gcc/expmed.h.jj 2017-09-01 09:26:37.000000000 +0200 +++ gcc/expmed.h 2017-11-14 12:32:11.344054003 +0100 @@ -727,7 +727,7 @@ extern rtx extract_bit_field (rtx, unsig unsigned HOST_WIDE_INT, int, rtx, machine_mode, machine_mode, bool, rtx *); extern rtx extract_low_bits (machine_mode, machine_mode, rtx); -extern rtx expand_mult (machine_mode, rtx, rtx, rtx, int); +extern rtx expand_mult (machine_mode, rtx, rtx, rtx, int, bool = false); extern rtx expand_mult_highpart_adjust (scalar_int_mode, rtx, rtx, rtx, rtx, int); --- gcc/expmed.c.jj 2017-11-09 15:38:48.000000000 +0100 +++ gcc/expmed.c 2017-11-14 12:50:24.948946475 +0100 @@ -3284,7 +3284,7 @@ expand_mult_const (machine_mode mode, rt rtx expand_mult (machine_mode mode, rtx op0, rtx op1, rtx target, - int unsignedp) + int unsignedp, bool no_libcall) { enum mult_variant variant; struct algorithm algorithm; @@ -3420,14 +3420,16 @@ expand_mult (machine_mode mode, rtx op0, { op0 = force_reg (GET_MODE (op0), op0); return expand_binop (mode, add_optab, op0, op0, - target, unsignedp, OPTAB_LIB_WIDEN); + target, unsignedp, + no_libcall ? OPTAB_WIDEN : OPTAB_LIB_WIDEN); } /* This used to use umul_optab if unsigned, but for non-widening multiply there is no difference between signed and unsigned. */ op0 = expand_binop (mode, do_trapv ? smulv_optab : smul_optab, - op0, op1, target, unsignedp, OPTAB_LIB_WIDEN); - gcc_assert (op0); + op0, op1, target, unsignedp, + no_libcall ? OPTAB_WIDEN : OPTAB_LIB_WIDEN); + gcc_assert (op0 || no_libcall); return op0; } Jakub