[PATCH] Small expand_mul_overflow improvement (PR target/82981)

Jakub Jelinek Tue, 14 Nov 2017 13:21:39 -0800

Hi!

For targets that don't have {,u}mulv<mode>4 insn we try 3 different
expansions of the basic signed * signed -> signed or unsigned * unsigned ->
unsigned overflow computation.  The first one is done if 
       if (GET_MODE_2XWIDER_MODE (mode).exists (&wmode)
           && targetm.scalar_mode_supported_p (wmode))
and we emit a WIDEN_MULT_EXPR followed by extraction of the hipart
from it (for testing overflow if both unsigned and signed) and
lowpart (result of the multiplication and for signed overflow testing
where we use MSB of it).  This case is meant for use by smaller modes,
e.g. subword, where it is generally pretty efficient.  Unfortunately
on some targets, e.g. mips 64-bit, where the is no widening mult
optab it can be expanded as a libcall on the full wmode operands,
which is slow and causes problems e.g. to some freestanding environments
like Linux kernel that don't bother to link in libgcc.a or replacement
thereof.  Then there is another case, usually pretty large, with usually
two but sometimes one multiplication, and various conditionals, shifts, etc.
meant primarily for the widest supported mode.  And the last fallback
is just doing multiplication and never computing overflow, hopefully it
is never used at least on sane targets.


This patch attempts to check if we'd emit WIDEN_MULT_EXPR as a libcall
and in that case tries to use the other possibilities, and only falls
back to the WIDEN_MULT_EXPR with a libcall if we'd otherwise use the
last fallback without overflow computation.

In addition to it, it adds support for targets that have supported
MULT_HIGHPART_EXPR for that mode, by doing pretty much what the
WIDEN_MULT_EXPR case does, but instead of doing one multiplication
to compute both lowpart and highpart and then shifts to split those
we use one multiplication to compute the lowpart and one MULT_HIGHPART_EXPR
to compute the highpart.  In theory this method doesn't have to be always
faster than the one with hmode, because the MULT_HIGHPART_EXPR case does
2 multiplications plus one comparison, while the hmode code does sometimes
just one, but it is significantly shorter, fewer conditionals/branches
so I think it should be generally a win (if it turns out not to be the
case on some target, we could restrict it to -Os only or whatever).

And lastly, the MULT_HIGHPART_EXPR case can actually be the optimal code
if we are only checking for the overflow and don't actually need the
multiplication value, it is unsigned multiply and we don't need any
res using code afterwards; in that case the low part multiply can be DCEd
and only the highpart multiply + comparison will remain.  So, the patch
adds check for single IMAGPART_EXPR use and other conditions and uses
the MULT_HIGHPART_EXPR code in preference of the WIDEN_MULT_EXPR in that
case.

Bootstrapped/regtested on x86_64-linux and i686-linux, tested on the
testcase with cross to mips, ok for trunk?

2017-11-14  Jakub Jelinek  <ja...@redhat.com>

        PR target/82981
        * internal-fn.c: Include gimple-ssa.h, tree-phinodes.h and
        ssa-iterators.h.
        (can_widen_mult_without_libcall): New function.
        (expand_mul_overflow): If only checking unsigned mul overflow,
        not result, and can do efficiently MULT_HIGHPART_EXPR, emit that.
        Don't use WIDEN_MULT_EXPR if it would involve a libcall, unless
        no other way works.  Add MULT_HIGHPART_EXPR + MULT_EXPR support.
        (expand_DIVMOD): Formatting fix.
        * expmed.h (expand_mult): Add NO_LIBCALL argument.
        * expmed.c (expand_mult): Likewise.  Use OPTAB_WIDEN rather
        than OPTAB_LIB_WIDEN if NO_LIBCALL is true, and allow it to fail.

--- gcc/internal-fn.c.jj        2017-10-23 10:13:08.000000000 +0200
+++ gcc/internal-fn.c   2017-11-14 16:48:25.414403348 +0100
@@ -46,6 +46,9 @@ along with GCC; see the file COPYING3.
 #include "recog.h"
 #include "builtins.h"
 #include "optabs-tree.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "ssa-iterators.h"
 
 /* The names of each internal function, indexed by function number.  */
 const char *const internal_fn_name_array[] = {
@@ -1172,6 +1175,35 @@ expand_neg_overflow (location_t loc, tre
     }
 }
 
+/* Return true if UNS WIDEN_MULT_EXPR with result mode WMODE and operand
+   mode MODE can be expanded without using a libcall.  */
+
+static bool
+can_widen_mult_without_libcall (scalar_int_mode wmode, scalar_int_mode mode,
+                               rtx op0, rtx op1, bool uns)
+{
+  if (find_widening_optab_handler (umul_widen_optab, wmode, mode)
+      != CODE_FOR_nothing)
+    return true;
+    
+  if (find_widening_optab_handler (smul_widen_optab, wmode, mode)
+      != CODE_FOR_nothing)
+    return true;
+
+  rtx_insn *last = get_last_insn ();
+  if (CONSTANT_P (op0))
+    op0 = convert_modes (wmode, mode, op0, uns);
+  else
+    op0 = gen_raw_REG (wmode, LAST_VIRTUAL_REGISTER + 1);
+  if (CONSTANT_P (op1))
+    op1 = convert_modes (wmode, mode, op1, uns);
+  else
+    op1 = gen_raw_REG (wmode, LAST_VIRTUAL_REGISTER + 2);
+  rtx ret = expand_mult (wmode, op0, op1, NULL_RTX, uns, true);
+  delete_insns_since (last);
+  return ret != NULL_RTX;
+} 
+
 /* Add mul overflow checking to the statement STMT.  */
 
 static void
@@ -1465,9 +1497,29 @@ expand_mul_overflow (location_t loc, tre
       ops.op1 = make_tree (type, op1);
       ops.op2 = NULL_TREE;
       ops.location = loc;
+
+      /* Optimize unsigned overflow check where we don't use the
+        multiplication result, just whether overflow happened.
+        If we can do MULT_HIGHPART_EXPR, that followed by
+        comparison of the result against zero is cheapest.
+        We'll still compute res, but it should be DCEd later.  */
+      use_operand_p use;
+      gimple *use_stmt;
+      if (!is_ubsan
+         && lhs
+         && uns
+         && !(uns0_p && uns1_p && !unsr_p)
+         && can_mult_highpart_p (mode, uns) == 1
+         && single_imm_use (lhs, &use, &use_stmt)
+         && is_gimple_assign (use_stmt)
+         && gimple_assign_rhs_code (use_stmt) == IMAGPART_EXPR)
+       goto highpart;
+
       if (GET_MODE_2XWIDER_MODE (mode).exists (&wmode)
-         && targetm.scalar_mode_supported_p (wmode))
+         && targetm.scalar_mode_supported_p (wmode)
+         && can_widen_mult_without_libcall (wmode, mode, op0, op1, uns))
        {
+       twoxwider:
          ops.code = WIDEN_MULT_EXPR;
          ops.type
            = build_nonstandard_integer_type (GET_MODE_PRECISION (wmode), uns);
@@ -1495,6 +1547,35 @@ expand_mul_overflow (location_t loc, tre
                                       profile_probability::very_likely ());
            }
        }
+      else if (can_mult_highpart_p (mode, uns) == 1)
+       {
+       highpart:
+         ops.code = MULT_HIGHPART_EXPR;
+         ops.type = type;
+
+         rtx hipart = expand_expr_real_2 (&ops, NULL_RTX, mode,
+                                          EXPAND_NORMAL);
+         ops.code = MULT_EXPR;
+         res = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL);
+         if (uns)
+           /* For the unsigned multiplication, there was overflow if
+              HIPART is non-zero.  */
+           do_compare_rtx_and_jump (hipart, const0_rtx, EQ, true, mode,
+                                    NULL_RTX, NULL, done_label,
+                                    profile_probability::very_likely ());
+         else
+           {
+             rtx signbit = expand_shift (RSHIFT_EXPR, mode, res, prec - 1,
+                                         NULL_RTX, 0);
+             /* RES is low half of the double width result, HIPART
+                the high half.  There was overflow if
+                HIPART is different from RES < 0 ? -1 : 0.  */
+             do_compare_rtx_and_jump (signbit, hipart, EQ, true, mode,
+                                      NULL_RTX, NULL, done_label,
+                                      profile_probability::very_likely ());
+           }
+         
+       }
       else if (int_mode_for_size (prec / 2, 1).exists (&hmode)
               && 2 * GET_MODE_PRECISION (hmode) == prec)
        {
@@ -1800,6 +1881,11 @@ expand_mul_overflow (location_t loc, tre
          tem = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL);
          emit_move_insn (res, tem);
        }
+      else if (GET_MODE_2XWIDER_MODE (mode).exists (&wmode)
+              && targetm.scalar_mode_supported_p (wmode))
+       /* Even emitting a libcall is better than not detecting overflow
+          at all.  */
+       goto twoxwider;
       else
        {
          gcc_assert (!is_ubsan);
@@ -2588,7 +2674,7 @@ expand_DIVMOD (internal_fn, gcall *call_
   expand_expr (build2 (COMPLEX_EXPR, TREE_TYPE (lhs),
                       make_tree (TREE_TYPE (arg0), quotient),
                       make_tree (TREE_TYPE (arg1), remainder)),
-             target, VOIDmode, EXPAND_NORMAL);
+              target, VOIDmode, EXPAND_NORMAL);
 }
 
 /* Expand a call to FN using the operands in STMT.  FN has a single
--- gcc/expmed.h.jj     2017-09-01 09:26:37.000000000 +0200
+++ gcc/expmed.h        2017-11-14 12:32:11.344054003 +0100
@@ -727,7 +727,7 @@ extern rtx extract_bit_field (rtx, unsig
                              unsigned HOST_WIDE_INT, int, rtx,
                              machine_mode, machine_mode, bool, rtx *);
 extern rtx extract_low_bits (machine_mode, machine_mode, rtx);
-extern rtx expand_mult (machine_mode, rtx, rtx, rtx, int);
+extern rtx expand_mult (machine_mode, rtx, rtx, rtx, int, bool = false);
 extern rtx expand_mult_highpart_adjust (scalar_int_mode, rtx, rtx, rtx,
                                        rtx, int);
 
--- gcc/expmed.c.jj     2017-11-09 15:38:48.000000000 +0100
+++ gcc/expmed.c        2017-11-14 12:50:24.948946475 +0100
@@ -3284,7 +3284,7 @@ expand_mult_const (machine_mode mode, rt
 
 rtx
 expand_mult (machine_mode mode, rtx op0, rtx op1, rtx target,
-            int unsignedp)
+            int unsignedp, bool no_libcall)
 {
   enum mult_variant variant;
   struct algorithm algorithm;
@@ -3420,14 +3420,16 @@ expand_mult (machine_mode mode, rtx op0,
     {
       op0 = force_reg (GET_MODE (op0), op0);
       return expand_binop (mode, add_optab, op0, op0,
-                          target, unsignedp, OPTAB_LIB_WIDEN);
+                          target, unsignedp,
+                          no_libcall ? OPTAB_WIDEN : OPTAB_LIB_WIDEN);
     }
 
   /* This used to use umul_optab if unsigned, but for non-widening multiply
      there is no difference between signed and unsigned.  */
   op0 = expand_binop (mode, do_trapv ? smulv_optab : smul_optab,
-                     op0, op1, target, unsignedp, OPTAB_LIB_WIDEN);
-  gcc_assert (op0);
+                     op0, op1, target, unsignedp,
+                     no_libcall ? OPTAB_WIDEN : OPTAB_LIB_WIDEN);
+  gcc_assert (op0 || no_libcall);
   return op0;
 }
 

        Jakub

[PATCH] Small expand_mul_overflow improvement (PR target/82981)

Reply via email to