On Wed, 29 Nov 2017, Jakub Jelinek wrote:

> Hi!
> 
> Even if target has an umulv4_optab pattern like i?86 mul[lq]; jo,
> if one argument is constant power of two, using two shifts, or
> for multiplication by 2 just shl with js on previous value is faster
> (though, for -Os mul[lq]; jo wins).
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

So this is because we don't go through the standard multiplication
expander, right?  But for icode == CODE_FOR_nothing we probably
should do this and then emit the compare-and-jump?

And for constant multiplicators in general as well?

As you do it you assume that the targets optab is less efficient
than two shifts and a compare but how do you know?  Shouldn't we
do regular mult expansion and cost the resulting sequence against
the one from the umulv expander?

That said, the patch looks x86 specific but done in generic code...

Thanks,
Richard.

> 2017-11-29  Jakub Jelinek  <ja...@redhat.com>
> 
>       PR target/83210
>       * internal-fn.c (expand_mul_overflow): Optimize unsigned
>       multiplication by power of 2 constant into two shifts + comparison.
> 
>       * gcc.target/i386/pr83210.c: New test.
> 
> --- gcc/internal-fn.c.jj      2017-11-22 21:37:46.000000000 +0100
> +++ gcc/internal-fn.c 2017-11-29 17:44:17.699009169 +0100
> @@ -1462,6 +1462,39 @@ expand_mul_overflow (location_t loc, tre
>    type = build_nonstandard_integer_type (GET_MODE_PRECISION (mode), uns);
>    sign = uns ? UNSIGNED : SIGNED;
>    icode = optab_handler (uns ? umulv4_optab : mulv4_optab, mode);
> +  if (uns
> +      && (integer_pow2p (arg0) || integer_pow2p (arg1))
> +      && (optimize_insn_for_speed_p () || icode == CODE_FOR_nothing))
> +    {
> +      /* Optimize unsigned multiplication by power of 2 constant
> +      using 2 shifts, one for result, one to extract the shifted
> +      out bits to see if they are all zero.
> +      Don't do this if optimizing for size and we have umulv4_optab,
> +      in that case assume multiplication will be shorter.  */
> +      rtx opn0 = op0;
> +      rtx opn1 = op1;
> +      tree argn0 = arg0;
> +      tree argn1 = arg1;
> +      if (integer_pow2p (arg0))
> +     {
> +       std::swap (opn0, opn1);
> +       std::swap (argn0, argn1);
> +     }
> +      int cnt = tree_log2 (argn1);
> +      if (cnt >= 0 && cnt < GET_MODE_PRECISION (mode))
> +     {
> +       rtx upper = const0_rtx;
> +       res = expand_shift (LSHIFT_EXPR, mode, opn0, cnt, NULL_RTX, uns);
> +       if (cnt != 0)
> +         upper = expand_shift (RSHIFT_EXPR, mode, opn0,
> +                               GET_MODE_PRECISION (mode) - cnt,
> +                               NULL_RTX, uns);
> +       do_compare_rtx_and_jump (upper, const0_rtx, EQ, true, mode,
> +                                NULL_RTX, NULL, done_label,
> +                                profile_probability::very_likely ());
> +       goto do_error_label;
> +     }
> +    }
>    if (icode != CODE_FOR_nothing)
>      {
>        struct expand_operand ops[4];
> --- gcc/testsuite/gcc.target/i386/pr83210.c.jj        2017-11-29 
> 17:58:00.102881065 +0100
> +++ gcc/testsuite/gcc.target/i386/pr83210.c   2017-11-29 17:57:36.000000000 
> +0100
> @@ -0,0 +1,53 @@
> +/* PR target/83210 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-not {\mmul[lq]\M} } } */
> +
> +void bar (void);
> +
> +unsigned
> +f1 (unsigned int x)
> +{
> +  unsigned res;
> +  if (__builtin_mul_overflow (x, 2, &res))
> +    bar ();
> +  return res;
> +}
> +
> +unsigned long
> +f2 (unsigned long x)
> +{
> +  unsigned long res;
> +  if (__builtin_mul_overflow (16, x, &res))
> +    bar ();
> +  return res;
> +}
> +
> +unsigned long long
> +f3 (unsigned long long x)
> +{
> +  unsigned long long res;
> +  if (__builtin_mul_overflow (x, (1ULL << (__SIZEOF_LONG_LONG__ * 
> __CHAR_BIT__ - 1)), &res))
> +    bar ();
> +  return res;
> +}
> +
> +#ifdef __SIZEOF_INT128__
> +unsigned __int128
> +f4 (unsigned __int128 x)
> +{
> +  unsigned __int128 res;
> +  if (__builtin_mul_overflow (x, (((unsigned __int128) 1) << 
> (__SIZEOF_INT128__ * __CHAR_BIT__ / 2)), &res))
> +    bar ();
> +  return res;
> +}
> +
> +unsigned __int128
> +f5 (unsigned __int128 x)
> +{
> +  unsigned __int128 res;
> +  if (__builtin_mul_overflow (x, (((unsigned __int128) 1) << 
> (__SIZEOF_INT128__ * __CHAR_BIT__ / 2 + 3)), &res))
> +    bar ();
> +  return res;
> +}
> +#endif
> 
>       Jakub
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 
21284 (AG Nuernberg)

Reply via email to