b optimization [PR117927]

Richard Biener Wed, 08 Jan 2025 01:18:38 -0800

On Wed, 8 Jan 2025, Jakub Jelinek wrote:

> Hi!
> 
> As mentioned in the PR, the a r<< (bitsize-b) to a r>> b and similar
> match.pd optimization which has been introduced in GCC 15 can introduce
> UB which wasn't there before, in particular if b is equal at runtime
> to bitsize, then a r<< 0 is turned into a r>> bitsize.
> 
> The following patch fixes it by optimizing it early only if VRP
> tells us the count isn't equal to the bitsize, and late into
> a r>> (b & (bitsize - 1)) if bitsize is power of two and the subtraction
> has single use, on various targets the masking then goes away because
> its rotate instructions do masking already.  The latter can't be
> done too early though, because then the expr_not_equal_to case is
> basically useless and we introduce the masking always and can't find out
> anymore that there was originally no masking.  Even cfun->after_inlining
> check would be too early, there is forwprop before vrp, so the patch
> introduces a new PROP for vrp2 finish, after which there is another
> forwprop.  Or do you have ideas about what other condition could be used
> for late matching only?
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?


Maybe instead add PROP_last_full_fold and set that at the start of the
last forwprop?  I think in the end we may want to have a special
pre-expand (at ISEL time?) folding and have extra patterns enabled
there (and some disabled eventually), but I think we don't want to
introduce this now.

With the rest I agree.

Thanks,
Richard.

> 2025-01-08  Jakub Jelinek  <ja...@redhat.com>
>           Andrew Pinski  <quic_apin...@quicinc.com>
> 
>       PR tree-optimization/117927
>       * tree-pass.h (PROP_vrp): Define.
>       * tree-vrp.cc (pass_vrp::execute): Set PROP_vrp in curr_properties
>       at the end if final_p.
>       * match.pd (a rrotate (32-b) -> a lrotate b): Only optimize either
>       if @2 is known not to be equal to prec or if after vrp2 the
>       subtraction has single use and prec is power of two; in that case
>       transform it into orotate by masked count.
> 
>       * gcc.dg/tree-ssa/pr117927.c: New test.
> 
> --- gcc/tree-pass.h.jj        2025-01-07 20:10:42.617965365 +0100
> +++ gcc/tree-pass.h   2025-01-07 21:34:13.859794503 +0100
> @@ -230,6 +230,7 @@ protected:
>  #define PROP_assumptions_done        (1 << 19)       /* Assume function kept
>                                                  around.  */
>  #define PROP_gimple_lbitint  (1 << 20)       /* lowered large _BitInt */
> +#define PROP_vrp             (1 << 21)       /* vrp2 pass finished.  */
>  
>  #define PROP_gimple \
>    (PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp)
> --- gcc/tree-vrp.cc.jj        2025-01-07 20:10:42.617965365 +0100
> +++ gcc/tree-vrp.cc   2025-01-07 21:35:24.308814807 +0100
> @@ -1081,8 +1081,8 @@ private:
>    gimple *m_last_bb_stmt;
>  };
>  
> -/* Main entry point for a VRP pass using just ranger. This can be called
> -  from anywhere to perform a VRP pass, including from EVRP.  */
> +/* Main entry point for a VRP pass using just ranger.  This can be called
> +   from anywhere to perform a VRP pass, including from EVRP.  */
>  
>  unsigned int
>  execute_ranger_vrp (struct function *fun, bool final_p)
> @@ -1334,6 +1334,7 @@ public:
>      {
>        // Check for fast vrp.
>        bool use_fvrp = (&data == &pass_data_fast_vrp);
> +      unsigned int todo;
>        if (!use_fvrp && last_basic_block_for_fn (fun) > param_vrp_block_limit)
>       {
>         use_fvrp = true;
> @@ -1344,8 +1345,12 @@ public:
>                  param_vrp_block_limit);
>       }
>        if (use_fvrp)
> -     return execute_fast_vrp (fun, final_p);
> -      return execute_ranger_vrp (fun, final_p);
> +     todo = execute_fast_vrp (fun, final_p);
> +      else
> +     todo = execute_ranger_vrp (fun, final_p);
> +      if (final_p)
> +     fun->curr_properties |= PROP_vrp;
> +      return todo;
>      }
>  
>   private:
> --- gcc/match.pd.jj   2025-01-07 20:10:42.615965392 +0100
> +++ gcc/match.pd      2025-01-07 21:36:27.991929199 +0100
> @@ -4967,14 +4967,31 @@ (define_operator_list SYNC_FETCH_AND_AND
>                           build_int_cst (TREE_TYPE (@1),
>                                          element_precision (type)), @1); }))
>  
> -/* a rrotate (32-b) -> a lrotate b */
> -/* a lrotate (32-b) -> a rrotate b */
> +/* a rrotate (bitsize-b) -> a lrotate b */
> +/* a lrotate (bitsize-b) -> a rrotate b */
> +/* Only do the above when it is known that b != bitsize.
> +   Otherwise canonicalize to a orotate (b & mask) when the subtraction
> +   has single use and prec is a power of two.  */
>  (for rotate (lrotate rrotate)
>       orotate (rrotate lrotate)
>   (simplify
> -  (rotate @0 (minus INTEGER_CST@1 @2))
> -   (if (element_precision (TREE_TYPE (@0)) == wi::to_wide (@1))
> -     (orotate @0 @2))))
> +  (rotate @0 (minus@3 INTEGER_CST@1 @2))
> +  (with { auto prec = element_precision (TREE_TYPE (@0)); }
> +   (if (prec == wi::to_wide (@1))
> +    (switch
> +     (if (expr_not_equal_to (@2, wi::uhwi (prec, TYPE_PRECISION (TREE_TYPE 
> (@2)))))
> +      (orotate @0 @2))
> +     (if (single_use (@3)
> +       && pow2p_hwi (prec)
> +       /* Defer this case until VRP2 could be run and expr_not_equal_to
> +          had a chance to match.  Otherwise we'd do pretty much always
> +          just the second case.  */
> +       && cfun
> +       && ((cfun->curr_properties & PROP_vrp) != 0
> +           || !flag_tree_vrp
> +           || optimize_debug))
> +      (orotate @0
> +     (bit_and @2 { build_int_cst (TREE_TYPE (@2), prec - 1); }))))))))
>  
>  /* Turn (a OP c1) OP c2 into a OP (c1+c2).  */
>  (for op (lrotate rrotate rshift lshift)
> --- gcc/testsuite/gcc.dg/tree-ssa/pr117927.c.jj       2025-01-07 
> 20:48:59.827825909 +0100
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr117927.c  2025-01-07 20:48:59.827825909 
> +0100
> @@ -0,0 +1,97 @@
> +/* PR tree-optimization/117927 */
> +/* { dg-do compile { target { int32 && longlong64 } } } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-times " r<< " 4 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times " r>> " 4 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times " & 31;" 2 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times " & 63;" 2 "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "32 - " "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "64 - " "optimized" } } */
> +
> +static inline
> +unsigned lrotate32 (unsigned x, int t)
> +{
> +  unsigned tl = x << t;
> +  unsigned th = x >> (-t & 31);
> +  return tl | th;
> +}
> +
> +static inline
> +unsigned rrotate32 (unsigned x, int t)
> +{
> +  unsigned tl = x >> t;
> +  unsigned th = x << (-t & 31);
> +  return tl | th;
> +}
> +
> +static inline
> +unsigned long long lrotate64 (unsigned long long x, int t)
> +{
> +  unsigned long long tl = x << t;
> +  unsigned long long th = x >> (-t & 63);
> +  return tl | th;
> +}
> +
> +static inline
> +unsigned long long rrotate64 (unsigned long long x, int t)
> +{
> +  unsigned long long tl = x >> t;
> +  unsigned long long th = x << (-t & 63);
> +  return tl | th;
> +}
> +
> +unsigned
> +f1 (unsigned x, int t)
> +{
> +  return lrotate32 (x, 32 - t);
> +}
> +
> +unsigned long long
> +f2 (unsigned long long x, int t)
> +{
> +  return lrotate64 (x, 64 - t);
> +}
> +
> +unsigned
> +f3 (unsigned x, int t)
> +{
> +  if (t == 32)
> +    __builtin_unreachable ();
> +  return lrotate32 (x, 32 - t);
> +}
> +
> +unsigned long long
> +f4 (unsigned long long x, int t)
> +{
> +  if (t == 64)
> +    __builtin_unreachable ();
> +  return lrotate64 (x, 64 - t);
> +}
> +
> +unsigned
> +f5 (unsigned x, int t)
> +{
> +  return rrotate32 (x, 32 - t);
> +}
> +
> +unsigned long long
> +f6 (unsigned long long x, int t)
> +{
> +  return rrotate64 (x, 64 - t);
> +}
> +
> +unsigned
> +f7 (unsigned x, int t)
> +{
> +  if (t == 32)
> +    __builtin_unreachable ();
> +  return rrotate32 (x, 32 - t);
> +}
> +
> +unsigned long long
> +f8 (unsigned long long x, int t)
> +{
> +  if (t == 64)
> +    __builtin_unreachable ();
> +  return rrotate64 (x, 64 - t);
> +}
> 
>       Jakub
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Re: [PATCH] match.pd: Avoid introducing UB in the a r<< (32-b) -> a r>> b optimization [PR117927]

Reply via email to