On Wed, 8 Jan 2025, Jakub Jelinek wrote: > Hi! > > As mentioned in the PR, the a r<< (bitsize-b) to a r>> b and similar > match.pd optimization which has been introduced in GCC 15 can introduce > UB which wasn't there before, in particular if b is equal at runtime > to bitsize, then a r<< 0 is turned into a r>> bitsize. > > The following patch fixes it by optimizing it early only if VRP > tells us the count isn't equal to the bitsize, and late into > a r>> (b & (bitsize - 1)) if bitsize is power of two and the subtraction > has single use, on various targets the masking then goes away because > its rotate instructions do masking already. The latter can't be > done too early though, because then the expr_not_equal_to case is > basically useless and we introduce the masking always and can't find out > anymore that there was originally no masking. Even cfun->after_inlining > check would be too early, there is forwprop before vrp, so the patch > introduces a new PROP for vrp2 finish, after which there is another > forwprop. Or do you have ideas about what other condition could be used > for late matching only? > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
Maybe instead add PROP_last_full_fold and set that at the start of the last forwprop? I think in the end we may want to have a special pre-expand (at ISEL time?) folding and have extra patterns enabled there (and some disabled eventually), but I think we don't want to introduce this now. With the rest I agree. Thanks, Richard. > 2025-01-08 Jakub Jelinek <ja...@redhat.com> > Andrew Pinski <quic_apin...@quicinc.com> > > PR tree-optimization/117927 > * tree-pass.h (PROP_vrp): Define. > * tree-vrp.cc (pass_vrp::execute): Set PROP_vrp in curr_properties > at the end if final_p. > * match.pd (a rrotate (32-b) -> a lrotate b): Only optimize either > if @2 is known not to be equal to prec or if after vrp2 the > subtraction has single use and prec is power of two; in that case > transform it into orotate by masked count. > > * gcc.dg/tree-ssa/pr117927.c: New test. > > --- gcc/tree-pass.h.jj 2025-01-07 20:10:42.617965365 +0100 > +++ gcc/tree-pass.h 2025-01-07 21:34:13.859794503 +0100 > @@ -230,6 +230,7 @@ protected: > #define PROP_assumptions_done (1 << 19) /* Assume function kept > around. */ > #define PROP_gimple_lbitint (1 << 20) /* lowered large _BitInt */ > +#define PROP_vrp (1 << 21) /* vrp2 pass finished. */ > > #define PROP_gimple \ > (PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp) > --- gcc/tree-vrp.cc.jj 2025-01-07 20:10:42.617965365 +0100 > +++ gcc/tree-vrp.cc 2025-01-07 21:35:24.308814807 +0100 > @@ -1081,8 +1081,8 @@ private: > gimple *m_last_bb_stmt; > }; > > -/* Main entry point for a VRP pass using just ranger. This can be called > - from anywhere to perform a VRP pass, including from EVRP. */ > +/* Main entry point for a VRP pass using just ranger. This can be called > + from anywhere to perform a VRP pass, including from EVRP. */ > > unsigned int > execute_ranger_vrp (struct function *fun, bool final_p) > @@ -1334,6 +1334,7 @@ public: > { > // Check for fast vrp. > bool use_fvrp = (&data == &pass_data_fast_vrp); > + unsigned int todo; > if (!use_fvrp && last_basic_block_for_fn (fun) > param_vrp_block_limit) > { > use_fvrp = true; > @@ -1344,8 +1345,12 @@ public: > param_vrp_block_limit); > } > if (use_fvrp) > - return execute_fast_vrp (fun, final_p); > - return execute_ranger_vrp (fun, final_p); > + todo = execute_fast_vrp (fun, final_p); > + else > + todo = execute_ranger_vrp (fun, final_p); > + if (final_p) > + fun->curr_properties |= PROP_vrp; > + return todo; > } > > private: > --- gcc/match.pd.jj 2025-01-07 20:10:42.615965392 +0100 > +++ gcc/match.pd 2025-01-07 21:36:27.991929199 +0100 > @@ -4967,14 +4967,31 @@ (define_operator_list SYNC_FETCH_AND_AND > build_int_cst (TREE_TYPE (@1), > element_precision (type)), @1); })) > > -/* a rrotate (32-b) -> a lrotate b */ > -/* a lrotate (32-b) -> a rrotate b */ > +/* a rrotate (bitsize-b) -> a lrotate b */ > +/* a lrotate (bitsize-b) -> a rrotate b */ > +/* Only do the above when it is known that b != bitsize. > + Otherwise canonicalize to a orotate (b & mask) when the subtraction > + has single use and prec is a power of two. */ > (for rotate (lrotate rrotate) > orotate (rrotate lrotate) > (simplify > - (rotate @0 (minus INTEGER_CST@1 @2)) > - (if (element_precision (TREE_TYPE (@0)) == wi::to_wide (@1)) > - (orotate @0 @2)))) > + (rotate @0 (minus@3 INTEGER_CST@1 @2)) > + (with { auto prec = element_precision (TREE_TYPE (@0)); } > + (if (prec == wi::to_wide (@1)) > + (switch > + (if (expr_not_equal_to (@2, wi::uhwi (prec, TYPE_PRECISION (TREE_TYPE > (@2))))) > + (orotate @0 @2)) > + (if (single_use (@3) > + && pow2p_hwi (prec) > + /* Defer this case until VRP2 could be run and expr_not_equal_to > + had a chance to match. Otherwise we'd do pretty much always > + just the second case. */ > + && cfun > + && ((cfun->curr_properties & PROP_vrp) != 0 > + || !flag_tree_vrp > + || optimize_debug)) > + (orotate @0 > + (bit_and @2 { build_int_cst (TREE_TYPE (@2), prec - 1); })))))))) > > /* Turn (a OP c1) OP c2 into a OP (c1+c2). */ > (for op (lrotate rrotate rshift lshift) > --- gcc/testsuite/gcc.dg/tree-ssa/pr117927.c.jj 2025-01-07 > 20:48:59.827825909 +0100 > +++ gcc/testsuite/gcc.dg/tree-ssa/pr117927.c 2025-01-07 20:48:59.827825909 > +0100 > @@ -0,0 +1,97 @@ > +/* PR tree-optimization/117927 */ > +/* { dg-do compile { target { int32 && longlong64 } } } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > +/* { dg-final { scan-tree-dump-times " r<< " 4 "optimized" } } */ > +/* { dg-final { scan-tree-dump-times " r>> " 4 "optimized" } } */ > +/* { dg-final { scan-tree-dump-times " & 31;" 2 "optimized" } } */ > +/* { dg-final { scan-tree-dump-times " & 63;" 2 "optimized" } } */ > +/* { dg-final { scan-tree-dump-not "32 - " "optimized" } } */ > +/* { dg-final { scan-tree-dump-not "64 - " "optimized" } } */ > + > +static inline > +unsigned lrotate32 (unsigned x, int t) > +{ > + unsigned tl = x << t; > + unsigned th = x >> (-t & 31); > + return tl | th; > +} > + > +static inline > +unsigned rrotate32 (unsigned x, int t) > +{ > + unsigned tl = x >> t; > + unsigned th = x << (-t & 31); > + return tl | th; > +} > + > +static inline > +unsigned long long lrotate64 (unsigned long long x, int t) > +{ > + unsigned long long tl = x << t; > + unsigned long long th = x >> (-t & 63); > + return tl | th; > +} > + > +static inline > +unsigned long long rrotate64 (unsigned long long x, int t) > +{ > + unsigned long long tl = x >> t; > + unsigned long long th = x << (-t & 63); > + return tl | th; > +} > + > +unsigned > +f1 (unsigned x, int t) > +{ > + return lrotate32 (x, 32 - t); > +} > + > +unsigned long long > +f2 (unsigned long long x, int t) > +{ > + return lrotate64 (x, 64 - t); > +} > + > +unsigned > +f3 (unsigned x, int t) > +{ > + if (t == 32) > + __builtin_unreachable (); > + return lrotate32 (x, 32 - t); > +} > + > +unsigned long long > +f4 (unsigned long long x, int t) > +{ > + if (t == 64) > + __builtin_unreachable (); > + return lrotate64 (x, 64 - t); > +} > + > +unsigned > +f5 (unsigned x, int t) > +{ > + return rrotate32 (x, 32 - t); > +} > + > +unsigned long long > +f6 (unsigned long long x, int t) > +{ > + return rrotate64 (x, 64 - t); > +} > + > +unsigned > +f7 (unsigned x, int t) > +{ > + if (t == 32) > + __builtin_unreachable (); > + return rrotate32 (x, 32 - t); > +} > + > +unsigned long long > +f8 (unsigned long long x, int t) > +{ > + if (t == 64) > + __builtin_unreachable (); > + return rrotate64 (x, 64 - t); > +} > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)