On Wed, 8 Jan 2025, Jakub Jelinek wrote:

> On Wed, Jan 08, 2025 at 10:17:59AM +0100, Richard Biener wrote:
> > > As mentioned in the PR, the a r<< (bitsize-b) to a r>> b and similar
> > > match.pd optimization which has been introduced in GCC 15 can introduce
> > > UB which wasn't there before, in particular if b is equal at runtime
> > > to bitsize, then a r<< 0 is turned into a r>> bitsize.
> > > 
> > > The following patch fixes it by optimizing it early only if VRP
> > > tells us the count isn't equal to the bitsize, and late into
> > > a r>> (b & (bitsize - 1)) if bitsize is power of two and the subtraction
> > > has single use, on various targets the masking then goes away because
> > > its rotate instructions do masking already.  The latter can't be
> > > done too early though, because then the expr_not_equal_to case is
> > > basically useless and we introduce the masking always and can't find out
> > > anymore that there was originally no masking.  Even cfun->after_inlining
> > > check would be too early, there is forwprop before vrp, so the patch
> > > introduces a new PROP for vrp2 finish, after which there is another
> > > forwprop.  Or do you have ideas about what other condition could be used
> > > for late matching only?
> > > 
> > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> > 
> > Maybe instead add PROP_last_full_fold and set that at the start of the
> > last forwprop?  I think in the end we may want to have a special
> > pre-expand (at ISEL time?) folding and have extra patterns enabled
> > there (and some disabled eventually), but I think we don't want to
> > introduce this now.
> > 
> > With the rest I agree.
> 
> So like this?  Works in quick testing, ok for trunk if it passes full
> bootstrap/regtest?

LGTM.

Thanks,
Richard.

> 2025-01-08  Jakub Jelinek  <ja...@redhat.com>
>           Andrew Pinski  <quic_apin...@quicinc.com>
> 
>       PR tree-optimization/117927
>       * tree-pass.h (PROP_last_full_fold): Define.
>       * passes.def: Add last= parameters to pass_forwprop.
>       * tree-ssa-forwprop.cc (pass_forwprop): Add last_p non-static
>       data member and initialize it in the ctor.
>       (pass_forwprop::set_pass_param): New method.
>       (pass_forwprop::execute): Set PROP_last_full_fold in curr_properties
>       at the start if last_p.
>       * match.pd (a rrotate (32-b) -> a lrotate b): Only optimize either
>       if @2 is known not to be equal to prec or if during/after last
>       forwprop the subtraction has single use and prec is power of two; in
>       that case transform it into orotate by masked count.
> 
>       * gcc.dg/tree-ssa/pr117927.c: New test.
> 
> --- gcc/tree-pass.h.jj        2025-01-08 10:05:09.634203321 +0100
> +++ gcc/tree-pass.h   2025-01-08 11:22:56.356672128 +0100
> @@ -230,6 +230,7 @@ protected:
>  #define PROP_assumptions_done        (1 << 19)       /* Assume function kept
>                                                  around.  */
>  #define PROP_gimple_lbitint  (1 << 20)       /* lowered large _BitInt */
> +#define PROP_last_full_fold  (1 << 21)       /* Start of last forwprop.  */
>  
>  #define PROP_gimple \
>    (PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp)
> --- gcc/passes.def.jj 2025-01-02 11:23:21.976440640 +0100
> +++ gcc/passes.def    2025-01-08 11:24:47.865096546 +0100
> @@ -80,7 +80,7 @@ along with GCC; see the file COPYING3.
>         NEXT_PASS (pass_ccp, false /* nonzero_p */);
>         /* After CCP we rewrite no longer addressed locals into SSA
>            form if possible.  */
> -       NEXT_PASS (pass_forwprop);
> +       NEXT_PASS (pass_forwprop, /*last=*/false);
>            NEXT_PASS (pass_early_thread_jumps, /*first=*/true);
>         NEXT_PASS (pass_sra_early);
>         /* pass_build_ealias is a dummy pass that ensures that we
> @@ -214,7 +214,7 @@ along with GCC; see the file COPYING3.
>        NEXT_PASS (pass_complete_unrolli);
>        NEXT_PASS (pass_backprop);
>        NEXT_PASS (pass_phiprop);
> -      NEXT_PASS (pass_forwprop);
> +      NEXT_PASS (pass_forwprop, /*last=*/false);
>        /* pass_build_alias is a dummy pass that ensures that we
>        execute TODO_rebuild_alias at this point.  */
>        NEXT_PASS (pass_build_alias);
> @@ -254,7 +254,7 @@ along with GCC; see the file COPYING3.
>        NEXT_PASS (pass_isolate_erroneous_paths);
>        NEXT_PASS (pass_reassoc, true /* early_p */);
>        NEXT_PASS (pass_dce);
> -      NEXT_PASS (pass_forwprop);
> +      NEXT_PASS (pass_forwprop, /*last=*/false);
>        NEXT_PASS (pass_phiopt, false /* early_p */);
>        NEXT_PASS (pass_ccp, true /* nonzero_p */);
>        /* After CCP we rewrite no longer addressed locals into SSA
> @@ -356,7 +356,7 @@ along with GCC; see the file COPYING3.
>        NEXT_PASS (pass_dce, true /* update_address_taken_p */, true /* 
> remove_unused_locals */);
>        /* After late DCE we rewrite no longer addressed locals into SSA
>        form if possible.  */
> -      NEXT_PASS (pass_forwprop);
> +      NEXT_PASS (pass_forwprop, /*last=*/true);
>        NEXT_PASS (pass_sink_code, true /* unsplit edges */);
>        NEXT_PASS (pass_phiopt, false /* early_p */);
>        NEXT_PASS (pass_fold_builtins);
> --- gcc/tree-ssa-forwprop.cc.jj       2025-01-03 17:55:01.067219567 +0100
> +++ gcc/tree-ssa-forwprop.cc  2025-01-08 11:46:05.935096203 +0100
> @@ -4107,14 +4107,22 @@ class pass_forwprop : public gimple_opt_
>  {
>  public:
>    pass_forwprop (gcc::context *ctxt)
> -    : gimple_opt_pass (pass_data_forwprop, ctxt)
> +    : gimple_opt_pass (pass_data_forwprop, ctxt), last_p (false)
>    {}
>  
>    /* opt_pass methods: */
>    opt_pass * clone () final override { return new pass_forwprop (m_ctxt); }
> +  void set_pass_param (unsigned int n, bool param) final override
> +    {
> +      gcc_assert (n == 0);
> +      last_p = param;
> +    }
>    bool gate (function *) final override { return flag_tree_forwprop; }
>    unsigned int execute (function *) final override;
>  
> + private:
> +  /* Determines whether the pass instance should set PROP_last_full_fold.  */
> +  bool last_p;
>  }; // class pass_forwprop
>  
>  unsigned int
> @@ -4123,6 +4131,8 @@ pass_forwprop::execute (function *fun)
>    unsigned int todoflags = 0;
>  
>    cfg_changed = false;
> +  if (last_p)
> +    fun->curr_properties |= PROP_last_full_fold;
>  
>    calculate_dominance_info (CDI_DOMINATORS);
>  
> --- gcc/match.pd.jj   2025-01-08 10:05:24.515994525 +0100
> +++ gcc/match.pd      2025-01-08 11:30:24.822335431 +0100
> @@ -4950,14 +4950,32 @@ (define_operator_list SYNC_FETCH_AND_AND
>                           build_int_cst (TREE_TYPE (@1),
>                                          element_precision (type)), @1); }))
>  
> -/* a rrotate (32-b) -> a lrotate b */
> -/* a lrotate (32-b) -> a rrotate b */
> +/* a rrotate (bitsize-b) -> a lrotate b */
> +/* a lrotate (bitsize-b) -> a rrotate b */
> +/* Only do the above when it is known that b != bitsize.
> +   Otherwise canonicalize to a orotate (b & mask) when the subtraction
> +   has single use and prec is a power of two.  */
>  (for rotate (lrotate rrotate)
>       orotate (rrotate lrotate)
>   (simplify
> -  (rotate @0 (minus INTEGER_CST@1 @2))
> -   (if (element_precision (TREE_TYPE (@0)) == wi::to_wide (@1))
> -     (orotate @0 @2))))
> +  (rotate @0 (minus@3 INTEGER_CST@1 @2))
> +  (with { auto prec = element_precision (TREE_TYPE (@0)); }
> +   (if (prec == wi::to_wide (@1))
> +    (switch
> +     (if (expr_not_equal_to (@2, wi::uhwi (prec,
> +                                        TYPE_PRECISION (TREE_TYPE (@2)))))
> +      (orotate @0 @2))
> +     (if (single_use (@3)
> +       && pow2p_hwi (prec)
> +       /* Defer this case until last forwprop, so that VRP could be run and
> +          expr_not_equal_to had a chance to match.  Otherwise we'd do
> +          pretty much always just the second case.  */
> +       && cfun
> +       && ((cfun->curr_properties & PROP_last_full_fold) != 0
> +           || !flag_tree_vrp
> +           || optimize_debug))
> +      (orotate @0
> +     (bit_and @2 { build_int_cst (TREE_TYPE (@2), prec - 1); }))))))))
>  
>  /* Turn (a OP c1) OP c2 into a OP (c1+c2).  */
>  (for op (lrotate rrotate rshift lshift)
> --- gcc/testsuite/gcc.dg/tree-ssa/pr117927.c.jj       2025-01-08 
> 11:22:04.773400985 +0100
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr117927.c  2025-01-08 11:22:04.773400985 
> +0100
> @@ -0,0 +1,97 @@
> +/* PR tree-optimization/117927 */
> +/* { dg-do compile { target { int32 && longlong64 } } } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-times " r<< " 4 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times " r>> " 4 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times " & 31;" 2 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times " & 63;" 2 "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "32 - " "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "64 - " "optimized" } } */
> +
> +static inline
> +unsigned lrotate32 (unsigned x, int t)
> +{
> +  unsigned tl = x << t;
> +  unsigned th = x >> (-t & 31);
> +  return tl | th;
> +}
> +
> +static inline
> +unsigned rrotate32 (unsigned x, int t)
> +{
> +  unsigned tl = x >> t;
> +  unsigned th = x << (-t & 31);
> +  return tl | th;
> +}
> +
> +static inline
> +unsigned long long lrotate64 (unsigned long long x, int t)
> +{
> +  unsigned long long tl = x << t;
> +  unsigned long long th = x >> (-t & 63);
> +  return tl | th;
> +}
> +
> +static inline
> +unsigned long long rrotate64 (unsigned long long x, int t)
> +{
> +  unsigned long long tl = x >> t;
> +  unsigned long long th = x << (-t & 63);
> +  return tl | th;
> +}
> +
> +unsigned
> +f1 (unsigned x, int t)
> +{
> +  return lrotate32 (x, 32 - t);
> +}
> +
> +unsigned long long
> +f2 (unsigned long long x, int t)
> +{
> +  return lrotate64 (x, 64 - t);
> +}
> +
> +unsigned
> +f3 (unsigned x, int t)
> +{
> +  if (t == 32)
> +    __builtin_unreachable ();
> +  return lrotate32 (x, 32 - t);
> +}
> +
> +unsigned long long
> +f4 (unsigned long long x, int t)
> +{
> +  if (t == 64)
> +    __builtin_unreachable ();
> +  return lrotate64 (x, 64 - t);
> +}
> +
> +unsigned
> +f5 (unsigned x, int t)
> +{
> +  return rrotate32 (x, 32 - t);
> +}
> +
> +unsigned long long
> +f6 (unsigned long long x, int t)
> +{
> +  return rrotate64 (x, 64 - t);
> +}
> +
> +unsigned
> +f7 (unsigned x, int t)
> +{
> +  if (t == 32)
> +    __builtin_unreachable ();
> +  return rrotate32 (x, 32 - t);
> +}
> +
> +unsigned long long
> +f8 (unsigned long long x, int t)
> +{
> +  if (t == 64)
> +    __builtin_unreachable ();
> +  return rrotate64 (x, 64 - t);
> +}
> 
> 
>       Jakub
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to