On Wed, Jan 08, 2025 at 10:17:59AM +0100, Richard Biener wrote:
> > As mentioned in the PR, the a r<< (bitsize-b) to a r>> b and similar
> > match.pd optimization which has been introduced in GCC 15 can introduce
> > UB which wasn't there before, in particular if b is equal at runtime
> > to bitsize, then a r<< 0 is turned into a r>> bitsize.
> > 
> > The following patch fixes it by optimizing it early only if VRP
> > tells us the count isn't equal to the bitsize, and late into
> > a r>> (b & (bitsize - 1)) if bitsize is power of two and the subtraction
> > has single use, on various targets the masking then goes away because
> > its rotate instructions do masking already.  The latter can't be
> > done too early though, because then the expr_not_equal_to case is
> > basically useless and we introduce the masking always and can't find out
> > anymore that there was originally no masking.  Even cfun->after_inlining
> > check would be too early, there is forwprop before vrp, so the patch
> > introduces a new PROP for vrp2 finish, after which there is another
> > forwprop.  Or do you have ideas about what other condition could be used
> > for late matching only?
> > 
> > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> Maybe instead add PROP_last_full_fold and set that at the start of the
> last forwprop?  I think in the end we may want to have a special
> pre-expand (at ISEL time?) folding and have extra patterns enabled
> there (and some disabled eventually), but I think we don't want to
> introduce this now.
> 
> With the rest I agree.

So like this?  Works in quick testing, ok for trunk if it passes full
bootstrap/regtest?

2025-01-08  Jakub Jelinek  <ja...@redhat.com>
            Andrew Pinski  <quic_apin...@quicinc.com>

        PR tree-optimization/117927
        * tree-pass.h (PROP_last_full_fold): Define.
        * passes.def: Add last= parameters to pass_forwprop.
        * tree-ssa-forwprop.cc (pass_forwprop): Add last_p non-static
        data member and initialize it in the ctor.
        (pass_forwprop::set_pass_param): New method.
        (pass_forwprop::execute): Set PROP_last_full_fold in curr_properties
        at the start if last_p.
        * match.pd (a rrotate (32-b) -> a lrotate b): Only optimize either
        if @2 is known not to be equal to prec or if during/after last
        forwprop the subtraction has single use and prec is power of two; in
        that case transform it into orotate by masked count.

        * gcc.dg/tree-ssa/pr117927.c: New test.

--- gcc/tree-pass.h.jj  2025-01-08 10:05:09.634203321 +0100
+++ gcc/tree-pass.h     2025-01-08 11:22:56.356672128 +0100
@@ -230,6 +230,7 @@ protected:
 #define PROP_assumptions_done  (1 << 19)       /* Assume function kept
                                                   around.  */
 #define PROP_gimple_lbitint    (1 << 20)       /* lowered large _BitInt */
+#define PROP_last_full_fold    (1 << 21)       /* Start of last forwprop.  */
 
 #define PROP_gimple \
   (PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp)
--- gcc/passes.def.jj   2025-01-02 11:23:21.976440640 +0100
+++ gcc/passes.def      2025-01-08 11:24:47.865096546 +0100
@@ -80,7 +80,7 @@ along with GCC; see the file COPYING3.
          NEXT_PASS (pass_ccp, false /* nonzero_p */);
          /* After CCP we rewrite no longer addressed locals into SSA
             form if possible.  */
-         NEXT_PASS (pass_forwprop);
+         NEXT_PASS (pass_forwprop, /*last=*/false);
           NEXT_PASS (pass_early_thread_jumps, /*first=*/true);
          NEXT_PASS (pass_sra_early);
          /* pass_build_ealias is a dummy pass that ensures that we
@@ -214,7 +214,7 @@ along with GCC; see the file COPYING3.
       NEXT_PASS (pass_complete_unrolli);
       NEXT_PASS (pass_backprop);
       NEXT_PASS (pass_phiprop);
-      NEXT_PASS (pass_forwprop);
+      NEXT_PASS (pass_forwprop, /*last=*/false);
       /* pass_build_alias is a dummy pass that ensures that we
         execute TODO_rebuild_alias at this point.  */
       NEXT_PASS (pass_build_alias);
@@ -254,7 +254,7 @@ along with GCC; see the file COPYING3.
       NEXT_PASS (pass_isolate_erroneous_paths);
       NEXT_PASS (pass_reassoc, true /* early_p */);
       NEXT_PASS (pass_dce);
-      NEXT_PASS (pass_forwprop);
+      NEXT_PASS (pass_forwprop, /*last=*/false);
       NEXT_PASS (pass_phiopt, false /* early_p */);
       NEXT_PASS (pass_ccp, true /* nonzero_p */);
       /* After CCP we rewrite no longer addressed locals into SSA
@@ -356,7 +356,7 @@ along with GCC; see the file COPYING3.
       NEXT_PASS (pass_dce, true /* update_address_taken_p */, true /* 
remove_unused_locals */);
       /* After late DCE we rewrite no longer addressed locals into SSA
         form if possible.  */
-      NEXT_PASS (pass_forwprop);
+      NEXT_PASS (pass_forwprop, /*last=*/true);
       NEXT_PASS (pass_sink_code, true /* unsplit edges */);
       NEXT_PASS (pass_phiopt, false /* early_p */);
       NEXT_PASS (pass_fold_builtins);
--- gcc/tree-ssa-forwprop.cc.jj 2025-01-03 17:55:01.067219567 +0100
+++ gcc/tree-ssa-forwprop.cc    2025-01-08 11:46:05.935096203 +0100
@@ -4107,14 +4107,22 @@ class pass_forwprop : public gimple_opt_
 {
 public:
   pass_forwprop (gcc::context *ctxt)
-    : gimple_opt_pass (pass_data_forwprop, ctxt)
+    : gimple_opt_pass (pass_data_forwprop, ctxt), last_p (false)
   {}
 
   /* opt_pass methods: */
   opt_pass * clone () final override { return new pass_forwprop (m_ctxt); }
+  void set_pass_param (unsigned int n, bool param) final override
+    {
+      gcc_assert (n == 0);
+      last_p = param;
+    }
   bool gate (function *) final override { return flag_tree_forwprop; }
   unsigned int execute (function *) final override;
 
+ private:
+  /* Determines whether the pass instance should set PROP_last_full_fold.  */
+  bool last_p;
 }; // class pass_forwprop
 
 unsigned int
@@ -4123,6 +4131,8 @@ pass_forwprop::execute (function *fun)
   unsigned int todoflags = 0;
 
   cfg_changed = false;
+  if (last_p)
+    fun->curr_properties |= PROP_last_full_fold;
 
   calculate_dominance_info (CDI_DOMINATORS);
 
--- gcc/match.pd.jj     2025-01-08 10:05:24.515994525 +0100
+++ gcc/match.pd        2025-01-08 11:30:24.822335431 +0100
@@ -4950,14 +4950,32 @@ (define_operator_list SYNC_FETCH_AND_AND
                            build_int_cst (TREE_TYPE (@1),
                                           element_precision (type)), @1); }))
 
-/* a rrotate (32-b) -> a lrotate b */
-/* a lrotate (32-b) -> a rrotate b */
+/* a rrotate (bitsize-b) -> a lrotate b */
+/* a lrotate (bitsize-b) -> a rrotate b */
+/* Only do the above when it is known that b != bitsize.
+   Otherwise canonicalize to a orotate (b & mask) when the subtraction
+   has single use and prec is a power of two.  */
 (for rotate (lrotate rrotate)
      orotate (rrotate lrotate)
  (simplify
-  (rotate @0 (minus INTEGER_CST@1 @2))
-   (if (element_precision (TREE_TYPE (@0)) == wi::to_wide (@1))
-     (orotate @0 @2))))
+  (rotate @0 (minus@3 INTEGER_CST@1 @2))
+  (with { auto prec = element_precision (TREE_TYPE (@0)); }
+   (if (prec == wi::to_wide (@1))
+    (switch
+     (if (expr_not_equal_to (@2, wi::uhwi (prec,
+                                          TYPE_PRECISION (TREE_TYPE (@2)))))
+      (orotate @0 @2))
+     (if (single_use (@3)
+         && pow2p_hwi (prec)
+         /* Defer this case until last forwprop, so that VRP could be run and
+            expr_not_equal_to had a chance to match.  Otherwise we'd do
+            pretty much always just the second case.  */
+         && cfun
+         && ((cfun->curr_properties & PROP_last_full_fold) != 0
+             || !flag_tree_vrp
+             || optimize_debug))
+      (orotate @0
+       (bit_and @2 { build_int_cst (TREE_TYPE (@2), prec - 1); }))))))))
 
 /* Turn (a OP c1) OP c2 into a OP (c1+c2).  */
 (for op (lrotate rrotate rshift lshift)
--- gcc/testsuite/gcc.dg/tree-ssa/pr117927.c.jj 2025-01-08 11:22:04.773400985 
+0100
+++ gcc/testsuite/gcc.dg/tree-ssa/pr117927.c    2025-01-08 11:22:04.773400985 
+0100
@@ -0,0 +1,97 @@
+/* PR tree-optimization/117927 */
+/* { dg-do compile { target { int32 && longlong64 } } } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times " r<< " 4 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " r>> " 4 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " & 31;" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " & 63;" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32 - " "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64 - " "optimized" } } */
+
+static inline
+unsigned lrotate32 (unsigned x, int t)
+{
+  unsigned tl = x << t;
+  unsigned th = x >> (-t & 31);
+  return tl | th;
+}
+
+static inline
+unsigned rrotate32 (unsigned x, int t)
+{
+  unsigned tl = x >> t;
+  unsigned th = x << (-t & 31);
+  return tl | th;
+}
+
+static inline
+unsigned long long lrotate64 (unsigned long long x, int t)
+{
+  unsigned long long tl = x << t;
+  unsigned long long th = x >> (-t & 63);
+  return tl | th;
+}
+
+static inline
+unsigned long long rrotate64 (unsigned long long x, int t)
+{
+  unsigned long long tl = x >> t;
+  unsigned long long th = x << (-t & 63);
+  return tl | th;
+}
+
+unsigned
+f1 (unsigned x, int t)
+{
+  return lrotate32 (x, 32 - t);
+}
+
+unsigned long long
+f2 (unsigned long long x, int t)
+{
+  return lrotate64 (x, 64 - t);
+}
+
+unsigned
+f3 (unsigned x, int t)
+{
+  if (t == 32)
+    __builtin_unreachable ();
+  return lrotate32 (x, 32 - t);
+}
+
+unsigned long long
+f4 (unsigned long long x, int t)
+{
+  if (t == 64)
+    __builtin_unreachable ();
+  return lrotate64 (x, 64 - t);
+}
+
+unsigned
+f5 (unsigned x, int t)
+{
+  return rrotate32 (x, 32 - t);
+}
+
+unsigned long long
+f6 (unsigned long long x, int t)
+{
+  return rrotate64 (x, 64 - t);
+}
+
+unsigned
+f7 (unsigned x, int t)
+{
+  if (t == 32)
+    __builtin_unreachable ();
+  return rrotate32 (x, 32 - t);
+}
+
+unsigned long long
+f8 (unsigned long long x, int t)
+{
+  if (t == 64)
+    __builtin_unreachable ();
+  return rrotate64 (x, 64 - t);
+}


        Jakub

Reply via email to