https://gcc.gnu.org/g:4857769045e622d96029afc2bb7a003060697f50

commit 4857769045e622d96029afc2bb7a003060697f50
Author: Jeff Law <j...@ventanamicro.com>
Date:   Fri Sep 19 06:54:01 2025 -0600

    [RISC-V] Optimize clear-lowest-set-bit sequence when ctz is nearby
    
    Daniel Barboza and I were looking at deepsjeng recently and spotted an 
oddity.
    In particular we had a sequence like this:
    
        addi a1,a0,-1
        and a1,a0,a1
    
    That idiom clears the lowest set bit in a0.  In the initial code we looked 
at
    there was a subsequent ctz dest,a0.  In other cases I could see an earlier 
ctz.
    And when I went digging further I even found the ctz between the addi/and.
    
    The key is the ctz uses the same input as the addi, but a different output. 
 So
    the ctz doesn't participate in combine with the addi/and.
    
    Tackling this in gimple isn't really feasible as the resultant code is 
going to
    have a higher expression count.  The bclr idiom looks like (and (rotate (-2)
    (count) (input)) where the count comes from a ctz.  SO to make it work from 
a
    costing standpoint, we'd have to go find the ctz and use it.
    
    Tackling in gimple->rtl expansion looked potentially feasible, but sign and
    general type changes make that painful.  So while I could see the add+and 
and
    could speculatively hope there was a nearby ctz it looked less than ideal 
from
    an implementation standpoint.  It also tickled some undesirable behavior in 
the
    combiner.
    
    Tackling in combine isn't possible because the ctz doesn't feed the 
addi/and or
    vice-versa.
    
    Tackling with a peephole works sometimes, but is easily thwarted by the
    scheduler putting random code in the middle of the sequences we want to 
adjust.
    
    So in the end I just made a new RISC-V pass that runs after combine.
    
    It walks each block looking for the addi+and construct.  When found it then
    looks a few instructions in either direction for a related ctz. When found
    it'll move things around as necessary and use ctz+bclr.  It depends on DCE 
to
    eliminate the almost certainly dead addi+and insns.
    
    I haven't benchmarked this one on design, but it's definitely picking up the
    opportunities in deepsjeng and saves a few billion instructions.  It has
    survived a riscv64-elf and riscv32-elf build.  Bootstrap on the BPI is in
    flight, bootstrap on the Pioneer passes, but this code won't trigger this
    because the Pioneer doesn't have ctz or bclr instructions.
    
    Note while I would expect some of this pass to be usable on other targets, 
it
    does make some assumptions about RTL structure that hold on RISC-V.  For
    example subword arithmetic has an explicit sign extension to word mode, bit
    position is QImode, etc.
    
    Comments?  Other thoughts on how to resolve?
    
    gcc/
            * config.gcc (riscv*); Add riscv-bclr-lowest-set-bit.o to 
extra_objs.
            * config/riscv/riscv-bclr-lowest-set-bit.cc: New file.
            * config/riscv/riscv-passes.def: Add new pass after combine.
            * config/riscv/riscv-protos.h (make_pass_bclr_lowest_set_bit): Add
            prototype.
            * config/riscv/t-riscv: Add rules to build 
riscv-bclr-lowest-set-bit.o.
    
    gcc/testsuite
            * gcc.target/riscv/bclr-lowest-set-bit-1.c: New test.
    
    (cherry picked from commit 4f01f39d021435bc18d3ce3b0fa1e576c8457606)

Diff:
---
 gcc/config.gcc                                     |   2 +-
 gcc/config/riscv/riscv-bclr-lowest-set-bit.cc      | 306 +++++++++++++++++++++
 gcc/config/riscv/riscv-passes.def                  |   1 +
 gcc/config/riscv/riscv-protos.h                    |   1 +
 gcc/config/riscv/t-riscv                           |   5 +
 .../gcc.target/riscv/bclr-lowest-set-bit-1.c       | 128 +++++++++
 6 files changed, 442 insertions(+), 1 deletion(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 58fa404d2ebe..3b48b9851126 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -552,7 +552,7 @@ riscv*)
        extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o 
riscv-shorten-memrefs.o riscv-selftests.o riscv-string.o"
        extra_objs="${extra_objs} riscv-v.o riscv-vsetvl.o riscv-vector-costs.o 
riscv-avlprop.o riscv-vect-permconst.o"
        extra_objs="${extra_objs} riscv-vector-builtins.o 
riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o 
sifive-vector-builtins-bases.o andes-vector-builtins-bases.o"
-       extra_objs="${extra_objs} thead.o riscv-target-attr.o riscv-zicfilp.o"
+       extra_objs="${extra_objs} thead.o riscv-target-attr.o riscv-zicfilp.o 
riscv-bclr-lowest-set-bit.o"
        d_target_objs="riscv-d.o"
        extra_headers="riscv_vector.h riscv_crypto.h riscv_bitmanip.h 
riscv_th_vector.h sifive_vector.h andes_vector.h"
        target_gtfiles="$target_gtfiles 
\$(srcdir)/config/riscv/riscv-vector-builtins.cc"
diff --git a/gcc/config/riscv/riscv-bclr-lowest-set-bit.cc 
b/gcc/config/riscv/riscv-bclr-lowest-set-bit.cc
new file mode 100644
index 000000000000..6bc4ea562415
--- /dev/null
+++ b/gcc/config/riscv/riscv-bclr-lowest-set-bit.cc
@@ -0,0 +1,306 @@
+/* Convert clear lowest bit set idiom into a more efficient
+   bclr sequence when possible.
+
+   Copyright (C) 2018-2025 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "rtl.h"
+#include "backend.h"
+#include "df.h"
+#include "tree-pass.h"
+
+/* x & (x - 1) clears the lowest set bit in x.  If we have a ctz (x) nearby,
+   then we can use a bclr with the bit position defined by the output of
+   the ctz.  */
+
+namespace {
+
+const pass_data pass_data_bclr_lowest_set_bit =
+{
+  RTL_PASS, /* type */
+  "bclr_lowest_set_bit", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_bclr_lowest_set_bit : public rtl_opt_pass
+{
+public:
+  pass_bclr_lowest_set_bit (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_bclr_lowest_set_bit, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      /* This uses ctz and bclr, so we need ZBB and ZBS
+        instructions.  */
+      return TARGET_ZBB && TARGET_ZBS && optimize > 0;
+    }
+  virtual unsigned int execute (function *);
+
+private:
+}; // class pass_bclr_lowest_set_bit
+
+/* Look at real insns before START to see if any of them compute ctz (src).
+   If so, return the insn that is a ctz, otherwise return NULL.  Look no
+   further than LIMIT real statements and do not leave this basic block.  */
+
+rtx_insn *
+find_prior_ctz (rtx_insn *start, rtx src, int limit)
+{
+  rtx_insn *prev = start;
+  while (prev && limit > 0)
+    {
+      prev = prev_nonnote_nondebug_insn_bb (prev);
+      limit--;
+
+      if (prev)
+       {
+         rtx set = single_set (prev);
+         if (!set)
+           continue;
+
+         /* A ctz of a SI object on rv64 will have an
+            SUBREG argument.  */
+         if (GET_CODE (SET_SRC (set)) == CTZ
+             && (XEXP (SET_SRC (set), 0) == src
+                 || (SUBREG_P (XEXP (SET_SRC (set), 0))
+                     && SUBREG_REG (XEXP (SET_SRC (set), 0)) == src)))
+           {
+             /* We've found a CTZ, make sure neither the input nor
+                the output change between the CTZ and START.  */
+             if (reg_set_between_p (src, prev, start)
+                 || reg_set_between_p (SET_DEST (set), prev, start))
+               return NULL;
+
+             /* Everything looks good.  */
+             return prev;
+           }
+       }
+    }
+  return NULL;
+}
+
+/* Look at real insns after START to see if any of them compute ctz (src).
+   If so, return the insn that is a ctz, otherwise return NULL.  Look no
+   further than LIMIT real statements and do not leave this basic block.  */
+
+rtx_insn *
+find_later_ctz (rtx_insn *start, rtx src, int limit)
+{
+  rtx_insn *next = start;
+  while (next && limit > 0)
+    {
+      next = next_nonnote_nondebug_insn_bb (next);
+      limit--;
+
+      if (next)
+       {
+         rtx set = single_set (next);
+         if (!set)
+           continue;
+
+         /* A ctz of a SI object on rv64 will have an
+            SUBREG argument.  */
+         if (GET_CODE (SET_SRC (set)) == CTZ
+             && (XEXP (SET_SRC (set), 0) == src
+                 || (SUBREG_P (XEXP (SET_SRC (set), 0))
+                     && SUBREG_REG (XEXP (SET_SRC (set), 0)) == src)))
+           {
+             /* We've found a CTZ.  The CTZ is going to be moved, so
+                we need to verify its input doesn't change between
+                START and NEXT.  We also have to verify that its
+                destination is unused between those points.  */
+             if (reg_set_between_p (XEXP (SET_SRC (set), 0), start, next)
+                 || reg_used_between_p (SET_DEST (set), start, next))
+               return NULL;
+
+             return next;
+           }
+       }
+    }
+  return NULL;
+}
+
+/* So the basic idea here is to find x & (x - 1) idioms which clear the
+   lowest set bit.  If there is a nearby ctz (x), then we can profitably
+   use a ctz+bclr sequence instead, essentially replacing the addi+and
+   with bclr.  This should often be more efficient and save space.
+
+   We don't do this in gimple because the cost model would reject as the
+   optimized for appear more expensive from its costing model.
+
+   Combine won't work as there's no data dependency between the
+   ctz and the x & (x - 1) idiom.
+
+   Peepholing doesn't work consistently because we're constantly
+   inserting unrelated instructions between the two components of
+   the x & (x - 1) idiom.  We'd have to match the ctz in various
+   positions as well as deal with random insns the scheduler puts
+   in the middle of the key instrutions.
+
+   So, this mini pass to optimize this scenario.  */
+
+unsigned int
+pass_bclr_lowest_set_bit::execute (function *fn)
+{
+  basic_block bb;
+
+  /* Scan all the blocks, once.  */
+  FOR_ALL_BB_FN (bb, fn)
+    {
+      rtx_insn *insn;
+      /* Scan all the insns once.  */
+      FOR_BB_INSNS (bb, insn)
+       {
+         /* Ignore as much as we can.  */
+         if (!NONDEBUG_INSN_P (insn)
+             || JUMP_P (insn)
+             || CALL_P (insn))
+           continue;
+
+         rtx dec_set = single_set (insn);
+         if (!dec_set)
+           continue;
+
+         rtx dec_src = SET_SRC (dec_set);
+         rtx dec_dest = SET_DEST (dec_set);
+
+         /* For a 32 bit object on rv64, the decrement will
+            be wrapped by a SIGN_EXTEND.  Strip it.  */
+         if (GET_CODE (dec_src) == SIGN_EXTEND)
+           dec_src = XEXP (dec_src, 0);
+
+         /* Verify it's res = x - 1, if not proceed to the next insn.  */
+         if (!dec_set
+             || !REG_P (dec_dest)
+             || GET_CODE (dec_src) != PLUS
+             || (XEXP (dec_src, 1) != CONSTM1_RTX (GET_MODE (dec_src))))
+           continue;
+
+         /* Get the value being decremented.  Note it might be
+            wrapped by a SUBREG which we strip.  */
+         dec_src = XEXP (dec_src, 0);
+         if (SUBREG_P (dec_src))
+           dec_src = SUBREG_REG (dec_src);
+
+         /* So we've found dest = src - 1;  Now look at the next
+            real insn and see if it's dest2 = (dest & src).  */
+         rtx_insn *next = next_nonnote_nondebug_insn_bb (insn);
+         if (!next)
+           continue;
+
+         rtx and_set = single_set (next);
+
+         if (!and_set)
+           continue;
+
+         rtx and_src = SET_SRC (and_set);
+         rtx and_dest = SET_DEST (and_set);
+         if (!and_set
+             || !REG_P (and_dest)
+             || GET_CODE (and_src) != AND)
+           continue;
+
+         rtx and_op0 = XEXP (and_src, 0);
+         rtx and_op1 = XEXP (and_src, 1);
+
+         if (dec_dest != and_op0 && dec_dest != and_op1)
+           continue;
+
+         if (dec_src != and_op0 && dec_src != and_op1)
+           continue;
+
+         /* We've found x & (x - 1).  Now look for a suitable ctz nearby.  */
+
+         rtx_insn *prior_ctz = find_prior_ctz (insn, dec_src, 10);
+         if (prior_ctz)
+           {
+             rtx prior_ctz_output = SET_DEST (single_set (prior_ctz));
+
+             /* Create a pattern for the variable bit clear idiom.  */
+             rtx pat = gen_rtx_ROTATE (GET_MODE (dec_dest),
+                                       GEN_INT (-2),
+                                       gen_lowpart (QImode, prior_ctz_output));
+             pat = gen_rtx_AND (GET_MODE (dec_dest), pat, dec_src);
+
+             /* Slam that pattern in as the SET_SRC of the original AND.  */
+             SET_SRC (and_set) = pat;
+             INSN_CODE (next) = -1;
+             df_insn_rescan (next);
+
+             /* Start next loop iteration.  */
+             continue;
+           }
+
+         /* Typically in cases where we can optimize, we'll find a REG->REG
+            copy into DEC_SRC immediately before INSN.  Look for it.  */
+         rtx_insn *prev = prev_nonnote_nondebug_insn_bb (insn);
+         rtx copy = NULL_RTX;
+         if (prev
+             && (copy = single_set (prev)) != NULL_RTX
+             && SET_SRC (copy) == dec_src)
+           dec_src = SET_DEST (copy);
+
+         /* We didn't find a CTZ before INSN.  So look after NEXT.
+            This case is more complex as we have to move insns around.  */
+         rtx_insn *later_ctz = find_later_ctz (next, dec_src, 10);
+         if (later_ctz)
+           {
+             /* Remove the CTZ from the stream and reemit it immediately
+                after NEXT.  XXX FIXME.  Need to prove this is safe.  */
+             df_insn_delete (later_ctz);
+             remove_insn (later_ctz);
+             SET_PREV_INSN (later_ctz) = NULL;
+             SET_NEXT_INSN (later_ctz) = NULL;
+             df_insn_rescan (emit_insn_after (PATTERN (later_ctz), next));
+
+             /* Now construct the bclr insn and add it to the stream.  */
+             rtx later_ctz_output = SET_DEST (single_set (later_ctz));
+             rtx pat = gen_rtx_ROTATE (GET_MODE (dec_dest),
+                                       GEN_INT (-2),
+                                       gen_lowpart (QImode, later_ctz_output));
+             pat = gen_rtx_AND (GET_MODE (dec_dest), pat, dec_src);
+             pat = gen_rtx_SET (and_dest, pat);
+             df_insn_rescan (emit_insn_after (pat, NEXT_INSN (next)));
+           }
+       }
+    }
+
+  return 0;
+}
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_bclr_lowest_set_bit (gcc::context *ctxt)
+{
+  return new pass_bclr_lowest_set_bit (ctxt);
+}
diff --git a/gcc/config/riscv/riscv-passes.def 
b/gcc/config/riscv/riscv-passes.def
index bc803c4678ed..5aa41228e1fe 100644
--- a/gcc/config/riscv/riscv-passes.def
+++ b/gcc/config/riscv/riscv-passes.def
@@ -17,6 +17,7 @@
    along with GCC; see the file COPYING3.  If not see
    <http://www.gnu.org/licenses/>.  */
 
+INSERT_PASS_AFTER (pass_combine, 1, pass_bclr_lowest_set_bit);
 INSERT_PASS_AFTER (pass_rtl_store_motion, 1, pass_shorten_memrefs);
 INSERT_PASS_AFTER (pass_split_all_insns, 1, pass_avlprop);
 INSERT_PASS_BEFORE (pass_fast_rtl_dce, 1, pass_vsetvl);
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 46b256d63539..2d3fd0e07626 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -207,6 +207,7 @@ rtl_opt_pass * make_pass_avlprop (gcc::context *ctxt);
 rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
 rtl_opt_pass * make_pass_insert_landing_pad (gcc::context *ctxt);
 rtl_opt_pass * make_pass_vector_permconst (gcc::context *ctxt);
+rtl_opt_pass * make_pass_bclr_lowest_set_bit (gcc::context *ctxt);
 
 
 /* Routines implemented in riscv-string.c.  */
diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index d86dc7443658..b53a2dff2cf7 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -131,6 +131,11 @@ riscv-d.o: $(srcdir)/config/riscv/riscv-d.cc \
        $(COMPILE) $<
        $(POSTCOMPILE)
 
+riscv-bclr-lowest-set-bit.o: 
$(srcdir)/config/riscv/riscv-bclr-lowest-set-bit.cc \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TARGET_H)
+       $(COMPILE) $<
+       $(POSTCOMPILE)
+
 riscv-shorten-memrefs.o: $(srcdir)/config/riscv/riscv-shorten-memrefs.cc \
   $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TARGET_H)
        $(COMPILE) $<
diff --git a/gcc/testsuite/gcc.target/riscv/bclr-lowest-set-bit-1.c 
b/gcc/testsuite/gcc.target/riscv/bclr-lowest-set-bit-1.c
new file mode 100644
index 000000000000..fa201cec3590
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/bclr-lowest-set-bit-1.c
@@ -0,0 +1,128 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcb -mabi=lp64d" { target { rv64} } } */
+/* { dg-options "-march=rv32gcb -mabi=ilp32" { target { rv32} } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+int foo1_res1;
+int foo1_res2;
+void foo1(unsigned int x)
+{
+  unsigned int tz = __builtin_ctz (x);
+  unsigned int t = x - 1UL;
+  foo1_res1 = t & x;
+  foo1_res2 = tz;
+}
+
+int foo2_res1;
+int foo2_res2;
+void foo2(unsigned int x)
+{
+  unsigned int t = x - 1UL;
+  unsigned int tz = __builtin_ctz (x);
+  foo2_res1 = t & x;
+  foo2_res2 = tz;
+}
+
+int foo3_res1;
+int foo3_res2;
+void foo3(unsigned int x)
+{
+  unsigned int t = x - 1UL;
+  foo3_res1 = t & x;
+  unsigned int tz = __builtin_ctz (x);
+  foo3_res2 = tz;
+}
+
+unsigned int foo4_res1;
+unsigned int foo4_res2;
+void foo4(unsigned int x)
+{
+  unsigned int tz = __builtin_ctz (x);
+  unsigned int t = x - 1UL;
+  foo4_res1 = t & x;
+  foo4_res2 = tz;
+}
+
+unsigned int foo5_res1;
+unsigned int foo5_res2;
+void foo5(unsigned int x)
+{
+  unsigned int t = x - 1UL;
+  unsigned int tz = __builtin_ctz (x);
+  foo5_res1 = t & x;
+  foo5_res2 = tz;
+}
+
+unsigned int foo6_res1;
+unsigned int foo6_res2;
+void foo6(unsigned int x)
+{
+  unsigned int t = x - 1UL;
+  foo6_res1 = t & x;
+  unsigned int tz = __builtin_ctzl (x);
+  foo6_res2 = tz;
+}
+
+long foo7_res1;
+long foo7_res2;
+void foo7(unsigned long x)
+{
+  unsigned long tz = __builtin_ctzl (x);
+  unsigned long t = x - 1UL;
+  foo7_res1 = t & x;
+  foo7_res2 = tz;
+}
+
+long foo8_res1;
+long foo8_res2;
+void foo8(unsigned long x)
+{
+  unsigned long t = x - 1UL;
+  unsigned long tz = __builtin_ctzl (x);
+  foo8_res1 = t & x;
+  foo8_res2 = tz;
+}
+
+long foo9_res1;
+long foo9_res2;
+void foo9(unsigned long x)
+{
+  unsigned long t = x - 1UL;
+  foo9_res1 = t & x;
+  unsigned long tz = __builtin_ctzl (x);
+  foo9_res2 = tz;
+}
+
+unsigned long foo10_res1;
+unsigned long foo10_res2;
+void foo10(unsigned long x)
+{
+  unsigned long tz = __builtin_ctzl (x);
+  unsigned long t = x - 1UL;
+  foo10_res1 = t & x;
+  foo10_res2 = tz;
+}
+
+unsigned long foo11_res1;
+unsigned long foo11_res2;
+void foo11(unsigned long x)
+{
+  unsigned long t = x - 1UL;
+  unsigned long tz = __builtin_ctzl (x);
+  foo11_res1 = t & x;
+  foo11_res2 = tz;
+}
+
+unsigned long foo12_res1;
+unsigned long foo12_res2;
+void foo12(unsigned long x)
+{
+  unsigned long t = x - 1UL;
+  foo12_res1 = t & x;
+  unsigned long tz = __builtin_ctzl (x);
+  foo12_res2 = tz;
+}
+
+/* { dg-final { scan-assembler-not "\\sand\\s" } } */
+/* { dg-final { scan-assembler-times "\\sbclr\\s" 12 } } */
+

Reply via email to