[gcc r15-8245] [RISC-V] Fix another unreported code quality regression

Jeff Law via Gcc-cvs Mon, 17 Mar 2025 21:01:13 -0700

https://gcc.gnu.org/g:e9888795b8bafe37dc65bd638de0533b842c960a


commit r15-8245-ge9888795b8bafe37dc65bd638de0533b842c960a
Author: Jeff Law <j...@ventanamicro.com>
Date:   Mon Mar 17 21:58:03 2025 -0600

    [RISC-V] Fix another unreported code quality regression
    
    So here's the other case I was just looking at.  This is a slightly modified
    version of some code from 500.perlbench which shows another nop logical
    operation:
    
    > void frob (void);
    > typedef struct av AV;
    > typedef unsigned int U32;
    > struct av
    > {
    >   void *dummy;
    >   U32 sv_refcnt;
    >   U32 sv_flags;
    > };
    > void
    > Perl_save_ary (AV *const oav)
    > {
    >   AV *av;
    >   unsigned int x1 = oav->sv_flags;
    >   unsigned int x2 = x1 & 3221225472;
    >   if (x2 == 2147483648)
    >     frob ();
    > }
    
    https://godbolt.org/z/941vqfGE6
    
    It's not as obvious, but this is probably a regression as well.  I would 
expect
    the gcc-14 code to execute in 1c faster than the current trunk code on a
    superscalar design:
    
    gcc-14:                               trunk:
            lw      a5,12(a0)                   lw      a5,12(a0)
            li      a3,-1073741824              li      a3,-2
            li      a4,-2147483648
    
            and     a5,a5,a3                    srai    a4,a5,30
    
            beq     a5,a4,.L4                   andi    a4,a4,-1
    
                                                beq     a4,a3,.L4
    
    Essentially the "li" instrutions can execute in parallel with the lw. But 
the
    rest of the sequence has data dependencies forcing the instructions to 
execute
    serially.  Thus that extra andi extends the critical path by 1c.
    
    Removing the useless andi should make the two sequences perform the same and
    reduces the codesize.
    
    Much like the prior case we walk backwards using -fdump-rtl-all -dp to find 
the
    andi:
    
            andi    a4,a4,-1        # 26    [c=4 l=4]  *anddi3/1
    
    The UID is 26.  And just like the prior case it first shows up in the 
.split2
    dump:
    
    grep insn\ 26 j.c.*
    j.c.326r.split2:(insn 26 25 27 2 (set (reg:DI 14 a4 [144])
    j.c.327r.ree:(insn 26 25 27 2 (set (reg:DI 14 a4 [144])
    j.c.329r.pro_and_epilogue:(insn 26 25 27 2 (set (reg:DI 14 a4 [144])
    j.c.330r.dse2:(insn 26 25 27 2 (set (reg:DI 14 a4 [144])
    
    In the .split2 dump:
    
    Splitting with gen_split_77 (riscv.md:3184)
    scanning new insn with uid = 25.
    scanning new insn with uid = 26.
    scanning new insn with uid = 27.
    scanning new insn with uid = 28.
    deleting insn with uid = 12.
    deleting insn with uid = 12.
    
    So insn 12 is where we want to look.
    
    > (jump_insn 12 6 13 2 (parallel [
    >             (set (pc)
    >                 (if_then_else (ne (and:DI (reg:DI 15 a5 [orig:138 
oav_3(D)->sv_flags ] [138])
    >                             (const_int -1073741824 [0xffffffffc0000000]))
    >                         (const_int -2147483648 [0xffffffff80000000]))
    >                     (label_ref:DI 18)
    >                     (pc)))
    >             (clobber (reg:DI 14 a4 [144]))
    >             (clobber (reg:DI 13 a3 [145]))
    >         ]) "j.c":16:6 361 {*branchdi_shiftedarith_ne_shifted}
    >      (int_list:REG_BR_PROB 856416484 (nil))
    >  -> 18)
    
    So that's a conditional branch with the condition
    
    (a5 & 0xffffffffc0000000) != 0xffffffff80000000
    
    Note how those instructions have many low bits as zeros and that the 
constants
    likely require some kind of constant synthesis.  We can conceptually do an
    arithmetic right shift of a5 and both constants and get the same result, 
likely
    making the constants easier to synthesize.
    
    And that's precisely what this pattern is designed to do:
    
    > (define_insn_and_split "*branch<ANYI:mode>_shiftedarith_<optab>_shifted"
    >   [(set (pc)
    >         (if_then_else (any_eq
    >                     (and:ANYI (match_operand:ANYI 1 "register_operand" 
"r")
    >                           (match_operand 2 "shifted_const_arith_operand" 
"i"))
    >                     (match_operand 3 "shifted_const_arith_operand" "i"))
    >          (label_ref (match_operand 0 "" ""))
    >          (pc)))
    >    (clobber (match_scratch:X 4 "=&r"))
    >    (clobber (match_scratch:X 5 "=&r"))]
    >   "!SMALL_OPERAND (INTVAL (operands[2]))
    >     && !SMALL_OPERAND (INTVAL (operands[3]))
    >     && SMALL_AFTER_COMMON_TRAILING_SHIFT (INTVAL (operands[2]),
    >                                              INTVAL (operands[3]))"
    >   "#"
    >   "&& reload_completed"
    >   [(set (match_dup 4) (ashiftrt:X (match_dup 1) (match_dup 7)))
    >    (set (match_dup 4) (and:X (match_dup 4) (match_dup 8)))
    >    (set (match_dup 5) (match_dup 9))
    >    (set (pc) (if_then_else (any_eq (match_dup 4) (match_dup 5))
    >                            (label_ref (match_dup 0)) (pc)))]
    > {
    >   HOST_WIDE_INT mask1 = INTVAL (operands[2]);
    >   HOST_WIDE_INT mask2 = INTVAL (operands[3]);
    >   int trailing_shift = COMMON_TRAILING_ZEROS (mask1, mask2);
    >
    >   operands[7] = GEN_INT (trailing_shift);
    >   operands[8] = GEN_INT (mask1 >> trailing_shift);
    >   operands[9] = GEN_INT (mask2 >> trailing_shift);
    > }
    It finds the number of low bits in both that must be zero.  In this case 
it's
    30 bits.  So it shifts the register right by 30 bits.  Then constructs the 
two
    new constants, one of which is -1 after shifting. And we emit (set 
(match_dup
    4) (and (match_dup 4) (const_int -1))
    
    And since this splits after register allocation nothing eliminates the 
useless
    and dest,src,-1 and boom we have a regression.
    
    The fix this time is a bit different.  I really don't want to open code the 
new
    RTL.  So instead I create a new operand for the source of the AND statement.
    If the constant is going to be -1 then that operand has the same value as 
the
    destination operand (ie, a nop move).  Otherwise it is the appropriate AND
    expression.
    
    The nop-move will get eliminated thus resolving the regression.
    
    I suspect some of the other patterns in riscv.md are subject to similar 
issues,
    though I haven't seem them trigger, so I'm leaving them alone for now.
    
    This has been tested in my tester and it'll obviously go through the 
upstream
    CI flow before I push it to the trunk.
    
    gcc/
            * config/riscv/riscv.md (equality shifted-arith splitter): Do not
            create op AND -1 as it won't be cleaned up post-reload.
    
    gcc/testsuite
            * gcc.target/riscv/redundant-andi-2.c: New test.

Diff:
---
 gcc/config/riscv/riscv.md                         | 12 ++++++++++-
 gcc/testsuite/gcc.target/riscv/redundant-andi-2.c | 25 +++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 84bce409bc72..26a247c2b966 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -3198,7 +3198,7 @@
   "#"
   "&& reload_completed"
   [(set (match_dup 4) (ashiftrt:X (match_dup 1) (match_dup 7)))
-   (set (match_dup 4) (and:X (match_dup 4) (match_dup 8)))
+   (set (match_dup 4) (match_dup 10))
    (set (match_dup 5) (match_dup 9))
    (set (pc) (if_then_else (any_eq (match_dup 4) (match_dup 5))
                           (label_ref (match_dup 0)) (pc)))]
@@ -3210,6 +3210,16 @@
   operands[7] = GEN_INT (trailing_shift);
   operands[8] = GEN_INT (mask1 >> trailing_shift);
   operands[9] = GEN_INT (mask2 >> trailing_shift);
+
+  /* This splits after reload, so there's little chance to clean things
+     up.  Rather than emit a ton of RTL here, we can just make a new
+     operand for that RHS and use it.  For the case where the AND would
+     have been redundant, we can make it a NOP move, which does get
+     cleaned up.  */
+  if (operands[8] == CONSTM1_RTX (word_mode))
+    operands[10] = operands[4];
+  else
+    operands[10] = gen_rtx_AND (word_mode, operands[4], operands[8]);
 }
 [(set_attr "type" "branch")])
 
diff --git a/gcc/testsuite/gcc.target/riscv/redundant-andi-2.c 
b/gcc/testsuite/gcc.target/riscv/redundant-andi-2.c
new file mode 100644
index 000000000000..ff0789d7d0a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/redundant-andi-2.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcb -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+void frob (void);
+typedef struct av AV;
+typedef unsigned int U32;
+struct av
+{
+  void *dummy;
+  U32 sv_refcnt;
+  U32 sv_flags;
+};
+void
+Perl_save_ary (AV *const oav)
+{
+  AV *av;
+  unsigned int x1 = oav->sv_flags;
+  unsigned int x2 = x1 & 3221225472;
+  if (x2 == 2147483648)
+    frob ();
+}
+
+/* { dg-final { scan-assembler-not "andi\t" } } */
+

[gcc r15-8245] [RISC-V] Fix another unreported code quality regression

Reply via email to