From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai>

Hi, this patch is present for GCC 14 since I understand it's not appropriate
to land it in GCC 13.

NUM_FIXED_BLOCKS = 2 since GCC define each function has aleast 2 blocks
one is entry block, the other is exit block.
So according this code, the function will not do cprop optimization when
there is only exactly one single block.

I am not sure whether it's correct to fix it like this.
Can anyone tell me why forbid cprop optimization if the function only has s
single block ?

Let's take a look at these 2 examples of RVV intrinsics:
1.    void f1 (void * in, void *out, int64_t x, int n)
      {
      vint64m1_t v = __riscv_vle64_v_i64m1 (in + 1, 4);
      vint64m1_t v2 = __riscv_vle64_v_i64m1_tu (v, in + 2, 4);
      vint64m1_t v3 = __riscv_vadd_vx_i64m1 (v2, x, 4);
      vint64m1_t v4 = __riscv_vadd_vx_i64m1 (v3, x, 4);
      __riscv_vse64_v_i64m1 (out + 2, v4, 4);
      }

asm:
        addi    sp,sp,-16
        sw      a2,8(sp)
        sw      a3,12(sp)
        sw      a2,0(sp)
        sw      a3,4(sp)
        addi    a5,a0,1
        vsetivli        zero,4,e64,m1,ta,ma
        addi    a0,a0,2
        vle64.v v24,0(a5)
        addi    a5,sp,8
        vlse64.v        v27,0(a5),zero
        addi    a1,a1,2
        vsetvli zero,zero,e64,m1,tu,ma
        vle64.v v24,0(a0)
        vsetvli zero,zero,e64,m1,ta,ma
        vlse64.v        v25,0(sp),zero
        vadd.vv v26,v24,v27
        vadd.vv v24,v26,v25
        vse64.v v24,0(a1)
        addi    sp,sp,16
        jr      ra
you can see here there are 2 vlse64.v instructions that broadcast the scalar 
value "x"
GCC fail to eliminate the second vlse64.v instruction since GCC doesn't do the 
cprop
optimization (the function only has 1 single block). It can be optimized if we 
apply
this patch.

2. void f1 (void * in, void *out, int64_t x, int n)
{
    if (n) {
      vint64m1_t v = __riscv_vle64_v_i64m1 (in + 1, 4);
      vint64m1_t v2 = __riscv_vle64_v_i64m1_tu (v, in + 2, 4);
      vint64m1_t v3 = __riscv_vadd_vx_i64m1 (v2, x, 4);
      vint64m1_t v4 = __riscv_vadd_vx_i64m1 (v3, x, 4);
      __riscv_vse64_v_i64m1 (out + 2, v4, 4);
    }
}

asm:
  f1:
        vsetivli        zero,4,e64,m1,ta,ma
        beq     a4,zero,.L7
        addi    sp,sp,-16
        sw      a2,8(sp)
        sw      a3,12(sp)
        addi    a5,a0,1
        vle64.v v24,0(a5)
        addi    a0,a0,2
        addi    a5,sp,8
        vlse64.v        v25,0(a5),zero
        addi    a1,a1,2
        vsetvli zero,zero,e64,m1,tu,ma
        vle64.v v24,0(a0)
        vadd.vv v26,v24,v25
        vadd.vv v24,v26,v25
        vse64.v v24,0(a1)
        addi    sp,sp,16
        jr      ra
.L7:
        ret

Here, if we add if (n) condition here, the program will end up with more than 1 
block.
So GCC will do the cprop optimization and the second vlse64.v instruction is 
eliminated.

I am not sure whether this patch is correct.
Can anyone help me with that ?
Thanks.


gcc/ChangeLog:

        * cprop.cc (one_cprop_pass): Remove +1.

---
 gcc/cprop.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/cprop.cc b/gcc/cprop.cc
index 6ec0bda4a24..615bc5078b6 100644
--- a/gcc/cprop.cc
+++ b/gcc/cprop.cc
@@ -1749,7 +1749,7 @@ one_cprop_pass (void)
   int changed = 0;
 
   /* Return if there's nothing to do, or it is too expensive.  */
-  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1
+  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS
       || gcse_or_cprop_is_too_expensive (_ ("const/copy propagation 
disabled")))
     return 0;
 
-- 
2.36.1

Reply via email to