[PATCH] RISC-V: Fix failed hoist in LICM of vmv.v.x instruction

Juzhe-Zhong Wed, 18 Oct 2023 03:22:12 -0700

Confirm dynamic LMUL algorithm works well for choosing LMUL = 4 for the PR:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111848


But it generate horrible register spillings.

The root cause is that we didn't hoist the vmv.v.x outside the loop which
increase the SLP loop register pressure.

So, change the COSNT_VECTOR move into vec_duplicate splitter that we can gain 
better optimizations:

1. better LICM.
2. More opportunities of transforming 'vv' into 'vx' in the future.

Before this patch:

f3:
        ble     a4,zero,.L8
        csrr    t0,vlenb
        slli    t1,t0,4
        csrr    a6,vlenb
        sub     sp,sp,t1
        csrr    a5,vlenb
        slli    a6,a6,3
        slli    a5,a5,2
        add     a6,a6,sp
        vsetvli a7,zero,e16,m8,ta,ma
        slli    a4,a4,3
        vid.v   v8
        addi    t6,a5,-1
        vand.vi v8,v8,-2
        neg     t5,a5
        vs8r.v  v8,0(sp)
        vadd.vi v8,v8,1
        vs8r.v  v8,0(a6)
        j       .L4
.L12:
        vsetvli a7,zero,e16,m8,ta,ma
.L4:
        csrr    t0,vlenb
        slli    t0,t0,3
        vl8re16.v       v16,0(sp)
        add     t0,t0,sp
        vmv.v.x v8,t6
        mv      t1,a4
        vand.vv v24,v16,v8
        mv      a6,a4
        vl8re16.v       v16,0(t0)
        vand.vv v8,v16,v8
        bleu    a4,a5,.L3
        mv      a6,a5
.L3:
        vsetvli zero,a6,e8,m4,ta,ma
        vle8.v  v20,0(a2)
        vle8.v  v16,0(a3)
        vsetvli a7,zero,e8,m4,ta,ma
        vrgatherei16.vv v4,v20,v24
        vadd.vv v4,v16,v4
        vsetvli zero,a6,e8,m4,ta,ma
        vse8.v  v4,0(a0)
        vle8.v  v20,0(a2)
        vsetvli a7,zero,e8,m4,ta,ma
        vrgatherei16.vv v4,v20,v8
        vadd.vv v4,v4,v16
        vsetvli zero,a6,e8,m4,ta,ma
        vse8.v  v4,0(a1)
        add     a4,a4,t5
        add     a0,a0,a5
        add     a3,a3,a5
        add     a1,a1,a5
        add     a2,a2,a5
        bgtu    t1,a5,.L12
        csrr    t0,vlenb
        slli    t1,t0,4
        add     sp,sp,t1
        jr      ra
.L8:
        ret

After this patch:

bar:
        ble     a3,zero,.L5
        csrr    a5,vlenb
        csrr    t1,vlenb
        srli    a5,a5,1
        srli    a7,t1,1
        addi    a5,a5,-1
        vsetvli a4,zero,e32,m2,ta,ma
        slli    a3,a3,1
        vmv.v.x v2,a5
        vid.v   v18
        vmv.v.x v6,a1
        vand.vi v10,v18,-2
        vand.vi v0,v18,1
        vadd.vi v16,v10,1
        vmseq.vi        v0,v0,1
        vand.vv v10,v10,v2
        vand.vv v16,v16,v2
        slli    t1,t1,1
        vsetvli zero,a4,e32,m2,ta,ma
        neg     t3,a7
        viota.m v4,v0
        vsetvli a4,zero,e32,m2,ta,mu
        vmv.v.x v8,a2
        vrgather.vv     v14,v6,v4
        vrgather.vv     v12,v8,v4
        vmv.v.i v2,0
        vrgather.vv     v14,v8,v4,v0.t
        vrgather.vv     v12,v6,v4,v0.t
.L4:
        mv      a2,a3
        mv      a5,a3
        bleu    a3,a7,.L3
        mv      a5,a7
.L3:
        vsetvli zero,a5,e32,m2,ta,ma
        vle32.v v6,0(a0)
        vsetvli a6,zero,e32,m2,ta,ma
        add     a3,a3,t3
        vrgather.vv     v4,v6,v10
        vrgather.vv     v8,v6,v16
        vsub.vv v4,v4,v12
        add     a0,a0,t1
        vsetvli zero,a5,e32,m2,tu,ma
        vadd.vv v2,v2,v4
        vmacc.vv        v2,v14,v8
        bgtu    a2,a7,.L4
        li      a5,-1
        vsetvli a6,zero,e32,m2,ta,ma
        li      a4,0
        vmv.v.i v4,0
        vmul.vx v0,v18,a5
        vadd.vi v0,v0,-1
        vand.vi v0,v0,1
        vmseq.vv        v0,v0,v4
        vand.vi v18,v18,1
        vmerge.vvm      v6,v4,v2,v0
        vmseq.vv        v18,v18,v4
        vmv.s.x v1,a4
        vmv1r.v v0,v18
        vredsum.vs      v6,v6,v1
        vmerge.vvm      v4,v4,v2,v0
        vmv.x.s a0,v6
        vredsum.vs      v4,v4,v1
        vmv.x.s a5,v4
        addw    a0,a0,a5
        ret
.L5:
        li      a0,0
        ret

Note that this patch triggers multiple FAILs:
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-2.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-2.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c 
execution test

They failed are all because of bugs on VSETVL PASS:

10dd4:       0c707057                vsetvli zero,zero,e8,mf2,ta,ma
   10dd8:       5e06b8d7                vmv.v.i v17,13
   10ddc:       9ed030d7                vmv1r.v v1,v13
   10de0:       b21040d7                vncvt.x.x.w     v1,v1           ----> 
raise illegal instruction since we don't have SEW = 8 -> SEW = 4 narrowing.
   10de4:       5e0785d7                vmv.v.v v11,v15

Confirm the recent VSETVL refactor patch: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633231.html fixed all of 
them.

So this patch should be committed after the VSETVL refactor patch.

        PR target/111848

gcc/ChangeLog:

        * config/riscv/riscv-selftests.cc (run_const_vector_selftests): Adapt 
selftest.
        * config/riscv/riscv-v.cc (expand_const_vector): Change it into 
vec_duplicate splitter.

gcc/testsuite/ChangeLog:

        * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: Adapt test.
        * gcc.dg/vect/costmodel/riscv/rvv/pr111848.c: New test.

---
 gcc/config/riscv/riscv-selftests.cc           | 14 ++++----
 gcc/config/riscv/riscv-v.cc                   | 27 ++++++++++++--
 .../costmodel/riscv/rvv/dynamic-lmul2-7.c     |  3 +-
 .../vect/costmodel/riscv/rvv/pr111848.c       | 35 +++++++++++++++++++
 4 files changed, 68 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111848.c

diff --git a/gcc/config/riscv/riscv-selftests.cc 
b/gcc/config/riscv/riscv-selftests.cc
index cdc863ee4f7..0ac17fb70a1 100644
--- a/gcc/config/riscv/riscv-selftests.cc
+++ b/gcc/config/riscv/riscv-selftests.cc
@@ -267,15 +267,14 @@ run_const_vector_selftests (void)
              rtx dup = gen_const_vec_duplicate (mode, GEN_INT (val));
              emit_move_insn (dest, dup);
              rtx_insn *insn = get_last_insn ();
-             rtx src = XEXP (SET_SRC (PATTERN (insn)), 1);
+             rtx src = SET_SRC (PATTERN (insn));
              /* 1. Should be vmv.v.i for in rang of -16 ~ 15.
                 2. Should be vmv.v.x for exceed -16 ~ 15.  */
              if (IN_RANGE (val, -16, 15))
-               ASSERT_TRUE (rtx_equal_p (src, dup));
-             else
                ASSERT_TRUE (
-                 rtx_equal_p (src,
-                              gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0))));
+                 rtx_equal_p (XEXP (SET_SRC (PATTERN (insn)), 1), dup));
+             else
+               ASSERT_TRUE (GET_CODE (src) == VEC_DUPLICATE);
              end_sequence ();
            }
        }
@@ -294,10 +293,9 @@ run_const_vector_selftests (void)
          rtx dup = gen_const_vec_duplicate (mode, ele);
          emit_move_insn (dest, dup);
          rtx_insn *insn = get_last_insn ();
-         rtx src = XEXP (SET_SRC (PATTERN (insn)), 1);
+         rtx src = SET_SRC (PATTERN (insn));
          /* Should always be vfmv.v.f.  */
-         ASSERT_TRUE (
-           rtx_equal_p (src, gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0))));
+         ASSERT_TRUE (GET_CODE (src) == VEC_DUPLICATE);
          end_sequence ();
        }
     }
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 895c11d13fc..6116f5df504 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -1001,8 +1001,31 @@ expand_const_vector (rtx target, rtx src)
        }
       else
        {
-         rtx ops[] = {tmp, elt};
-         emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
+         /* Emit vec_duplicate<mode> split pattern before RA so that
+            we could have a better optimization opportunity in LICM
+            which will hoist vmv.v.x outside the loop and in fwprop && combine
+            which will transform 'vv' into 'vx' instruction.
+
+            The reason we don't emit vec_duplicate<mode> split pattern during
+            RA since the split stage after RA is a too late stage to generate
+            RVV instruction which need an additional register (We can't
+            allocate a new register after RA) for VL operand of vsetvl
+            instruction (vsetvl a5, zero).  */
+         if (lra_in_progress)
+           {
+             rtx ops[] = {tmp, elt};
+             emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
+           }
+         else
+           {
+             struct expand_operand ops[2];
+             enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
+             gcc_assert (icode != CODE_FOR_nothing);
+             create_output_operand (&ops[0], tmp, mode);
+             create_input_operand (&ops[1], elt, GET_MODE_INNER (mode));
+             expand_insn (icode, 2, ops);
+             tmp = ops[0].value;
+           }
        }
 
       if (tmp != target)
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
index 3dfc6f16a25..2a735d8c6b6 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
@@ -18,7 +18,8 @@ bar (int *x, int a, int b, int n)
 }
 
 /* { dg-final { scan-assembler {e32,m2} } } */
-/* { dg-final { scan-assembler-times {csrr} 1 } } */
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler-times {ret} 2 } } *
 /* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111848.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111848.c
new file mode 100644
index 00000000000..b203ca907fa
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111848.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -fdump-tree-vect-details" } */
+
+void
+f3 (uint8_t *restrict a, uint8_t *restrict b,
+   uint8_t *restrict c, uint8_t *restrict d,
+   int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      a[i * 8] = c[i * 8] + d[i * 8];
+      a[i * 8 + 1] = c[i * 8] + d[i * 8 + 1];
+      a[i * 8 + 2] = c[i * 8 + 2] + d[i * 8 + 2];
+      a[i * 8 + 3] = c[i * 8 + 2] + d[i * 8 + 3];
+      a[i * 8 + 4] = c[i * 8 + 4] + d[i * 8 + 4];
+      a[i * 8 + 5] = c[i * 8 + 4] + d[i * 8 + 5];
+      a[i * 8 + 6] = c[i * 8 + 6] + d[i * 8 + 6];
+      a[i * 8 + 7] = c[i * 8 + 6] + d[i * 8 + 7];
+      b[i * 8] = c[i * 8 + 1] + d[i * 8];
+      b[i * 8 + 1] = c[i * 8 + 1] + d[i * 8 + 1];
+      b[i * 8 + 2] = c[i * 8 + 3] + d[i * 8 + 2];
+      b[i * 8 + 3] = c[i * 8 + 3] + d[i * 8 + 3];
+      b[i * 8 + 4] = c[i * 8 + 5] + d[i * 8 + 4];
+      b[i * 8 + 5] = c[i * 8 + 5] + d[i * 8 + 5];
+      b[i * 8 + 6] = c[i * 8 + 7] + d[i * 8 + 6];
+      b[i * 8 + 7] = c[i * 8 + 7] + d[i * 8 + 7];
+    }
+}
+
+/* { dg-final { scan-assembler {e8,m4} } } */
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler-times {ret} 1 } } *
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
-- 
2.36.3

[PATCH] RISC-V: Fix failed hoist in LICM of vmv.v.x instruction

Reply via email to