>> I'm not sure if it's profitable to replace a lmul8 load with 127 >> vslide1down.vx >> ops but we're being honest with the middle end when returning the # of insns >> we'll be emitting when costing...
I think it's issue of dynamic LMUL cost model which only care about program SSA-based register pressure. Definitely, we should care more things in dynamic LMUL cost model. juzhe.zh...@rivai.ai From: Patrick O'Neill Date: 2024-08-23 03:46 To: gcc-patches CC: rdapp.gcc; juzhe.zhong; kito.cheng; jeffreyalaw; gnu-toolchain; Patrick O'Neill Subject: [PATCH 9/9] RISC-V: Add vslide1up/down pattern to expand_const_vector Also explicitly disallow CONST_VECTOR_DUPLICATE_P for now. CONST_VECTOR_DUPLICATE_P was previously disallowed implicitly. gcc/ChangeLog: * config/riscv/riscv-v.cc (expand_vec_series): Update comment. (expand_vector_init_insert_elems): Ditto. (expand_const_vector): Add catch-all pattern. * config/riscv/riscv.cc (riscv_const_insns): Add costing for catch-all pattern. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/materialize-1.c: New test. * gcc.target/riscv/rvv/autovec/materialize-2.c: New test. * gcc.target/riscv/rvv/autovec/materialize-3.c: New test. * gcc.target/riscv/rvv/autovec/materialize-4.c: New test. * gcc.target/riscv/rvv/autovec/materialize-5.c: New test. * gcc.target/riscv/rvv/autovec/materialize-6.c: New test. Signed-off-by: Patrick O'Neill <patr...@rivosinc.com> --- This causes 4 new regressions on glibc rv64gcv: Appears to be spilling due to the increased register pressure from materializing constants for vslide1down: FAIL: gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c scan-assembler-not jr FAIL: gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c scan-assembler-not sp Caused due to vle32/64 being replaced with splat & vslide1down: FAIL: gcc.target/riscv/rvv/autovec/vls/init-5.c -O3 -ftree-vectorize -mrvv-vector-bits=scalable scan-assembler-times vle32\\.v 7 FAIL: gcc.target/riscv/rvv/autovec/vls/init-7.c -O3 -ftree-vectorize -mrvv-vector-bits=scalable scan-assembler-times vle64\\.v 7 I'm not sure if it's profitable to replace a lmul8 load with 127 vslide1down.vx ops but we're being honest with the middle end when returning the # of insns we'll be emitting when costing... --- gcc/config/riscv/riscv-v.cc | 24 +++- gcc/config/riscv/riscv.cc | 103 +++++++++++++++++- .../riscv/rvv/autovec/materialize-1.c | 13 +++ .../riscv/rvv/autovec/materialize-2.c | 13 +++ .../riscv/rvv/autovec/materialize-3.c | 13 +++ .../riscv/rvv/autovec/materialize-4.c | 13 +++ .../riscv/rvv/autovec/materialize-5.c | 13 +++ .../riscv/rvv/autovec/materialize-6.c | 13 +++ 8 files changed, 196 insertions(+), 9 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-2.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-3.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-4.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-5.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-6.c diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 2b0dc57a021..6ba4b6291bb 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -1095,7 +1095,7 @@ expand_vec_series (rtx dest, rtx base, rtx step, rtx vid) emit_move_insn (dest, result); } -/* Subroutine of riscv_vector_expand_vector_init. +/* Subroutine of riscv_vector_expand_vector_init and expand_const_vector. Works as follows: (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. (b) Skip leading elements from BUILDER, which are the same as @@ -1120,7 +1120,7 @@ expand_vector_init_insert_elems (rtx target, const rvv_builder &builder, } } -/* Subroutine of expand_vec_init to handle case +/* Subroutine of expand_vec_init and expand_const_vector to handle case when all trailing elements of builder are same. This works as follows: (a) Use expand_insn interface to broadcast last vector element in TARGET. @@ -1239,6 +1239,8 @@ expand_const_vector (rtx target, rtx src) } builder.finalize (); + bool emit_catch_all_pattern = false; + if (CONST_VECTOR_DUPLICATE_P (src)) { /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1 @@ -1546,10 +1548,24 @@ expand_const_vector (rtx target, rtx src) } else /* TODO: We will enable more variable-length vector in the future. */ - gcc_unreachable (); + emit_catch_all_pattern = true; } else - gcc_unreachable (); + emit_catch_all_pattern = true; + + if (emit_catch_all_pattern) + { + int nelts = XVECLEN (src, 0); + + /* Optimize trailing same elements sequence: + v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x}; */ + if (!expand_vector_init_trailing_same_elem (result, builder, nelts)) + /* Handle common situation with vslide1down. This function can handle + any case of vec_init<mode>. Only the cases that are not optimized + above will fall through here. This prevents us from dumping + to/reading from the stack to initialize vectors. */ + expand_vector_init_insert_elems (result, builder, nelts); + } if (result != target) emit_move_insn (target, result); diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index a820cadd205..d1b8c27f4ac 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -2141,12 +2141,13 @@ riscv_const_insns (rtx x, bool allow_new_pseudos) out range of [-16, 15]. - 3. const series vector. ...etc. */ - if (riscv_v_ext_mode_p (GET_MODE (x))) + machine_mode mode = GET_MODE (x); + if (riscv_v_ext_mode_p (mode)) { rtx elt; if (const_vec_duplicate_p (x, &elt)) { - if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL) + if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) /* Duplicate values of 0/1 can be emitted using vmv.v.i. */ return 1; @@ -2154,7 +2155,7 @@ riscv_const_insns (rtx x, bool allow_new_pseudos) system since the ELT constant value can not held within a single register to disable reload a DI register vec_duplicate into vmv.v.x. */ - scalar_mode smode = GET_MODE_INNER (GET_MODE (x)); + scalar_mode smode = GET_MODE_INNER (mode); if (maybe_gt (GET_MODE_SIZE (smode), UNITS_PER_WORD) && !immediate_operand (elt, Pmode)) return 0; @@ -2190,7 +2191,15 @@ riscv_const_insns (rtx x, bool allow_new_pseudos) return 1; } - if (CONST_VECTOR_STEPPED_P (x)) + if (CONST_VECTOR_DUPLICATE_P (x)) + { + /* TODO: Cost cases of CONST_VECTOR_DUPLICATE_P. + We get ICEs with could not split insn from an + expand_vector_broadcast emitted during LRA when this op is + costed. For now disallow all cases. */ + return 0; + } + else if (CONST_VECTOR_STEPPED_P (x)) { /* Some cases are unhandled so we need construct a builder to detect/allow those cases to be handled by the fallthrough @@ -2226,10 +2235,94 @@ riscv_const_insns (rtx x, bool allow_new_pseudos) /* Fallthrough to catch all pattern. */ } + int nelts = const_vector_encoded_nelts (x); + if (nelts == 0) + return 0; + + /* Most arbitrary vectors can be constructed with a splat and + vslide1up/down. */ + + /* The arbitrary vector's elements must be supported by the + vslide1up/down operation. */ + scalar_mode smode = GET_MODE_INNER (mode); + if (maybe_gt (GET_MODE_SIZE (smode), UNITS_PER_WORD)) + /* vslide1up/down does not handle elts larger than a register. */ + return 0; + if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) + /* vslide1up/down does not handle BImode elts. */ + return 0; + if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT + && maybe_lt (GET_MODE_SIZE (smode), 4)) + /* Our vslide1up/down insn def does not handle HF. */ + return 0; + + /* We already checked for a fully const vector above. Calculate + the number of leading/trailing elements covered by the splat. */ + int leading_ndups = 1; + rtx first_elt = CONST_VECTOR_ENCODED_ELT (x, 0); + for (int i = 1; i < nelts; i++) + { + if (first_elt == CONST_VECTOR_ENCODED_ELT (x, i)) + leading_ndups += 1; + else + break; + } + int trailing_ndups = 1; + if (leading_ndups >= nelts / 2) + /* We already know leading_ndups >= trailing_ndups so just use + leading ndups. */ + trailing_ndups = 0; + else + { + rtx last_elt = CONST_VECTOR_ENCODED_ELT (x, nelts - 1); + for (int i = nelts - 2; i > 0; i--) + { + if (last_elt == CONST_VECTOR_ENCODED_ELT (x, i)) + trailing_ndups += 1; + else + break; + } + } + + /* We always splat the leading/trailing elt with the most contigious + duplicates. */ + int splatted_leading = 0; + int splatted_trailing = 0; + int splat_materialize = 0; + if (leading_ndups > trailing_ndups) + { + splatted_leading = leading_ndups; + splat_materialize = riscv_const_insns (CONST_VECTOR_ELT (x, 0), + allow_new_pseudos); + } + else + { + splatted_trailing = trailing_ndups; + splat_materialize = + riscv_const_insns (CONST_VECTOR_ELT (x, nelts - 1), + allow_new_pseudos); + } + + /* Splat leading/trailing elt. */ + int total_insns = splat_materialize + 1; + + /* TODO: There's room for improvement here. eg. duplicate + constants won't require another materialization. */ + for (int i = splatted_leading; i < nelts - splatted_trailing; i++) + { + rtx elt = CONST_VECTOR_ELT (x, i); + int n = riscv_const_insns (elt, allow_new_pseudos); + if (n == 0) + n = 4; /* memory access. */ + /* materialize + vslide1{up|down}. */ + total_insns += n + 1; + } + + return total_insns; } /* TODO: We may support more const vector in the future. */ - return x == CONST0_RTX (GET_MODE (x)) ? 1 : 0; + return x == CONST0_RTX (mode) ? 1 : 0; } case CONST: diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-1.c new file mode 100644 index 00000000000..2d7e1cf1556 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-1.c @@ -0,0 +1,13 @@ +/* Vector constants can be constructed using splat + vslide1up. */ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvl128b -mabi=lp64d -O3" } */ + +void slide1up(int a, int *b) { + int c[4] = {1, 10, 10, 10}; + for (int j = 0; j < 4; j++) { + b[j] = c[j]; + } +} + +/* { dg-final { scan-assembler-times {vmv.v.i} 1 } } */ +/* { dg-final { scan-assembler-times {vslide1up.vx} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-2.c new file mode 100644 index 00000000000..22d8365814c --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-2.c @@ -0,0 +1,13 @@ +/* Vector constants can be constructed using splat + vslide1up. */ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3" } */ + +void slide1up(int a, int *b) { + int c[8] = {1, 1, 1, 10, 10, 10, 10, 10}; + for (int j = 0; j < 8; j++) { + b[j] = c[j]; + } +} + +/* { dg-final { scan-assembler-times {vmv.v.i} 1 } } */ +/* { dg-final { scan-assembler-times {vslide1up.vx} 3 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-3.c new file mode 100644 index 00000000000..928e3d5fc74 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-3.c @@ -0,0 +1,13 @@ +/* Vector constants can be constructed using splat + vslide1down. */ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvl128b -mabi=lp64d -O3" } */ + +void slide1up(int a, int *b) { + int c[4] = {1, 1, 1, 10}; + for (int j = 0; j < 4; j++) { + b[j] = c[j]; + } +} + +/* { dg-final { scan-assembler-times {vmv.v.i} 1 } } */ +/* { dg-final { scan-assembler-times {vslide1down.vx} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-4.c new file mode 100644 index 00000000000..ba1b880e901 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-4.c @@ -0,0 +1,13 @@ +/* Vector constants can be constructed using splat + vslide1down. */ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3" } */ + +void slide1up(int a, int *b) { + int c[8] = {1, 1, 1, 1, 1, 10, 10, 10}; + for (int j = 0; j < 8; j++) { + b[j] = c[j]; + } +} + +/* { dg-final { scan-assembler-times {vmv.v.i} 1 } } */ +/* { dg-final { scan-assembler-times {vslide1down.vx} 3 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-5.c new file mode 100644 index 00000000000..6fd68ec4a55 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-5.c @@ -0,0 +1,13 @@ +/* Arbitrary vector constants can be constructed using splat + vslide1{up|down}. */ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvl128b -mabi=lp64d -O3" } */ + +void slide1down_arbitrary(int a, int *b) { + int c[4] = {100, 12, 324, 1}; + for (int j = 0; j < 4; j++) { + b[j] = c[j]; + } +} + +/* { dg-final { scan-assembler-times {vmv.v.x} 1 } } */ +/* { dg-final { scan-assembler-times {vslide1(up|down).vx} 3 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-6.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-6.c new file mode 100644 index 00000000000..0de98bf26b9 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/materialize-6.c @@ -0,0 +1,13 @@ +/* Arbitrary vector constants can be constructed using splat + vslide1{up|down}. */ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3" } */ + +void slide1down_arbitrary(int a, int *b) { + int c[8] = {100, 12, 324, 1, 57, 16, 98, 60}; + for (int j = 0; j < 8; j++) { + b[j] = c[j]; + } +} + +/* { dg-final { scan-assembler-times {vmv.v.x} 1 } } */ +/* { dg-final { scan-assembler-times {vslide1(up|down).vx} 7 } } */ -- 2.34.1