shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d *d) I think this name is obsolete, since you have changed the codegen which is possible to use 2 "slides".
Could you rename this function ? juzhe.zh...@rivai.ai From: Robin Dapp Date: 2024-11-17 20:53 To: gcc-patches CC: palmer; kito.cheng; juzhe.zhong; jeffreyalaw; pan2.li; rdapp.gcc Subject: [PATCH 4/4] RISC-V: Improve slide1up pattern. From: Robin Dapp <rd...@ventanamicro.com> This patch adds a second variant to implement the extract/slide1up pattern. In order to do a permutation like <3, 4, 5, 6> from vectors <0, 1, 2, 3> and <4, 5, 6, 7> we currently extract <3> from the first vector and re-insert it into the second vector. Unless register-file crossing latency is essentially zero it should be preferable to first slide the second vector up by one, then slide down the first vector by (nunits - 1). gcc/ChangeLog: * config/riscv/riscv-protos.h (riscv_register_move_cost): Export. * config/riscv/riscv-v.cc (shuffle_extract_and_slide1up_patterns): Add slideup/slidedown variant. * config/riscv/riscv.cc (riscv_secondary_memory_needed): Remove static. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr112599-2.c: Adjust test expectation. --- gcc/config/riscv/riscv-protos.h | 1 + gcc/config/riscv/riscv-v.cc | 44 ++++++++++++++----- gcc/config/riscv/riscv.cc | 18 +++++++- .../gcc.target/riscv/rvv/autovec/pr112599-2.c | 2 +- 4 files changed, 52 insertions(+), 13 deletions(-) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 500b357f6eb..ecb4e64cdf8 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -139,6 +139,7 @@ extern void riscv_expand_ussub (rtx, rtx, rtx); extern void riscv_expand_sssub (rtx, rtx, rtx); extern void riscv_expand_ustrunc (rtx, rtx); extern void riscv_expand_sstrunc (rtx, rtx); +extern int riscv_register_move_cost (machine_mode, reg_class_t, reg_class_t); #ifdef RTX_CODE extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0); diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 4fb032af953..76ee95d5b21 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -3800,17 +3800,39 @@ shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d *d) if (d->testing_p) return true; - /* Extract the last element of the first vector. */ - scalar_mode smode = GET_MODE_INNER (d->vmode); - rtx tmp = gen_reg_rtx (smode); - emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode)); - - /* Insert the scalar into element 0. */ - unsigned int unspec - = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP; - insn_code icode = code_for_pred_slide (unspec, d->vmode); - rtx ops[] = {d->target, d->op1, tmp}; - emit_vlmax_insn (icode, BINARY_OP, ops); + int scalar_cost = riscv_register_move_cost (d->vmode, V_REGS, GR_REGS) + + riscv_register_move_cost (d->vmode, GR_REGS, V_REGS) + 2; + int slide_cost = 2; + + if (slide_cost < scalar_cost) + { + /* This variant should always be preferable because we just need two + slides. The extract-variant also requires two slides but additionally + pays the latency for register-file crossing. */ + rtx tmp = gen_reg_rtx (d->vmode); + rtx ops[] = {tmp, d->op1, gen_int_mode (1, Pmode)}; + insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, d->vmode); + emit_vlmax_insn (icode, BINARY_OP, ops); + + rtx ops2[] = {d->target, tmp, d->op0, gen_int_mode (nunits - 1, Pmode)}; + icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode); + emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops2, gen_int_mode (1, Pmode)); + } + else + { + /* Extract the last element of the first vector. */ + scalar_mode smode = GET_MODE_INNER (d->vmode); + rtx tmp = gen_reg_rtx (smode); + emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode)); + + /* Insert the scalar into element 0. */ + unsigned int unspec + = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP; + insn_code icode = code_for_pred_slide (unspec, d->vmode); + rtx ops[] = {d->target, d->op1, tmp}; + emit_vlmax_insn (icode, BINARY_OP, ops); + } + return true; } diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 7694954c4c5..62b80fefedd 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -9464,7 +9464,7 @@ riscv_secondary_memory_needed (machine_mode mode, reg_class_t class1, /* Implement TARGET_REGISTER_MOVE_COST. */ -static int +int riscv_register_move_cost (machine_mode mode, reg_class_t from, reg_class_t to) { @@ -9472,6 +9472,22 @@ riscv_register_move_cost (machine_mode mode, (from == GR_REGS && to == FP_REGS)) return tune_param->fmv_cost; + if (from == V_REGS) + { + if (to == GR_REGS) + return get_vector_costs ()->regmove->VR2GR; + else if (to == FP_REGS) + return get_vector_costs ()->regmove->VR2FR; + } + + if (to == V_REGS) + { + if (from == GR_REGS) + return get_vector_costs ()->regmove->GR2VR; + else if (from == FP_REGS) + return get_vector_costs ()->regmove->FR2VR; + } + return riscv_secondary_memory_needed (mode, from, to) ? 8 : 2; } diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112599-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112599-2.c index fd87565b054..79d87196bf7 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112599-2.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112599-2.c @@ -48,4 +48,4 @@ foo(void) } /* { dg-final { scan-assembler-not {vrgather} } } */ -/* { dg-final { scan-assembler-times {vslide1up\.vx} 1 } } */ +/* { dg-final { scan-assembler {vslide} } } */ -- 2.47.0