shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d *d)

I think this name is obsolete, since you have changed the codegen which is 
possible to use 2 "slides".

Could you rename this function ?


juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2024-11-17 20:53
To: gcc-patches
CC: palmer; kito.cheng; juzhe.zhong; jeffreyalaw; pan2.li; rdapp.gcc
Subject: [PATCH 4/4] RISC-V: Improve slide1up pattern.
From: Robin Dapp <rd...@ventanamicro.com>
 
This patch adds a second variant to implement the extract/slide1up
pattern.  In order to do a permutation like
<3, 4, 5, 6> from vectors <0, 1, 2, 3> and <4, 5, 6, 7>
we currently extract <3> from the first vector and re-insert it into the
second vector.  Unless register-file crossing latency is essentially
zero it should be preferable to first slide the second vector up by
one, then slide down the first vector by (nunits - 1).
 
gcc/ChangeLog:
 
* config/riscv/riscv-protos.h (riscv_register_move_cost):
Export.
* config/riscv/riscv-v.cc (shuffle_extract_and_slide1up_patterns):
Add slideup/slidedown variant.
* config/riscv/riscv.cc (riscv_secondary_memory_needed): Remove
static.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/pr112599-2.c: Adjust test
expectation.
---
gcc/config/riscv/riscv-protos.h               |  1 +
gcc/config/riscv/riscv-v.cc                   | 44 ++++++++++++++-----
gcc/config/riscv/riscv.cc                     | 18 +++++++-
.../gcc.target/riscv/rvv/autovec/pr112599-2.c |  2 +-
4 files changed, 52 insertions(+), 13 deletions(-)
 
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 500b357f6eb..ecb4e64cdf8 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -139,6 +139,7 @@ extern void riscv_expand_ussub (rtx, rtx, rtx);
extern void riscv_expand_sssub (rtx, rtx, rtx);
extern void riscv_expand_ustrunc (rtx, rtx);
extern void riscv_expand_sstrunc (rtx, rtx);
+extern int riscv_register_move_cost (machine_mode, reg_class_t, reg_class_t);
#ifdef RTX_CODE
extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool 
*invert_ptr = 0);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 4fb032af953..76ee95d5b21 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3800,17 +3800,39 @@ shuffle_extract_and_slide1up_patterns (struct 
expand_vec_perm_d *d)
   if (d->testing_p)
     return true;
-  /* Extract the last element of the first vector.  */
-  scalar_mode smode = GET_MODE_INNER (d->vmode);
-  rtx tmp = gen_reg_rtx (smode);
-  emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
-
-  /* Insert the scalar into element 0.  */
-  unsigned int unspec
-    = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
-  insn_code icode = code_for_pred_slide (unspec, d->vmode);
-  rtx ops[] = {d->target, d->op1, tmp};
-  emit_vlmax_insn (icode, BINARY_OP, ops);
+  int scalar_cost = riscv_register_move_cost (d->vmode, V_REGS, GR_REGS)
+    + riscv_register_move_cost (d->vmode, GR_REGS, V_REGS) + 2;
+  int slide_cost = 2;
+
+  if (slide_cost < scalar_cost)
+    {
+      /* This variant should always be preferable because we just need two
+ slides.  The extract-variant also requires two slides but additionally
+ pays the latency for register-file crossing.  */
+      rtx tmp = gen_reg_rtx (d->vmode);
+      rtx ops[] = {tmp, d->op1, gen_int_mode (1, Pmode)};
+      insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, d->vmode);
+      emit_vlmax_insn (icode, BINARY_OP, ops);
+
+      rtx ops2[] = {d->target, tmp, d->op0, gen_int_mode (nunits - 1, Pmode)};
+      icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
+      emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops2, gen_int_mode (1, 
Pmode));
+    }
+  else
+    {
+      /* Extract the last element of the first vector.  */
+      scalar_mode smode = GET_MODE_INNER (d->vmode);
+      rtx tmp = gen_reg_rtx (smode);
+      emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
+
+      /* Insert the scalar into element 0.  */
+      unsigned int unspec
+ = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
+      insn_code icode = code_for_pred_slide (unspec, d->vmode);
+      rtx ops[] = {d->target, d->op1, tmp};
+      emit_vlmax_insn (icode, BINARY_OP, ops);
+    }
+
   return true;
}
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 7694954c4c5..62b80fefedd 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -9464,7 +9464,7 @@ riscv_secondary_memory_needed (machine_mode mode, 
reg_class_t class1,
/* Implement TARGET_REGISTER_MOVE_COST.  */
-static int
+int
riscv_register_move_cost (machine_mode mode,
  reg_class_t from, reg_class_t to)
{
@@ -9472,6 +9472,22 @@ riscv_register_move_cost (machine_mode mode,
       (from == GR_REGS && to == FP_REGS))
     return tune_param->fmv_cost;
+  if (from == V_REGS)
+    {
+      if (to == GR_REGS)
+ return get_vector_costs ()->regmove->VR2GR;
+      else if (to == FP_REGS)
+ return get_vector_costs ()->regmove->VR2FR;
+    }
+
+  if (to == V_REGS)
+    {
+      if (from == GR_REGS)
+ return get_vector_costs ()->regmove->GR2VR;
+      else if (from == FP_REGS)
+ return get_vector_costs ()->regmove->FR2VR;
+    }
+
   return riscv_secondary_memory_needed (mode, from, to) ? 8 : 2;
}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112599-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112599-2.c
index fd87565b054..79d87196bf7 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112599-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112599-2.c
@@ -48,4 +48,4 @@ foo(void)
}
/* { dg-final { scan-assembler-not {vrgather} } } */
-/* { dg-final { scan-assembler-times {vslide1up\.vx} 1 } } */
+/* { dg-final { scan-assembler {vslide} } } */
-- 
2.47.0
 
 

Reply via email to