From: Robin Dapp <rd...@ventanamicro.com>

This patch adds a shuffle_slide_patterns to expand_vec_perm_const.
It recognizes permutations like

  {0, 1, 4, 5}
or
  {2, 3, 6, 7}

which can be constructed by a slideup or slidedown of one of the vectors
into the other one.

gcc/ChangeLog:

        * config/riscv/riscv-v.cc (shuffle_slide_patterns): New.
        (expand_vec_perm_const_1): Call new function.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run1.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide1.c: New test.
---
 gcc/config/riscv/riscv-v.cc                   |  99 +++++++++++++
 .../autovec/vls-vlmax/shuffle-slide-run1.c    |  81 +++++++++++
 .../rvv/autovec/vls-vlmax/shuffle-slide1.c    | 137 ++++++++++++++++++
 3 files changed, 317 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide1.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index ee7a0128c0e..deb2bdb4247 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3395,6 +3395,103 @@ shuffle_compress_patterns (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns like [4 5 6 7 12 13 14 15] where either the lower
+   or the higher parts of both vectors are combined into one.  */
+
+static bool
+shuffle_slide_patterns (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  poly_int64 vec_len = d->perm.length ();
+
+  if (!vec_len.is_constant ())
+    return false;
+
+  int vlen = vec_len.to_constant ();
+  if (vlen < 4)
+    return false;
+
+  if (d->one_vector_p)
+    return false;
+
+  /* For a slideup OP0 can stay, for a slidedown OP1 can.
+     The former requires that the first element of the permutation
+     is the first element of OP0, the latter that the last permutation
+     element is the last element of OP1.  */
+  bool slideup = false;
+  bool slidedown = false;
+
+  /* For a slideup the permutation must start at OP0's first element.  */
+  if (known_eq (d->perm[0], 0))
+    slideup = true;
+
+  /* For a slidedown the permutation must end at OP1's last element.  */
+  if (known_eq (d->perm[vlen - 1], 2 * vlen - 1))
+    slidedown = true;
+
+  if (slideup && slidedown)
+    return false;
+
+  if (!slideup && !slidedown)
+    return false;
+
+  /* Check for a monotonic sequence with one pivot.  */
+  int pivot = -1;
+  for (int i = 0; i < vlen; i++)
+    {
+      if (pivot == -1 && known_ge (d->perm[i], vec_len))
+       pivot = i;
+      if (i > 0 && i != pivot
+         && maybe_ne (d->perm[i], d->perm[i - 1] + 1))
+       return false;
+    }
+
+  if (pivot == -1)
+    return false;
+
+  /* For a slideup OP1's part (to be slid up) must be a low part,
+     i.e. starting with its first element.  */
+  if (slideup && maybe_ne (d->perm[pivot], vlen))
+      return false;
+
+  /* For a slidedown OP0's part (to be slid down) must be a high part,
+     i.e. ending with its last element.  */
+  if (slidedown && maybe_ne (d->perm[pivot - 1], vlen - 1))
+    return false;
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  /* PIVOT is the start of the lower/higher part of OP1 or OP2.
+     For a slideup it indicates how many elements of OP1 to
+     skip/slide over.  For a slidedown it indicates how long
+     OP1's high part is, while VLEN - PIVOT is the amount to slide.  */
+  int slide_cnt = slideup ? pivot : vlen - pivot;
+  insn_code icode;
+  if (slideup)
+    {
+      /* No need for a vector length because we slide up until the
+        end of OP1 anyway.  */
+      rtx ops[] = {d->target, d->op0, d->op1, gen_int_mode (slide_cnt, Pmode)};
+      icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
+      emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
+    }
+  else
+    {
+      /* Here we need a length because we slide to the beginning of OP1
+        leaving the remaining elements undisturbed.  */
+      int len = pivot;
+      rtx ops[] = {d->target, d->op1, d->op0,
+                  gen_int_mode (slide_cnt, Pmode)};
+      icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, vmode);
+      emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops,
+                         gen_int_mode (len, Pmode));
+    }
+
+  return true;
+}
+
 /* Recognize decompress patterns:
 
    1. VEC_PERM_EXPR op0 and op1
@@ -3709,6 +3806,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
            return true;
          if (shuffle_consecutive_patterns (d))
            return true;
+         if (shuffle_slide_patterns (d))
+           return true;
          if (shuffle_compress_patterns (d))
            return true;
          if (shuffle_decompress_patterns (d))
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run1.c
new file mode 100644
index 00000000000..17e68caad21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run1.c
@@ -0,0 +1,81 @@
+/* { dg-do run } */
+/* { dg-require-effective-target rvv_zvl512b_ok }  */
+
+#include "shuffle-slide1.c"
+
+#define comp(a, b, n) \
+  for (unsigned i = 0; i < n; ++i) \
+    if ((a)[i] != (b)[i]) \
+      __builtin_abort ();
+
+int main()
+{
+  a4 = (v4si) { 0, 1, 2, 3 };
+  b4 = (v4si) { 4, 5, 6, 7 };
+  a8 = (v8si) { 0, 1, 2, 3, 4, 5, 6, 7 };
+  b8 = (v8si) { 8, 9, 10, 11, 12, 13, 14, 15 };
+  a16 = (v16si) { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+  b16 = (v16si) { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+                  31};
+
+  foo4u ();
+  comp (res4, ((v4si) { 0, 1, 4, 5 }), 4);
+
+  foo4u1 ();
+  comp (res4, ((v4si) { 0, 1, 2, 4 }), 4);
+
+  foo4u3 ();
+  comp (res4, ((v4si) { 0, 4, 5, 6}), 4);
+
+  foo4d ();
+  comp (res4, ((v4si) { 2, 3, 6, 7 }), 4);
+
+  foo4d1 ();
+  comp (res4, ((v4si) { 1, 2, 3, 7 }), 4);
+
+  foo4d3 ();
+  comp (res4, ((v4si) { 3, 5, 6, 7 }), 4);
+
+  foo8u ();
+  comp (res8, ((v8si) { 0, 1, 2, 3, 8, 9, 10, 11 }), 8);
+
+  foo8u1 ();
+  comp (res8, ((v8si) { 0, 1, 2, 3, 4, 5, 6, 8 }), 8);
+
+  foo8u3 ();
+  comp (res8, ((v8si) { 0, 8, 9, 10, 11, 12, 13, 14 }), 8);
+
+  foo8d ();
+  comp (res8, ((v8si) { 4, 5, 6, 7, 12, 13, 14, 15 }), 8);
+
+  foo8d1 ();
+  comp (res8, ((v8si) { 1, 2, 3, 4, 5, 6, 7, 15 }), 8);
+
+  foo8d3 ();
+  comp (res8, ((v8si) { 7, 9, 10, 11, 12, 13, 14, 15 }), 8);
+
+  foo16u ();
+  comp (res16, ((v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
+                          16, 17, 18, 19, 20, 21, 22, 23 }), 16);
+
+  foo16u1 ();
+  comp (res16, ((v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
+                          8, 9, 10, 11, 12, 13, 14, 16 } ), 16);
+
+  foo16u3 ();
+  comp (res16, ((v16si) { 0, 16, 17, 18, 19, 20, 21, 22,
+                          23, 24, 25, 26, 27, 28, 29, 30 }), 16);
+
+  foo16d ();
+  comp (res16, ((v16si) { 8, 9, 10, 11, 12, 13, 14, 15,
+                          24, 25, 26, 27, 28, 29, 30, 31 }), 16);
+
+  foo16d1 ();
+  comp (res16, ((v16si) { 1, 2, 3, 4, 5, 6, 7, 8, 9,
+                          10, 11, 12, 13, 14, 15, 31 }), 16);
+
+  foo16d3 ();
+  comp (res16, ((v16si) { 15, 17, 18, 19, 20, 21, 22, 23,
+                          24, 25, 26, 27, 28, 29, 30, 31 }), 16);
+  return 0;
+}
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide1.c
new file mode 100644
index 00000000000..4aa954245dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide1.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv_zvl512b" } */
+
+typedef int v4si __attribute__((vector_size(4 * sizeof (int))));
+typedef int v8si __attribute__((vector_size(8 * sizeof (int))));
+typedef int v16si __attribute__((vector_size(16 * sizeof (int))));
+
+v4si res4, a4, b4;
+v8si res8, a8, b8;
+v16si res16, a16, b16;
+
+void __attribute__((noipa))
+foo4u (void)
+{
+  res4 = __builtin_shufflevector (a4, b4, 0, 1, 4, 5);
+}
+
+void __attribute__((noipa))
+foo4u1 (void)
+{
+  res4 = __builtin_shufflevector (a4, b4, 0, 1, 2, 4);
+}
+
+void __attribute__((noipa))
+foo4u3 (void)
+{
+  res4 = __builtin_shufflevector (a4, b4, 0, 4, 5, 6);
+}
+
+void __attribute__((noipa))
+foo4d (void)
+{
+  res4 = __builtin_shufflevector (a4, b4, 2, 3, 6, 7);
+}
+
+void __attribute__((noipa))
+foo4d1 (void)
+{
+  res4 = __builtin_shufflevector (a4, b4, 1, 2, 3, 7);
+}
+
+void __attribute__((noipa))
+foo4d3 (void)
+{
+  res4 = __builtin_shufflevector (a4, b4, 3, 5, 6, 7);
+}
+
+void __attribute__((noipa))
+foo8u (void)
+{
+  res8 = __builtin_shufflevector (a8, b8, 0, 1, 2, 3, 8, 9, 10, 11);
+}
+
+void __attribute__((noipa))
+foo8u1 (void)
+{
+  res8 = __builtin_shufflevector (a8, b8, 0, 1, 2, 3, 4, 5, 6, 8);
+}
+
+void __attribute__((noipa))
+foo8u3 (void)
+{
+  res8 = __builtin_shufflevector (a8, b8, 0, 8, 9, 10, 11, 12, 13, 14);
+}
+
+void __attribute__((noipa))
+foo8d (void)
+{
+  res8 = __builtin_shufflevector (a8, b8, 4, 5, 6, 7, 12, 13, 14, 15);
+}
+
+void __attribute__((noipa))
+foo8d1 (void)
+{
+  res8 = __builtin_shufflevector (a8, b8, 1, 2, 3, 4, 5, 6, 7, 15);
+}
+
+void __attribute__((noipa))
+foo8d3 (void)
+{
+  res8 = __builtin_shufflevector (a8, b8, 7, 9, 10, 11, 12, 13, 14, 15);
+}
+
+void __attribute__((noipa))
+foo16u (void)
+{
+  res16 = __builtin_shufflevector (a16, b16,
+                                   0, 1, 2, 3, 4, 5, 6, 7,
+                                   16, 17, 18, 19, 20, 21, 22, 23);
+}
+
+void __attribute__((noipa))
+foo16u1 (void)
+{
+  res16 = __builtin_shufflevector (a16, b16,
+                                   0, 1, 2, 3, 4, 5, 6, 7,
+                                   8, 9, 10, 11, 12, 13, 14,
+                                   16);
+}
+
+void __attribute__((noipa))
+foo16u3 (void)
+{
+  res16 = __builtin_shufflevector (a16, b16,
+                                   0,
+                                   16, 17, 18, 19, 20, 21, 22,
+                                   23, 24, 25, 26, 27, 28, 29, 30);
+}
+
+void __attribute__((noipa))
+foo16d (void)
+{
+  res16 = __builtin_shufflevector (a16, b16,
+                                   8, 9, 10, 11, 12, 13, 14, 15,
+                                   24, 25, 26, 27, 28, 29, 30, 31);
+}
+
+void __attribute__((noipa))
+foo16d1 (void)
+{
+  res16 = __builtin_shufflevector (a16, b16,
+                                   1, 2, 3, 4, 5, 6, 7, 8, 9,
+                                   10, 11, 12, 13, 14, 15,
+                                   31);
+}
+
+void __attribute__((noipa))
+foo16d3 (void)
+{
+  res16 = __builtin_shufflevector (a16, b16,
+                                   15,
+                                   17, 18, 19, 20, 21, 22, 23,
+                                   24, 25, 26, 27, 28, 29, 30, 31);
+}
+
+/* { dg-final { scan-assembler-times "vslideup" 9 } } */
+/* { dg-final { scan-assembler-times "vslidedown" 9 } } */
-- 
2.47.0

Reply via email to