+shuffle_evenodd_patterns (struct expand_vec_perm_d *d)

I prefer it rename into shuffle_even_odd_patterns


juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2024-11-17 20:53
To: gcc-patches
CC: palmer; kito.cheng; juzhe.zhong; jeffreyalaw; pan2.li; rdapp.gcc
Subject: [PATCH 3/4] RISC-V: Add even/odd vec_perm_const pattern.
From: Robin Dapp <rd...@ventanamicro.com>
 
This adds handling for even/odd patterns.
 
gcc/ChangeLog:
 
* config/riscv/riscv-v.cc (shuffle_evenodd_patterns): New
function.
(expand_vec_perm_const_1): Use new function.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c: New test.
---
gcc/config/riscv/riscv-v.cc                   |  66 ++++++++++
.../autovec/vls-vlmax/shuffle-evenodd-run.c   | 122 ++++++++++++++++++
.../rvv/autovec/vls-vlmax/shuffle-evenodd.c   |  68 ++++++++++
3 files changed, 256 insertions(+)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c
 
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index d940b961bf8..4fb032af953 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3552,6 +3552,70 @@ shuffle_interleave_patterns (struct expand_vec_perm_d *d)
   return true;
}
+
+/* Recognize even/odd patterns like [0 2 4 6].  We use two compress
+   and one slideup.j  */
+
+static bool
+shuffle_evenodd_patterns (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  poly_int64 vec_len = d->perm.length ();
+  int n_patterns = d->perm.encoding ().npatterns ();
+
+  if (n_patterns != 1)
+    return false;
+
+  if (!vec_len.is_constant ())
+    return false;
+
+  int vlen = vec_len.to_constant ();
+  if (vlen < 4 || vlen > 64)
+    return false;
+
+  if (d->one_vector_p)
+    return false;
+
+  bool even = true;
+  if (!d->perm.series_p (0, 1, 0, 2))
+    {
+      even = false;
+      if (!d->perm.series_p (0, 1, 1, 2))
+ return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  machine_mode mask_mode = get_mask_mode (vmode);
+  rvv_builder builder (mask_mode, vlen, 1);
+  int bit = even ? 0 : 1;
+  for (int i = 0; i < vlen; i++)
+    {
+      bit ^= 1;
+      if (bit)
+ builder.quick_push (CONST1_RTX (BImode));
+      else
+ builder.quick_push (CONST0_RTX (BImode));
+    }
+  rtx mask = force_reg (mask_mode, builder.build ());
+
+  insn_code icode = code_for_pred_compress (vmode);
+  rtx ops1[] = {d->target, d->op0, mask};
+  emit_vlmax_insn (icode, COMPRESS_OP, ops1);
+
+  rtx tmp2 = gen_reg_rtx (vmode);
+  rtx ops2[] = {tmp2, d->op1, mask};
+  emit_vlmax_insn (icode, COMPRESS_OP, ops2);
+
+  rtx ops[] = {d->target, d->target, tmp2, gen_int_mode (vlen / 2, Pmode)};
+  icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
+  emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
+
+  return true;
+}
+
/* Recognize decompress patterns:
    1. VEC_PERM_EXPR op0 and op1
@@ -3870,6 +3934,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
    return true;
  if (shuffle_interleave_patterns (d))
    return true;
+   if (shuffle_evenodd_patterns (d))
+     return true;
  if (shuffle_compress_patterns (d))
    return true;
  if (shuffle_decompress_patterns (d))
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c
new file mode 100644
index 00000000000..c0760e5ed30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c
@@ -0,0 +1,122 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target riscv_v_ok } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=m8 -std=gnu99" } */
+
+#include "shuffle-evenodd.c"
+
+#define SERIES_2(x, y) (x), (x + 1)
+#define SERIES_4(x, y) SERIES_2 (x, y), SERIES_2 (x + 2, y)
+#define SERIES_8(x, y) SERIES_4 (x, y), SERIES_4 (x + 4, y)
+#define SERIES_16(x, y) SERIES_8 (x, y), SERIES_8 (x + 8, y)
+#define SERIES_32(x, y) SERIES_16 (x, y), SERIES_16 (x + 16, y)
+#define SERIES_64(x, y) SERIES_32 (x, y), SERIES_32 (x + 32, y)
+
+#define comp(a, b, n)                                                          
\
+  for (unsigned i = 0; i < n; ++i)                                             
\
+    if ((a)[i] != (b)[i])                                                      
\
+      __builtin_abort ();
+
+#define CHECK1(TYPE, NUNITS)                                                   
\
+  __attribute__ ((noipa)) void check1_##TYPE ()                                
\
+  {                                                                            
\
+    TYPE v0 = (TYPE){SERIES_##NUNITS (0, NUNITS)};                             
\
+    TYPE v1 = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)};                        
\
+    TYPE ref = (TYPE){MASKE_##NUNITS (0, NUNITS)};                             
\
+    TYPE res;                                                                  
\
+    permute1_##TYPE (v0, v1, &res);                                            
\
+    comp (res, ref, NUNITS);                                                   
\
+  }
+
+#define CHECK2(TYPE, NUNITS)                                                   
\
+  __attribute__ ((noipa)) void check2_##TYPE ()                       \
+  {                                                                            
\
+    TYPE v0 = (TYPE){SERIES_##NUNITS (0, NUNITS)};                             
\
+    TYPE v1 = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)};                        
\
+    TYPE ref = (TYPE){MASKO_##NUNITS (0, NUNITS)};                             
\
+    TYPE res;                                                                  
\
+    permute2_##TYPE (v0, v1, &res);                                            
\
+    comp (res, ref, NUNITS);                                                   
\
+  }
+
+#define CHECK_ALL(T)                                                           
 \
+  T (vnx4qi, 4)                                                                
\
+  T (vnx8qi, 8)                                                                
\
+  T (vnx16qi, 16)                                                              
\
+  T (vnx32qi, 32)                                                              
\
+  T (vnx64qi, 64)                                                              
\
+  T (vnx4hi, 4)                                                                
\
+  T (vnx8hi, 8)                                                                
\
+  T (vnx16hi, 16)                                                              
\
+  T (vnx32hi, 32)                                                              
\
+  T (vnx64hi, 64)                                                              
\
+  T (vnx4si, 4)                                                                
\
+  T (vnx8si, 8)                                                                
\
+  T (vnx16si, 16)                                                              
\
+  T (vnx32si, 32)                                                              
\
+  T (vnx4di, 4)                                                                
\
+  T (vnx8di, 8)                                                                
\
+  T (vnx16di, 16)                                                              
\
+  T (vnx4sf, 4)                                                                
\
+  T (vnx8sf, 8)                                                                
\
+  T (vnx16sf, 16)                                                              
\
+  T (vnx32sf, 32)                                                              
\
+  T (vnx4df, 4)                                                                
\
+  T (vnx8df, 8)                                                                
\
+  T (vnx16df, 16)
+
+CHECK_ALL (CHECK1)
+CHECK_ALL (CHECK2)
+
+int
+main ()
+{
+  check1_vnx4qi ();
+  check1_vnx8qi ();
+  check1_vnx16qi ();
+  check1_vnx32qi ();
+  check1_vnx64qi ();
+  check1_vnx4hi ();
+  check1_vnx8hi ();
+  check1_vnx16hi ();
+  check1_vnx32hi ();
+  check1_vnx64hi ();
+  check1_vnx4si ();
+  check1_vnx8si ();
+  check1_vnx16si ();
+  check1_vnx32si ();
+  check1_vnx4di ();
+  check1_vnx8di ();
+  check1_vnx16di ();
+  check1_vnx4sf ();
+  check1_vnx8sf ();
+  check1_vnx16sf ();
+  check1_vnx32sf ();
+  check1_vnx4df ();
+  check1_vnx8df ();
+  check1_vnx16df ();
+  check2_vnx4qi ();
+  check2_vnx8qi ();
+  check2_vnx16qi ();
+  check2_vnx32qi ();
+  check2_vnx64qi ();
+  check2_vnx4hi ();
+  check2_vnx8hi ();
+  check2_vnx16hi ();
+  check2_vnx32hi ();
+  check2_vnx64hi ();
+  check2_vnx4si ();
+  check2_vnx8si ();
+  check2_vnx16si ();
+  check2_vnx32si ();
+  check2_vnx4di ();
+  check2_vnx8di ();
+  check2_vnx16di ();
+  check2_vnx4sf ();
+  check2_vnx8sf ();
+  check2_vnx16sf ();
+  check2_vnx32sf ();
+  check2_vnx4df ();
+  check2_vnx8df ();
+  check2_vnx16df ();
+}
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c
new file mode 100644
index 00000000000..21570d7986e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c
@@ -0,0 +1,68 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv -mrvv-max-lmul=m8" } */
+
+#include "perm.h"
+
+#define MASKE_2(x, y) (x), (x + 2)
+#define MASKE_4(x, y) MASKE_2 (x, y), MASKE_2 (x + 4, y)
+#define MASKE_8(x, y) MASKE_4 (x, y), MASKE_4 (x + 8, y)
+#define MASKE_16(x, y) MASKE_8 (x, y), MASKE_8 (x + 16, y)
+#define MASKE_32(x, y) MASKE_16 (x, y), MASKE_16 (x + 32, y)
+#define MASKE_64(x, y) MASKE_32 (x, y), MASKE_32 (x + 64, y)
+
+#define MASKO_2(x, y) (x + 1), (x + 3)
+#define MASKO_4(x, y) MASKO_2 (x, y), MASKO_2 (x + 4, y)
+#define MASKO_8(x, y) MASKO_4 (x, y), MASKO_4 (x + 8, y)
+#define MASKO_16(x, y) MASKO_8 (x, y), MASKO_8 (x + 16, y)
+#define MASKO_32(x, y) MASKO_16 (x, y), MASKO_16 (x + 32, y)
+#define MASKO_64(x, y) MASKO_32 (x, y), MASKO_32 (x + 64, y)
+
+#define PERMUTE1(TYPE, NUNITS)                                                 
\
+  __attribute__ ((noipa)) void permute1_##TYPE (TYPE values1, TYPE values2,    
\
+ TYPE *out)                     \
+  {                                                                            
\
+    TYPE v = __builtin_shufflevector (values1, values2,                        
\
+       MASKE_##NUNITS (0, NUNITS));             \
+    *(TYPE *) out = v;                                                         
\
+  }
+
+#define PERMUTE2(TYPE, NUNITS)                                                 
\
+  __attribute__ ((noipa)) void permute2_##TYPE (TYPE values1, TYPE values2,    
\
+ TYPE *out)                     \
+  {                                                                            
\
+    TYPE v = __builtin_shufflevector (values1, values2,                        
\
+       MASKO_##NUNITS (0, NUNITS));             \
+    *(TYPE *) out = v;                                                         
\
+  }
+
+#define TEST_ALL(T)                                                            
\
+  T (vnx4qi, 4)                                                                
\
+  T (vnx8qi, 8)                                                                
\
+  T (vnx16qi, 16)                                                              
\
+  T (vnx32qi, 32)                                                              
\
+  T (vnx64qi, 64)                                                              
\
+  T (vnx4hi, 4)                                                                
\
+  T (vnx8hi, 8)                                                                
\
+  T (vnx16hi, 16)                                                              
\
+  T (vnx32hi, 32)                                                              
\
+  T (vnx64hi, 64)                                                              
\
+  T (vnx4si, 4)                                                                
\
+  T (vnx8si, 8)                                                                
\
+  T (vnx16si, 16)                                                              
\
+  T (vnx32si, 32)                                                              
\
+  T (vnx4di, 4)                                                                
\
+  T (vnx8di, 8)                                                                
\
+  T (vnx16di, 16)                                                              
\
+  T (vnx4sf, 4)                                                                
\
+  T (vnx8sf, 8)                                                                
\
+  T (vnx16sf, 16)                                                              
\
+  T (vnx32sf, 32)                                                              
\
+  T (vnx4df, 4)                                                                
\
+  T (vnx8df, 8)                                                                
\
+  T (vnx16df, 16)
+
+TEST_ALL (PERMUTE1)
+TEST_ALL (PERMUTE2)
+
+/* { dg-final { scan-assembler-times "vslideup" 48 } } */
+/* { dg-final { scan-assembler-times "vcompress" 96 } } */
-- 
2.47.0
 
 

Reply via email to