From: Robin Dapp <rd...@ventanamicro.com> This patch adds a shuffle_slide_patterns to expand_vec_perm_const. It recognizes permutations like
{0, 1, 4, 5} or {2, 3, 6, 7} which can be constructed by a slideup or slidedown of one of the vectors into the other one. gcc/ChangeLog: * config/riscv/riscv-v.cc (shuffle_slide_patterns): New. (expand_vec_perm_const_1): Call new function. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run1.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide1.c: New test. --- gcc/config/riscv/riscv-v.cc | 99 +++++++++++++ .../autovec/vls-vlmax/shuffle-slide-run1.c | 81 +++++++++++ .../rvv/autovec/vls-vlmax/shuffle-slide1.c | 137 ++++++++++++++++++ 3 files changed, 317 insertions(+) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide1.c diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index ee7a0128c0e..deb2bdb4247 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -3395,6 +3395,103 @@ shuffle_compress_patterns (struct expand_vec_perm_d *d) return true; } +/* Recognize patterns like [4 5 6 7 12 13 14 15] where either the lower + or the higher parts of both vectors are combined into one. */ + +static bool +shuffle_slide_patterns (struct expand_vec_perm_d *d) +{ + machine_mode vmode = d->vmode; + poly_int64 vec_len = d->perm.length (); + + if (!vec_len.is_constant ()) + return false; + + int vlen = vec_len.to_constant (); + if (vlen < 4) + return false; + + if (d->one_vector_p) + return false; + + /* For a slideup OP0 can stay, for a slidedown OP1 can. + The former requires that the first element of the permutation + is the first element of OP0, the latter that the last permutation + element is the last element of OP1. */ + bool slideup = false; + bool slidedown = false; + + /* For a slideup the permutation must start at OP0's first element. */ + if (known_eq (d->perm[0], 0)) + slideup = true; + + /* For a slidedown the permutation must end at OP1's last element. */ + if (known_eq (d->perm[vlen - 1], 2 * vlen - 1)) + slidedown = true; + + if (slideup && slidedown) + return false; + + if (!slideup && !slidedown) + return false; + + /* Check for a monotonic sequence with one pivot. */ + int pivot = -1; + for (int i = 0; i < vlen; i++) + { + if (pivot == -1 && known_ge (d->perm[i], vec_len)) + pivot = i; + if (i > 0 && i != pivot + && maybe_ne (d->perm[i], d->perm[i - 1] + 1)) + return false; + } + + if (pivot == -1) + return false; + + /* For a slideup OP1's part (to be slid up) must be a low part, + i.e. starting with its first element. */ + if (slideup && maybe_ne (d->perm[pivot], vlen)) + return false; + + /* For a slidedown OP0's part (to be slid down) must be a high part, + i.e. ending with its last element. */ + if (slidedown && maybe_ne (d->perm[pivot - 1], vlen - 1)) + return false; + + /* Success! */ + if (d->testing_p) + return true; + + /* PIVOT is the start of the lower/higher part of OP1 or OP2. + For a slideup it indicates how many elements of OP1 to + skip/slide over. For a slidedown it indicates how long + OP1's high part is, while VLEN - PIVOT is the amount to slide. */ + int slide_cnt = slideup ? pivot : vlen - pivot; + insn_code icode; + if (slideup) + { + /* No need for a vector length because we slide up until the + end of OP1 anyway. */ + rtx ops[] = {d->target, d->op0, d->op1, gen_int_mode (slide_cnt, Pmode)}; + icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode); + emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops); + } + else + { + /* Here we need a length because we slide to the beginning of OP1 + leaving the remaining elements undisturbed. */ + int len = pivot; + rtx ops[] = {d->target, d->op1, d->op0, + gen_int_mode (slide_cnt, Pmode)}; + icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, vmode); + emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops, + gen_int_mode (len, Pmode)); + } + + return true; +} + /* Recognize decompress patterns: 1. VEC_PERM_EXPR op0 and op1 @@ -3709,6 +3806,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d) return true; if (shuffle_consecutive_patterns (d)) return true; + if (shuffle_slide_patterns (d)) + return true; if (shuffle_compress_patterns (d)) return true; if (shuffle_decompress_patterns (d)) diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run1.c new file mode 100644 index 00000000000..17e68caad21 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run1.c @@ -0,0 +1,81 @@ +/* { dg-do run } */ +/* { dg-require-effective-target rvv_zvl512b_ok } */ + +#include "shuffle-slide1.c" + +#define comp(a, b, n) \ + for (unsigned i = 0; i < n; ++i) \ + if ((a)[i] != (b)[i]) \ + __builtin_abort (); + +int main() +{ + a4 = (v4si) { 0, 1, 2, 3 }; + b4 = (v4si) { 4, 5, 6, 7 }; + a8 = (v8si) { 0, 1, 2, 3, 4, 5, 6, 7 }; + b8 = (v8si) { 8, 9, 10, 11, 12, 13, 14, 15 }; + a16 = (v16si) { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + b16 = (v16si) { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31}; + + foo4u (); + comp (res4, ((v4si) { 0, 1, 4, 5 }), 4); + + foo4u1 (); + comp (res4, ((v4si) { 0, 1, 2, 4 }), 4); + + foo4u3 (); + comp (res4, ((v4si) { 0, 4, 5, 6}), 4); + + foo4d (); + comp (res4, ((v4si) { 2, 3, 6, 7 }), 4); + + foo4d1 (); + comp (res4, ((v4si) { 1, 2, 3, 7 }), 4); + + foo4d3 (); + comp (res4, ((v4si) { 3, 5, 6, 7 }), 4); + + foo8u (); + comp (res8, ((v8si) { 0, 1, 2, 3, 8, 9, 10, 11 }), 8); + + foo8u1 (); + comp (res8, ((v8si) { 0, 1, 2, 3, 4, 5, 6, 8 }), 8); + + foo8u3 (); + comp (res8, ((v8si) { 0, 8, 9, 10, 11, 12, 13, 14 }), 8); + + foo8d (); + comp (res8, ((v8si) { 4, 5, 6, 7, 12, 13, 14, 15 }), 8); + + foo8d1 (); + comp (res8, ((v8si) { 1, 2, 3, 4, 5, 6, 7, 15 }), 8); + + foo8d3 (); + comp (res8, ((v8si) { 7, 9, 10, 11, 12, 13, 14, 15 }), 8); + + foo16u (); + comp (res16, ((v16si) { 0, 1, 2, 3, 4, 5, 6, 7, + 16, 17, 18, 19, 20, 21, 22, 23 }), 16); + + foo16u1 (); + comp (res16, ((v16si) { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 16 } ), 16); + + foo16u3 (); + comp (res16, ((v16si) { 0, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30 }), 16); + + foo16d (); + comp (res16, ((v16si) { 8, 9, 10, 11, 12, 13, 14, 15, + 24, 25, 26, 27, 28, 29, 30, 31 }), 16); + + foo16d1 (); + comp (res16, ((v16si) { 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 31 }), 16); + + foo16d3 (); + comp (res16, ((v16si) { 15, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31 }), 16); + return 0; +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide1.c new file mode 100644 index 00000000000..4aa954245dc --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide1.c @@ -0,0 +1,137 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=rv64gcv_zvl512b" } */ + +typedef int v4si __attribute__((vector_size(4 * sizeof (int)))); +typedef int v8si __attribute__((vector_size(8 * sizeof (int)))); +typedef int v16si __attribute__((vector_size(16 * sizeof (int)))); + +v4si res4, a4, b4; +v8si res8, a8, b8; +v16si res16, a16, b16; + +void __attribute__((noipa)) +foo4u (void) +{ + res4 = __builtin_shufflevector (a4, b4, 0, 1, 4, 5); +} + +void __attribute__((noipa)) +foo4u1 (void) +{ + res4 = __builtin_shufflevector (a4, b4, 0, 1, 2, 4); +} + +void __attribute__((noipa)) +foo4u3 (void) +{ + res4 = __builtin_shufflevector (a4, b4, 0, 4, 5, 6); +} + +void __attribute__((noipa)) +foo4d (void) +{ + res4 = __builtin_shufflevector (a4, b4, 2, 3, 6, 7); +} + +void __attribute__((noipa)) +foo4d1 (void) +{ + res4 = __builtin_shufflevector (a4, b4, 1, 2, 3, 7); +} + +void __attribute__((noipa)) +foo4d3 (void) +{ + res4 = __builtin_shufflevector (a4, b4, 3, 5, 6, 7); +} + +void __attribute__((noipa)) +foo8u (void) +{ + res8 = __builtin_shufflevector (a8, b8, 0, 1, 2, 3, 8, 9, 10, 11); +} + +void __attribute__((noipa)) +foo8u1 (void) +{ + res8 = __builtin_shufflevector (a8, b8, 0, 1, 2, 3, 4, 5, 6, 8); +} + +void __attribute__((noipa)) +foo8u3 (void) +{ + res8 = __builtin_shufflevector (a8, b8, 0, 8, 9, 10, 11, 12, 13, 14); +} + +void __attribute__((noipa)) +foo8d (void) +{ + res8 = __builtin_shufflevector (a8, b8, 4, 5, 6, 7, 12, 13, 14, 15); +} + +void __attribute__((noipa)) +foo8d1 (void) +{ + res8 = __builtin_shufflevector (a8, b8, 1, 2, 3, 4, 5, 6, 7, 15); +} + +void __attribute__((noipa)) +foo8d3 (void) +{ + res8 = __builtin_shufflevector (a8, b8, 7, 9, 10, 11, 12, 13, 14, 15); +} + +void __attribute__((noipa)) +foo16u (void) +{ + res16 = __builtin_shufflevector (a16, b16, + 0, 1, 2, 3, 4, 5, 6, 7, + 16, 17, 18, 19, 20, 21, 22, 23); +} + +void __attribute__((noipa)) +foo16u1 (void) +{ + res16 = __builtin_shufflevector (a16, b16, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, + 16); +} + +void __attribute__((noipa)) +foo16u3 (void) +{ + res16 = __builtin_shufflevector (a16, b16, + 0, + 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30); +} + +void __attribute__((noipa)) +foo16d (void) +{ + res16 = __builtin_shufflevector (a16, b16, + 8, 9, 10, 11, 12, 13, 14, 15, + 24, 25, 26, 27, 28, 29, 30, 31); +} + +void __attribute__((noipa)) +foo16d1 (void) +{ + res16 = __builtin_shufflevector (a16, b16, + 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, + 31); +} + +void __attribute__((noipa)) +foo16d3 (void) +{ + res16 = __builtin_shufflevector (a16, b16, + 15, + 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); +} + +/* { dg-final { scan-assembler-times "vslideup" 9 } } */ +/* { dg-final { scan-assembler-times "vslidedown" 9 } } */ -- 2.47.0