x86 have shufps which shuffles the first operand to the lower 64-bit, and the second operand to the upper 64-bit. For __builtin_shufflevector (op0, op1, 1, 4, 3, 6), it will be veclowered since can_vec_perm_const_p return false for sse2 target. This patch add a new function to support 2-operand v4si/v4sf vector shuffle with any index for sse2.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ok for trunk? 2022-09-23 Hongtao Liu <hongtao....@intel.com> Liwei Xu <liwei...@intel.com> gcc/ChangeLog: PR target/53346 * config/i386/i386-expand.cc (expand_vec_perm_shufps_shufps): New function. (ix86_expand_vec_perm_const_1): Insert expand_vec_perm_shufps_shufps at the end of 2-instruction expand sequence. gcc/testsuite/ChangeLog: * gcc.target/i386/pr53346-1.c: New test. * gcc.target/i386/pr53346-2.c: New test. --- gcc/config/i386/i386-expand.cc | 117 ++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr53346-1.c | 70 +++++++++++++ gcc/testsuite/gcc.target/i386/pr53346-2.c | 59 +++++++++++ gcc/testsuite/gcc.target/i386/pr53346-3.c | 69 +++++++++++++ gcc/testsuite/gcc.target/i386/pr53346-4.c | 59 +++++++++++ 5 files changed, 374 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-4.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 5334363e235..43c58111a62 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -19604,6 +19604,120 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) return false; } +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D + in terms of a pair of shufps+ shufps/pshufd instructions. */ +static bool +expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d) +{ + unsigned char perm1[4]; + machine_mode vmode = d->vmode; + bool ok; + unsigned i, j, k, count = 0; + + if (d->one_operand_p + || (vmode != V4SImode && vmode != V4SFmode)) + return false; + + if (d->testing_p) + return true; + + for (i = 0; i < 4; ++i) + count += d->perm[i] > 3 ? 1 : 0; + + gcc_assert(count & 3); + + rtx tmp = gen_reg_rtx (vmode); + /* 2 from op0 and 2 from op1. */ + if (count == 2) + { + unsigned char perm2[4]; + for (i = 0, j = 0, k = 2; i < 4; ++i) + if (d->perm[i] & 4) + { + perm1[k++] = d->perm[i]; + perm2[i] = k - 1; + } + else + { + perm1[j++] = d->perm[i]; + perm2[i] = j - 1; + } + + /* shufps. */ + ok = expand_vselect_vconcat(tmp, d->op0, d->op1, + perm1, d->nelt, false); + gcc_assert (ok); + if (vmode == V4SImode && TARGET_SSE2) + /* pshufd. */ + ok = expand_vselect (d->target, tmp, + perm2, d->nelt, false); + else + { + /* shufps. */ + perm2[2] += 4; + perm2[3] += 4; + ok = expand_vselect_vconcat (d->target, tmp, tmp, + perm2, d->nelt, false); + } + gcc_assert (ok); + } + /* 3 from one op and 1 from another. */ + else + { + unsigned pair_idx = 8, lone_idx = 8, shift; + + /* Find the lone index. */ + for (i = 0; i < 4; ++i) + if ((d->perm[i] > 3 && count == 1) + || (d->perm[i] < 4 && count == 3)) + lone_idx = i; + + /* When lone_idx is not 0, it must from second op(count == 1). */ + gcc_assert ((lone_idx == 0 && count == 3) + || (lone_idx != 0 && count == 1)); + + /* Find the pair index that sits in the same half as the lone index. */ + shift = lone_idx & 2; + pair_idx = 1 - lone_idx + 2 * shift; + + /* First permutate lone index and pair index into the same vector as + [ lone, lone, pair, pair ]. */ + perm1[1] = perm1[0] + = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4; + perm1[3] = perm1[2] + = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4; + + /* Alway put the vector contains lone indx at the first. */ + if (count == 1) + std::swap (d->op0, d->op1); + + /* shufps. */ + ok = expand_vselect_vconcat(tmp, d->op0, d->op1, + perm1, d->nelt, false); + gcc_assert (ok); + + /* Refine lone and pair index to original order. */ + perm1[shift] = lone_idx << 1; + perm1[shift + 1] = pair_idx << 1; + + /* Select the remaining 2 elements in another vector. */ + for (i = 2 - shift; i < 4 - shift; ++i) + perm1[i] = (lone_idx == 1) ? (d->perm[i] + 4) : d->perm[i]; + + /* Adjust to original selector. */ + if (lone_idx > 1) + std::swap (tmp, d->op1); + + /* shufps. */ + ok = expand_vselect_vconcat(d->target, tmp, d->op1, + perm1, d->nelt, false); + + gcc_assert (ok); + } + + return true; +} + /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D in terms of a pair of pshuflw + pshufhw instructions. */ @@ -22152,6 +22266,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_2perm_pblendv (d, true)) return true; + if (expand_vec_perm_shufps_shufps (d)) + return true; + /* Try sequences of three instructions. */ if (expand_vec_perm_even_odd_pack (d)) diff --git a/gcc/testsuite/gcc.target/i386/pr53346-1.c b/gcc/testsuite/gcc.target/i386/pr53346-1.c new file mode 100644 index 00000000000..6d230da632c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr53346-1.c @@ -0,0 +1,70 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2 -mno-sse3" } */ +/* { dg-final { scan-assembler-times "shufps" 15 } } */ +/* { dg-final { scan-assembler-times "pshufd" 2 } } */ + +typedef int v4si __attribute__((vector_size(16))); + +v4si +__attribute__((noipa)) +foo (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 1, 2, 5, 3); +} + +v4si +__attribute__((noipa)) +foo1 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 1, 5, 2, 3); +} + +v4si +__attribute__((noipa)) +foo2 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 1, 2, 3, 5); +} + +v4si +__attribute__((noipa)) +foo3 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 1, 4, 5, 6); +} + +v4si +__attribute__((noipa)) +foo4 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 3, 6, 7, 5); +} + +v4si +__attribute__((noipa)) +foo5 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 2, 4, 7, 6); +} + +v4si +__attribute__((noipa)) +foo6 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 2, 4, 3, 6); +} + +v4si +__attribute__((noipa)) +foo7 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 2, 3, 4, 6); +} + +v4si +__attribute__((noipa)) +foo8 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 2, 4, 6, 3); +} + diff --git a/gcc/testsuite/gcc.target/i386/pr53346-2.c b/gcc/testsuite/gcc.target/i386/pr53346-2.c new file mode 100644 index 00000000000..0c6c7b35e01 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr53346-2.c @@ -0,0 +1,59 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-require-effective-target sse2 } */ + +#include "sse2-check.h" +#include "pr53346-1.c" + +static void +sse2_test () +{ + v4si a = __extension__(v4si) { 0, 1, 2, 3 }; + v4si b = __extension__(v4si) { 4, 5, 6, 7 }; + v4si exp = __extension__(v4si) { 1, 2, 5, 3 }; + v4si dest; + dest = foo (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 1, 5, 2, 3 }; + dest = foo1 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 1, 2, 3, 5 }; + dest = foo2 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 1, 4, 5, 6 }; + dest = foo3 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 3, 6, 7, 5 }; + dest = foo4 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 2, 4, 7, 6 }; + dest = foo5 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 2, 4, 3, 6 }; + dest = foo6 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 2, 3, 4, 6 }; + dest = foo7 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 2, 4, 6, 3 }; + dest = foo8 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/pr53346-3.c b/gcc/testsuite/gcc.target/i386/pr53346-3.c new file mode 100644 index 00000000000..0b204f6f210 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr53346-3.c @@ -0,0 +1,69 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2 -mno-sse3" } */ +/* { dg-final { scan-assembler-times "shufps" 17 } } */ + +typedef float v4sf __attribute__((vector_size(16))); + +v4sf +__attribute__((noipa)) +foo (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 1, 2, 5, 3); +} + +v4sf +__attribute__((noipa)) +foo1 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 1, 5, 2, 3); +} + +v4sf +__attribute__((noipa)) +foo2 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 1, 2, 3, 5); +} + +v4sf +__attribute__((noipa)) +foo3 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 1, 4, 5, 6); +} + +v4sf +__attribute__((noipa)) +foo4 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 3, 6, 7, 5); +} + +v4sf +__attribute__((noipa)) +foo5 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 2, 4, 7, 6); +} + +v4sf +__attribute__((noipa)) +foo6 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 2, 4, 3, 6); +} + +v4sf +__attribute__((noipa)) +foo7 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 2, 3, 4, 6); +} + +v4sf +__attribute__((noipa)) +foo8 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 2, 4, 6, 3); +} + diff --git a/gcc/testsuite/gcc.target/i386/pr53346-4.c b/gcc/testsuite/gcc.target/i386/pr53346-4.c new file mode 100644 index 00000000000..9e4e45bd584 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr53346-4.c @@ -0,0 +1,59 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-require-effective-target sse2 } */ + +#include "sse2-check.h" +#include "pr53346-3.c" + +static void +sse2_test () +{ + v4sf a = __extension__(v4sf) { 0, 1, 2, 3 }; + v4sf b = __extension__(v4sf) { 4, 5, 6, 7 }; + v4sf exp = __extension__(v4sf) { 1, 2, 5, 3 }; + v4sf dest; + dest = foo (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 1, 5, 2, 3 }; + dest = foo1 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 1, 2, 3, 5 }; + dest = foo2 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 1, 4, 5, 6 }; + dest = foo3 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 3, 6, 7, 5 }; + dest = foo4 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 2, 4, 7, 6 }; + dest = foo5 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 2, 4, 3, 6 }; + dest = foo6 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 2, 3, 4, 6 }; + dest = foo7 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 2, 4, 6, 3 }; + dest = foo8 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + +} -- 2.27.0