Hi! As discussed in the PR, for some permutation we can get better code if we try to expand it as if it was a permutation in a mode with the same vector size, but wider vector element. The first attempt to do this always had mixed results, lots of improvements, lots of pessimizations, this one at least on gcc.dg/vshuf* {-msse2,-msse4,-mavx,-mavx2,-mavx512f,-mavx512bw} shows only improvements - it tries the original permutation for single insn, if that doesn't work tries the wider one single insn, and then as complete fallback, if we don't have any expansion whatsoever, tries the wider one too.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2015-12-03 Jakub Jelinek <ja...@redhat.com> PR target/68655 * config/i386/i386.c (canonicalize_vector_int_perm): New function. (expand_vec_perm_1): Use it and recurse if everything else failed. Use nd.perm instead of perm2. (expand_vec_perm_even_odd_1): If testing_p, use gen_raw_REG instead of gen_lowpart for the target. (ix86_expand_vec_perm_const_1): Use canonicalize_vector_int_perm and recurse if everything else failed. * gcc.dg/torture/vshuf-4.inc (TESTS): Add one extra test. * gcc.dg/torture/vshuf-4.inc (TESTS): Add two extra tests. --- gcc/config/i386/i386.c.jj 2015-12-02 20:27:00.000000000 +0100 +++ gcc/config/i386/i386.c 2015-12-03 15:03:13.415764986 +0100 @@ -49365,6 +49365,57 @@ expand_vec_perm_pshufb (struct expand_ve return true; } +/* For V*[QHS]Imode permutations, check if the same permutation + can't be performed in a 2x, 4x or 8x wider inner mode. */ + +static bool +canonicalize_vector_int_perm (const struct expand_vec_perm_d *d, + struct expand_vec_perm_d *nd) +{ + int i; + enum machine_mode mode = VOIDmode; + + switch (d->vmode) + { + case V16QImode: mode = V8HImode; break; + case V32QImode: mode = V16HImode; break; + case V64QImode: mode = V32HImode; break; + case V8HImode: mode = V4SImode; break; + case V16HImode: mode = V8SImode; break; + case V32HImode: mode = V16SImode; break; + case V4SImode: mode = V2DImode; break; + case V8SImode: mode = V4DImode; break; + case V16SImode: mode = V8DImode; break; + default: return false; + } + for (i = 0; i < d->nelt; i += 2) + if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1) + return false; + nd->vmode = mode; + nd->nelt = d->nelt / 2; + for (i = 0; i < nd->nelt; i++) + nd->perm[i] = d->perm[2 * i] / 2; + if (GET_MODE_INNER (mode) != DImode) + canonicalize_vector_int_perm (nd, nd); + if (nd != d) + { + nd->one_operand_p = d->one_operand_p; + nd->testing_p = d->testing_p; + if (d->op0 == d->op1) + nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0); + else + { + nd->op0 = gen_lowpart (nd->vmode, d->op0); + nd->op1 = gen_lowpart (nd->vmode, d->op1); + } + if (d->testing_p) + nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1); + else + nd->target = gen_reg_rtx (nd->vmode); + } + return true; +} + /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D in a single instruction. */ @@ -49372,7 +49423,7 @@ static bool expand_vec_perm_1 (struct expand_vec_perm_d *d) { unsigned i, nelt = d->nelt; - unsigned char perm2[MAX_VECT_LEN]; + struct expand_vec_perm_d nd; /* Check plain VEC_SELECT first, because AVX has instructions that could match both SEL and SEL+CONCAT, but the plain SEL will allow a memory @@ -49385,10 +49436,10 @@ expand_vec_perm_1 (struct expand_vec_per for (i = 0; i < nelt; i++) { - perm2[i] = d->perm[i] & mask; - if (perm2[i] != i) + nd.perm[i] = d->perm[i] & mask; + if (nd.perm[i] != i) identity_perm = false; - if (perm2[i]) + if (nd.perm[i]) broadcast_perm = false; } @@ -49457,7 +49508,7 @@ expand_vec_perm_1 (struct expand_vec_per } } - if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p)) + if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p)) return true; /* There are plenty of patterns in sse.md that are written for @@ -49468,10 +49519,10 @@ expand_vec_perm_1 (struct expand_vec_per every other permutation operand. */ for (i = 0; i < nelt; i += 2) { - perm2[i] = d->perm[i] & mask; - perm2[i + 1] = (d->perm[i + 1] & mask) + nelt; + nd.perm[i] = d->perm[i] & mask; + nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt; } - if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt, + if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, d->testing_p)) return true; @@ -49480,13 +49531,13 @@ expand_vec_perm_1 (struct expand_vec_per { for (i = 0; i < nelt; i += 4) { - perm2[i + 0] = d->perm[i + 0] & mask; - perm2[i + 1] = d->perm[i + 1] & mask; - perm2[i + 2] = (d->perm[i + 2] & mask) + nelt; - perm2[i + 3] = (d->perm[i + 3] & mask) + nelt; + nd.perm[i + 0] = d->perm[i + 0] & mask; + nd.perm[i + 1] = d->perm[i + 1] & mask; + nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt; + nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt; } - if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt, + if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, d->testing_p)) return true; } @@ -49507,10 +49558,10 @@ expand_vec_perm_1 (struct expand_vec_per e -= nelt; else e += nelt; - perm2[i] = e; + nd.perm[i] = e; } - if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt, + if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt, d->testing_p)) return true; } @@ -49536,6 +49587,14 @@ expand_vec_perm_1 (struct expand_vec_per if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d)) return true; + /* See if we can get the same permutation in different vector integer + mode. */ + if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) + { + if (!d->testing_p) + emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); + return true; + } return false; } @@ -50968,7 +51027,7 @@ expand_vec_perm_even_odd_1 (struct expan struct expand_vec_perm_d d_copy = *d; d_copy.vmode = V4DFmode; if (d->testing_p) - d_copy.target = gen_lowpart (V4DFmode, d->target); + d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1); else d_copy.target = gen_reg_rtx (V4DFmode); d_copy.op0 = gen_lowpart (V4DFmode, d->op0); @@ -51007,7 +51066,7 @@ expand_vec_perm_even_odd_1 (struct expan struct expand_vec_perm_d d_copy = *d; d_copy.vmode = V8SFmode; if (d->testing_p) - d_copy.target = gen_lowpart (V8SFmode, d->target); + d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1); else d_copy.target = gen_reg_rtx (V8SFmode); d_copy.op0 = gen_lowpart (V8SFmode, d->op0); @@ -51451,6 +51510,16 @@ ix86_expand_vec_perm_const_1 (struct exp if (expand_vec_perm_vpshufb4_vpermq2 (d)) return true; + /* See if we can get the same permutation in different vector integer + mode. */ + struct expand_vec_perm_d nd; + if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) + { + if (!d->testing_p) + emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); + return true; + } + return false; } --- gcc/testsuite/gcc.dg/torture/vshuf-4.inc.jj 2014-10-01 22:39:47.000000000 +0200 +++ gcc/testsuite/gcc.dg/torture/vshuf-4.inc 2015-12-03 15:44:29.252181928 +0100 @@ -24,7 +24,8 @@ T (20, 0, 4, 1, 5) \ T (21, 2, 6, 3, 7) \ T (22, 1, 2, 3, 0) \ T (23, 2, 1, 0, 3) \ -T (24, 2, 5, 6, 3) +T (24, 2, 5, 6, 3) \ +T (25, 0, 1, 4, 5) #define EXPTESTS \ T (116, 1, 2, 4, 3) \ T (117, 7, 3, 3, 0) \ --- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj 2012-03-20 08:51:25.000000000 +0100 +++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc 2015-12-03 15:45:13.280567206 +0100 @@ -23,7 +23,9 @@ T (19, 7, 6, 5, 4, 3, 2, 1, 0) \ T (20, 0, 8, 1, 9, 2, 10, 3, 11) \ T (21, 4, 12, 5, 13, 6, 14, 7, 15) \ T (22, 1, 2, 3, 4, 5, 6, 7, 0) \ -T (23, 6, 5, 4, 3, 2, 1, 0, 7) +T (23, 6, 5, 4, 3, 2, 1, 0, 7) \ +T (24, 0, 1, 2, 3, 8, 9, 10, 11) \ +T (25, 0, 1, 2, 3, 12, 13, 14, 15) #define EXPTESTS \ T (116, 9, 3, 9, 4, 7, 0, 0, 6) \ T (117, 4, 14, 12, 8, 9, 6, 0, 10) \ Jakub