On Thu, Dec 3, 2015 at 9:52 PM, Jakub Jelinek <ja...@redhat.com> wrote: > Hi! > > As discussed in the PR, for some permutation we can get better code > if we try to expand it as if it was a permutation in a mode with the > same vector size, but wider vector element. The first attempt to do this > always had mixed results, lots of improvements, lots of pessimizations, > this one at least on gcc.dg/vshuf* > {-msse2,-msse4,-mavx,-mavx2,-mavx512f,-mavx512bw} shows only > improvements - it tries the original permutation for single insn, > if that doesn't work tries the wider one single insn, and then > as complete fallback, if we don't have any expansion whatsoever, tries > the wider one too. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2015-12-03 Jakub Jelinek <ja...@redhat.com> > > PR target/68655 > * config/i386/i386.c (canonicalize_vector_int_perm): New function. > (expand_vec_perm_1): Use it and recurse if everything else > failed. Use nd.perm instead of perm2. > (expand_vec_perm_even_odd_1): If testing_p, use gen_raw_REG > instead of gen_lowpart for the target. > (ix86_expand_vec_perm_const_1): Use canonicalize_vector_int_perm > and recurse if everything else failed. > > * gcc.dg/torture/vshuf-4.inc (TESTS): Add one extra test. > * gcc.dg/torture/vshuf-4.inc (TESTS): Add two extra tests.
OK for mainline. Thanks, Uros. > --- gcc/config/i386/i386.c.jj 2015-12-02 20:27:00.000000000 +0100 > +++ gcc/config/i386/i386.c 2015-12-03 15:03:13.415764986 +0100 > @@ -49365,6 +49365,57 @@ expand_vec_perm_pshufb (struct expand_ve > return true; > } > > +/* For V*[QHS]Imode permutations, check if the same permutation > + can't be performed in a 2x, 4x or 8x wider inner mode. */ > + > +static bool > +canonicalize_vector_int_perm (const struct expand_vec_perm_d *d, > + struct expand_vec_perm_d *nd) > +{ > + int i; > + enum machine_mode mode = VOIDmode; > + > + switch (d->vmode) > + { > + case V16QImode: mode = V8HImode; break; > + case V32QImode: mode = V16HImode; break; > + case V64QImode: mode = V32HImode; break; > + case V8HImode: mode = V4SImode; break; > + case V16HImode: mode = V8SImode; break; > + case V32HImode: mode = V16SImode; break; > + case V4SImode: mode = V2DImode; break; > + case V8SImode: mode = V4DImode; break; > + case V16SImode: mode = V8DImode; break; > + default: return false; > + } > + for (i = 0; i < d->nelt; i += 2) > + if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1) > + return false; > + nd->vmode = mode; > + nd->nelt = d->nelt / 2; > + for (i = 0; i < nd->nelt; i++) > + nd->perm[i] = d->perm[2 * i] / 2; > + if (GET_MODE_INNER (mode) != DImode) > + canonicalize_vector_int_perm (nd, nd); > + if (nd != d) > + { > + nd->one_operand_p = d->one_operand_p; > + nd->testing_p = d->testing_p; > + if (d->op0 == d->op1) > + nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0); > + else > + { > + nd->op0 = gen_lowpart (nd->vmode, d->op0); > + nd->op1 = gen_lowpart (nd->vmode, d->op1); > + } > + if (d->testing_p) > + nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1); > + else > + nd->target = gen_reg_rtx (nd->vmode); > + } > + return true; > +} > + > /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D > in a single instruction. */ > > @@ -49372,7 +49423,7 @@ static bool > expand_vec_perm_1 (struct expand_vec_perm_d *d) > { > unsigned i, nelt = d->nelt; > - unsigned char perm2[MAX_VECT_LEN]; > + struct expand_vec_perm_d nd; > > /* Check plain VEC_SELECT first, because AVX has instructions that could > match both SEL and SEL+CONCAT, but the plain SEL will allow a memory > @@ -49385,10 +49436,10 @@ expand_vec_perm_1 (struct expand_vec_per > > for (i = 0; i < nelt; i++) > { > - perm2[i] = d->perm[i] & mask; > - if (perm2[i] != i) > + nd.perm[i] = d->perm[i] & mask; > + if (nd.perm[i] != i) > identity_perm = false; > - if (perm2[i]) > + if (nd.perm[i]) > broadcast_perm = false; > } > > @@ -49457,7 +49508,7 @@ expand_vec_perm_1 (struct expand_vec_per > } > } > > - if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p)) > + if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p)) > return true; > > /* There are plenty of patterns in sse.md that are written for > @@ -49468,10 +49519,10 @@ expand_vec_perm_1 (struct expand_vec_per > every other permutation operand. */ > for (i = 0; i < nelt; i += 2) > { > - perm2[i] = d->perm[i] & mask; > - perm2[i + 1] = (d->perm[i + 1] & mask) + nelt; > + nd.perm[i] = d->perm[i] & mask; > + nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt; > } > - if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt, > + if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, > d->testing_p)) > return true; > > @@ -49480,13 +49531,13 @@ expand_vec_perm_1 (struct expand_vec_per > { > for (i = 0; i < nelt; i += 4) > { > - perm2[i + 0] = d->perm[i + 0] & mask; > - perm2[i + 1] = d->perm[i + 1] & mask; > - perm2[i + 2] = (d->perm[i + 2] & mask) + nelt; > - perm2[i + 3] = (d->perm[i + 3] & mask) + nelt; > + nd.perm[i + 0] = d->perm[i + 0] & mask; > + nd.perm[i + 1] = d->perm[i + 1] & mask; > + nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt; > + nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt; > } > > - if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt, > + if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, > nelt, > d->testing_p)) > return true; > } > @@ -49507,10 +49558,10 @@ expand_vec_perm_1 (struct expand_vec_per > e -= nelt; > else > e += nelt; > - perm2[i] = e; > + nd.perm[i] = e; > } > > - if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt, > + if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt, > d->testing_p)) > return true; > } > @@ -49536,6 +49587,14 @@ expand_vec_perm_1 (struct expand_vec_per > if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, > d)) > return true; > > + /* See if we can get the same permutation in different vector integer > + mode. */ > + if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) > + { > + if (!d->testing_p) > + emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); > + return true; > + } > return false; > } > > @@ -50968,7 +51027,7 @@ expand_vec_perm_even_odd_1 (struct expan > struct expand_vec_perm_d d_copy = *d; > d_copy.vmode = V4DFmode; > if (d->testing_p) > - d_copy.target = gen_lowpart (V4DFmode, d->target); > + d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1); > else > d_copy.target = gen_reg_rtx (V4DFmode); > d_copy.op0 = gen_lowpart (V4DFmode, d->op0); > @@ -51007,7 +51066,7 @@ expand_vec_perm_even_odd_1 (struct expan > struct expand_vec_perm_d d_copy = *d; > d_copy.vmode = V8SFmode; > if (d->testing_p) > - d_copy.target = gen_lowpart (V8SFmode, d->target); > + d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1); > else > d_copy.target = gen_reg_rtx (V8SFmode); > d_copy.op0 = gen_lowpart (V8SFmode, d->op0); > @@ -51451,6 +51510,16 @@ ix86_expand_vec_perm_const_1 (struct exp > if (expand_vec_perm_vpshufb4_vpermq2 (d)) > return true; > > + /* See if we can get the same permutation in different vector integer > + mode. */ > + struct expand_vec_perm_d nd; > + if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) > + { > + if (!d->testing_p) > + emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); > + return true; > + } > + > return false; > } > > --- gcc/testsuite/gcc.dg/torture/vshuf-4.inc.jj 2014-10-01 22:39:47.000000000 > +0200 > +++ gcc/testsuite/gcc.dg/torture/vshuf-4.inc 2015-12-03 15:44:29.252181928 > +0100 > @@ -24,7 +24,8 @@ T (20, 0, 4, 1, 5) \ > T (21, 2, 6, 3, 7) \ > T (22, 1, 2, 3, 0) \ > T (23, 2, 1, 0, 3) \ > -T (24, 2, 5, 6, 3) > +T (24, 2, 5, 6, 3) \ > +T (25, 0, 1, 4, 5) > #define EXPTESTS \ > T (116, 1, 2, 4, 3) \ > T (117, 7, 3, 3, 0) \ > --- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj 2012-03-20 08:51:25.000000000 > +0100 > +++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc 2015-12-03 15:45:13.280567206 > +0100 > @@ -23,7 +23,9 @@ T (19, 7, 6, 5, 4, 3, 2, 1, 0) \ > T (20, 0, 8, 1, 9, 2, 10, 3, 11) \ > T (21, 4, 12, 5, 13, 6, 14, 7, 15) \ > T (22, 1, 2, 3, 4, 5, 6, 7, 0) \ > -T (23, 6, 5, 4, 3, 2, 1, 0, 7) > +T (23, 6, 5, 4, 3, 2, 1, 0, 7) \ > +T (24, 0, 1, 2, 3, 8, 9, 10, 11) \ > +T (25, 0, 1, 2, 3, 12, 13, 14, 15) > #define EXPTESTS \ > T (116, 9, 3, 9, 4, 7, 0, 0, 6) \ > T (117, 4, 14, 12, 8, 9, 6, 0, 10) \ > > Jakub