On Thu, Dec 3, 2015 at 9:52 PM, Jakub Jelinek <ja...@redhat.com> wrote:
> Hi!
>
> As discussed in the PR, for some permutation we can get better code
> if we try to expand it as if it was a permutation in a mode with the
> same vector size, but wider vector element.  The first attempt to do this
> always had mixed results, lots of improvements, lots of pessimizations,
> this one at least on gcc.dg/vshuf*
> {-msse2,-msse4,-mavx,-mavx2,-mavx512f,-mavx512bw} shows only
> improvements - it tries the original permutation for single insn,
> if that doesn't work tries the wider one single insn, and then
> as complete fallback, if we don't have any expansion whatsoever, tries
> the wider one too.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2015-12-03  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/68655
>         * config/i386/i386.c (canonicalize_vector_int_perm): New function.
>         (expand_vec_perm_1): Use it and recurse if everything else
>         failed.  Use nd.perm instead of perm2.
>         (expand_vec_perm_even_odd_1): If testing_p, use gen_raw_REG
>         instead of gen_lowpart for the target.
>         (ix86_expand_vec_perm_const_1): Use canonicalize_vector_int_perm
>         and recurse if everything else failed.
>
>         * gcc.dg/torture/vshuf-4.inc (TESTS): Add one extra test.
>         * gcc.dg/torture/vshuf-4.inc (TESTS): Add two extra tests.

OK for mainline.

Thanks,
Uros.

> --- gcc/config/i386/i386.c.jj   2015-12-02 20:27:00.000000000 +0100
> +++ gcc/config/i386/i386.c      2015-12-03 15:03:13.415764986 +0100
> @@ -49365,6 +49365,57 @@ expand_vec_perm_pshufb (struct expand_ve
>    return true;
>  }
>
> +/* For V*[QHS]Imode permutations, check if the same permutation
> +   can't be performed in a 2x, 4x or 8x wider inner mode.  */
> +
> +static bool
> +canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
> +                             struct expand_vec_perm_d *nd)
> +{
> +  int i;
> +  enum machine_mode mode = VOIDmode;
> +
> +  switch (d->vmode)
> +    {
> +    case V16QImode: mode = V8HImode; break;
> +    case V32QImode: mode = V16HImode; break;
> +    case V64QImode: mode = V32HImode; break;
> +    case V8HImode: mode = V4SImode; break;
> +    case V16HImode: mode = V8SImode; break;
> +    case V32HImode: mode = V16SImode; break;
> +    case V4SImode: mode = V2DImode; break;
> +    case V8SImode: mode = V4DImode; break;
> +    case V16SImode: mode = V8DImode; break;
> +    default: return false;
> +    }
> +  for (i = 0; i < d->nelt; i += 2)
> +    if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
> +      return false;
> +  nd->vmode = mode;
> +  nd->nelt = d->nelt / 2;
> +  for (i = 0; i < nd->nelt; i++)
> +    nd->perm[i] = d->perm[2 * i] / 2;
> +  if (GET_MODE_INNER (mode) != DImode)
> +    canonicalize_vector_int_perm (nd, nd);
> +  if (nd != d)
> +    {
> +      nd->one_operand_p = d->one_operand_p;
> +      nd->testing_p = d->testing_p;
> +      if (d->op0 == d->op1)
> +       nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
> +      else
> +       {
> +         nd->op0 = gen_lowpart (nd->vmode, d->op0);
> +         nd->op1 = gen_lowpart (nd->vmode, d->op1);
> +       }
> +      if (d->testing_p)
> +       nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
> +      else
> +       nd->target = gen_reg_rtx (nd->vmode);
> +    }
> +  return true;
> +}
> +
>  /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
>     in a single instruction.  */
>
> @@ -49372,7 +49423,7 @@ static bool
>  expand_vec_perm_1 (struct expand_vec_perm_d *d)
>  {
>    unsigned i, nelt = d->nelt;
> -  unsigned char perm2[MAX_VECT_LEN];
> +  struct expand_vec_perm_d nd;
>
>    /* Check plain VEC_SELECT first, because AVX has instructions that could
>       match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
> @@ -49385,10 +49436,10 @@ expand_vec_perm_1 (struct expand_vec_per
>
>        for (i = 0; i < nelt; i++)
>         {
> -         perm2[i] = d->perm[i] & mask;
> -         if (perm2[i] != i)
> +         nd.perm[i] = d->perm[i] & mask;
> +         if (nd.perm[i] != i)
>             identity_perm = false;
> -         if (perm2[i])
> +         if (nd.perm[i])
>             broadcast_perm = false;
>         }
>
> @@ -49457,7 +49508,7 @@ expand_vec_perm_1 (struct expand_vec_per
>             }
>         }
>
> -      if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
> +      if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
>         return true;
>
>        /* There are plenty of patterns in sse.md that are written for
> @@ -49468,10 +49519,10 @@ expand_vec_perm_1 (struct expand_vec_per
>          every other permutation operand.  */
>        for (i = 0; i < nelt; i += 2)
>         {
> -         perm2[i] = d->perm[i] & mask;
> -         perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
> +         nd.perm[i] = d->perm[i] & mask;
> +         nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
>         }
> -      if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
> +      if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
>                                   d->testing_p))
>         return true;
>
> @@ -49480,13 +49531,13 @@ expand_vec_perm_1 (struct expand_vec_per
>         {
>           for (i = 0; i < nelt; i += 4)
>             {
> -             perm2[i + 0] = d->perm[i + 0] & mask;
> -             perm2[i + 1] = d->perm[i + 1] & mask;
> -             perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
> -             perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
> +             nd.perm[i + 0] = d->perm[i + 0] & mask;
> +             nd.perm[i + 1] = d->perm[i + 1] & mask;
> +             nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
> +             nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
>             }
>
> -         if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
> +         if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, 
> nelt,
>                                       d->testing_p))
>             return true;
>         }
> @@ -49507,10 +49558,10 @@ expand_vec_perm_1 (struct expand_vec_per
>             e -= nelt;
>           else
>             e += nelt;
> -         perm2[i] = e;
> +         nd.perm[i] = e;
>         }
>
> -      if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
> +      if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
>                                   d->testing_p))
>         return true;
>      }
> @@ -49536,6 +49587,14 @@ expand_vec_perm_1 (struct expand_vec_per
>    if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, 
> d))
>      return true;
>
> +  /* See if we can get the same permutation in different vector integer
> +     mode.  */
> +  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
> +    {
> +      if (!d->testing_p)
> +       emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
> +      return true;
> +    }
>    return false;
>  }
>
> @@ -50968,7 +51027,7 @@ expand_vec_perm_even_odd_1 (struct expan
>           struct expand_vec_perm_d d_copy = *d;
>           d_copy.vmode = V4DFmode;
>           if (d->testing_p)
> -           d_copy.target = gen_lowpart (V4DFmode, d->target);
> +           d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
>           else
>             d_copy.target = gen_reg_rtx (V4DFmode);
>           d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
> @@ -51007,7 +51066,7 @@ expand_vec_perm_even_odd_1 (struct expan
>           struct expand_vec_perm_d d_copy = *d;
>           d_copy.vmode = V8SFmode;
>           if (d->testing_p)
> -           d_copy.target = gen_lowpart (V8SFmode, d->target);
> +           d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
>           else
>             d_copy.target = gen_reg_rtx (V8SFmode);
>           d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
> @@ -51451,6 +51510,16 @@ ix86_expand_vec_perm_const_1 (struct exp
>    if (expand_vec_perm_vpshufb4_vpermq2 (d))
>      return true;
>
> +  /* See if we can get the same permutation in different vector integer
> +     mode.  */
> +  struct expand_vec_perm_d nd;
> +  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
> +    {
> +      if (!d->testing_p)
> +       emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
> +      return true;
> +    }
> +
>    return false;
>  }
>
> --- gcc/testsuite/gcc.dg/torture/vshuf-4.inc.jj 2014-10-01 22:39:47.000000000 
> +0200
> +++ gcc/testsuite/gcc.dg/torture/vshuf-4.inc    2015-12-03 15:44:29.252181928 
> +0100
> @@ -24,7 +24,8 @@ T (20,        0, 4, 1, 5) \
>  T (21, 2, 6, 3, 7) \
>  T (22, 1, 2, 3, 0) \
>  T (23, 2, 1, 0, 3) \
> -T (24, 2, 5, 6, 3)
> +T (24, 2, 5, 6, 3) \
> +T (25, 0, 1, 4, 5)
>  #define EXPTESTS \
>  T (116,        1, 2, 4, 3) \
>  T (117,        7, 3, 3, 0) \
> --- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj 2012-03-20 08:51:25.000000000 
> +0100
> +++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc    2015-12-03 15:45:13.280567206 
> +0100
> @@ -23,7 +23,9 @@ T (19,        7, 6, 5, 4, 3, 2, 1, 0) \
>  T (20, 0, 8, 1, 9, 2, 10, 3, 11) \
>  T (21, 4, 12, 5, 13, 6, 14, 7, 15) \
>  T (22, 1, 2, 3, 4, 5, 6, 7, 0) \
> -T (23, 6, 5, 4, 3, 2, 1, 0, 7)
> +T (23, 6, 5, 4, 3, 2, 1, 0, 7) \
> +T (24, 0, 1, 2, 3, 8, 9, 10, 11) \
> +T (25, 0, 1, 2, 3, 12, 13, 14, 15)
>  #define EXPTESTS \
>  T (116,        9, 3, 9, 4, 7, 0, 0, 6) \
>  T (117,        4, 14, 12, 8, 9, 6, 0, 10) \
>
>         Jakub

Reply via email to