Hi!

As discussed in the PR, for some permutation we can get better code
if we try to expand it as if it was a permutation in a mode with the
same vector size, but wider vector element.  The first attempt to do this
always had mixed results, lots of improvements, lots of pessimizations,
this one at least on gcc.dg/vshuf*
{-msse2,-msse4,-mavx,-mavx2,-mavx512f,-mavx512bw} shows only
improvements - it tries the original permutation for single insn,
if that doesn't work tries the wider one single insn, and then
as complete fallback, if we don't have any expansion whatsoever, tries
the wider one too.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2015-12-03  Jakub Jelinek  <ja...@redhat.com>

        PR target/68655
        * config/i386/i386.c (canonicalize_vector_int_perm): New function.
        (expand_vec_perm_1): Use it and recurse if everything else
        failed.  Use nd.perm instead of perm2.
        (expand_vec_perm_even_odd_1): If testing_p, use gen_raw_REG
        instead of gen_lowpart for the target.
        (ix86_expand_vec_perm_const_1): Use canonicalize_vector_int_perm
        and recurse if everything else failed.

        * gcc.dg/torture/vshuf-4.inc (TESTS): Add one extra test.
        * gcc.dg/torture/vshuf-4.inc (TESTS): Add two extra tests.

--- gcc/config/i386/i386.c.jj   2015-12-02 20:27:00.000000000 +0100
+++ gcc/config/i386/i386.c      2015-12-03 15:03:13.415764986 +0100
@@ -49365,6 +49365,57 @@ expand_vec_perm_pshufb (struct expand_ve
   return true;
 }
 
+/* For V*[QHS]Imode permutations, check if the same permutation
+   can't be performed in a 2x, 4x or 8x wider inner mode.  */
+
+static bool
+canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
+                             struct expand_vec_perm_d *nd)
+{
+  int i;
+  enum machine_mode mode = VOIDmode;
+
+  switch (d->vmode)
+    {
+    case V16QImode: mode = V8HImode; break;
+    case V32QImode: mode = V16HImode; break;
+    case V64QImode: mode = V32HImode; break;
+    case V8HImode: mode = V4SImode; break;
+    case V16HImode: mode = V8SImode; break;
+    case V32HImode: mode = V16SImode; break;
+    case V4SImode: mode = V2DImode; break;
+    case V8SImode: mode = V4DImode; break;
+    case V16SImode: mode = V8DImode; break;
+    default: return false;
+    }
+  for (i = 0; i < d->nelt; i += 2)
+    if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
+      return false;
+  nd->vmode = mode;
+  nd->nelt = d->nelt / 2;
+  for (i = 0; i < nd->nelt; i++)
+    nd->perm[i] = d->perm[2 * i] / 2;
+  if (GET_MODE_INNER (mode) != DImode)
+    canonicalize_vector_int_perm (nd, nd);
+  if (nd != d)
+    {
+      nd->one_operand_p = d->one_operand_p;
+      nd->testing_p = d->testing_p;
+      if (d->op0 == d->op1)
+       nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
+      else
+       {
+         nd->op0 = gen_lowpart (nd->vmode, d->op0);
+         nd->op1 = gen_lowpart (nd->vmode, d->op1);
+       }
+      if (d->testing_p)
+       nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
+      else
+       nd->target = gen_reg_rtx (nd->vmode);
+    }
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
    in a single instruction.  */
 
@@ -49372,7 +49423,7 @@ static bool
 expand_vec_perm_1 (struct expand_vec_perm_d *d)
 {
   unsigned i, nelt = d->nelt;
-  unsigned char perm2[MAX_VECT_LEN];
+  struct expand_vec_perm_d nd;
 
   /* Check plain VEC_SELECT first, because AVX has instructions that could
      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
@@ -49385,10 +49436,10 @@ expand_vec_perm_1 (struct expand_vec_per
 
       for (i = 0; i < nelt; i++)
        {
-         perm2[i] = d->perm[i] & mask;
-         if (perm2[i] != i)
+         nd.perm[i] = d->perm[i] & mask;
+         if (nd.perm[i] != i)
            identity_perm = false;
-         if (perm2[i])
+         if (nd.perm[i])
            broadcast_perm = false;
        }
 
@@ -49457,7 +49508,7 @@ expand_vec_perm_1 (struct expand_vec_per
            }
        }
 
-      if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
+      if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
        return true;
 
       /* There are plenty of patterns in sse.md that are written for
@@ -49468,10 +49519,10 @@ expand_vec_perm_1 (struct expand_vec_per
         every other permutation operand.  */
       for (i = 0; i < nelt; i += 2)
        {
-         perm2[i] = d->perm[i] & mask;
-         perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
+         nd.perm[i] = d->perm[i] & mask;
+         nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
        }
-      if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
+      if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
                                  d->testing_p))
        return true;
 
@@ -49480,13 +49531,13 @@ expand_vec_perm_1 (struct expand_vec_per
        {
          for (i = 0; i < nelt; i += 4)
            {
-             perm2[i + 0] = d->perm[i + 0] & mask;
-             perm2[i + 1] = d->perm[i + 1] & mask;
-             perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
-             perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
+             nd.perm[i + 0] = d->perm[i + 0] & mask;
+             nd.perm[i + 1] = d->perm[i + 1] & mask;
+             nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
+             nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
            }
 
-         if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
+         if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
                                      d->testing_p))
            return true;
        }
@@ -49507,10 +49558,10 @@ expand_vec_perm_1 (struct expand_vec_per
            e -= nelt;
          else
            e += nelt;
-         perm2[i] = e;
+         nd.perm[i] = e;
        }
 
-      if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
+      if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
                                  d->testing_p))
        return true;
     }
@@ -49536,6 +49587,14 @@ expand_vec_perm_1 (struct expand_vec_per
   if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
     return true;
 
+  /* See if we can get the same permutation in different vector integer
+     mode.  */
+  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+    {
+      if (!d->testing_p)
+       emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
+      return true;
+    }
   return false;
 }
 
@@ -50968,7 +51027,7 @@ expand_vec_perm_even_odd_1 (struct expan
          struct expand_vec_perm_d d_copy = *d;
          d_copy.vmode = V4DFmode;
          if (d->testing_p)
-           d_copy.target = gen_lowpart (V4DFmode, d->target);
+           d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
          else
            d_copy.target = gen_reg_rtx (V4DFmode);
          d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
@@ -51007,7 +51066,7 @@ expand_vec_perm_even_odd_1 (struct expan
          struct expand_vec_perm_d d_copy = *d;
          d_copy.vmode = V8SFmode;
          if (d->testing_p)
-           d_copy.target = gen_lowpart (V8SFmode, d->target);
+           d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
          else
            d_copy.target = gen_reg_rtx (V8SFmode);
          d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
@@ -51451,6 +51510,16 @@ ix86_expand_vec_perm_const_1 (struct exp
   if (expand_vec_perm_vpshufb4_vpermq2 (d))
     return true;
 
+  /* See if we can get the same permutation in different vector integer
+     mode.  */
+  struct expand_vec_perm_d nd;
+  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+    {
+      if (!d->testing_p)
+       emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
+      return true;
+    }
+
   return false;
 }
 
--- gcc/testsuite/gcc.dg/torture/vshuf-4.inc.jj 2014-10-01 22:39:47.000000000 
+0200
+++ gcc/testsuite/gcc.dg/torture/vshuf-4.inc    2015-12-03 15:44:29.252181928 
+0100
@@ -24,7 +24,8 @@ T (20,        0, 4, 1, 5) \
 T (21, 2, 6, 3, 7) \
 T (22, 1, 2, 3, 0) \
 T (23, 2, 1, 0, 3) \
-T (24, 2, 5, 6, 3)
+T (24, 2, 5, 6, 3) \
+T (25, 0, 1, 4, 5)
 #define EXPTESTS \
 T (116,        1, 2, 4, 3) \
 T (117,        7, 3, 3, 0) \
--- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj 2012-03-20 08:51:25.000000000 
+0100
+++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc    2015-12-03 15:45:13.280567206 
+0100
@@ -23,7 +23,9 @@ T (19,        7, 6, 5, 4, 3, 2, 1, 0) \
 T (20, 0, 8, 1, 9, 2, 10, 3, 11) \
 T (21, 4, 12, 5, 13, 6, 14, 7, 15) \
 T (22, 1, 2, 3, 4, 5, 6, 7, 0) \
-T (23, 6, 5, 4, 3, 2, 1, 0, 7)
+T (23, 6, 5, 4, 3, 2, 1, 0, 7) \
+T (24, 0, 1, 2, 3, 8, 9, 10, 11) \
+T (25, 0, 1, 2, 3, 12, 13, 14, 15)
 #define EXPTESTS \
 T (116,        9, 3, 9, 4, 7, 0, 0, 6) \
 T (117,        4, 14, 12, 8, 9, 6, 0, 10) \

        Jakub

Reply via email to