Hi!

This patch adds two new permutations to the vshuf* tests (VEC_PACK_TRUNC
style) and fixes problems related to them.  The first two fix issues with
AVX2, the rest of the patch deals with the problem that if the shuffle
mask suggests two operands, testing whether there is a permutation supported
by the hw is done with d.op0 != d.op1, but then when we find out both
arguments are the same, it is performed for d.op0 == d.op1.  Some routines
handle just equality and some non-equality, so it is possible
the check might succeed even when d.op0 == d.op1 using the adjusted mask
won't (which results in ICEs).  This patch in that case retries
using d.op0 != d.op1.

Bootstrapped/regtested on x86_64-linux and i686-linux, additionally tested
with -mavx2 on sde.

2011-10-19  Jakub Jelinek  <ja...@redhat.com>

        * config/i386/i386.c (expand_vec_perm_vpshufb2_vpermq_even_odd): Use
        d->op1 instead of d->op0 for the second vpshufb.
        (expand_vec_perm_even_odd_1): For V8SImode fix vpshufd immediates.
        (ix86_expand_vec_perm_const): If mask indicates two operands are
        needed, but both are the same and expanding them as d.op0 == d.op1
        failed, retry with d.op0 != d.op1.
        (ix86_expand_vec_perm_builtin): Likewise.  Handle sorry printing
        also for d.nelt == 32.

        * gcc.dg/torture/vshuf-32.inc: Add interleave permutations.
        * gcc.dg/torture/vshuf-16.inc: Likewise.
        * gcc.dg/torture/vshuf-8.inc: Likewise.
        * gcc.dg/torture/vshuf-4.inc: Likewise.

--- gcc/config/i386/i386.c.jj   2011-10-18 23:52:02.000000000 +0200
+++ gcc/config/i386/i386.c      2011-10-19 19:02:57.000000000 +0200
@@ -35992,7 +35992,7 @@ expand_vec_perm_vpshufb2_vpermq_even_odd
   vperm = force_reg (V32QImode, vperm);
 
   h = gen_reg_rtx (V32QImode);
-  op = gen_lowpart (V32QImode, d->op0);
+  op = gen_lowpart (V32QImode, d->op1);
   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
 
   ior = gen_reg_rtx (V32QImode);
@@ -36154,9 +36154,9 @@ expand_vec_perm_even_odd_1 (struct expan
       /* Swap the 2nd and 3rd position in each lane into
         { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
       emit_insn (gen_avx2_pshufdv3 (t1, t1,
-                                   GEN_INT (2 * 2 + 1 * 16 + 3 * 64)));
+                                   GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
       emit_insn (gen_avx2_pshufdv3 (t2, t2,
-                                   GEN_INT (2 * 2 + 1 * 16 + 3 * 64)));
+                                   GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
 
       /* Now an vpunpck[lh]qdq will produce
         { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
@@ -36498,6 +36498,7 @@ ix86_expand_vec_perm_builtin (tree exp)
 {
   struct expand_vec_perm_d d;
   tree arg0, arg1, arg2;
+  bool maybe_retry = false;
 
   arg0 = CALL_EXPR_ARG (exp, 0);
   arg1 = CALL_EXPR_ARG (exp, 1);
@@ -36543,6 +36544,7 @@ ix86_expand_vec_perm_builtin (tree exp)
        for (i = 0; i < nelt; ++i)
          if (d.perm[i] >= nelt)
            d.perm[i] -= nelt;
+       maybe_retry = true;
       }
       /* FALLTHRU */
 
@@ -36563,6 +36565,28 @@ ix86_expand_vec_perm_builtin (tree exp)
   if (ix86_expand_vec_perm_builtin_1 (&d))
     return d.target;
 
+  /* If the mask says both arguments are needed, but they are the same,
+     the above tried to expand with d.op0 == d.op1.  If that didn't work,
+     retry with d.op0 != d.op1 as that is what testing has been done with.  */
+  if (maybe_retry)
+    {
+      rtx seq;
+      bool ok;
+
+      extract_vec_perm_cst (&d, arg2);
+      d.op1 = gen_reg_rtx (d.vmode);
+      start_sequence ();
+      ok = ix86_expand_vec_perm_builtin_1 (&d);
+      seq = get_insns ();
+      end_sequence ();
+      if (ok)
+       {
+         emit_move_insn (d.op1, d.op0);
+         emit_insn (seq);
+         return d.target;
+       }
+    }
+
   /* For compiler generated permutations, we should never got here, because
      the compiler should also be checking the ok hook.  But since this is a
      builtin the user has access too, so don't abort.  */
@@ -36588,6 +36612,19 @@ ix86_expand_vec_perm_builtin (tree exp)
             d.perm[8], d.perm[9], d.perm[10], d.perm[11],
             d.perm[12], d.perm[13], d.perm[14], d.perm[15]);
       break;
+    case 32:
+      sorry ("vector permutation "
+            "(%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d "
+            "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d)",
+            d.perm[0], d.perm[1], d.perm[2], d.perm[3],
+            d.perm[4], d.perm[5], d.perm[6], d.perm[7],
+            d.perm[8], d.perm[9], d.perm[10], d.perm[11],
+            d.perm[12], d.perm[13], d.perm[14], d.perm[15],
+            d.perm[16], d.perm[17], d.perm[18], d.perm[19],
+            d.perm[20], d.perm[21], d.perm[22], d.perm[23],
+            d.perm[24], d.perm[25], d.perm[26], d.perm[27],
+            d.perm[28], d.perm[29], d.perm[30], d.perm[31]);
+      break;
     default:
       gcc_unreachable ();
     }
@@ -36599,6 +36636,7 @@ bool
 ix86_expand_vec_perm_const (rtx operands[4])
 {
   struct expand_vec_perm_d d;
+  unsigned char perm[MAX_VECT_LEN];
   int i, nelt, which;
   rtx sel;
 
@@ -36614,6 +36652,7 @@ ix86_expand_vec_perm_const (rtx operands
 
   gcc_assert (GET_CODE (sel) == CONST_VECTOR);
   gcc_assert (XVECLEN (sel, 0) == nelt);
+  gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
 
   for (i = which = 0; i < nelt; ++i)
     {
@@ -36622,6 +36661,7 @@ ix86_expand_vec_perm_const (rtx operands
 
       which |= (ei < nelt ? 1 : 2);
       d.perm[i] = ei;
+      perm[i] = ei;
     }
 
   switch (which)
@@ -36653,7 +36693,32 @@ ix86_expand_vec_perm_const (rtx operands
       break;
     }
 
-  return ix86_expand_vec_perm_builtin_1 (&d);
+  if (ix86_expand_vec_perm_builtin_1 (&d))
+    return true;
+
+  /* If the mask says both arguments are needed, but they are the same,
+     the above tried to expand with d.op0 == d.op1.  If that didn't work,
+     retry with d.op0 != d.op1 as that is what testing has been done with.  */
+  if (which == 3 && d.op0 == d.op1)
+    {
+      rtx seq;
+      bool ok;
+
+      memcpy (d.perm, perm, sizeof (perm));
+      d.op1 = gen_reg_rtx (d.vmode);
+      start_sequence ();
+      ok = ix86_expand_vec_perm_builtin_1 (&d);
+      seq = get_insns ();
+      end_sequence ();
+      if (ok)
+       {
+         emit_move_insn (d.op1, d.op0);
+         emit_insn (seq);
+         return true;
+       }
+    }
+
+  return false;
 }
 
 /* Implement targetm.vectorize.builtin_vec_perm_ok.  */
--- gcc/testsuite/gcc.dg/torture/vshuf-32.inc.jj        2011-10-17 
22:27:41.000000000 +0200
+++ gcc/testsuite/gcc.dg/torture/vshuf-32.inc   2011-10-19 15:16:40.000000000 
+0200
@@ -15,7 +15,9 @@ T (11,        13, 40, 7, 33, 51, 21, 59, 46, 47
 T (12, 39, 43, 54, 27, 53, 39, 27, 30, 2, 17, 13, 33, 7, 52, 40, 15, 36, 57, 
10, 28, 22, 23, 25, 24, 41, 47, 8, 20, 5, 3, 4, 0) \
 T (13, 7, 51, 13, 61, 25, 4, 19, 58, 35, 33, 29, 15, 40, 2, 39, 16, 38, 3, 54, 
63, 15, 6, 48, 21, 14, 52, 17, 50, 34, 55, 57, 50) \
 T (14, 22, 53, 28, 42, 45, 38, 49, 13, 54, 61, 21, 52, 7, 16, 34, 9, 1, 43, 
62, 43, 35, 50, 47, 58, 20, 3, 30, 15, 37, 53, 43, 36) \
-T (15, 2, 43, 49, 34, 28, 35, 29, 36, 51, 9, 17, 48, 10, 37, 45, 21, 52, 19, 
25, 33, 60, 31, 30, 42, 12, 26, 27, 46, 5, 40, 14, 36)
+T (15, 2, 43, 49, 34, 28, 35, 29, 36, 51, 9, 17, 48, 10, 37, 45, 21, 52, 19, 
25, 33, 60, 31, 30, 42, 12, 26, 27, 46, 5, 40, 14, 36) \
+T (16, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 
38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62) \
+T (17, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 
39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63)
 #define EXPTESTS \
 T (116,        13, 38, 47, 3, 17, 8, 38, 20, 59, 61, 39, 26, 7, 49, 63, 43, 
57, 16, 40, 19, 4, 32, 27, 7, 52, 19, 46, 55, 36, 41, 48, 6) \
 T (117,        39, 35, 59, 20, 56, 18, 58, 63, 57, 14, 2, 16, 5, 61, 35, 4, 
53, 9, 52, 51, 27, 33, 61, 12, 3, 35, 36, 40, 37, 7, 45, 42) \
--- gcc/testsuite/gcc.dg/torture/vshuf-4.inc.jj 2011-10-17 22:27:41.000000000 
+0200
+++ gcc/testsuite/gcc.dg/torture/vshuf-4.inc    2011-10-19 15:17:00.000000000 
+0200
@@ -15,7 +15,9 @@ T (11,        1, 4, 0, 7) \
 T (12, 1, 5, 7, 2) \
 T (13, 2, 3, 0, 4) \
 T (14, 7, 6, 4, 2) \
-T (15, 6, 1, 3, 4)
+T (15, 6, 1, 3, 4) \
+T (16, 0, 2, 4, 6) \
+T (17, 1, 3, 5, 7)
 #define EXPTESTS \
 T (116,        1, 2, 4, 3) \
 T (117,        7, 3, 3, 0) \
--- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj 2011-10-17 22:27:41.000000000 
+0200
+++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc    2011-10-19 15:17:31.000000000 
+0200
@@ -15,7 +15,9 @@ T (11,        5, 11, 12, 6, 3, 2, 4, 15) \
 T (12, 5, 13, 14, 8, 4, 10, 4, 12) \
 T (13, 14, 8, 12, 3, 13, 9, 5, 4) \
 T (14, 15, 3, 13, 6, 14, 12, 10, 0) \
-T (15, 0, 5, 11, 7, 4, 6, 14, 1)
+T (15, 0, 5, 11, 7, 4, 6, 14, 1) \
+T (16, 0, 2, 4, 6, 8, 10, 12, 14) \
+T (17, 1, 3, 5, 7, 9, 11, 13, 15)
 #define EXPTESTS \
 T (116,        9, 3, 9, 4, 7, 0, 0, 6) \
 T (117,        4, 14, 12, 8, 9, 6, 0, 10) \
--- gcc/testsuite/gcc.dg/torture/vshuf-16.inc.jj        2011-10-17 
22:27:41.000000000 +0200
+++ gcc/testsuite/gcc.dg/torture/vshuf-16.inc   2011-10-19 15:15:42.000000000 
+0200
@@ -15,7 +15,9 @@ T (11,        15, 9, 14, 10, 8, 12, 13, 11, 7, 
 T (12, 2, 5, 24, 23, 17, 22, 20, 21, 12, 14, 13, 8, 6, 20, 10, 18) \
 T (13, 23, 11, 15, 9, 0, 14, 8, 12, 10, 13, 19, 11, 2, 26, 24, 30) \
 T (14, 25, 5, 17, 1, 9, 15, 21, 7, 28, 2, 18, 13, 30, 14, 10, 4) \
-T (15, 1, 30, 27, 31, 9, 18, 25, 12, 7, 4, 2, 16, 25, 20, 10, 3)
+T (15, 1, 30, 27, 31, 9, 18, 25, 12, 7, 4, 2, 16, 25, 20, 10, 3) \
+T (16, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30) \
+T (17, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31)
 #define EXPTESTS \
 T (116,        28, 13, 27, 11, 21, 1, 5, 22, 29, 14, 15, 6, 3, 10, 16, 30) \
 T (117,        22, 26, 1, 13, 29, 3, 18, 18, 11, 21, 12, 28, 19, 5, 7, 4) \

        Jakub

Reply via email to