Hi! This patch adds two new permutations to the vshuf* tests (VEC_PACK_TRUNC style) and fixes problems related to them. The first two fix issues with AVX2, the rest of the patch deals with the problem that if the shuffle mask suggests two operands, testing whether there is a permutation supported by the hw is done with d.op0 != d.op1, but then when we find out both arguments are the same, it is performed for d.op0 == d.op1. Some routines handle just equality and some non-equality, so it is possible the check might succeed even when d.op0 == d.op1 using the adjusted mask won't (which results in ICEs). This patch in that case retries using d.op0 != d.op1.
Bootstrapped/regtested on x86_64-linux and i686-linux, additionally tested with -mavx2 on sde. 2011-10-19 Jakub Jelinek <ja...@redhat.com> * config/i386/i386.c (expand_vec_perm_vpshufb2_vpermq_even_odd): Use d->op1 instead of d->op0 for the second vpshufb. (expand_vec_perm_even_odd_1): For V8SImode fix vpshufd immediates. (ix86_expand_vec_perm_const): If mask indicates two operands are needed, but both are the same and expanding them as d.op0 == d.op1 failed, retry with d.op0 != d.op1. (ix86_expand_vec_perm_builtin): Likewise. Handle sorry printing also for d.nelt == 32. * gcc.dg/torture/vshuf-32.inc: Add interleave permutations. * gcc.dg/torture/vshuf-16.inc: Likewise. * gcc.dg/torture/vshuf-8.inc: Likewise. * gcc.dg/torture/vshuf-4.inc: Likewise. --- gcc/config/i386/i386.c.jj 2011-10-18 23:52:02.000000000 +0200 +++ gcc/config/i386/i386.c 2011-10-19 19:02:57.000000000 +0200 @@ -35992,7 +35992,7 @@ expand_vec_perm_vpshufb2_vpermq_even_odd vperm = force_reg (V32QImode, vperm); h = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, d->op0); + op = gen_lowpart (V32QImode, d->op1); emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); ior = gen_reg_rtx (V32QImode); @@ -36154,9 +36154,9 @@ expand_vec_perm_even_odd_1 (struct expan /* Swap the 2nd and 3rd position in each lane into { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */ emit_insn (gen_avx2_pshufdv3 (t1, t1, - GEN_INT (2 * 2 + 1 * 16 + 3 * 64))); + GEN_INT (2 * 4 + 1 * 16 + 3 * 64))); emit_insn (gen_avx2_pshufdv3 (t2, t2, - GEN_INT (2 * 2 + 1 * 16 + 3 * 64))); + GEN_INT (2 * 4 + 1 * 16 + 3 * 64))); /* Now an vpunpck[lh]qdq will produce { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */ @@ -36498,6 +36498,7 @@ ix86_expand_vec_perm_builtin (tree exp) { struct expand_vec_perm_d d; tree arg0, arg1, arg2; + bool maybe_retry = false; arg0 = CALL_EXPR_ARG (exp, 0); arg1 = CALL_EXPR_ARG (exp, 1); @@ -36543,6 +36544,7 @@ ix86_expand_vec_perm_builtin (tree exp) for (i = 0; i < nelt; ++i) if (d.perm[i] >= nelt) d.perm[i] -= nelt; + maybe_retry = true; } /* FALLTHRU */ @@ -36563,6 +36565,28 @@ ix86_expand_vec_perm_builtin (tree exp) if (ix86_expand_vec_perm_builtin_1 (&d)) return d.target; + /* If the mask says both arguments are needed, but they are the same, + the above tried to expand with d.op0 == d.op1. If that didn't work, + retry with d.op0 != d.op1 as that is what testing has been done with. */ + if (maybe_retry) + { + rtx seq; + bool ok; + + extract_vec_perm_cst (&d, arg2); + d.op1 = gen_reg_rtx (d.vmode); + start_sequence (); + ok = ix86_expand_vec_perm_builtin_1 (&d); + seq = get_insns (); + end_sequence (); + if (ok) + { + emit_move_insn (d.op1, d.op0); + emit_insn (seq); + return d.target; + } + } + /* For compiler generated permutations, we should never got here, because the compiler should also be checking the ok hook. But since this is a builtin the user has access too, so don't abort. */ @@ -36588,6 +36612,19 @@ ix86_expand_vec_perm_builtin (tree exp) d.perm[8], d.perm[9], d.perm[10], d.perm[11], d.perm[12], d.perm[13], d.perm[14], d.perm[15]); break; + case 32: + sorry ("vector permutation " + "(%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d " + "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d)", + d.perm[0], d.perm[1], d.perm[2], d.perm[3], + d.perm[4], d.perm[5], d.perm[6], d.perm[7], + d.perm[8], d.perm[9], d.perm[10], d.perm[11], + d.perm[12], d.perm[13], d.perm[14], d.perm[15], + d.perm[16], d.perm[17], d.perm[18], d.perm[19], + d.perm[20], d.perm[21], d.perm[22], d.perm[23], + d.perm[24], d.perm[25], d.perm[26], d.perm[27], + d.perm[28], d.perm[29], d.perm[30], d.perm[31]); + break; default: gcc_unreachable (); } @@ -36599,6 +36636,7 @@ bool ix86_expand_vec_perm_const (rtx operands[4]) { struct expand_vec_perm_d d; + unsigned char perm[MAX_VECT_LEN]; int i, nelt, which; rtx sel; @@ -36614,6 +36652,7 @@ ix86_expand_vec_perm_const (rtx operands gcc_assert (GET_CODE (sel) == CONST_VECTOR); gcc_assert (XVECLEN (sel, 0) == nelt); + gcc_checking_assert (sizeof (d.perm) == sizeof (perm)); for (i = which = 0; i < nelt; ++i) { @@ -36622,6 +36661,7 @@ ix86_expand_vec_perm_const (rtx operands which |= (ei < nelt ? 1 : 2); d.perm[i] = ei; + perm[i] = ei; } switch (which) @@ -36653,7 +36693,32 @@ ix86_expand_vec_perm_const (rtx operands break; } - return ix86_expand_vec_perm_builtin_1 (&d); + if (ix86_expand_vec_perm_builtin_1 (&d)) + return true; + + /* If the mask says both arguments are needed, but they are the same, + the above tried to expand with d.op0 == d.op1. If that didn't work, + retry with d.op0 != d.op1 as that is what testing has been done with. */ + if (which == 3 && d.op0 == d.op1) + { + rtx seq; + bool ok; + + memcpy (d.perm, perm, sizeof (perm)); + d.op1 = gen_reg_rtx (d.vmode); + start_sequence (); + ok = ix86_expand_vec_perm_builtin_1 (&d); + seq = get_insns (); + end_sequence (); + if (ok) + { + emit_move_insn (d.op1, d.op0); + emit_insn (seq); + return true; + } + } + + return false; } /* Implement targetm.vectorize.builtin_vec_perm_ok. */ --- gcc/testsuite/gcc.dg/torture/vshuf-32.inc.jj 2011-10-17 22:27:41.000000000 +0200 +++ gcc/testsuite/gcc.dg/torture/vshuf-32.inc 2011-10-19 15:16:40.000000000 +0200 @@ -15,7 +15,9 @@ T (11, 13, 40, 7, 33, 51, 21, 59, 46, 47 T (12, 39, 43, 54, 27, 53, 39, 27, 30, 2, 17, 13, 33, 7, 52, 40, 15, 36, 57, 10, 28, 22, 23, 25, 24, 41, 47, 8, 20, 5, 3, 4, 0) \ T (13, 7, 51, 13, 61, 25, 4, 19, 58, 35, 33, 29, 15, 40, 2, 39, 16, 38, 3, 54, 63, 15, 6, 48, 21, 14, 52, 17, 50, 34, 55, 57, 50) \ T (14, 22, 53, 28, 42, 45, 38, 49, 13, 54, 61, 21, 52, 7, 16, 34, 9, 1, 43, 62, 43, 35, 50, 47, 58, 20, 3, 30, 15, 37, 53, 43, 36) \ -T (15, 2, 43, 49, 34, 28, 35, 29, 36, 51, 9, 17, 48, 10, 37, 45, 21, 52, 19, 25, 33, 60, 31, 30, 42, 12, 26, 27, 46, 5, 40, 14, 36) +T (15, 2, 43, 49, 34, 28, 35, 29, 36, 51, 9, 17, 48, 10, 37, 45, 21, 52, 19, 25, 33, 60, 31, 30, 42, 12, 26, 27, 46, 5, 40, 14, 36) \ +T (16, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62) \ +T (17, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63) #define EXPTESTS \ T (116, 13, 38, 47, 3, 17, 8, 38, 20, 59, 61, 39, 26, 7, 49, 63, 43, 57, 16, 40, 19, 4, 32, 27, 7, 52, 19, 46, 55, 36, 41, 48, 6) \ T (117, 39, 35, 59, 20, 56, 18, 58, 63, 57, 14, 2, 16, 5, 61, 35, 4, 53, 9, 52, 51, 27, 33, 61, 12, 3, 35, 36, 40, 37, 7, 45, 42) \ --- gcc/testsuite/gcc.dg/torture/vshuf-4.inc.jj 2011-10-17 22:27:41.000000000 +0200 +++ gcc/testsuite/gcc.dg/torture/vshuf-4.inc 2011-10-19 15:17:00.000000000 +0200 @@ -15,7 +15,9 @@ T (11, 1, 4, 0, 7) \ T (12, 1, 5, 7, 2) \ T (13, 2, 3, 0, 4) \ T (14, 7, 6, 4, 2) \ -T (15, 6, 1, 3, 4) +T (15, 6, 1, 3, 4) \ +T (16, 0, 2, 4, 6) \ +T (17, 1, 3, 5, 7) #define EXPTESTS \ T (116, 1, 2, 4, 3) \ T (117, 7, 3, 3, 0) \ --- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj 2011-10-17 22:27:41.000000000 +0200 +++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc 2011-10-19 15:17:31.000000000 +0200 @@ -15,7 +15,9 @@ T (11, 5, 11, 12, 6, 3, 2, 4, 15) \ T (12, 5, 13, 14, 8, 4, 10, 4, 12) \ T (13, 14, 8, 12, 3, 13, 9, 5, 4) \ T (14, 15, 3, 13, 6, 14, 12, 10, 0) \ -T (15, 0, 5, 11, 7, 4, 6, 14, 1) +T (15, 0, 5, 11, 7, 4, 6, 14, 1) \ +T (16, 0, 2, 4, 6, 8, 10, 12, 14) \ +T (17, 1, 3, 5, 7, 9, 11, 13, 15) #define EXPTESTS \ T (116, 9, 3, 9, 4, 7, 0, 0, 6) \ T (117, 4, 14, 12, 8, 9, 6, 0, 10) \ --- gcc/testsuite/gcc.dg/torture/vshuf-16.inc.jj 2011-10-17 22:27:41.000000000 +0200 +++ gcc/testsuite/gcc.dg/torture/vshuf-16.inc 2011-10-19 15:15:42.000000000 +0200 @@ -15,7 +15,9 @@ T (11, 15, 9, 14, 10, 8, 12, 13, 11, 7, T (12, 2, 5, 24, 23, 17, 22, 20, 21, 12, 14, 13, 8, 6, 20, 10, 18) \ T (13, 23, 11, 15, 9, 0, 14, 8, 12, 10, 13, 19, 11, 2, 26, 24, 30) \ T (14, 25, 5, 17, 1, 9, 15, 21, 7, 28, 2, 18, 13, 30, 14, 10, 4) \ -T (15, 1, 30, 27, 31, 9, 18, 25, 12, 7, 4, 2, 16, 25, 20, 10, 3) +T (15, 1, 30, 27, 31, 9, 18, 25, 12, 7, 4, 2, 16, 25, 20, 10, 3) \ +T (16, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30) \ +T (17, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31) #define EXPTESTS \ T (116, 28, 13, 27, 11, 21, 1, 5, 22, 29, 14, 15, 6, 3, 10, 16, 30) \ T (117, 22, 26, 1, 13, 29, 3, 18, 18, 11, 21, 12, 28, 19, 5, 7, 4) \ Jakub