Ping.
On Mon, May 5, 2014 at 8:49 PM, Evgeny Stupachenko <evstu...@gmail.com> wrote: > Is the following patch ok? It passes bootstrap and make check. > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 88142a8..91f6f21 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -42807,6 +42807,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) > return true; > } > > +static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d); > + > /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D > in a single instruction. */ > > @@ -42946,6 +42948,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) > if (expand_vec_perm_pshufb (d)) > return true; > > + /* Try the AVX2 vpshufb. */ > + if (expand_vec_perm_vpshufb2_vpermq (d)) > + return true; > + > /* Try the AVX512F vpermi2 instructions. */ > rtx vec[64]; > enum machine_mode mode = d->vmode; > @@ -43004,7 +43010,7 @@ expand_vec_perm_pshuflw_pshufhw (struct > expand_vec_perm_d *d) > } > > /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify > - the permutation using the SSSE3 palignr instruction. This succeeds > + the permutation using the SSSE3/AVX2 palignr instruction. This succeeds > when all of the elements in PERM fit within one vector and we merely > need to shift them down so that a single vector permutation has a > chance to succeed. */ > @@ -43015,14 +43021,26 @@ expand_vec_perm_palignr (struct expand_vec_perm_d > *d) > unsigned i, nelt = d->nelt; > unsigned min, max; > bool in_order, ok; > - rtx shift, target; > + rtx shift, shift1, target, tmp; > struct expand_vec_perm_d dcopy; > > - /* Even with AVX, palignr only operates on 128-bit vectors. */ > - if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) > + /* SSSE3 is required to apply PALIGNR on 16 bytes operands. */ > + if (GET_MODE_SIZE (d->vmode) == 16) > + { > + if (!TARGET_SSSE3) > + return false; > + } > + /* AVX2 is required to apply PALIGNR on 32 bytes operands. */ > + else if (GET_MODE_SIZE (d->vmode) == 32) > + { > + if (!TARGET_AVX2) > + return false; > + } > + /* Other sizes are not supported. */ > + else > return false; > > - min = nelt, max = 0; > + min = 2 * nelt, max = 0; > for (i = 0; i < nelt; ++i) > { > unsigned e = d->perm[i]; > @@ -43041,9 +43059,35 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d) > > dcopy = *d; > shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); > - target = gen_reg_rtx (TImode); > - emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), > - gen_lowpart (TImode, d->op0), shift)); > + shift1 = GEN_INT ((min - nelt / 2) * > + GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); > + > + if (GET_MODE_SIZE (d->vmode) != 32) > + { > + target = gen_reg_rtx (TImode); > + emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), > + gen_lowpart (TImode, d->op0), shift)); > + } > + else > + { > + target = gen_reg_rtx (V2TImode); > + tmp = gen_reg_rtx (V4DImode); > + emit_insn (gen_avx2_permv2ti (tmp, > + gen_lowpart (V4DImode, d->op0), > + gen_lowpart (V4DImode, d->op1), > + GEN_INT (33))); > + if (min < nelt / 2) > + emit_insn (gen_avx2_palignrv2ti (target, > + gen_lowpart (V2TImode, tmp), > + gen_lowpart (V2TImode, d->op0), > + shift)); > + else > + emit_insn (gen_avx2_palignrv2ti (target, > + gen_lowpart (V2TImode, d->op1), > + gen_lowpart (V2TImode, tmp), > + shift1)); > + } > + > > dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); > dcopy.one_operand_p = true; > > On Tue, Apr 29, 2014 at 1:03 AM, Richard Henderson <r...@redhat.com> wrote: >> On 04/28/2014 01:43 PM, Evgeny Stupachenko wrote: >>> Agree on checks: >>> >>> /* PALIGNR of 2 128-bits registers takes only 1 instrucion. >>> Requires SSSE3. */ >>> if (GET_MODE_SIZE (d->vmode) == 16) >>> { >>> if(!TARGET_SSSE3) >>> return false; >>> } >>> /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions: >>> PERM and PALIGNR. It is more profitable than 2 PSHUFB and PERM. */ >>> else if (GET_MODE_SIZE (d->vmode) == 32) >>> { >>> if(!TARGET_AVX2) >>> return false; >>> } >>> else >>> return false; >> >> Thanks, much better. >> >> >> r~