Ping.

On Mon, May 5, 2014 at 8:49 PM, Evgeny Stupachenko <evstu...@gmail.com> wrote:
> Is the following patch ok? It passes bootstrap and make check.
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 88142a8..91f6f21 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -42807,6 +42807,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
>    return true;
>  }
>
> +static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);
> +
>  /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
>     in a single instruction.  */
>
> @@ -42946,6 +42948,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
>    if (expand_vec_perm_pshufb (d))
>      return true;
>
> +  /* Try the AVX2 vpshufb.  */
> +  if (expand_vec_perm_vpshufb2_vpermq (d))
> +    return true;
> +
>    /* Try the AVX512F vpermi2 instructions.  */
>    rtx vec[64];
>    enum machine_mode mode = d->vmode;
> @@ -43004,7 +43010,7 @@ expand_vec_perm_pshuflw_pshufhw (struct
> expand_vec_perm_d *d)
>  }
>
>  /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
> -   the permutation using the SSSE3 palignr instruction.  This succeeds
> +   the permutation using the SSSE3/AVX2 palignr instruction.  This succeeds
>     when all of the elements in PERM fit within one vector and we merely
>     need to shift them down so that a single vector permutation has a
>     chance to succeed.  */
> @@ -43015,14 +43021,26 @@ expand_vec_perm_palignr (struct expand_vec_perm_d 
> *d)
>    unsigned i, nelt = d->nelt;
>    unsigned min, max;
>    bool in_order, ok;
> -  rtx shift, target;
> +  rtx shift, shift1, target, tmp;
>    struct expand_vec_perm_d dcopy;
>
> -  /* Even with AVX, palignr only operates on 128-bit vectors.  */
> -  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
> +  /* SSSE3 is required to apply PALIGNR on 16 bytes operands.  */
> +  if (GET_MODE_SIZE (d->vmode) == 16)
> +    {
> +      if (!TARGET_SSSE3)
> +       return false;
> +    }
> +  /* AVX2 is required to apply PALIGNR on 32 bytes operands.  */
> +  else if (GET_MODE_SIZE (d->vmode) == 32)
> +    {
> +      if (!TARGET_AVX2)
> +       return false;
> +    }
> +  /* Other sizes are not supported.  */
> +  else
>      return false;
>
> -  min = nelt, max = 0;
> +  min = 2 * nelt, max = 0;
>    for (i = 0; i < nelt; ++i)
>      {
>        unsigned e = d->perm[i];
> @@ -43041,9 +43059,35 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
>
>    dcopy = *d;
>    shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
> -  target = gen_reg_rtx (TImode);
> -  emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
> -                                 gen_lowpart (TImode, d->op0), shift));
> +  shift1 = GEN_INT ((min - nelt / 2) *
> +          GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
> +
> +  if (GET_MODE_SIZE (d->vmode) != 32)
> +    {
> +      target = gen_reg_rtx (TImode);
> +      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
> +                                     gen_lowpart (TImode, d->op0), shift));
> +    }
> +  else
> +    {
> +      target = gen_reg_rtx (V2TImode);
> +      tmp = gen_reg_rtx (V4DImode);
> +      emit_insn (gen_avx2_permv2ti (tmp,
> +                                   gen_lowpart (V4DImode, d->op0),
> +                                   gen_lowpart (V4DImode, d->op1),
> +                                   GEN_INT (33)));
> +      if (min < nelt / 2)
> +        emit_insn (gen_avx2_palignrv2ti (target,
> +                                        gen_lowpart (V2TImode, tmp),
> +                                        gen_lowpart (V2TImode, d->op0),
> +                                        shift));
> +      else
> +       emit_insn (gen_avx2_palignrv2ti (target,
> +                                        gen_lowpart (V2TImode, d->op1),
> +                                        gen_lowpart (V2TImode, tmp),
> +                                        shift1));
> +    }
> +
>
>    dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
>    dcopy.one_operand_p = true;
>
> On Tue, Apr 29, 2014 at 1:03 AM, Richard Henderson <r...@redhat.com> wrote:
>> On 04/28/2014 01:43 PM, Evgeny Stupachenko wrote:
>>> Agree on checks:
>>>
>>>   /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
>>>      Requires SSSE3.  */
>>>   if (GET_MODE_SIZE (d->vmode) == 16)
>>>     {
>>>       if(!TARGET_SSSE3)
>>>         return false;
>>>     }
>>>   /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
>>>      PERM and PALIGNR.  It is more profitable than 2 PSHUFB and PERM.  */
>>>   else if (GET_MODE_SIZE (d->vmode) == 32)
>>>     {
>>>       if(!TARGET_AVX2)
>>>         return false;
>>>     }
>>>   else
>>>     return false;
>>
>> Thanks, much better.
>>
>>
>> r~

Reply via email to