On Thu, Aug 12, 2021 at 01:43:23PM +0800, liuhongt wrote:
> Hi:
>   This is another patch to optimize vec_perm_expr to match vpmov{dw,dq,wb}
> under AVX512.
>   For scenarios(like pr101846-2.c) where the upper half is not used, this 
> patch
> generates better code with only one vpmov{wb,dw,qd} instruction. For
> scenarios(like pr101846-3.c) where the upper half is actually used,  if the 
> src
> vector length is 256/512bits, the patch can still generate better code, but 
> for
> 128bits, the code generation is worse.
> 
> 128 bits upper half not used.
> 
> -       vpshufb .LC2(%rip), %xmm0, %xmm0
> +       vpmovdw %xmm0, %xmm0
> 
> 128 bits upper half used.
> -       vpshufb .LC2(%rip), %xmm0, %xmm0
> +       vpmovdw %xmm0, %xmm1
> +       vmovq   %xmm1, %rax
> +       vpinsrq $0, %rax, %xmm0, %xmm0
> 
>   Maybe expand_vec_perm_trunc_vinsert should only deal with 256/512bits of
> vectors, but considering the real use of scenarios like pr101846-3.c
> foo_*_128 possibility is relatively low, I still keep this part of the code.

I actually am not sure if even
 foo_dw_512:
 .LFB0:
        .cfi_startproc
-       vmovdqa64       %zmm0, %zmm1
-       vmovdqa64       .LC0(%rip), %zmm0
-       vpermi2w        %zmm1, %zmm1, %zmm0
+       vpmovdw %zmm0, %ymm1
+       vinserti64x4    $0x0, %ymm1, %zmm0, %zmm0
        ret
is always a win, the permutations we should care most about are in loops
and the constant load as well as the first move in that case likely go
away and it is one permutation insn vs. two.
Different case is e.g.
-       vmovdqa64       .LC5(%rip), %zmm2
-       vmovdqa64       %zmm0, %zmm1
-       vmovdqa64       .LC0(%rip), %zmm0
-       vpermi2w        %zmm1, %zmm1, %zmm2
-       vpermi2w        %zmm1, %zmm1, %zmm0
-       vpshufb .LC6(%rip), %zmm0, %zmm0
-       vpshufb .LC7(%rip), %zmm2, %zmm1
-       vporq   %zmm1, %zmm0, %zmm0
+       vpmovwb %zmm0, %ymm1
+       vinserti64x4    $0x0, %ymm1, %zmm0, %zmm0
So, I wonder if your new routine shouldn't be instead done after
in ix86_expand_vec_perm_const_1 after vec_perm_1 among other 2 insn cases
and handle the other vpmovdw etc. cases in combine splitters (see that we
only use low half or quarter of the result and transform whatever
permutation we've used into what we want).

And perhaps make the routine eventually more general, don't handle
just identity permutation in the upper half, but allow there other
permutations too (ones where that half can be represented by a single insn
permutation).
> 
>   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
>   Ok for trunk?
> 
> gcc/ChangeLog:
> 
>       PR target/101846
>       * config/i386/i386-expand.c (expand_vec_perm_trunc_vinsert):
>       New function.
>       (ix86_vectorize_vec_perm_const): Call
>       expand_vec_perm_trunc_vinsert.
>       * config/i386/sse.md (vec_set_lo_v32hi): New define_insn.
>       (vec_set_lo_v64qi): Ditto.
>       (vec_set_lo_<mode><mask_name>): Extend to no-avx512dq.
> 
> gcc/testsuite/ChangeLog:
> 
>       PR target/101846
>       * gcc.target/i386/pr101846-2.c: New test.
>       * gcc.target/i386/pr101846-3.c: New test.
> ---
>  gcc/config/i386/i386-expand.c              | 125 +++++++++++++++++++++
>  gcc/config/i386/sse.md                     |  60 +++++++++-
>  gcc/testsuite/gcc.target/i386/pr101846-2.c |  81 +++++++++++++
>  gcc/testsuite/gcc.target/i386/pr101846-3.c |  95 ++++++++++++++++
>  4 files changed, 359 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-3.c
> 
> diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> index bd21efa9530..519caac2e15 100644
> --- a/gcc/config/i386/i386-expand.c
> +++ b/gcc/config/i386/i386-expand.c
> @@ -18317,6 +18317,126 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
>    return false;
>  }
>  
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
> +   in terms of a pair of vpmovdw + vinserti128 instructions.  */
> +static bool
> +expand_vec_perm_trunc_vinsert (struct expand_vec_perm_d *d)
> +{
> +  unsigned i, nelt = d->nelt, mask = d->nelt - 1;
> +  unsigned half = nelt / 2;
> +  machine_mode half_mode, trunc_mode;
> +
> +  /* vpmov{wb,dw,qd} only available under AVX512.  */
> +  if (!d->one_operand_p || !TARGET_AVX512F
> +      || (!TARGET_AVX512VL  && GET_MODE_SIZE (d->vmode) < 64)

Too many spaces.
> +      || GET_MODE_SIZE (GET_MODE_INNER (d->vmode)) > 4)
> +    return false;
> +
> +  /* TARGET_AVX512BW is needed for vpmovwb.  */
> +  if (GET_MODE_INNER (d->vmode) == E_QImode && !TARGET_AVX512BW)
> +    return false;
> +
> +  for (i = 0; i < nelt; i++)
> +    {
> +      unsigned idx = d->perm[i] & mask;
> +      if (idx != i * 2 && i < half)
> +     return false;
> +      if (idx != i && i >= half)
> +     return false;
> +    }
> +
> +  rtx (*gen_trunc) (rtx, rtx) = NULL;
> +  rtx (*gen_vec_set_lo) (rtx, rtx, rtx) = NULL;
> +  switch (d->vmode)
> +    {
> +    case E_V16QImode:
> +      gen_trunc = gen_truncv8hiv8qi2;
> +      gen_vec_set_lo = gen_vec_setv2di;
> +      half_mode = V8QImode;
> +      trunc_mode = V8HImode;
> +      break;
> +    case E_V32QImode:
> +      gen_trunc = gen_truncv16hiv16qi2;
> +      gen_vec_set_lo = gen_vec_set_lo_v32qi;
> +      half_mode = V16QImode;
> +      trunc_mode = V16HImode;
> +      break;
> +    case E_V64QImode:
> +      gen_trunc = gen_truncv32hiv32qi2;
> +      gen_vec_set_lo = gen_vec_set_lo_v64qi;
> +      half_mode = V32QImode;
> +      trunc_mode = V32HImode;
> +      break;
> +    case E_V8HImode:
> +      gen_trunc = gen_truncv4siv4hi2;
> +      gen_vec_set_lo = gen_vec_setv2di;
> +      half_mode = V4HImode;
> +      trunc_mode = V4SImode;
> +      break;
> +    case E_V16HImode:
> +      gen_trunc = gen_truncv8siv8hi2;
> +      gen_vec_set_lo = gen_vec_set_lo_v16hi;
> +      half_mode = V8HImode;
> +      trunc_mode = V8SImode;
> +      break;
> +    case E_V32HImode:
> +      gen_trunc = gen_truncv16siv16hi2;
> +      gen_vec_set_lo = gen_vec_set_lo_v32hi;
> +      half_mode = V16HImode;
> +      trunc_mode = V16SImode;
> +      break;
> +    case E_V4SImode:
> +      gen_trunc = gen_truncv2div2si2;
> +      gen_vec_set_lo = gen_vec_setv2di;
> +      half_mode = V2SImode;
> +      trunc_mode = V2DImode;
> +      break;
> +    case E_V8SImode:
> +      gen_trunc = gen_truncv4div4si2;
> +      gen_vec_set_lo = gen_vec_set_lo_v8si;
> +      half_mode = V4SImode;
> +      trunc_mode = V4DImode;
> +      break;
> +    case E_V16SImode:
> +      gen_trunc = gen_truncv8div8si2;
> +      gen_vec_set_lo = gen_vec_set_lo_v16si;
> +      half_mode = V8SImode;
> +      trunc_mode = V8DImode;
> +      break;
> +
> +    default:
> +      break;
> +    }
> +
> +  if (gen_trunc == NULL)
> +    return false;

Isn't it simpler to return false; in the default: case?
> @@ -21028,6 +21148,11 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, 
> rtx target, rtx op0,
>    d.op0 = nop0;
>    d.op1 = force_reg (vmode, d.op1);
>  
> +  /* Try to match vpmov{wb,dw,qd}, although vinserti128 will be generated,
> +     it's very likely to be optimized off. So let's put the function here.  
> */

Two spaces after full stop.

        Jakub

Reply via email to