Re: [PATCH] i386, v2: Optimize _mm_unpacklo_epi8 of 0 vector as second argument or similar VEC_PERM_EXPRs into pmovzx [PR95905]

Uros Bizjak via Gcc-patches Tue, 12 Jan 2021 06:03:39 -0800

On Tue, Jan 12, 2021 at 2:40 PM Jakub Jelinek <ja...@redhat.com> wrote:
>
> On Tue, Jan 12, 2021 at 11:42:44AM +0100, Uros Bizjak via Gcc-patches wrote:
> > You can use post-reload define_insn_and_split here. This way,
> > gen_lowpart on all arguments, including output, can be used. So,
> > instead of generating an insn template, the patterns you introduced
> > should split to "real" sse4_1 zero-extend insns. This approach is
> > preferred to avoid having several pseudo-insns in .md files that do
> > the same thing with slightly different patterns. There are many
> > examples of post-reload splitters that use gen_lowpart in i386.md.
>
> So like this?
>
> If I tweak the vec_perm_const, the other define_insn_and_split will be
> easier, as they won't need the vec_select and variants in what they split
> into, just lowpart_subreg on the operand unconditionally and zero_extend it.
>
> 2021-01-12  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/95905
>         * config/i386/predicates.md (pmovzx_parallel): New predicate.
>         * config/i386/sse.md (*sse4_1_zero_extendv8qiv8hi2_3,
>         *sse4_1_zero_extendv4hiv4si2_3, *sse4_1_zero_extendv2siv2di2_3): New
>         define_insn_and_split patterns.
>
>         * gcc.target/i386/pr95905-1.c: New test.
>         * gcc.target/i386/pr95905-2.c: New test.


LGTM.

Thanks,
Uros.

>
> --- gcc/config/i386/predicates.md.jj    2021-01-12 11:01:28.458643868 +0100
> +++ gcc/config/i386/predicates.md       2021-01-12 13:58:39.816222121 +0100
> @@ -1600,6 +1600,38 @@ (define_predicate "addsub_vs_parallel"
>    return true;
>  })
>
> +;; Return true if OP is a parallel for an pmovz{bw,wd,dq} vec_select,
> +;; where one of the two operands of the vec_concat is const0_operand.
> +(define_predicate "pmovzx_parallel"
> +  (and (match_code "parallel")
> +       (match_code "const_int" "a"))
> +{
> +  int nelt = XVECLEN (op, 0);
> +  int elt, i;
> +
> +  if (nelt < 2)
> +    return false;
> +
> +  /* Check that the permutation is suitable for pmovz{bw,wd,dq}.
> +     For example { 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23 }.  */
> +  elt = INTVAL (XVECEXP (op, 0, 0));
> +  if (elt == 0)
> +    {
> +      for (i = 1; i < nelt; ++i)
> +       if ((i & 1) != 0)
> +         {
> +           if (INTVAL (XVECEXP (op, 0, i)) < nelt)
> +             return false;
> +         }
> +       else if (INTVAL (XVECEXP (op, 0, i)) != i / 2)
> +         return false;
> +    }
> +  else
> +    return false;
> +
> +  return true;
> +})
> +
>  ;; Return true if OP is a parallel for a vbroadcast permute.
>  (define_predicate "avx_vbroadcast_operand"
>    (and (match_code "parallel")
> --- gcc/config/i386/sse.md.jj   2021-01-12 11:01:28.494643460 +0100
> +++ gcc/config/i386/sse.md      2021-01-12 14:30:32.688546846 +0100
> @@ -17683,6 +17683,36 @@ (define_insn_and_split "*sse4_1_<code>v8
>         (any_extend:V8HI (match_dup 1)))]
>    "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);")
>
> +(define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_3"
> +  [(set (match_operand:V16QI 0 "register_operand" "=Yr,*x,v")
> +       (vec_select:V16QI
> +         (vec_concat:V32QI
> +           (match_operand:V16QI 1 "vector_operand" "Yrm,*xm,vm")
> +           (match_operand:V16QI 2 "const0_operand" "C,C,C"))
> +         (match_parallel 3 "pmovzx_parallel"
> +           [(match_operand 4 "const_int_operand" "n,n,n")])))]
> +  "TARGET_SSE4_1"
> +  "#"
> +  "&& reload_completed"
> +  [(set (match_dup 0)
> +       (zero_extend:V8HI
> +         (vec_select:V8QI
> +           (match_dup 1)
> +           (parallel [(const_int 0) (const_int 1)
> +                      (const_int 2) (const_int 3)
> +                      (const_int 4) (const_int 5)
> +                      (const_int 6) (const_int 7)]))))]
> +{
> +  operands[0] = lowpart_subreg (V8HImode, operands[0], V16QImode);
> +  if (MEM_P (operands[1]))
> +    {
> +      operands[1] = lowpart_subreg (V8QImode, operands[1], V16QImode);
> +      operands[1] = gen_rtx_ZERO_EXTEND (V8HImode, operands[1]);
> +      emit_insn (gen_rtx_SET (operands[0], operands[1]));
> +      DONE;
> +    }
> +})
> +
>  (define_expand "<insn>v8qiv8hi2"
>    [(set (match_operand:V8HI 0 "register_operand")
>         (any_extend:V8HI
> @@ -17929,6 +17959,34 @@ (define_expand "<insn>v4hiv4si2"
>      }
>  })
>
> +(define_insn_and_split "*sse4_1_zero_extendv4hiv4si2_3"
> +  [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
> +       (vec_select:V8HI
> +         (vec_concat:V16HI
> +           (match_operand:V8HI 1 "vector_operand" "Yrm,*xm,vm")
> +           (match_operand:V8HI 2 "const0_operand" "C,C,C"))
> +         (match_parallel 3 "pmovzx_parallel"
> +           [(match_operand 4 "const_int_operand" "n,n,n")])))]
> +  "TARGET_SSE4_1"
> +  "#"
> +  "&& reload_completed"
> +  [(set (match_dup 0)
> +       (zero_extend:V4SI
> +         (vec_select:V4HI
> +           (match_dup 1)
> +           (parallel [(const_int 0) (const_int 1)
> +                      (const_int 2) (const_int 3)]))))]
> +{
> +  operands[0] = lowpart_subreg (V4SImode, operands[0], V8HImode);
> +  if (MEM_P (operands[1]))
> +    {
> +      operands[1] = lowpart_subreg (V4HImode, operands[1], V8HImode);
> +      operands[1] = gen_rtx_ZERO_EXTEND (V4SImode, operands[1]);
> +      emit_insn (gen_rtx_SET (operands[0], operands[1]));
> +      DONE;
> +    }
> +})
> +
>  (define_insn "avx512f_<code>v8qiv8di2<mask_name>"
>    [(set (match_operand:V8DI 0 "register_operand" "=v")
>         (any_extend:V8DI
> @@ -18283,6 +18341,32 @@ (define_insn_and_split "*sse4_1_<code>v2
>         (any_extend:V2DI (match_dup 1)))]
>    "operands[1] = adjust_address_nv (operands[1], V2SImode, 0);")
>
> +(define_insn_and_split "*sse4_1_zero_extendv2siv2di2_3"
> +  [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
> +       (vec_select:V4SI
> +         (vec_concat:V8SI
> +           (match_operand:V4SI 1 "vector_operand" "Yrm,*xm,vm")
> +           (match_operand:V4SI 2 "const0_operand" "C,C,C"))
> +         (match_parallel 3 "pmovzx_parallel"
> +           [(match_operand 4 "const_int_operand" "n,n,n")])))]
> +  "TARGET_SSE4_1"
> +  "#"
> +  "&& reload_completed"
> +  [(set (match_dup 0)
> +       (zero_extend:V2DI
> +         (vec_select:V2SI (match_dup 1)
> +                          (parallel [(const_int 0) (const_int 1)]))))]
> +{
> +  operands[0] = lowpart_subreg (V2DImode, operands[0], V4SImode);
> +  if (MEM_P (operands[1]))
> +    {
> +      operands[1] = lowpart_subreg (V2SImode, operands[1], V4SImode);
> +      operands[1] = gen_rtx_ZERO_EXTEND (V2DImode, operands[1]);
> +      emit_insn (gen_rtx_SET (operands[0], operands[1]));
> +      DONE;
> +    }
> +})
> +
>  (define_expand "<insn>v2siv2di2"
>    [(set (match_operand:V2DI 0 "register_operand")
>         (any_extend:V2DI
> --- gcc/testsuite/gcc.target/i386/pr95905-1.c.jj        2021-01-12 
> 13:58:39.820222075 +0100
> +++ gcc/testsuite/gcc.target/i386/pr95905-1.c   2021-01-12 13:58:39.820222075 
> +0100
> @@ -0,0 +1,26 @@
> +/* PR target/95905 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse4.1 -mno-avx" } */
> +/* { dg-final { scan-assembler "\tpmovzxbw\t" } } */
> +/* { dg-final { scan-assembler "\tpmovzxwd\t" } } */
> +/* { dg-final { scan-assembler "\tpmovzxdq\t" } } */
> +
> +#include <x86intrin.h>
> +
> +__m128i
> +f1 (__m128i a)
> +{
> +  return _mm_unpacklo_epi8 (a, _mm_setzero_si128 ());
> +}
> +
> +__m128i
> +f2 (__m128i a)
> +{
> +  return _mm_unpacklo_epi16 (a, _mm_setzero_si128 ());
> +}
> +
> +__m128i
> +f3 (__m128i a)
> +{
> +  return _mm_unpacklo_epi32 (a, _mm_setzero_si128 ());
> +}
> --- gcc/testsuite/gcc.target/i386/pr95905-2.c.jj        2021-01-12 
> 13:58:39.820222075 +0100
> +++ gcc/testsuite/gcc.target/i386/pr95905-2.c   2021-01-12 13:58:39.820222075 
> +0100
> @@ -0,0 +1,46 @@
> +/* PR target/95905 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse4.1" } */
> +/* { dg-final { scan-assembler "\tv?pmovzxbw\t" } } */
> +/* { dg-final { scan-assembler "\tv?pmovzxwd\t" } } */
> +/* { dg-final { scan-assembler "\tv?pmovzxdq\t" } } */
> +
> +typedef unsigned char V1 __attribute__((vector_size (16)));
> +typedef unsigned short V2 __attribute__((vector_size (16)));
> +typedef unsigned int V3 __attribute__((vector_size (16)));
> +
> +V1
> +f1 (V1 x)
> +{
> +  return __builtin_shuffle (x, (V1) {}, (V1) { 0, 16, 1, 17, 2, 18, 3, 19, 
> 4, 20, 5, 21, 6, 22, 7, 23 });
> +}
> +
> +V2
> +f2 (V2 x)
> +{
> +  return __builtin_shuffle (x, (V2) {}, (V2) { 0, 8, 1, 9, 2, 10, 3, 11 });
> +}
> +
> +V3
> +f3 (V3 x)
> +{
> +  return __builtin_shuffle (x, (V3) {}, (V3) { 0, 4, 1, 5 });
> +}
> +
> +V1
> +f4 (V1 *x)
> +{
> +  return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 16, 1, 17, 2, 18, 3, 19, 
> 4, 20, 5, 21, 6, 22, 7, 23 });
> +}
> +
> +V2
> +f5 (V2 *x)
> +{
> +  return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 8, 1, 9, 2, 10, 3, 11 });
> +}
> +
> +V3
> +f6 (V3 *x)
> +{
> +  return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 4, 1, 5 });
> +}
>
>
>         Jakub
>

Re: [PATCH] i386, v2: Optimize _mm_unpacklo_epi8 of 0 vector as second argument or similar VEC_PERM_EXPRs into pmovzx [PR95905]

Reply via email to