On Fri, Oct 27, 2023 at 3:21 PM Hongtao Liu <crazy...@gmail.com> wrote:
>
> On Fri, Oct 27, 2023 at 2:49 PM Richard Biener
> <richard.guent...@gmail.com> wrote:
> >
> >
> >
> > > Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao....@intel.com>:
> > >
> > > When 2 vectors are equal, kmask is allones and kortest will set CF,
> > > else CF will be cleared.
> > >
> > > So CF bit can be used to check for the result of the comparison.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > Ok for trunk?
> >
> > Is that also profitable for 256bit aka AVX10?
> Yes, it's also available for both 128-bit and 256-bit with AVX10, from
> performance perspective it's better.
> AVX10:
>   vpcmp + kortest
>  vs
> AVX2:
>  vpxor + vptest
>
>  vptest is more expensive than vpcmp + kortest
>
> > Is there a jump on carry in case the result feeds control flow rather than 
> > a value and is using ktest better then (does combine figure this out?)
> There are JC and JNC, there're many pattern matches for ptest which
> can't be automatically adjusted to kortest by combining, backend needs
> to manually transform them.
> That's why my patch only handles 64-bit vectors(to avoid regressing
I mean 64 bytes.
> those pattern match stuff).
>
> >
> > > Before:
> > >        vmovdqu (%rsi), %ymm0
> > >        vpxorq  (%rdi), %ymm0, %ymm0
> > >        vptest  %ymm0, %ymm0
> > >        jne     .L2
> > >        vmovdqu 32(%rsi), %ymm0
> > >        vpxorq  32(%rdi), %ymm0, %ymm0
> > >        vptest  %ymm0, %ymm0
> > >        je      .L5
> > > .L2:
> > >        movl    $1, %eax
> > >        xorl    $1, %eax
> > >        vzeroupper
> > >        ret
> > >
> > > After:
> > >        vmovdqu64       (%rsi), %zmm0
> > >        xorl    %eax, %eax
> > >        vpcmpeqd        (%rdi), %zmm0, %k0
> > >        kortestw        %k0, %k0
> > >        setc    %al
> > >        vzeroupper
> > >        ret
> > >
> > > gcc/ChangeLog:
> > >
> > >    PR target/104610
> > >    * config/i386/i386-expand.cc (ix86_expand_branch): Handle
> > >    512-bit vector with vpcmpeq + kortest.
> > >    * config/i386/i386.md (cbranchxi4): New expander.
> > >    * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
> > >    and V8DImode.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >    * gcc.target/i386/pr104610-2.c: New test.
> > > ---
> > > gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
> > > gcc/config/i386/i386.md                    | 16 +++++++
> > > gcc/config/i386/sse.md                     | 36 +++++++++++---
> > > gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
> > > 4 files changed, 99 insertions(+), 22 deletions(-)
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
> > >
> > > diff --git a/gcc/config/i386/i386-expand.cc 
> > > b/gcc/config/i386/i386-expand.cc
> > > index 1eae9d7c78c..c664cb61e80 100644
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, 
> > > rtx op1, rtx label)
> > >   rtx tmp;
> > >
> > >   /* Handle special case - vector comparsion with boolean result, 
> > > transform
> > > -     it using ptest instruction.  */
> > > +     it using ptest instruction or vpcmpeq + kortest.  */
> > >   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> > >       || (mode == TImode && !TARGET_64BIT)
> > > -      || mode == OImode)
> > > +      || mode == OImode
> > > +      || GET_MODE_SIZE (mode) == 64)
> > >     {
> > > -      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
> > > -      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : 
> > > V2DImode;
> > > +      unsigned msize = GET_MODE_SIZE (mode);
> > > +      machine_mode p_mode
> > > +    = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
> > > +      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
> > > +      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, 
> > > FLAGS_REG);
> > >
> > >       gcc_assert (code == EQ || code == NE);
> > >
> > > -      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> > > +      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
> > > +      if (msize == 64)
> > >    {
> > > -      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > > -      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > > -      mode = p_mode;
> > > +      if (mode != V16SImode)
> > > +        {
> > > +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > > +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > > +        }
> > > +
> > > +      tmp = gen_reg_rtx (HImode);
> > > +      emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
> > > +      emit_insn (gen_kortesthi_ccc (tmp, tmp));
> > > +    }
> > > +      /* Using ptest for 128/256-bit vectors.  */
> > > +      else
> > > +    {
> > > +      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> > > +        {
> > > +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > > +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > > +          mode = p_mode;
> > > +        }
> > > +
> > > +      /* Generate XOR since we can't check that one operand is zero
> > > +         vector.  */
> > > +      tmp = gen_reg_rtx (mode);
> > > +      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> > > +      tmp = gen_lowpart (p_mode, tmp);
> > > +      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> > > +                  gen_rtx_UNSPEC (CCZmode,
> > > +                          gen_rtvec (2, tmp, tmp),
> > > +                          UNSPEC_PTEST)));
> > >    }
> > > -      /* Generate XOR since we can't check that one operand is zero 
> > > vector.  */
> > > -      tmp = gen_reg_rtx (mode);
> > > -      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> > > -      tmp = gen_lowpart (p_mode, tmp);
> > > -      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> > > -                  gen_rtx_UNSPEC (CCZmode,
> > > -                          gen_rtvec (2, tmp, tmp),
> > > -                          UNSPEC_PTEST)));
> > >       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
> > >       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
> > >                  gen_rtx_LABEL_REF (VOIDmode, label),
> > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > > index abaf2f311e8..51d8d0c3b97 100644
> > > --- a/gcc/config/i386/i386.md
> > > +++ b/gcc/config/i386/i386.md
> > > @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
> > >   DONE;
> > > })
> > >
> > > +(define_expand "cbranchxi4"
> > > +  [(set (reg:CC FLAGS_REG)
> > > +    (compare:CC (match_operand:XI 1 "nonimmediate_operand")
> > > +            (match_operand:XI 2 "nonimmediate_operand")))
> > > +   (set (pc) (if_then_else
> > > +           (match_operator 0 "bt_comparison_operator"
> > > +        [(reg:CC FLAGS_REG) (const_int 0)])
> > > +           (label_ref (match_operand 3))
> > > +           (pc)))]
> > > +  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
> > > +{
> > > +  ix86_expand_branch (GET_CODE (operands[0]),
> > > +              operands[1], operands[2], operands[3]);
> > > +  DONE;
> > > +})
> > > +
> > > (define_expand "cstore<mode>4"
> > >   [(set (reg:CC FLAGS_REG)
> > >    (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index c988935d4df..88fb1154699 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
> > >    (set_attr "type" "msklog")
> > >    (set_attr "prefix" "vex")])
> > >
> > > -(define_insn "kortest<mode>"
> > > -  [(set (reg:CC FLAGS_REG)
> > > -    (unspec:CC
> > > +(define_insn "*kortest<mode>"
> > > +  [(set (reg FLAGS_REG)
> > > +    (unspec
> > >      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
> > >       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
> > >      UNSPEC_KORTEST))]
> > > @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
> > >    (set_attr "type" "msklog")
> > >    (set_attr "prefix" "vex")])
> > >
> > > +(define_insn "kortest<mode>_ccc"
> > > +  [(set (reg:CCC FLAGS_REG)
> > > +    (unspec:CCC
> > > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > > +      UNSPEC_KORTEST))]
> > > +  "TARGET_AVX512F")
> > > +
> > > +(define_insn "kortest<mode>_ccz"
> > > +  [(set (reg:CCZ FLAGS_REG)
> > > +    (unspec:CCZ
> > > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > > +      UNSPEC_KORTEST))]
> > > +  "TARGET_AVX512F")
> > > +
> > > +(define_expand "kortest<mode>"
> > > +  [(set (reg:CC FLAGS_REG)
> > > +    (unspec:CC
> > > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > > +      UNSPEC_KORTEST))]
> > > +  "TARGET_AVX512F")
> > > +
> > > (define_insn "kunpckhi"
> > >   [(set (match_operand:HI 0 "register_operand" "=k")
> > >    (ior:HI
> > > @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
> > >
> > > (define_expand "cbranch<mode>4"
> > >   [(set (reg:CC FLAGS_REG)
> > > -    (compare:CC (match_operand:VI48_AVX 1 "register_operand")
> > > -            (match_operand:VI48_AVX 2 "nonimmediate_operand")))
> > > +    (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
> > > +            (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
> > >    (set (pc) (if_then_else
> > >           (match_operator 0 "bt_comparison_operator"
> > >        [(reg:CC FLAGS_REG) (const_int 0)])
> > >           (label_ref (match_operand 3))
> > >           (pc)))]
> > > -  "TARGET_SSE4_1"
> > > +  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
> > > {
> > >   ix86_expand_branch (GET_CODE (operands[0]),
> > >              operands[1], operands[2], operands[3]);
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c 
> > > b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> > > new file mode 100644
> > > index 00000000000..999ef926a18
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> > > @@ -0,0 +1,14 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
> > > +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
> > > +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
> > > +
> > > +int compare (const char* s1, const char* s2)
> > > +{
> > > +  return __builtin_memcmp (s1, s2, 64) == 0;
> > > +}
> > > +
> > > +int compare1 (const char* s1, const char* s2)
> > > +{
> > > +  return __builtin_memcmp (s1, s2, 64) != 0;
> > > +}
> > > --
> > > 2.31.1
> > >
>
>
>
> --
> BR,
> Hongtao



-- 
BR,
Hongtao

Reply via email to