On Fri, Oct 27, 2023 at 3:21 PM Hongtao Liu <crazy...@gmail.com> wrote: > > On Fri, Oct 27, 2023 at 2:49 PM Richard Biener > <richard.guent...@gmail.com> wrote: > > > > > > > > > Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao....@intel.com>: > > > > > > When 2 vectors are equal, kmask is allones and kortest will set CF, > > > else CF will be cleared. > > > > > > So CF bit can be used to check for the result of the comparison. > > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > > Ok for trunk? > > > > Is that also profitable for 256bit aka AVX10? > Yes, it's also available for both 128-bit and 256-bit with AVX10, from > performance perspective it's better. > AVX10: > vpcmp + kortest > vs > AVX2: > vpxor + vptest > > vptest is more expensive than vpcmp + kortest > > > Is there a jump on carry in case the result feeds control flow rather than > > a value and is using ktest better then (does combine figure this out?) > There are JC and JNC, there're many pattern matches for ptest which > can't be automatically adjusted to kortest by combining, backend needs > to manually transform them. > That's why my patch only handles 64-bit vectors(to avoid regressing I mean 64 bytes. > those pattern match stuff). > > > > > > Before: > > > vmovdqu (%rsi), %ymm0 > > > vpxorq (%rdi), %ymm0, %ymm0 > > > vptest %ymm0, %ymm0 > > > jne .L2 > > > vmovdqu 32(%rsi), %ymm0 > > > vpxorq 32(%rdi), %ymm0, %ymm0 > > > vptest %ymm0, %ymm0 > > > je .L5 > > > .L2: > > > movl $1, %eax > > > xorl $1, %eax > > > vzeroupper > > > ret > > > > > > After: > > > vmovdqu64 (%rsi), %zmm0 > > > xorl %eax, %eax > > > vpcmpeqd (%rdi), %zmm0, %k0 > > > kortestw %k0, %k0 > > > setc %al > > > vzeroupper > > > ret > > > > > > gcc/ChangeLog: > > > > > > PR target/104610 > > > * config/i386/i386-expand.cc (ix86_expand_branch): Handle > > > 512-bit vector with vpcmpeq + kortest. > > > * config/i386/i386.md (cbranchxi4): New expander. > > > * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode > > > and V8DImode. > > > > > > gcc/testsuite/ChangeLog: > > > > > > * gcc.target/i386/pr104610-2.c: New test. > > > --- > > > gcc/config/i386/i386-expand.cc | 55 +++++++++++++++------- > > > gcc/config/i386/i386.md | 16 +++++++ > > > gcc/config/i386/sse.md | 36 +++++++++++--- > > > gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++ > > > 4 files changed, 99 insertions(+), 22 deletions(-) > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c > > > > > > diff --git a/gcc/config/i386/i386-expand.cc > > > b/gcc/config/i386/i386-expand.cc > > > index 1eae9d7c78c..c664cb61e80 100644 > > > --- a/gcc/config/i386/i386-expand.cc > > > +++ b/gcc/config/i386/i386-expand.cc > > > @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, > > > rtx op1, rtx label) > > > rtx tmp; > > > > > > /* Handle special case - vector comparsion with boolean result, > > > transform > > > - it using ptest instruction. */ > > > + it using ptest instruction or vpcmpeq + kortest. */ > > > if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT > > > || (mode == TImode && !TARGET_64BIT) > > > - || mode == OImode) > > > + || mode == OImode > > > + || GET_MODE_SIZE (mode) == 64) > > > { > > > - rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG); > > > - machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : > > > V2DImode; > > > + unsigned msize = GET_MODE_SIZE (mode); > > > + machine_mode p_mode > > > + = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode; > > > + /* kortest set CF when result is 0xFFFF (op0 == op1). */ > > > + rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, > > > FLAGS_REG); > > > > > > gcc_assert (code == EQ || code == NE); > > > > > > - if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT) > > > + /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors. */ > > > + if (msize == 64) > > > { > > > - op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode); > > > - op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode); > > > - mode = p_mode; > > > + if (mode != V16SImode) > > > + { > > > + op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode); > > > + op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode); > > > + } > > > + > > > + tmp = gen_reg_rtx (HImode); > > > + emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0))); > > > + emit_insn (gen_kortesthi_ccc (tmp, tmp)); > > > + } > > > + /* Using ptest for 128/256-bit vectors. */ > > > + else > > > + { > > > + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT) > > > + { > > > + op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode); > > > + op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode); > > > + mode = p_mode; > > > + } > > > + > > > + /* Generate XOR since we can't check that one operand is zero > > > + vector. */ > > > + tmp = gen_reg_rtx (mode); > > > + emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1))); > > > + tmp = gen_lowpart (p_mode, tmp); > > > + emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG), > > > + gen_rtx_UNSPEC (CCZmode, > > > + gen_rtvec (2, tmp, tmp), > > > + UNSPEC_PTEST))); > > > } > > > - /* Generate XOR since we can't check that one operand is zero > > > vector. */ > > > - tmp = gen_reg_rtx (mode); > > > - emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1))); > > > - tmp = gen_lowpart (p_mode, tmp); > > > - emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG), > > > - gen_rtx_UNSPEC (CCZmode, > > > - gen_rtvec (2, tmp, tmp), > > > - UNSPEC_PTEST))); > > > tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx); > > > tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, > > > gen_rtx_LABEL_REF (VOIDmode, label), > > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > > > index abaf2f311e8..51d8d0c3b97 100644 > > > --- a/gcc/config/i386/i386.md > > > +++ b/gcc/config/i386/i386.md > > > @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4" > > > DONE; > > > }) > > > > > > +(define_expand "cbranchxi4" > > > + [(set (reg:CC FLAGS_REG) > > > + (compare:CC (match_operand:XI 1 "nonimmediate_operand") > > > + (match_operand:XI 2 "nonimmediate_operand"))) > > > + (set (pc) (if_then_else > > > + (match_operator 0 "bt_comparison_operator" > > > + [(reg:CC FLAGS_REG) (const_int 0)]) > > > + (label_ref (match_operand 3)) > > > + (pc)))] > > > + "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256" > > > +{ > > > + ix86_expand_branch (GET_CODE (operands[0]), > > > + operands[1], operands[2], operands[3]); > > > + DONE; > > > +}) > > > + > > > (define_expand "cstore<mode>4" > > > [(set (reg:CC FLAGS_REG) > > > (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand") > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > > > index c988935d4df..88fb1154699 100644 > > > --- a/gcc/config/i386/sse.md > > > +++ b/gcc/config/i386/sse.md > > > @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>" > > > (set_attr "type" "msklog") > > > (set_attr "prefix" "vex")]) > > > > > > -(define_insn "kortest<mode>" > > > - [(set (reg:CC FLAGS_REG) > > > - (unspec:CC > > > +(define_insn "*kortest<mode>" > > > + [(set (reg FLAGS_REG) > > > + (unspec > > > [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k") > > > (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")] > > > UNSPEC_KORTEST))] > > > @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>" > > > (set_attr "type" "msklog") > > > (set_attr "prefix" "vex")]) > > > > > > +(define_insn "kortest<mode>_ccc" > > > + [(set (reg:CCC FLAGS_REG) > > > + (unspec:CCC > > > + [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand") > > > + (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")] > > > + UNSPEC_KORTEST))] > > > + "TARGET_AVX512F") > > > + > > > +(define_insn "kortest<mode>_ccz" > > > + [(set (reg:CCZ FLAGS_REG) > > > + (unspec:CCZ > > > + [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand") > > > + (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")] > > > + UNSPEC_KORTEST))] > > > + "TARGET_AVX512F") > > > + > > > +(define_expand "kortest<mode>" > > > + [(set (reg:CC FLAGS_REG) > > > + (unspec:CC > > > + [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand") > > > + (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")] > > > + UNSPEC_KORTEST))] > > > + "TARGET_AVX512F") > > > + > > > (define_insn "kunpckhi" > > > [(set (match_operand:HI 0 "register_operand" "=k") > > > (ior:HI > > > @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask" > > > > > > (define_expand "cbranch<mode>4" > > > [(set (reg:CC FLAGS_REG) > > > - (compare:CC (match_operand:VI48_AVX 1 "register_operand") > > > - (match_operand:VI48_AVX 2 "nonimmediate_operand"))) > > > + (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand") > > > + (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand"))) > > > (set (pc) (if_then_else > > > (match_operator 0 "bt_comparison_operator" > > > [(reg:CC FLAGS_REG) (const_int 0)]) > > > (label_ref (match_operand 3)) > > > (pc)))] > > > - "TARGET_SSE4_1" > > > + "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)" > > > { > > > ix86_expand_branch (GET_CODE (operands[0]), > > > operands[1], operands[2], operands[3]); > > > diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c > > > b/gcc/testsuite/gcc.target/i386/pr104610-2.c > > > new file mode 100644 > > > index 00000000000..999ef926a18 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c > > > @@ -0,0 +1,14 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-mavx512f -O2 -mtune=generic" } */ > > > +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */ > > > +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */ > > > + > > > +int compare (const char* s1, const char* s2) > > > +{ > > > + return __builtin_memcmp (s1, s2, 64) == 0; > > > +} > > > + > > > +int compare1 (const char* s1, const char* s2) > > > +{ > > > + return __builtin_memcmp (s1, s2, 64) != 0; > > > +} > > > -- > > > 2.31.1 > > > > > > > -- > BR, > Hongtao
-- BR, Hongtao