On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote:
> False dependency happens when destination is only updated by > pternlog. There is no false dependency when destination is also used > in source. So either a pxor should be inserted, or input operand > should be set with constraint '0'. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ready to push to trunk. Shouldn't this patch also remove uses of vpternlog in standard_sse_constant_opcode? A couple more questions below: > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal" > ] > (symbol_ref "true")))]) > > +; False dependency happens on destination register which is not really > +; used when moving all ones to vector register > +(define_split > + [(set (match_operand:VMOVE 0 "register_operand") > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] > + "TARGET_AVX512F && reload_completed > + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0])) > + && optimize_function_for_speed_p (cfun)" Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate. Doesn't it work here as well? > + [(set (match_dup 0) (match_dup 2)) > + (parallel > + [(set (match_dup 0) (match_dup 1)) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > + "operands[2] = CONST0_RTX (<MODE>mode);") > + > +(define_insn "*vmov<mode>_constm1_pternlog_false_dep" > + [(set (match_operand:VMOVE 0 "register_operand" "=v") > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" > "<sseconstm1>")) > + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] > UNSPEC_INSN_FALSE_DEP)] > + "TARGET_AVX512VL || <MODE_SIZE> == 64" > + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" > + [(set_attr "type" "sselog1") > + (set_attr "prefix" "evex")]) > + > ;; If mem_addr points to a memory region with less than whole vector size > bytes > ;; of accessible memory and k is a mask that would prevent reading the > inaccessible > ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed > to vpblendd > @@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>" > operands[3] = CONST0_RTX (<MODE>mode); > }") > > -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" > +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>" > [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") > (vec_merge:VI48_AVX512VL > (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > @@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" > "@ > vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1} > vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, > %0, %0, 0x81}" > + "&& !TARGET_AVX512DQ && reload_completed > + && optimize_function_for_speed_p (cfun)" > + [(set (match_dup 0) (match_dup 4)) > + (parallel > + [(set (match_dup 0) > + (vec_merge:VI48_AVX512VL > + (match_dup 2) > + (match_dup 3) > + (match_dup 1))) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > + "operands[4] = CONST0_RTX (<MODE>mode);" > [(set_attr "isa" "avx512dq,*") > (set_attr "length_immediate" "0,1") > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep" > + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") > + (vec_merge:VI48_AVX512VL > + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > + (match_operand:VI48_AVX512VL 3 "const0_operand") > + (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk"))) > + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] > UNSPEC_INSN_FALSE_DEP)] > + "TARGET_AVX512F && !TARGET_AVX512DQ" > + "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, > %0, %0, 0x81}" > + [(set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > (define_expand "extendv2sfv2df2" > [(set (match_operand:V2DF 0 "register_operand") > (float_extend:V2DF > @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl<mode>2" > operands[2] = force_reg (<MODE>mode, operands[2]); > }) > > -(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>" > - [(set (match_operand:VI 0 "register_operand" "=v,v") > - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m") > - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))] > +(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>" > + [(set (match_operand:VI 0 "register_operand" "=v,v,v") > + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br") > + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))] > "TARGET_AVX512F > && (!<mask_applied> > || <ssescalarmode>mode == SImode > || <ssescalarmode>mode == DImode)" > { > + if (!<mask_applied> && which_alternative > + && optimize_function_for_speed_p (cfun)) > + return "#"; > + > if (TARGET_AVX512VL) > return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, > %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; > else > return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, > %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}"; > } > + "&& reload_completed && !REG_P (operands[1]) && !<mask_applied> > + && optimize_function_for_speed_p (cfun)" > + [(set (match_dup 0) (match_dup 3)) > + (parallel > + [(set (match_dup 0) > + (xor:VI (match_dup 1) (match_dup 2))) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > + "operands[3] = CONST0_RTX (<MODE>mode);" Perhaps I'm misreading this, but this seems to result in vpxor zmm0, zmm0 vpternlog zmm0, zmm0, [mem], 0x55 while in the PR the agreement was to emit vmovdq? zmm0, [mem] vpternlog zmm0, zmm0, zmm0, 0x55 when the source is in memory, because the former has three uops in fused domain? > [(set_attr "type" "sselog") > (set_attr "prefix" "evex") > (set (attr "mode") > @@ -17191,6 +17250,30 @@ (define_insn > "<mask_codefor>one_cmpl<mode>2<mask_name>" > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > (const_int 1)))]) > > +(define_insn "*one_cmpl<mode>2_pternlog_false_dep" > + [(set (match_operand:VI 0 "register_operand" "=v,v") > + (xor:VI (match_operand:VI 1 "bcst_vector_operand" "m, Br") > + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC"))) > + (unspec [(match_operand:VI 3 "register_operand" "0,0")] > + UNSPEC_INSN_FALSE_DEP)] > + "TARGET_AVX512F" > +{ > + if (TARGET_AVX512VL) > + return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, > %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; > + else > + return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, > %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}"; > +} > + [(set_attr "type" "sselog") > + (set_attr "prefix" "evex") > + (set (attr "mode") > + (if_then_else (match_test "TARGET_AVX512VL") > + (const_string "<sseinsnmode>") > + (const_string "XI"))) > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "0") > + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > + (const_int 1)))]) > + > (define_split > [(set (match_operand:VI48_AVX512F 0 "register_operand") > (vec_duplicate:VI48_AVX512F > @@ -17226,7 +17309,7 @@ (define_insn "*andnot<mode>3" > [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v") > (and:VI > (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br")) > - (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))] > + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))] > "TARGET_SSE > && (register_operand (operands[1], <MODE>mode) > || register_operand (operands[2], <MODE>mode))" > @@ -17685,8 +17768,8 @@ (define_insn "*iornot<mode>3" > [(set (match_operand:VI 0 "register_operand" "=v,v,v,v") > (ior:VI > (not:VI > - (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m")) > - (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))] > + (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr")) > + (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))] > "(<MODE_SIZE> == 64 || TARGET_AVX512VL > || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > && (register_operand (operands[1], <MODE>mode) > @@ -17710,7 +17793,7 @@ (define_insn "*iornot<mode>3" > (const_string "<sseinsnmode>") > (const_string "XI"))) > (set (attr "enabled") > - (if_then_else (eq_attr "alternative" "2,3") > + (if_then_else (eq_attr "alternative" "0,1") > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > (const_string "*")))]) > > @@ -17718,8 +17801,8 @@ (define_insn "*xnor<mode>3" > [(set (match_operand:VI 0 "register_operand" "=v,v") > (not:VI > (xor:VI > - (match_operand:VI 1 "bcst_vector_operand" "%v,v") > - (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > + (match_operand:VI 1 "bcst_vector_operand" "%0, 0") > + (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))] > "(<MODE_SIZE> == 64 || TARGET_AVX512VL > || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > && (register_operand (operands[1], <MODE>mode) > @@ -17738,7 +17821,7 @@ (define_insn "*xnor<mode>3" > (const_string "<sseinsnmode>") > (const_string "XI"))) > (set (attr "enabled") > - (if_then_else (eq_attr "alternative" "1") > + (if_then_else (eq_attr "alternative" "0") > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > (const_string "*")))]) > > @@ -17749,8 +17832,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior > "0x77")]) > (define_insn "*<nlogic><mode>3" > [(set (match_operand:VI 0 "register_operand" "=v,v") > (andor:VI > - (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) > - (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0")) > + (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))] > "(<MODE_SIZE> == 64 || TARGET_AVX512VL > || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > && (register_operand (operands[1], <MODE>mode) > @@ -17769,7 +17852,7 @@ (define_insn "*<nlogic><mode>3" > (const_string "<sseinsnmode>") > (const_string "XI"))) > (set (attr "enabled") > - (if_then_else (eq_attr "alternative" "1") > + (if_then_else (eq_attr "alternative" "0") > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > (const_string "*")))]) > > diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c > b/gcc/testsuite/gcc.target/i386/pr110438.c > new file mode 100644 > index 00000000000..11b8cc59fd2 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr110438.c > @@ -0,0 +1,30 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp > -mprefer-vector-width=512" } */ > +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */ > +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */ > +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */ > + > + > +#include <immintrin.h> > + > +__m512i g(void) > +{ > + return (__m512i){ 0 } - 1; > +} > + > +__m512i g1(__m512i* a) > +{ > + return ~(*a); > +} > + > +void > +foo (int* a, int* __restrict b) > +{ > + for (int i = 0; i != 16; i++) > + { > + if (b[i]) > + a[i] = -1; > + else > + a[i] = 0; > + } > +} >