vpternlog is also used for optimization which doesn't need any valid input operand, in that case, the destination is used as input in the instruction and that creates a false dependence.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ready to push to trunk. gcc/ChangeLog: PR target/110438 * config/i386/predicates.md (int_float_vector_all_ones_operand): New predicate. * config/i386/sse.md (*vmov<mode>_constm1_pternlog): New define_insn. (*<avx512>_cvtmask2<ssemodesuffix><mode>): Adjust to define_insn_and_split to avoid false dependence. (*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog): New define_insn. gcc/testsuite/ChangeLog: * gcc.target/i386/pr110438.c: New test. --- gcc/config/i386/predicates.md | 8 ++- gcc/config/i386/sse.md | 69 +++++++++++++++++++----- gcc/testsuite/gcc.target/i386/pr110438.c | 30 +++++++++++ 3 files changed, 94 insertions(+), 13 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index fb07707dcba..df0d9e20def 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand" return false; }) -/* Return true if operand is a vector constant that is all ones. */ +/* Return true if operand is an integral vector constant that is all ones. */ (define_predicate "vector_all_ones_operand" (and (match_code "const_vector") (match_test "INTEGRAL_MODE_P (GET_MODE (op))") (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) +/* Return true if operand is a vector constant that is all ones. */ +(define_predicate "int_float_vector_all_ones_operand" + (ior (match_operand 0 "vector_all_ones_operand") + (match_operand 0 "float_vector_all_ones_operand") + (match_test "op == constm1_rtx"))) + /* Return true if operand is an 128/256bit all ones vector that zero-extends to 256/512bit. */ (define_predicate "vector_all_ones_zero_extend_half_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 812cfca4b92..93cdd844026 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1382,6 +1382,28 @@ (define_insn "mov<mode>_internal" ] (symbol_ref "true")))]) +; False dependency happens on destination register which is not really +; used when moving all ones to vector register +(define_split + [(set (match_operand:VMOVE 0 "register_operand") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] + "TARGET_AVX512F && reload_completed + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))" + [(set (match_dup 0) (match_dup 2)) + (parallel + [(set (match_dup 0) (match_dup 1)) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[2] = CONST0_RTX (<MODE>mode);") + +(define_insn "*vmov<mode>_constm1_pternlog" + [(set (match_operand:VMOVE 0 "register_operand" "=v") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>")) + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512VL || <MODE_SIZE> == 64" + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "evex")]) + ;; If mem_addr points to a memory region with less than whole vector size bytes ;; of accessible memory and k is a mask that would prevent reading the inaccessible ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd @@ -9336,7 +9358,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>" operands[3] = CONST0_RTX (<MODE>mode); }") -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VI48_AVX512VL (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") @@ -9345,12 +9367,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" "TARGET_AVX512F" "@ vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1} - vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + #" + "&& !TARGET_AVX512DQ && reload_completed" + [(set (match_dup 0) (match_dup 4)) + (parallel + [(set (match_dup 0) + (vec_merge:VI48_AVX512VL + (match_dup 2) + (match_dup 3) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[4] = CONST0_RTX (<MODE>mode);" [(set_attr "isa" "avx512dq,*") (set_attr "length_immediate" "0,1") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog" + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") + (vec_merge:VI48_AVX512VL + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") + (match_operand:VI48_AVX512VL 3 "const0_operand") + (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk"))) + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512F && !TARGET_AVX512DQ" + "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + [(set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_expand "extendv2sfv2df2" [(set (match_operand:V2DF 0 "register_operand") (float_extend:V2DF @@ -17164,32 +17209,32 @@ (define_expand "one_cmpl<mode>2" if (!TARGET_AVX512F) operands[2] = force_reg (<MODE>mode, operands[2]); + else + operands[1] = force_reg (<MODE>mode, operands[1]); }) (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>" - [(set (match_operand:VI 0 "register_operand" "=v,v") - (xor:VI (match_operand:VI 1 "nonimmediate_operand" "v,m") - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))] + [(set (match_operand:VI 0 "register_operand" "=v") + (xor:VI (match_operand:VI 1 "register_operand" "v") + (match_operand:VI 2 "vector_all_ones_operand" "BC")))] "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode == SImode || <ssescalarmode>mode == DImode)" { + /* Use vpternlog 0x55, %1, %1, %0 instead of + vpternlog 0x55, %1, %0, %0 to avoid false dependence on %0. */ if (TARGET_AVX512VL) - return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; + return "vpternlog<ternlogsuffix>\t{$0x55, %1, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %1, 0x55}"; else - return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}"; + return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g1, %g0<mask_operand3>|%g0<mask_operand3>, %g1, %g1, 0x55}"; } [(set_attr "type" "sselog") (set_attr "prefix" "evex") (set (attr "mode") (if_then_else (match_test "TARGET_AVX512VL") (const_string "<sseinsnmode>") - (const_string "XI"))) - (set (attr "enabled") - (if_then_else (eq_attr "alternative" "1") - (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") - (const_int 1)))]) + (const_string "XI")))]) (define_expand "<sse2_avx2>_andnot<mode>3" [(set (match_operand:VI_AVX2 0 "register_operand") diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c new file mode 100644 index 00000000000..11b8cc59fd2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr110438.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */ +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */ +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */ +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */ + + +#include <immintrin.h> + +__m512i g(void) +{ + return (__m512i){ 0 } - 1; +} + +__m512i g1(__m512i* a) +{ + return ~(*a); +} + +void +foo (int* a, int* __restrict b) +{ + for (int i = 0; i != 16; i++) + { + if (b[i]) + a[i] = -1; + else + a[i] = 0; + } +} -- 2.39.1.388.g2fc9e9ca3c