Hi, Zen5 on some variants has false dependency on tzcnt, blsi, blsr and blsmsk instructions. Those can be tested by the following benchmark
jh@shroud:~> cat ee.c int main() { int a = 10; int b = 0; for (int i = 0; i < 1000000000; i++) { #ifdef BREAK asm volatile ("xor %0, %0": "=r" (b)); #endif asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); } return 0; } jh@shroud:~> cat bmk.sh gcc ee.c -DBREAK -DINST=\"$1\" -O2 ; time ./a.out ; gcc ee.c -DINST=\"$1\" -O2 ; time ./a.out jh@shroud:~> sh bmk.sh tzcnt real 0m0.886s user 0m0.886s sys 0m0.000s real 0m0.886s user 0m0.886s sys 0m0.000s jh@shroud:~> sh bmk.sh blsi real 0m0.979s user 0m0.979s sys 0m0.000s real 0m2.418s user 0m2.418s sys 0m0.000s jh@shroud:~> sh bmk.sh blsr real 0m0.986s user 0m0.986s sys 0m0.000s real 0m2.422s user 0m2.421s sys 0m0.000s jh@shroud:~> sh bmk.sh blsmsk real 0m0.973s user 0m0.973s sys 0m0.000s real 0m2.422s user 0m2.422s sys 0m0.000s So zen5 tested seems to have problem with bls* but not with tzcnt. I tested zen1-zen4 and they do not seem to be affected. This can be mitigated by adding a splitter that first clear the destination register. We already have runable that controls tzcnt together with lzcnt and popcnt. Since it seems that only tzcnt is affected I added new tunable to control tzcnt only. I also added splitters for blsi/blsr/blsmsk implemented analogously to existing splitter for lzcnt. The patch is neutral on SPEC. We produce blsi and blsr in some internal loops, but they usually have same destination as source. However it is good to break the dependency chain to avoid patogolical cases and it is quite cheap overall, so I think we want to enable this for generic. I will send followup patch for this. Bootstrapped/regtested x86_64-linux, will commit it shortly. gcc/ChangeLog: * config/i386/i386.h (TARGET_AVOID_FALSE_DEP_FOR_TZCNT): New macro. (TARGET_AVOID_FALSE_DEP_FOR_BLS): New macro. * config/i386/i386.md (*bmi_blsi_<mode>): Add splitter for false dependency. (*bmi_blsi_<mode>_ccno): Add splitter for false dependency. (*bmi_blsi_<mode>_falsedep): New pattern. (*bmi_blsmsk_<mode>): Add splitter for false dependency. (*bmi_blsmsk_<mode>_falsedep): New pattern. (*bmi_blsr_<mode>): Add splitter for false dependency. (*bmi_blsr_<mode>_cmp): Add splitter for false dependency (*bmi_blsr_<mode>_cmp_falsedep): New pattern. * config/i386/x86-tune.def (X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT): New tune. (X86_TUNE_AVOID_FALSE_DEP_FOR_BLS): New tune. gcc/testsuite/ChangeLog: * gcc.target/i386/blsi.c: New test. * gcc.target/i386/blsmsk.c: New test. * gcc.target/i386/blsr.c: New test. diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 40b1aa4e6df..c89f493d180 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -455,6 +455,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_ADJUST_UNROLL] #define TARGET_AVOID_FALSE_DEP_FOR_BMI \ ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BMI] +#define TARGET_AVOID_FALSE_DEP_FOR_TZCNT \ + ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT] +#define TARGET_AVOID_FALSE_DEP_FOR_BLS \ + ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BLS] #define TARGET_ONE_IF_CONV_INSN \ ix86_tune_features[X86_TUNE_ONE_IF_CONV_INSN] #define TARGET_AVOID_MFENCE ix86_tune_features[X86_TUNE_AVOID_MFENCE] diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 8575fbf40fe..b1cd52382a8 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -20993,7 +20993,8 @@ (ctz:SWI48 (match_dup 1)))] "TARGET_BMI" "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"; - "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + "&& (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT) + && epilogue_completed && optimize_function_for_speed_p (cfun) && !reg_mentioned_p (operands[0], operands[1])" [(parallel @@ -21060,7 +21061,8 @@ return "bsf{<imodesuffix>}\t{%1, %0|%0, %1}"; } "(TARGET_BMI || TARGET_CPU_P (GENERIC)) - && TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + && (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT) + && epilogue_completed && optimize_function_for_speed_p (cfun) && !reg_mentioned_p (operands[0], operands[1])" [(parallel @@ -21115,7 +21117,8 @@ (clobber (reg:CC FLAGS_REG))] "TARGET_BMI && TARGET_64BIT" "tzcnt{l}\t{%1, %k0|%k0, %1}" - "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + "&& (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT) + && epilogue_completed && optimize_function_for_speed_p (cfun) && !reg_mentioned_p (operands[0], operands[1])" [(parallel @@ -21166,7 +21169,8 @@ return "bsf{l}\t{%1, %k0|%k0, %1}"; } "(TARGET_BMI || TARGET_CPU_P (GENERIC)) - && TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + && (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT) + && epilogue_completed && optimize_function_for_speed_p (cfun) && !reg_mentioned_p (operands[0], operands[1])" [(parallel @@ -21747,7 +21751,7 @@ (set_attr "btver2_decode" "direct, double") (set_attr "mode" "<MODE>")]) -(define_insn "*bmi_blsi_<mode>" +(define_insn_and_split "*bmi_blsi_<mode>" [(set (match_operand:SWI48 0 "register_operand" "=r") (and:SWI48 (neg:SWI48 @@ -21756,6 +21760,20 @@ (clobber (reg:CC FLAGS_REG))] "TARGET_BMI" "blsi\t{%1, %0|%0, %1}" + "&& TARGET_AVOID_FALSE_DEP_FOR_BLS + && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (reg:CCNO FLAGS_REG) + (compare:CCNO + (and:SWI48 + (neg:SWI48 (match_dup 1)) + (match_dup 1)) + (const_int 0))) + (set (match_dup 0) (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "ix86_expand_clear (operands[0]);" [(set_attr "type" "bitmanip") (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) @@ -21775,6 +21793,51 @@ (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) +(define_split + [(set (match_operand 3 "flags_reg_operand") + (match_operator 4 "compare_operator" + [(and:SWI48 + (neg:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")) + (match_dup 1)) + (const_int 0)])) + (set (match_operand:SWI48 0 "register_operand") + (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))] + "TARGET_BMI + && TARGET_AVOID_FALSE_DEP_FOR_BLS + && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (match_dup 3) + (match_op_dup 4 + [(and:SWI48 + (neg:SWI48 (match_dup 1)) + (match_dup 1)) + (const_int 0)])) + (set (match_dup 0) (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "ix86_expand_clear (operands[0]);") + +; False dependency happens when destination is only updated by blsi. +(define_insn "*bmi_blsi_<mode>_falsedep" + [(set (reg FLAGS_REG) + (compare + (and:SWI48 + (neg:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")) + (match_dup 1)) + (const_int 0))) + (set (match_operand:SWI48 0 "register_operand" "=r") + (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1))) + (unspec [(match_operand:SWI48 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP)] + "TARGET_BMI && ix86_match_ccmode (insn, CCNOmode)" + "blsi\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "btver2_decode" "double") + (set_attr "mode" "<MODE>")]) + +;; No need for splitter of the false dependency, since the output is unused +;; so it will not extend dependency chain. (define_insn "*bmi_blsi_<mode>_ccno" [(set (reg FLAGS_REG) (compare @@ -21789,7 +21852,7 @@ (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) -(define_insn "*bmi_blsmsk_<mode>" +(define_insn_and_split "*bmi_blsmsk_<mode>" [(set (match_operand:SWI48 0 "register_operand" "=r") (xor:SWI48 (plus:SWI48 @@ -21799,11 +21862,40 @@ (clobber (reg:CC FLAGS_REG))] "TARGET_BMI" "blsmsk\t{%1, %0|%0, %1}" + "TARGET_AVOID_FALSE_DEP_FOR_BLS + && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (match_dup 0) + (xor:SWI48 + (plus:SWI48 (match_dup 1) (const_int -1)) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG)) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "ix86_expand_clear (operands[0]);" + [(set_attr "type" "bitmanip") + (set_attr "btver2_decode" "double") + (set_attr "mode" "<MODE>")]) + +; False dependency happens when destination is only updated by blmsk. +(define_insn "*bmi_blsmsk_<mode>_falsedep" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (xor:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int -1)) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG)) + (unspec [(match_operand:SWI48 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP)] + "TARGET_BMI" + "blsmsk\t{%1, %0|%0, %1}" [(set_attr "type" "bitmanip") (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) -(define_insn "*bmi_blsr_<mode>" +(define_insn_and_split "*bmi_blsr_<mode>" [(set (match_operand:SWI48 0 "register_operand" "=r") (and:SWI48 (plus:SWI48 @@ -21811,13 +21903,28 @@ (const_int -1)) (match_dup 1))) (clobber (reg:CC FLAGS_REG))] - "TARGET_BMI" - "blsr\t{%1, %0|%0, %1}" + "TARGET_BMI" + "blsr\t{%1, %0|%0, %1}" + "&& TARGET_AVOID_FALSE_DEP_FOR_BLS + && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (and:SWI48 + (plus:SWI48 (match_dup 1) (const_int -1)) + (match_dup 1)) + (const_int 0))) + (set (match_dup 0) (and:SWI48 (plus:SWI48 (match_dup 1) (const_int -1)) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "ix86_expand_clear (operands[0]);" [(set_attr "type" "bitmanip") (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) -(define_insn "*bmi_blsr_<mode>_cmp" +(define_insn_and_split "*bmi_blsr_<mode>_cmp" [(set (reg:CCZ FLAGS_REG) (compare:CCZ (and:SWI48 @@ -21832,12 +21939,53 @@ (match_dup 1) (const_int -1)) (match_dup 1)))] + "TARGET_BMI" + "blsr\t{%1, %0|%0, %1}" + "&& TARGET_AVOID_FALSE_DEP_FOR_BLS + && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (and:SWI48 + (plus:SWI48 (match_dup 1) (const_int -1)) + (match_dup 1)) + (const_int 0))) + (set (match_dup 0) (and:SWI48 (plus:SWI48 (match_dup 1) (const_int -1)) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "ix86_expand_clear (operands[0]);" + [(set_attr "type" "bitmanip") + (set_attr "btver2_decode" "double") + (set_attr "mode" "<MODE>")]) + +; False dependency happens when destination is only updated by bslr. +(define_insn "*bmi_blsr_<mode>_cmp_falsedep" + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (and:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int -1)) + (match_dup 1)) + (const_int 0))) + (set (match_operand:SWI48 0 "register_operand" "=r") + (and:SWI48 + (plus:SWI48 + (match_dup 1) + (const_int -1)) + (match_dup 1))) + (unspec [(match_operand:SWI48 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP)] "TARGET_BMI" "blsr\t{%1, %0|%0, %1}" [(set_attr "type" "bitmanip") (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) +;; No need for splitter of the false dependency, since the output is unused +;; so it will not extend dependency chain. (define_insn "*bmi_blsr_<mode>_ccz" [(set (reg:CCZ FLAGS_REG) (compare:CCZ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index df7b4ed22bc..b332132cf61 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -349,6 +349,16 @@ DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi", | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE | m_ZHAOXIN | m_GENERIC) +/* X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT: Avoid false dependency + for tzcnt instruction (also included in X86_TUNE_AVOID_FALSE_DEP_FOR_BMI). */ +DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT, "avoid_false_dep_for_tzcnt", + m_ZNVER5) + +/* X86_TUNE_AVOID_FALSE_DEP_FOR_BLS: Avoid false dependency + for blsi, blsr and blsmsk instructions. */ +DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BLS, "avoid_false_dep_for_bls", + m_ZNVER5) + /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based on hardware capabilities. Bdver3 hardware has a loop buffer which makes unrolling small loop less important. For, such architectures we adjust diff --git a/gcc/testsuite/gcc.target/i386/blsi.c b/gcc/testsuite/gcc.target/i386/blsi.c new file mode 100644 index 00000000000..ee68a9d42c3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/blsi.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=znver5" } */ +void bar (); +int +test(int a) +{ + return a & -a; +} +int +test2(int a) +{ + if (a & -a) + bar (); +} +int +test3(int a) +{ + int ret = a & -a; + if (ret) + bar (); + return ret; +} +/* All three functions should produce bslr. + Only test and test3 needs xor to break false dependency. */ +/* { dg-final { scan-assembler-times "blsi\[ \\t\]+" 3 } } */ +/* { dg-final { scan-assembler-times "xor" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/blsmsk.c b/gcc/testsuite/gcc.target/i386/blsmsk.c new file mode 100644 index 00000000000..9d15ac03180 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/blsmsk.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=znver5" } */ +int +test(int a) +{ + return (a - 1)^a; +} +/* { dg-final { scan-assembler-times "blsmsk\[ \\t\]+" 1 } } */ +/* { dg-final { scan-assembler-times "xor" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/blsr.c b/gcc/testsuite/gcc.target/i386/blsr.c new file mode 100644 index 00000000000..340ebb40dd0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/blsr.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=znver5" } */ +void bar (); +int +test(int a) +{ + return a & (a - 1); +} +int +test2(int a) +{ + if (a & (a - 1)) + bar (); +} +int +test3(int a) +{ + int ret = a & (a - 1); + if (ret) + bar (); + return ret; +} +/* All three functions should produce bslr. + Only test and test3 needs xor to break false dependency. */ +/* { dg-final { scan-assembler-times "blsr\[ \\t\]+" 3 } } */ +/* { dg-final { scan-assembler-times "xor" 2 } } */