Hi,
Zen5 on some variants has false dependency on tzcnt, blsi, blsr and blsmsk
instructions.  Those can be tested by the following benchmark

jh@shroud:~> cat ee.c
int
main()
{
       int a = 10;
       int b = 0;
       for (int i = 0; i < 1000000000; i++)
       {
#ifdef BREAK
               asm volatile ("xor %0, %0": "=r" (b));
#endif
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
       }
       return 0;
}
jh@shroud:~> cat bmk.sh
gcc ee.c -DBREAK -DINST=\"$1\" -O2 ; time ./a.out ; gcc ee.c -DINST=\"$1\" -O2 
; time ./a.out
jh@shroud:~> sh bmk.sh tzcnt

real    0m0.886s
user    0m0.886s
sys     0m0.000s

real    0m0.886s
user    0m0.886s
sys     0m0.000s 

jh@shroud:~> sh bmk.sh blsi

real    0m0.979s
user    0m0.979s
sys     0m0.000s

real    0m2.418s
user    0m2.418s
sys     0m0.000s

jh@shroud:~> sh bmk.sh blsr

real    0m0.986s
user    0m0.986s
sys     0m0.000s

real    0m2.422s
user    0m2.421s
sys     0m0.000s
jh@shroud:~> sh bmk.sh blsmsk

real    0m0.973s
user    0m0.973s
sys     0m0.000s

real    0m2.422s
user    0m2.422s
sys     0m0.000s 

So zen5 tested seems to have problem with bls* but not with tzcnt.
I tested zen1-zen4 and they do not seem to be affected.

This can be mitigated by adding a splitter that first clear the
destination register.  We already have runable that controls tzcnt
together with lzcnt and popcnt.  Since it seems that only tzcnt is
affected I added new tunable to control tzcnt only.  I also added
splitters for blsi/blsr/blsmsk implemented analogously to existing
splitter for lzcnt.

The patch is neutral on SPEC. We produce blsi and blsr in some internal
loops, but they usually have same destination as source. However it is
good to break the dependency chain to avoid patogolical cases and it is
quite cheap overall, so I think we want to enable this for generic.  I
will send followup patch for this.

Bootstrapped/regtested x86_64-linux, will commit it shortly.

gcc/ChangeLog:

        * config/i386/i386.h (TARGET_AVOID_FALSE_DEP_FOR_TZCNT): New macro.
        (TARGET_AVOID_FALSE_DEP_FOR_BLS): New macro.
        * config/i386/i386.md (*bmi_blsi_<mode>): Add splitter for false
        dependency.
        (*bmi_blsi_<mode>_ccno): Add splitter for false dependency.
        (*bmi_blsi_<mode>_falsedep): New pattern.
        (*bmi_blsmsk_<mode>): Add splitter for false dependency.
        (*bmi_blsmsk_<mode>_falsedep): New pattern.
        (*bmi_blsr_<mode>): Add splitter for false dependency.
        (*bmi_blsr_<mode>_cmp): Add splitter for false dependency
        (*bmi_blsr_<mode>_cmp_falsedep): New pattern.
        * config/i386/x86-tune.def (X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT): New 
tune.
        (X86_TUNE_AVOID_FALSE_DEP_FOR_BLS): New tune.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/blsi.c: New test.
        * gcc.target/i386/blsmsk.c: New test.
        * gcc.target/i386/blsr.c: New test.

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 40b1aa4e6df..c89f493d180 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -455,6 +455,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
     ix86_tune_features[X86_TUNE_ADJUST_UNROLL]
 #define TARGET_AVOID_FALSE_DEP_FOR_BMI \
        ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BMI]
+#define TARGET_AVOID_FALSE_DEP_FOR_TZCNT \
+       ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT]
+#define TARGET_AVOID_FALSE_DEP_FOR_BLS \
+       ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BLS]
 #define TARGET_ONE_IF_CONV_INSN \
        ix86_tune_features[X86_TUNE_ONE_IF_CONV_INSN]
 #define TARGET_AVOID_MFENCE ix86_tune_features[X86_TUNE_AVOID_MFENCE]
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 8575fbf40fe..b1cd52382a8 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -20993,7 +20993,8 @@
        (ctz:SWI48 (match_dup 1)))]
   "TARGET_BMI"
   "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}";
-  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+  "&& (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT)
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && !reg_mentioned_p (operands[0], operands[1])"
   [(parallel
@@ -21060,7 +21061,8 @@
   return "bsf{<imodesuffix>}\t{%1, %0|%0, %1}";
 }
   "(TARGET_BMI || TARGET_CPU_P (GENERIC))
-   && TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT)
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && !reg_mentioned_p (operands[0], operands[1])"
   [(parallel
@@ -21115,7 +21117,8 @@
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_BMI && TARGET_64BIT"
   "tzcnt{l}\t{%1, %k0|%k0, %1}"
-  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+  "&& (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT)
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && !reg_mentioned_p (operands[0], operands[1])"
   [(parallel
@@ -21166,7 +21169,8 @@
   return "bsf{l}\t{%1, %k0|%k0, %1}";
 }
   "(TARGET_BMI || TARGET_CPU_P (GENERIC))
-   && TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT)
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && !reg_mentioned_p (operands[0], operands[1])"
   [(parallel
@@ -21747,7 +21751,7 @@
    (set_attr "btver2_decode" "direct, double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*bmi_blsi_<mode>"
+(define_insn_and_split "*bmi_blsi_<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (and:SWI48
           (neg:SWI48
@@ -21756,6 +21760,20 @@
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_BMI"
   "blsi\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (reg:CCNO FLAGS_REG)
+      (compare:CCNO
+        (and:SWI48
+          (neg:SWI48 (match_dup 1))
+          (match_dup 1))
+        (const_int 0)))
+     (set (match_dup 0) (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
@@ -21775,6 +21793,51 @@
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
+(define_split
+  [(set (match_operand 3 "flags_reg_operand")
+       (match_operator 4 "compare_operator"
+         [(and:SWI48
+           (neg:SWI48 (match_operand:SWI48 1 "nonimmediate_operand"))
+           (match_dup 1))
+          (const_int 0)]))
+   (set (match_operand:SWI48 0 "register_operand")
+       (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))]
+  "TARGET_BMI
+   && TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (match_dup 3)
+      (match_op_dup 4
+        [(and:SWI48
+           (neg:SWI48 (match_dup 1))
+           (match_dup 1))
+         (const_int 0)]))
+     (set (match_dup 0) (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);")
+
+; False dependency happens when destination is only updated by blsi.
+(define_insn "*bmi_blsi_<mode>_falsedep"
+  [(set (reg FLAGS_REG)
+       (compare
+         (and:SWI48
+           (neg:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm"))
+           (match_dup 1))
+         (const_int 0)))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+       (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+          UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_BMI && ix86_match_ccmode (insn, CCNOmode)"
+   "blsi\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+;; No need for splitter of the false dependency, since the output is unused
+;; so it will not extend dependency chain.
 (define_insn "*bmi_blsi_<mode>_ccno"
   [(set (reg FLAGS_REG)
        (compare
@@ -21789,7 +21852,7 @@
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*bmi_blsmsk_<mode>"
+(define_insn_and_split "*bmi_blsmsk_<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (xor:SWI48
           (plus:SWI48
@@ -21799,11 +21862,40 @@
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_BMI"
   "blsmsk\t{%1, %0|%0, %1}"
+  "TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (match_dup 0)
+          (xor:SWI48
+            (plus:SWI48 (match_dup 1) (const_int -1))
+            (match_dup 1)))
+     (clobber (reg:CC FLAGS_REG))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+; False dependency happens when destination is only updated by blmsk.
+(define_insn "*bmi_blsmsk_<mode>_falsedep"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (xor:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int -1))
+          (match_dup 1)))
+   (clobber (reg:CC FLAGS_REG))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+          UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_BMI"
+  "blsmsk\t{%1, %0|%0, %1}"
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*bmi_blsr_<mode>"
+(define_insn_and_split "*bmi_blsr_<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (and:SWI48
           (plus:SWI48
@@ -21811,13 +21903,28 @@
             (const_int -1))
           (match_dup 1)))
    (clobber (reg:CC FLAGS_REG))]
-   "TARGET_BMI"
-   "blsr\t{%1, %0|%0, %1}"
+  "TARGET_BMI"
+  "blsr\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (reg:CCZ FLAGS_REG)
+      (compare:CCZ
+        (and:SWI48
+          (plus:SWI48 (match_dup 1) (const_int -1))
+          (match_dup 1))
+        (const_int 0)))
+     (set (match_dup 0) (and:SWI48 (plus:SWI48 (match_dup 1) (const_int -1))
+                                   (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*bmi_blsr_<mode>_cmp"
+(define_insn_and_split "*bmi_blsr_<mode>_cmp"
   [(set (reg:CCZ FLAGS_REG)
        (compare:CCZ
          (and:SWI48
@@ -21832,12 +21939,53 @@
            (match_dup 1)
            (const_int -1))
          (match_dup 1)))]
+  "TARGET_BMI"
+  "blsr\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (reg:CCZ FLAGS_REG)
+      (compare:CCZ
+        (and:SWI48
+          (plus:SWI48 (match_dup 1) (const_int -1))
+          (match_dup 1))
+        (const_int 0)))
+     (set (match_dup 0) (and:SWI48 (plus:SWI48 (match_dup 1) (const_int -1))
+                                   (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+; False dependency happens when destination is only updated by bslr.
+(define_insn "*bmi_blsr_<mode>_cmp_falsedep"
+  [(set (reg:CCZ FLAGS_REG)
+       (compare:CCZ
+         (and:SWI48
+           (plus:SWI48
+             (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+             (const_int -1))
+           (match_dup 1))
+         (const_int 0)))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+       (and:SWI48
+         (plus:SWI48
+           (match_dup 1)
+           (const_int -1))
+         (match_dup 1)))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+          UNSPEC_INSN_FALSE_DEP)]
    "TARGET_BMI"
    "blsr\t{%1, %0|%0, %1}"
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
+;; No need for splitter of the false dependency, since the output is unused
+;; so it will not extend dependency chain.
 (define_insn "*bmi_blsr_<mode>_ccz"
   [(set (reg:CCZ FLAGS_REG)
        (compare:CCZ
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index df7b4ed22bc..b332132cf61 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -349,6 +349,16 @@ DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, 
"avoid_false_dep_for_bmi",
          | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE
          | m_ZHAOXIN | m_GENERIC)
 
+/* X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT: Avoid false dependency
+   for tzcnt instruction (also included in X86_TUNE_AVOID_FALSE_DEP_FOR_BMI).  
*/
+DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT, "avoid_false_dep_for_tzcnt",
+         m_ZNVER5)
+
+/* X86_TUNE_AVOID_FALSE_DEP_FOR_BLS: Avoid false dependency
+   for blsi, blsr and blsmsk instructions.  */
+DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BLS, "avoid_false_dep_for_bls",
+         m_ZNVER5)
+
 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
    on hardware capabilities. Bdver3 hardware has a loop buffer which makes
    unrolling small loop less important. For, such architectures we adjust
diff --git a/gcc/testsuite/gcc.target/i386/blsi.c 
b/gcc/testsuite/gcc.target/i386/blsi.c
new file mode 100644
index 00000000000..ee68a9d42c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/blsi.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=znver5" } */
+void bar ();
+int
+test(int a)
+{
+       return a & -a;
+}
+int
+test2(int a)
+{
+       if (a & -a)
+               bar ();
+}
+int
+test3(int a)
+{
+       int ret = a & -a;
+       if (ret)
+               bar ();
+       return ret;
+}
+/* All three functions should produce bslr.
+   Only test and test3 needs xor to break false dependency.  */
+/* { dg-final { scan-assembler-times "blsi\[ \\t\]+" 3 } } */
+/* { dg-final { scan-assembler-times "xor" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/blsmsk.c 
b/gcc/testsuite/gcc.target/i386/blsmsk.c
new file mode 100644
index 00000000000..9d15ac03180
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/blsmsk.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=znver5" } */
+int
+test(int a)
+{
+       return (a - 1)^a;
+}
+/* { dg-final { scan-assembler-times "blsmsk\[ \\t\]+" 1 } } */
+/* { dg-final { scan-assembler-times "xor" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/blsr.c 
b/gcc/testsuite/gcc.target/i386/blsr.c
new file mode 100644
index 00000000000..340ebb40dd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/blsr.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=znver5" } */
+void bar ();
+int
+test(int a)
+{
+       return a & (a - 1);
+}
+int
+test2(int a)
+{
+       if (a & (a - 1))
+               bar ();
+}
+int
+test3(int a)
+{
+       int ret = a & (a - 1);
+       if (ret)
+               bar ();
+       return ret;
+}
+/* All three functions should produce bslr.
+   Only test and test3 needs xor to break false dependency.  */
+/* { dg-final { scan-assembler-times "blsr\[ \\t\]+" 3 } } */
+/* { dg-final { scan-assembler-times "xor" 2 } } */

Reply via email to