On Sat, Aug 16, 2014 at 6:01 AM, Uros Bizjak <ubiz...@gmail.com> wrote:
> Hello!
>
> Attached patch fixes the problem with false data dependency on output
> register for popcnt, lzcnt and tzcnt insns on sandybridge and haswell
> targets.
>
> The new insn pattern shadows existing one, and after reload, the
> clearing isns is split out of the insn. This way the clearing insn can
> be scheduled by postreload scheduler. The new pattern takes care to
> avoid live registers, so the compiler is always able to clear output
> reg.
>
> The testcase from the PR, compiled with -O3 -march=corei7 improves on
> Ivybridge from:
>
> unsigned        209717360000    3.21002 sec     16.3329 GB/s
> uint64_t        209717360000    4.06517 sec     12.8971 GB/s
>
> to (-O3 -march=corei7 -mtune-ctrl=avoid_false_dep_for_bmi):
>
> unsigned        209717360000    3.14541 sec     16.6683 GB/s
> uint64_t        209717360000    2.3663 sec      22.1564 GB/s
>
> Due to high impact, the new tune flag is enabled by default for Intel
> tunes and generic:
>
> m_SANDYBRIDGE | m_HASWELL | m_INTEL | m_GENERIC
>
> 2014-08-16  Uros Bizjak  <ubiz...@gmail.com>
>
>     PR target/62011
>     * config/i386/x86-tune.def (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI):
>     New tune flag.
>     * config/i386/i386.h (TARGET_AVOID_FALSE_DEP_FOR_BMI): New define.
>     * config/i386/i386.md (unspec) <UNSPEC_INSN_FALSE_DEP>: New unspec.
>     (ffs<mode>2): Do not expand with tzcnt for
>     TARGET_AVOID_FALSE_DEP_FOR_BMI.
>     (ffssi2_no_cmove): Ditto.
>     (*tzcnt<mode>_1): Disable for TARGET_AVOID_FALSE_DEP_FOR_BMI.
>     (ctz<mode>2): New expander.
>     (*ctz<mode>2_falsedep_1): New insn_and_split pattern.
>     (*ctz<mode>2_falsedep): New insn.
>     (*ctz<mode>2): Rename from ctz<mode>2.
>     (clz<mode>2_lzcnt): New expander.
>     (*clz<mode>2_lzcnt_falsedep_1): New insn_and_split pattern.
>     (*clz<mode>2_lzcnt_falsedep): New insn.
>     (*clz<mode>2): Rename from ctz<mode>2.
>     (popcount<mode>2): New expander.
>     (*popcount<mode>2_falsedep_1): New insn_and_split pattern.
>     (*popcount<mode>2_falsedep): New insn.
>     (*popcount<mode>2): Rename from ctz<mode>2.
>     (*popcount<mode>2_cmp): Remove.
>     (*popcountsi2_cmp_zext): Ditto.
>
> The patch was bootstrapped and regression tested on
> x86_64-pc-linux-gnu {,-m32} and will be committed to mainline SVN
> after a couple of days. The patch will be also backported to 4.9
> branch.
>
> Uros.

False dependency happens when destination is only updated by tcnt,
lzcnt or popcnt.  There is no false dependency when destination is
also used in source.  This patch avoids xor when destination is used
in source.  The difference is

@@ -91,15 +91,12 @@ main:
  .p2align 3
 .L23:
  leal 1(%rdx), %ecx
- xorl %r9d, %r9d
- xorl %r10d, %r10d
- popcntq (%rbx,%rax,8), %r10
- popcntq (%rbx,%rcx,8), %r9
+ popcntq (%rbx,%rax,8), %rax
  leal 2(%rdx), %r8d
- movq %r9, %rcx
+ popcntq (%rbx,%rcx,8), %rcx
+ addq %rax, %rcx
  xorl %eax, %eax
  leal 3(%rdx), %esi
- addq %r10, %rcx
  popcntq (%rbx,%r8,8), %rax
  addq %rax, %rcx
  xorl %eax, %eax

and I got

unsigned 41959360000 0.456816 sec 22.954 GB/s
uint64_t 41959360000 0.408019 sec 25.6992 GB/s

vs

unsigned 41959360000 0.531386 sec 19.7328 GB/s
uint64_t 41959360000 0.408081 sec 25.6953 GB/s

on Haswell.  OK for trunk?

Thanks.


-- 
H.J.
---
2014-08-18  H.J. Lu  <hongjiu...@intel.com>

* config/i386/i386.md (*ctz<mode>2_falsedep_1): Don't clear
destination if it is used in source.
(*clz<mode>2_lzcnt_falsedep_1): Likewise.
(*popcount<mode>2_falsedep_1): Likewise.
2014-08-18  H.J. Lu  <hongjiu...@intel.com>

	* config/i386/i386.md (*ctz<mode>2_falsedep_1): Don't clear
	destination if it is used in source.
	(*clz<mode>2_lzcnt_falsedep_1): Likewise.
	(*popcount<mode>2_falsedep_1): Likewise.

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 4749b74..993e165 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12269,8 +12269,11 @@
 	    (match_operand:SWI248 1 "nonimmediate_operand")))
      (clobber (reg:CC FLAGS_REG))])])
 
+; False dependency happens when destination is only updated by tcnt,
+; lzcnt or popcnt.  There is no false dependency when destination is
+; also used in source.
 (define_insn_and_split "*ctz<mode>2_falsedep_1"
-  [(set (match_operand:SWI48 0 "register_operand" "=&r")
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
 	(ctz:SWI48
 	  (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
    (clobber (reg:CC FLAGS_REG))]
@@ -12283,7 +12286,8 @@
 	  (ctz:SWI48 (match_dup 1)))
      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)
      (clobber (reg:CC FLAGS_REG))])]
-  "ix86_expand_clear (operands[0]);")
+  "if (!reg_mentioned_p (operands[0], operands[1]))
+    ix86_expand_clear (operands[0]);")
 
 (define_insn "*ctz<mode>2_falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
@@ -12363,7 +12367,7 @@
   "TARGET_LZCNT")
 
 (define_insn_and_split "*clz<mode>2_lzcnt_falsedep_1"
-  [(set (match_operand:SWI48 0 "register_operand" "=&r")
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
 	(clz:SWI48
 	  (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
    (clobber (reg:CC FLAGS_REG))]
@@ -12376,7 +12380,8 @@
 	  (clz:SWI48 (match_dup 1)))
      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)
      (clobber (reg:CC FLAGS_REG))])]
-  "ix86_expand_clear (operands[0]);")
+  "if (!reg_mentioned_p (operands[0], operands[1]))
+    ix86_expand_clear (operands[0]);")
 
 (define_insn "*clz<mode>2_lzcnt_falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
@@ -12683,7 +12688,7 @@
   "TARGET_POPCNT")
 
 (define_insn_and_split "*popcount<mode>2_falsedep_1"
-  [(set (match_operand:SWI48 0 "register_operand" "=&r")
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
 	(popcount:SWI48
 	  (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
    (clobber (reg:CC FLAGS_REG))]
@@ -12696,7 +12701,8 @@
 	  (popcount:SWI48 (match_dup 1)))
      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)
      (clobber (reg:CC FLAGS_REG))])]
-  "ix86_expand_clear (operands[0]);")
+  "if (!reg_mentioned_p (operands[0], operands[1]))
+    ix86_expand_clear (operands[0]);")
 
 (define_insn "*popcount<mode>2_falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")

Reply via email to