For example, for the first loop for attached test-case we do not prepend xor to popcnt because of using destination register:
.L23: leal 1(%rdx), %ecx popcntq (%rbx,%rax,8), %rax leal 2(%rdx), %r8d popcntq (%rbx,%rcx,8), %rcx addq %rax, %rcx leal 3(%rdx), %esi xorq %rax, %rax popcntq (%rbx,%r8,8), %rax addq %rax, %rcx xorq %rax, %rax popcntq (%rbx,%rsi,8), %rax addq %rax, %rcx leal 4(%rdx), %eax addq %rcx, %r14 movq %rax, %rdx cmpq %rax, %r12 ja .L23 2014-08-14 18:06 GMT+04:00 Ilya Enkovich <enkovich....@gmail.com>: > 2014-08-14 18:00 GMT+04:00 Alexander Monakov <amona...@ispras.ru>: >> >> >> On Thu, 14 Aug 2014, Yuri Rumyantsev wrote: >> >>> Hi All, >>> >>> Here is a fix for PR 62011 - remove false dependency for unary >>> bit-manipulation instructions for latest BigCore chips (Sandybridge >>> and Haswell) by outputting in assembly file zeroing destination >>> register before bmi instruction. I checked that performance restored >>> for popcnt, lzcnt and tzcnt instructions. >> >> I am not an x86 reviewer, but one thing looks a bit superfluous to me: >> >>> +/* Retirn true if we need to insert before bit-manipulation instruction >> note typo^ >> >>> + zeroing of its destination register. */ >>> +bool >>> +ix86_avoid_false_dep_for_bm (rtx insn, rtx operands[]) >>> +{ >>> + unsigned int regno0; >>> + df_ref use; >>> + if (!TARGET_AVOID_FALSE_DEP_FOR_BM || optimize_function_for_size_p >>> (cfun)) >>> + return false; >>> + regno0 = true_regnum (operands[0]); >>> + /* Check if insn does not use REGNO0. */ >>> + FOR_EACH_INSN_USE (use, insn) >>> + if (regno0 == DF_REF_REGNO (use)) >>> + return false; >>> + return true; >>> +} >> >> The loop is to prevent adding the xor when the dest operand is also the >> source >> operand. Looks like a simpler "reg_or_subregno (operands[0]) == >> reg_or_subregno (operands[1])" could be used here, as long as the assumption >> that this is called only for two-operand instruction holds? > > This wouldn't cover memory operand case. > > Ilya > >> >> Alexander