On Fri, Jun 5, 2020 at 8:45 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > In January I've added patterns to optimize SImode -> DImode sign or zero > extension of __builtin_popcount, this patch does the same for > __builtin_c[lt]z. Like most other instructions, the [tl]zcntl instructions > clear the upper 32 bits of the destination register and as the instructions > only result in values 0 to 32 inclusive, both sign and zero extensions > behave the same. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2020-06-05 Jakub Jelinek <ja...@redhat.com> > > PR target/95535 > * config/i386/i386.md (*ctzsi2_zext, *clzsi2_lzcnt_zext): New > define_insn_and_split patterns. > (*ctzsi2_zext_falsedep, *clzsi2_lzcnt_zext_falsedep): New > define_insn patterns. > > * gcc.target/i386/pr95535-1.c: New test. > * gcc.target/i386/pr95535-2.c: New test.
OK. Thanks, Uros. > --- gcc/config/i386/i386.md.jj 2020-05-25 10:06:59.882176002 +0200 > +++ gcc/config/i386/i386.md 2020-06-04 18:44:26.333963121 +0200 > @@ -13985,6 +13985,50 @@ (define_insn "*ctz<mode>2_falsedep" > (set_attr "prefix_rep" "1") > (set_attr "mode" "<MODE>")]) > > +(define_insn_and_split "*ctzsi2_zext" > + [(set (match_operand:DI 0 "register_operand" "=r") > + (and:DI > + (subreg:DI > + (ctz:SI > + (match_operand:SI 1 "nonimmediate_operand" "rm")) 0) > + (const_int 63))) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_BMI && TARGET_64BIT" > + "tzcnt{l}\t{%1, %k0|%k0, %1}" > + "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed > + && optimize_function_for_speed_p (cfun) > + && !reg_mentioned_p (operands[0], operands[1])" > + [(parallel > + [(set (match_dup 0) > + (and:DI (subreg:DI (ctz:SI (match_dup 1)) 0) (const_int 63))) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP) > + (clobber (reg:CC FLAGS_REG))])] > + "ix86_expand_clear (operands[0]);" > + [(set_attr "type" "alu1") > + (set_attr "prefix_0f" "1") > + (set_attr "prefix_rep" "1") > + (set_attr "mode" "SI")]) > + > +; False dependency happens when destination is only updated by tzcnt, > +; lzcnt or popcnt. There is no false dependency when destination is > +; also used in source. > +(define_insn "*ctzsi2_zext_falsedep" > + [(set (match_operand:DI 0 "register_operand" "=r") > + (and:DI > + (subreg:DI > + (ctz:SI > + (match_operand:SI 1 "nonimmediate_operand" "rm")) 0) > + (const_int 63))) > + (unspec [(match_operand:DI 2 "register_operand" "0")] > + UNSPEC_INSN_FALSE_DEP) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_BMI && TARGET_64BIT" > + "tzcnt{l}\t{%1, %k0|%k0, %1}" > + [(set_attr "type" "alu1") > + (set_attr "prefix_0f" "1") > + (set_attr "prefix_rep" "1") > + (set_attr "mode" "SI")]) > + > (define_insn "bsr_rex64" > [(set (match_operand:DI 0 "register_operand" "=r") > (minus:DI (const_int 63) > @@ -14077,6 +14121,48 @@ (define_insn "*clz<mode>2_lzcnt_falsedep > (set_attr "type" "bitmanip") > (set_attr "mode" "<MODE>")]) > > +(define_insn_and_split "*clzsi2_lzcnt_zext" > + [(set (match_operand:DI 0 "register_operand" "=r") > + (and:DI > + (subreg:DI > + (clz:SI > + (match_operand:SI 1 "nonimmediate_operand" "rm")) 0) > + (const_int 63))) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_LZCNT && TARGET_64BIT" > + "lzcnt{l}\t{%1, %k0|%k0, %1}" > + "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed > + && optimize_function_for_speed_p (cfun) > + && !reg_mentioned_p (operands[0], operands[1])" > + [(parallel > + [(set (match_dup 0) > + (and:DI (subreg:DI (clz:SI (match_dup 1)) 0) (const_int 63))) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP) > + (clobber (reg:CC FLAGS_REG))])] > + "ix86_expand_clear (operands[0]);" > + [(set_attr "prefix_rep" "1") > + (set_attr "type" "bitmanip") > + (set_attr "mode" "SI")]) > + > +; False dependency happens when destination is only updated by tzcnt, > +; lzcnt or popcnt. There is no false dependency when destination is > +; also used in source. > +(define_insn "*clzsi2_lzcnt_zext_falsedep" > + [(set (match_operand:DI 0 "register_operand" "=r") > + (and:DI > + (subreg:DI > + (clz:SI > + (match_operand:SWI48 1 "nonimmediate_operand" "rm")) 0) > + (const_int 63))) > + (unspec [(match_operand:DI 2 "register_operand" "0")] > + UNSPEC_INSN_FALSE_DEP) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_LZCNT" > + "lzcnt{l}\t{%1, %k0|%k0, %1}" > + [(set_attr "prefix_rep" "1") > + (set_attr "type" "bitmanip") > + (set_attr "mode" "SI")]) > + > (define_int_iterator LT_ZCNT > [(UNSPEC_TZCNT "TARGET_BMI") > (UNSPEC_LZCNT "TARGET_LZCNT")]) > --- gcc/testsuite/gcc.target/i386/pr95535-1.c.jj 2020-06-04 > 18:47:07.642642830 +0200 > +++ gcc/testsuite/gcc.target/i386/pr95535-1.c 2020-06-04 18:47:34.011263537 > +0200 > @@ -0,0 +1,54 @@ > +/* PR target/95535 */ > +/* { dg-do compile { target lp64 } } */ > +/* { dg-options "-O2 -mbmi" } */ > +/* { dg-final { scan-assembler-not "cltq" } } */ > + > +unsigned int foo (void); > + > +unsigned long > +f1 (unsigned int x) > +{ > + return __builtin_ctz (x); > +} > + > +unsigned long > +f2 (unsigned int x) > +{ > + return (unsigned) __builtin_ctz (x); > +} > + > +unsigned long > +f3 (unsigned int x) > +{ > + return __builtin_ctz (x) & 63ULL; > +} > + > +unsigned long > +f4 (unsigned int x) > +{ > + return __builtin_ctz (x) & 1023ULL; > +} > + > +unsigned long > +f5 (void) > +{ > + return __builtin_ctz (foo ()); > +} > + > +unsigned long > +f6 (void) > +{ > + return (unsigned) __builtin_ctz (foo ()); > +} > + > +unsigned long > +f7 (void) > +{ > + return __builtin_ctz (foo ()) & 63ULL; > +} > + > +unsigned long > +f8 (void) > +{ > + return __builtin_ctz (foo ()) & 1023ULL; > +} > --- gcc/testsuite/gcc.target/i386/pr95535-2.c.jj 2020-06-04 > 18:47:10.774597782 +0200 > +++ gcc/testsuite/gcc.target/i386/pr95535-2.c 2020-06-04 18:47:50.576025269 > +0200 > @@ -0,0 +1,54 @@ > +/* PR target/95535 */ > +/* { dg-do compile { target lp64 } } */ > +/* { dg-options "-O2 -mlzcnt" } */ > +/* { dg-final { scan-assembler-not "cltq" } } */ > + > +unsigned int foo (void); > + > +unsigned long > +f1 (unsigned int x) > +{ > + return __builtin_clz (x); > +} > + > +unsigned long > +f2 (unsigned int x) > +{ > + return (unsigned) __builtin_clz (x); > +} > + > +unsigned long > +f3 (unsigned int x) > +{ > + return __builtin_clz (x) & 63ULL; > +} > + > +unsigned long > +f4 (unsigned int x) > +{ > + return __builtin_clz (x) & 1023ULL; > +} > + > +unsigned long > +f5 (void) > +{ > + return __builtin_clz (foo ()); > +} > + > +unsigned long > +f6 (void) > +{ > + return (unsigned) __builtin_clz (foo ()); > +} > + > +unsigned long > +f7 (void) > +{ > + return __builtin_clz (foo ()) & 63ULL; > +} > + > +unsigned long > +f8 (void) > +{ > + return __builtin_clz (foo ()) & 1023ULL; > +} > > Jakub >