On Wed, Jan 13, 2021 at 8:18 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > In the following testcase we only optimize f2 and f7 to btrl, although we > should optimize that way all of the functions. The problem is the type > demotion/narrowing (which is performed solely during the generic folding and > not later), without it we see the AND performed in SImode and match it as > btrl, but with it while the shifts are still performed in SImode, the > AND is already done in QImode or HImode low part of the shift. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2021-01-13 Jakub Jelinek <ja...@redhat.com> > > PR target/96938 > * config/i386/i386.md (*btr<mode>_1, *btr<mode>_2): New > define_insn_and_split patterns. > (splitter after *btr<mode>_2): New splitter. > > * gcc.target/i386/pr96938.c: New test.
OK with a small fix below. Thanks, Uros. > > --- gcc/config/i386/i386.md.jj 2021-01-07 17:18:39.653487482 +0100 > +++ gcc/config/i386/i386.md 2021-01-12 19:01:37.286603961 +0100 > @@ -12419,6 +12419,70 @@ (define_insn_and_split "*btr<mode>_mask_ > (match_dup 3))) > (clobber (reg:CC FLAGS_REG))])]) > > +(define_insn_and_split "*btr<mode>_1" > + [(set (match_operand:SWI12 0 "register_operand") > + (and:SWI12 > + (subreg:SWI12 > + (rotate:SI (const_int -2) > + (match_operand:QI 2 "register_operand")) 0) > + (match_operand:SWI12 1 "nonimmediate_operand"))) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_USE_BT && ix86_pre_reload_split ()" > + "#" > + "&& 1" > + [(parallel > + [(set (match_dup 0) > + (and:SI (rotate:SI (const_int -2) (match_dup 2)) > + (match_dup 1))) > + (clobber (reg:CC FLAGS_REG))])] > +{ > + operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); > + if (MEM_P (operands[1])) > + operands[1] = force_reg (<MODE>mode, operands[1]); > + operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); > +}) > + > +(define_insn_and_split "*btr<mode>_2" > + [(set (zero_extract:HI > + (match_operand:SWI12 0 "nonimmediate_operand") > + (const_int 1) > + (zero_extend:SI (match_operand:QI 1 "register_operand"))) > + (const_int 0)) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_USE_BT && ix86_pre_reload_split ()" > + "#" > + "&& MEM_P (operands[0])" > + [(set (match_dup 2) (match_dup 0)) > + (parallel > + [(set (match_dup 3) (and:SI (rotate:SI (const_int -2) (match_dup 1)) Please move (and ...) to a separate line. > + (match_dup 4))) > + (clobber (reg:CC FLAGS_REG))]) > + (set (match_dup 0) (match_dup 5))] > +{ > + operands[2] = gen_reg_rtx (<MODE>mode); > + operands[5] = gen_reg_rtx (<MODE>mode); > + operands[3] = lowpart_subreg (SImode, operands[5], <MODE>mode); > + operands[4] = lowpart_subreg (SImode, operands[2], <MODE>mode); > +}) > + > +(define_split > + [(set (zero_extract:HI > + (match_operand:SWI12 0 "register_operand") > + (const_int 1) > + (zero_extend:SI (match_operand:QI 1 "register_operand"))) > + (const_int 0)) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_USE_BT && ix86_pre_reload_split ()" > + [(parallel > + [(set (match_dup 0) > + (and:SI (rotate:SI (const_int -2) (match_dup 1)) > + (match_dup 2))) > + (clobber (reg:CC FLAGS_REG))])] > +{ > + operands[2] = lowpart_subreg (SImode, operands[0], <MODE>mode); > + operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); > +}) > + > ;; These instructions are never faster than the corresponding > ;; and/ior/xor operations when using immediate operand, so with > ;; 32-bit there's no point. But in 64-bit, we can't hold the > --- gcc/testsuite/gcc.target/i386/pr96938.c.jj 2021-01-12 19:12:48.285023954 > +0100 > +++ gcc/testsuite/gcc.target/i386/pr96938.c 2021-01-12 19:12:33.209194271 > +0100 > @@ -0,0 +1,66 @@ > +/* PR target/96938 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -masm=att" } */ > +/* { dg-final { scan-assembler-times "\tbtrl\t" 10 } } */ > + > +void > +f1 (unsigned char *f, int o, unsigned char v) > +{ > + *f = (*f & ~(1 << o)) | (v << o); > +} > + > +void > +f2 (unsigned char *f, int o, unsigned char v) > +{ > + int t = *f & ~(1 << o); > + *f = t | (v << o); > +} > + > +void > +f3 (unsigned char *f, int o, unsigned char v) > +{ > + *f &= ~(1 << o); > +} > + > +void > +f4 (unsigned char *f, int o, unsigned char v) > +{ > + *f = (*f & ~(1 << (o & 31))) | v; > +} > + > +void > +f5 (unsigned char *f, int o, unsigned char v) > +{ > + *f = (*f & ~(1 << (o & 31))) | (v << (o & 31)); > +} > + > +void > +f6 (unsigned short *f, int o, unsigned short v) > +{ > + *f = (*f & ~(1 << o)) | (v << o); > +} > + > +void > +f7 (unsigned short *f, int o, unsigned short v) > +{ > + int t = *f & ~(1 << o); > + *f = t | (v << o); > +} > + > +void > +f8 (unsigned short *f, int o, unsigned short v) > +{ > + *f &= ~(1 << o); > +} > + > +void > +f9 (unsigned short *f, int o, unsigned short v) > +{ > + *f = (*f & ~(1 << (o & 31))) | v; > +} > + > +void > +f10 (unsigned short *f, int o, unsigned short v) > +{ > + *f = (*f & ~(1 << (o & 31))) | (v << (o & 31)); > +} > > Jakub >