On Wed, Jan 13, 2021 at 8:18 AM Jakub Jelinek <ja...@redhat.com> wrote:
>
> Hi!
>
> In the following testcase we only optimize f2 and f7 to btrl, although we
> should optimize that way all of the functions.  The problem is the type
> demotion/narrowing (which is performed solely during the generic folding and
> not later), without it we see the AND performed in SImode and match it as
> btrl, but with it while the shifts are still performed in SImode, the
> AND is already done in QImode or HImode low part of the shift.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2021-01-13  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/96938
>         * config/i386/i386.md (*btr<mode>_1, *btr<mode>_2): New
>         define_insn_and_split patterns.
>         (splitter after *btr<mode>_2): New splitter.
>
>         * gcc.target/i386/pr96938.c: New test.

OK with a small fix below.

Thanks,
Uros.

>
> --- gcc/config/i386/i386.md.jj  2021-01-07 17:18:39.653487482 +0100
> +++ gcc/config/i386/i386.md     2021-01-12 19:01:37.286603961 +0100
> @@ -12419,6 +12419,70 @@ (define_insn_and_split "*btr<mode>_mask_
>              (match_dup 3)))
>        (clobber (reg:CC FLAGS_REG))])])
>
> +(define_insn_and_split "*btr<mode>_1"
> +  [(set (match_operand:SWI12 0 "register_operand")
> +       (and:SWI12
> +         (subreg:SWI12
> +           (rotate:SI (const_int -2)
> +                      (match_operand:QI 2 "register_operand")) 0)
> +         (match_operand:SWI12 1 "nonimmediate_operand")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_USE_BT && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(parallel
> +     [(set (match_dup 0)
> +          (and:SI (rotate:SI (const_int -2) (match_dup 2))
> +                  (match_dup 1)))
> +      (clobber (reg:CC FLAGS_REG))])]
> +{
> +  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
> +  if (MEM_P (operands[1]))
> +    operands[1] = force_reg (<MODE>mode, operands[1]);
> +  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
> +})
> +
> +(define_insn_and_split "*btr<mode>_2"
> +  [(set (zero_extract:HI
> +         (match_operand:SWI12 0 "nonimmediate_operand")
> +         (const_int 1)
> +         (zero_extend:SI (match_operand:QI 1 "register_operand")))
> +       (const_int 0))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_USE_BT && ix86_pre_reload_split ()"
> +  "#"
> +  "&& MEM_P (operands[0])"
> +  [(set (match_dup 2) (match_dup 0))
> +   (parallel
> +     [(set (match_dup 3) (and:SI (rotate:SI (const_int -2) (match_dup 1))

Please move (and ...) to a separate line.

> +                                (match_dup 4)))
> +      (clobber (reg:CC FLAGS_REG))])
> +   (set (match_dup 0) (match_dup 5))]
> +{
> +  operands[2] = gen_reg_rtx (<MODE>mode);
> +  operands[5] = gen_reg_rtx (<MODE>mode);
> +  operands[3] = lowpart_subreg (SImode, operands[5], <MODE>mode);
> +  operands[4] = lowpart_subreg (SImode, operands[2], <MODE>mode);
> +})
> +
> +(define_split
> +  [(set (zero_extract:HI
> +         (match_operand:SWI12 0 "register_operand")
> +         (const_int 1)
> +         (zero_extend:SI (match_operand:QI 1 "register_operand")))
> +       (const_int 0))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_USE_BT && ix86_pre_reload_split ()"
> +  [(parallel
> +     [(set (match_dup 0)
> +          (and:SI (rotate:SI (const_int -2) (match_dup 1))
> +                  (match_dup 2)))
> +      (clobber (reg:CC FLAGS_REG))])]
> +{
> +  operands[2] = lowpart_subreg (SImode, operands[0], <MODE>mode);
> +  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
> +})
> +
>  ;; These instructions are never faster than the corresponding
>  ;; and/ior/xor operations when using immediate operand, so with
>  ;; 32-bit there's no point.  But in 64-bit, we can't hold the
> --- gcc/testsuite/gcc.target/i386/pr96938.c.jj  2021-01-12 19:12:48.285023954 
> +0100
> +++ gcc/testsuite/gcc.target/i386/pr96938.c     2021-01-12 19:12:33.209194271 
> +0100
> @@ -0,0 +1,66 @@
> +/* PR target/96938 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -masm=att" } */
> +/* { dg-final { scan-assembler-times "\tbtrl\t" 10 } } */
> +
> +void
> +f1 (unsigned char *f, int o, unsigned char v)
> +{
> +  *f = (*f & ~(1 << o)) | (v << o);
> +}
> +
> +void
> +f2 (unsigned char *f, int o, unsigned char v)
> +{
> +  int t = *f & ~(1 << o);
> +  *f = t | (v << o);
> +}
> +
> +void
> +f3 (unsigned char *f, int o, unsigned char v)
> +{
> +  *f &= ~(1 << o);
> +}
> +
> +void
> +f4 (unsigned char *f, int o, unsigned char v)
> +{
> +  *f = (*f & ~(1 << (o & 31))) | v;
> +}
> +
> +void
> +f5 (unsigned char *f, int o, unsigned char v)
> +{
> +  *f = (*f & ~(1 << (o & 31))) | (v << (o & 31));
> +}
> +
> +void
> +f6 (unsigned short *f, int o, unsigned short v)
> +{
> +  *f = (*f & ~(1 << o)) | (v << o);
> +}
> +
> +void
> +f7 (unsigned short *f, int o, unsigned short v)
> +{
> +  int t = *f & ~(1 << o);
> +  *f = t | (v << o);
> +}
> +
> +void
> +f8 (unsigned short *f, int o, unsigned short v)
> +{
> +  *f &= ~(1 << o);
> +}
> +
> +void
> +f9 (unsigned short *f, int o, unsigned short v)
> +{
> +  *f = (*f & ~(1 << (o & 31))) | v;
> +}
> +
> +void
> +f10 (unsigned short *f, int o, unsigned short v)
> +{
> +  *f = (*f & ~(1 << (o & 31))) | (v << (o & 31));
> +}
>
>         Jakub
>

Reply via email to