On Thu, May 29, 2025 at 4:56 PM Hu, Lin1 <lin1...@intel.com> wrote:
>
> Hi,
>
> The patch aims to optimize
>          movb    (%rdi), %al
>          movq    %rdi, %rbx
>          xorl    %esi, %eax, %edx
>          movb    %dl, (%rdi)
>          cmpb    %sil, %al
>          jne
> to
>          xorb    %sil, (%rdi)
>          movq    %rdi, %rbx
>          jne
>
> Reduce 2 mov and 1 cmp instructions.
>
> Due to APX NDD allowing the dest register and source register to be different,
> some original peephole2 are invalid. Add new peephole2 patterns for APX NDD.
>
> Bootstrapped and regtested on x86_64-linux-pc-gnu, OK for trunk?
Ok.
>
> BRs,
> Lin
>
> gcc/ChangeLog:
>
>         * config/i386/i386.md (define_peephole2): Define some new peephole2 
> for
>         APX NDD.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr49095-2.c: New test.
> ---
>  gcc/config/i386/i386.md                   | 135 ++++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr49095-2.c |  73 ++++++++++++
>  2 files changed, 208 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr49095-2.c
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index b7a18d583da..398cdf447b3 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -28212,6 +28212,41 @@ (define_peephole2
>                        const0_rtx);
>  })
>
> +;; For APX NDD PLUS/MINUS/LOGIC
> +;; Like cmpelim optimized pattern.
> +;; Reduce an extra mov instruction like
> +;; decl (%rdi), %eax
> +;; mov %eax, (%rdi)
> +;; to
> +;; decl (%rdi)
> +(define_peephole2
> +  [(parallel [(set (reg FLAGS_REG)
> +                  (compare (match_operator:SWI 2 "plusminuslogic_operator"
> +                             [(match_operand:SWI 0 "memory_operand")
> +                              (match_operand:SWI 1 "<nonmemory_operand>")])
> +                           (const_int 0)))
> +             (set (match_operand:SWI 3 "register_operand") (match_dup 2))])
> +   (set (match_dup 0) (match_dup 3))]
> +  "TARGET_APX_NDD
> +   && (TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
> +   && peep2_reg_dead_p (2, operands[3])
> +   && !reg_overlap_mentioned_p (operands[3], operands[0])
> +   && ix86_match_ccmode (peep2_next_insn (0),
> +                        (GET_CODE (operands[2]) == PLUS
> +                         || GET_CODE (operands[2]) == MINUS)
> +                        ? CCGOCmode : CCNOmode)"
> +  [(parallel [(set (match_dup 4) (match_dup 6))
> +             (set (match_dup 0) (match_dup 5))])]
> +{
> +  operands[4] = SET_DEST (XVECEXP (PATTERN (peep2_next_insn (0)), 0, 0));
> +  operands[5]
> +    = gen_rtx_fmt_ee (GET_CODE (operands[2]), GET_MODE (operands[2]),
> +                     copy_rtx (operands[0]), operands[1]);
> +  operands[6]
> +    = gen_rtx_COMPARE (GET_MODE (operands[4]), copy_rtx (operands[5]),
> +                      const0_rtx);
> +})
> +
>  ;; Likewise for instances where we have a lea pattern.
>  (define_peephole2
>    [(set (match_operand:SWI 0 "register_operand")
> @@ -28500,6 +28535,54 @@ (define_peephole2
>                        const0_rtx);
>  })
>
> +;; For APX NDD XOR
> +;; Reduce 2 mov and 1 cmp instruction.
> +;; from
> +;; movq (%rdi), %rax
> +;; xorq %rsi, %rax, %rdx
> +;; movb %rdx, (%rdi)
> +;; cmpb %rsi, %rax
> +;; jne
> +;; to
> +;; xorb %rsi, (%rdi)
> +;; jne
> +(define_peephole2
> +  [(set (match_operand:SWI 0 "register_operand")
> +       (match_operand:SWI 1 "memory_operand"))
> +   (parallel [(set (match_operand:SWI 4 "register_operand")
> +                  (xor:SWI (match_operand:SWI 3 "register_operand")
> +                           (match_operand:SWI 2 "<nonmemory_operand>")))
> +             (clobber (reg:CC FLAGS_REG))])
> +   (set (match_dup 1) (match_dup 4))
> +   (set (reg:CCZ FLAGS_REG)
> +       (compare:CCZ (match_operand:SWI 5 "register_operand")
> +                    (match_operand:SWI 6 "<nonmemory_operand>")))]
> +  "TARGET_APX_NDD
> +   && (TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
> +   && REGNO (operands[3]) == REGNO (operands[0])
> +   && (rtx_equal_p (operands[0], operands[5])
> +       ? rtx_equal_p (operands[2], operands[6])
> +       : rtx_equal_p (operands[2], operands[5])
> +        && rtx_equal_p (operands[0], operands[6]))
> +   && peep2_reg_dead_p (3, operands[4])
> +   && peep2_reg_dead_p (4, operands[0])
> +   && !reg_overlap_mentioned_p (operands[0], operands[1])
> +   && !reg_overlap_mentioned_p (operands[0], operands[2])
> +   && (<MODE>mode != QImode
> +       || immediate_operand (operands[2], QImode)
> +       || any_QIreg_operand (operands[2], QImode))"
> +  [(parallel [(set (match_dup 7) (match_dup 9))
> +             (set (match_dup 1) (match_dup 8))])]
> +{
> +  operands[7] = SET_DEST (PATTERN (peep2_next_insn (3)));
> +  operands[8] = gen_rtx_XOR (<MODE>mode, copy_rtx (operands[1]),
> +                            operands[2]);
> +  operands[9]
> +    = gen_rtx_COMPARE (GET_MODE (operands[7]),
> +                      copy_rtx (operands[8]),
> +                      const0_rtx);
> +})
> +
>  (define_peephole2
>    [(set (match_operand:SWI12 0 "register_operand")
>         (match_operand:SWI12 1 "memory_operand"))
> @@ -28548,6 +28631,58 @@ (define_peephole2
>                        const0_rtx);
>  })
>
> +;; For APX NDD XOR
> +;; Reduce 2 mov and 1 cmp instruction.
> +;; from
> +;; movb (%rdi), %al
> +;; xorl %esi, %eax, %edx
> +;; movb %dl, (%rdi)
> +;; cmpb %sil, %al
> +;; jne
> +;; to
> +;; xorl %sil, (%rdi)
> +;; jne
> +(define_peephole2
> +  [(set (match_operand:SWI12 0 "register_operand")
> +       (match_operand:SWI12 1 "memory_operand"))
> +   (parallel [(set (match_operand:SI 4 "register_operand")
> +                  (xor:SI (match_operand:SI 3 "register_operand")
> +                          (match_operand:SI 2 "<nonmemory_operand>")))
> +             (clobber (reg:CC FLAGS_REG))])
> +   (set (match_dup 1) (match_operand:SWI12 5 "register_operand"))
> +   (set (reg:CCZ FLAGS_REG)
> +       (compare:CCZ (match_operand:SWI12 6 "register_operand")
> +                    (match_operand:SWI12 7 "<nonmemory_operand>")))]
> +  "TARGET_APX_NDD
> +   && (TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
> +   && REGNO (operands[3]) == REGNO (operands[0])
> +   && REGNO (operands[5]) == REGNO (operands[4])
> +   && (rtx_equal_p (operands[0], operands[6])
> +       ? (REG_P (operands[2])
> +         ? REG_P (operands[7]) && REGNO (operands[2]) == REGNO (operands[7])
> +         : rtx_equal_p (operands[2], operands[7]))
> +       : (rtx_equal_p (operands[0], operands[7])
> +         && REG_P (operands[2])
> +         && REGNO (operands[2]) == REGNO (operands[6])))
> +   && peep2_reg_dead_p (3, operands[5])
> +   && peep2_reg_dead_p (4, operands[0])
> +   && !reg_overlap_mentioned_p (operands[0], operands[1])
> +   && !reg_overlap_mentioned_p (operands[0], operands[2])
> +   && (<MODE>mode != QImode
> +       || immediate_operand (operands[2], SImode)
> +       || any_QIreg_operand (operands[2], SImode))"
> +  [(parallel [(set (match_dup 8) (match_dup 10))
> +             (set (match_dup 1) (match_dup 9))])]
> +{
> +  operands[8] = SET_DEST (PATTERN (peep2_next_insn (3)));
> +  operands[9] = gen_rtx_XOR (<MODE>mode, copy_rtx (operands[1]),
> +                            gen_lowpart (<MODE>mode, operands[2]));
> +  operands[10]
> +    = gen_rtx_COMPARE (GET_MODE (operands[8]),
> +                      copy_rtx (operands[9]),
> +                      const0_rtx);
> +})
> +
>  ;; Attempt to optimize away memory stores of values the memory already
>  ;; has.  See PR79593.
>  (define_peephole2
> diff --git a/gcc/testsuite/gcc.target/i386/pr49095-2.c 
> b/gcc/testsuite/gcc.target/i386/pr49095-2.c
> new file mode 100644
> index 00000000000..25bc6b79a43
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr49095-2.c
> @@ -0,0 +1,73 @@
> +/* PR rtl-optimization/49095 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-Os -fno-shrink-wrap -masm=att -mapxf" } */
> +
> +void foo (void *);
> +
> +int *
> +f1 (int *x)
> +{
> +  if (!--*x)
> +    foo (x);
> +  return x;
> +}
> +
> +int
> +g1 (int x)
> +{
> +  if (!--x)
> +    foo ((void *) 0);
> +  return x;
> +}
> +
> +#define F(T, OP, OPN) \
> +T *                    \
> +f##T##OPN (T *x, T y)  \
> +{                      \
> +  *x OP y;             \
> +  if (!*x)             \
> +    foo (x);           \
> +  return x;            \
> +}                      \
> +                       \
> +T                      \
> +g##T##OPN (T x, T y)   \
> +{                      \
> +  x OP y;              \
> +  if (!x)              \
> +    foo ((void *) 0);  \
> +  return x;            \
> +}                      \
> +                       \
> +T *                    \
> +h##T##OPN (T *x)       \
> +{                      \
> +  *x OP 24;            \
> +  if (!*x)             \
> +    foo (x);           \
> +  return x;            \
> +}                      \
> +                       \
> +T                      \
> +i##T##OPN (T x, T y)   \
> +{                      \
> +  x OP 24;             \
> +  if (!x)              \
> +    foo ((void *) 0);  \
> +  return x;            \
> +}
> +
> +#define G(T) \
> +F (T, +=, plus)                \
> +F (T, -=, minus)       \
> +F (T, &=, and)         \
> +F (T, |=, or)          \
> +F (T, ^=, xor)
> +
> +G (char)
> +G (short)
> +G (int)
> +G (long)
> +
> +/* { dg-final { scan-assembler-not "test\[lq\]" } } */
> +/* { dg-final { scan-assembler-not "\\(%\[re\]di\\), %" } } */
> --
> 2.31.1
>


-- 
BR,
Hongtao

Reply via email to