On Thu, May 29, 2025 at 4:56 PM Hu, Lin1 <lin1...@intel.com> wrote: > > Hi, > > The patch aims to optimize > movb (%rdi), %al > movq %rdi, %rbx > xorl %esi, %eax, %edx > movb %dl, (%rdi) > cmpb %sil, %al > jne > to > xorb %sil, (%rdi) > movq %rdi, %rbx > jne > > Reduce 2 mov and 1 cmp instructions. > > Due to APX NDD allowing the dest register and source register to be different, > some original peephole2 are invalid. Add new peephole2 patterns for APX NDD. > > Bootstrapped and regtested on x86_64-linux-pc-gnu, OK for trunk? Ok. > > BRs, > Lin > > gcc/ChangeLog: > > * config/i386/i386.md (define_peephole2): Define some new peephole2 > for > APX NDD. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr49095-2.c: New test. > --- > gcc/config/i386/i386.md | 135 ++++++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr49095-2.c | 73 ++++++++++++ > 2 files changed, 208 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/i386/pr49095-2.c > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index b7a18d583da..398cdf447b3 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -28212,6 +28212,41 @@ (define_peephole2 > const0_rtx); > }) > > +;; For APX NDD PLUS/MINUS/LOGIC > +;; Like cmpelim optimized pattern. > +;; Reduce an extra mov instruction like > +;; decl (%rdi), %eax > +;; mov %eax, (%rdi) > +;; to > +;; decl (%rdi) > +(define_peephole2 > + [(parallel [(set (reg FLAGS_REG) > + (compare (match_operator:SWI 2 "plusminuslogic_operator" > + [(match_operand:SWI 0 "memory_operand") > + (match_operand:SWI 1 "<nonmemory_operand>")]) > + (const_int 0))) > + (set (match_operand:SWI 3 "register_operand") (match_dup 2))]) > + (set (match_dup 0) (match_dup 3))] > + "TARGET_APX_NDD > + && (TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ()) > + && peep2_reg_dead_p (2, operands[3]) > + && !reg_overlap_mentioned_p (operands[3], operands[0]) > + && ix86_match_ccmode (peep2_next_insn (0), > + (GET_CODE (operands[2]) == PLUS > + || GET_CODE (operands[2]) == MINUS) > + ? CCGOCmode : CCNOmode)" > + [(parallel [(set (match_dup 4) (match_dup 6)) > + (set (match_dup 0) (match_dup 5))])] > +{ > + operands[4] = SET_DEST (XVECEXP (PATTERN (peep2_next_insn (0)), 0, 0)); > + operands[5] > + = gen_rtx_fmt_ee (GET_CODE (operands[2]), GET_MODE (operands[2]), > + copy_rtx (operands[0]), operands[1]); > + operands[6] > + = gen_rtx_COMPARE (GET_MODE (operands[4]), copy_rtx (operands[5]), > + const0_rtx); > +}) > + > ;; Likewise for instances where we have a lea pattern. > (define_peephole2 > [(set (match_operand:SWI 0 "register_operand") > @@ -28500,6 +28535,54 @@ (define_peephole2 > const0_rtx); > }) > > +;; For APX NDD XOR > +;; Reduce 2 mov and 1 cmp instruction. > +;; from > +;; movq (%rdi), %rax > +;; xorq %rsi, %rax, %rdx > +;; movb %rdx, (%rdi) > +;; cmpb %rsi, %rax > +;; jne > +;; to > +;; xorb %rsi, (%rdi) > +;; jne > +(define_peephole2 > + [(set (match_operand:SWI 0 "register_operand") > + (match_operand:SWI 1 "memory_operand")) > + (parallel [(set (match_operand:SWI 4 "register_operand") > + (xor:SWI (match_operand:SWI 3 "register_operand") > + (match_operand:SWI 2 "<nonmemory_operand>"))) > + (clobber (reg:CC FLAGS_REG))]) > + (set (match_dup 1) (match_dup 4)) > + (set (reg:CCZ FLAGS_REG) > + (compare:CCZ (match_operand:SWI 5 "register_operand") > + (match_operand:SWI 6 "<nonmemory_operand>")))] > + "TARGET_APX_NDD > + && (TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ()) > + && REGNO (operands[3]) == REGNO (operands[0]) > + && (rtx_equal_p (operands[0], operands[5]) > + ? rtx_equal_p (operands[2], operands[6]) > + : rtx_equal_p (operands[2], operands[5]) > + && rtx_equal_p (operands[0], operands[6])) > + && peep2_reg_dead_p (3, operands[4]) > + && peep2_reg_dead_p (4, operands[0]) > + && !reg_overlap_mentioned_p (operands[0], operands[1]) > + && !reg_overlap_mentioned_p (operands[0], operands[2]) > + && (<MODE>mode != QImode > + || immediate_operand (operands[2], QImode) > + || any_QIreg_operand (operands[2], QImode))" > + [(parallel [(set (match_dup 7) (match_dup 9)) > + (set (match_dup 1) (match_dup 8))])] > +{ > + operands[7] = SET_DEST (PATTERN (peep2_next_insn (3))); > + operands[8] = gen_rtx_XOR (<MODE>mode, copy_rtx (operands[1]), > + operands[2]); > + operands[9] > + = gen_rtx_COMPARE (GET_MODE (operands[7]), > + copy_rtx (operands[8]), > + const0_rtx); > +}) > + > (define_peephole2 > [(set (match_operand:SWI12 0 "register_operand") > (match_operand:SWI12 1 "memory_operand")) > @@ -28548,6 +28631,58 @@ (define_peephole2 > const0_rtx); > }) > > +;; For APX NDD XOR > +;; Reduce 2 mov and 1 cmp instruction. > +;; from > +;; movb (%rdi), %al > +;; xorl %esi, %eax, %edx > +;; movb %dl, (%rdi) > +;; cmpb %sil, %al > +;; jne > +;; to > +;; xorl %sil, (%rdi) > +;; jne > +(define_peephole2 > + [(set (match_operand:SWI12 0 "register_operand") > + (match_operand:SWI12 1 "memory_operand")) > + (parallel [(set (match_operand:SI 4 "register_operand") > + (xor:SI (match_operand:SI 3 "register_operand") > + (match_operand:SI 2 "<nonmemory_operand>"))) > + (clobber (reg:CC FLAGS_REG))]) > + (set (match_dup 1) (match_operand:SWI12 5 "register_operand")) > + (set (reg:CCZ FLAGS_REG) > + (compare:CCZ (match_operand:SWI12 6 "register_operand") > + (match_operand:SWI12 7 "<nonmemory_operand>")))] > + "TARGET_APX_NDD > + && (TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ()) > + && REGNO (operands[3]) == REGNO (operands[0]) > + && REGNO (operands[5]) == REGNO (operands[4]) > + && (rtx_equal_p (operands[0], operands[6]) > + ? (REG_P (operands[2]) > + ? REG_P (operands[7]) && REGNO (operands[2]) == REGNO (operands[7]) > + : rtx_equal_p (operands[2], operands[7])) > + : (rtx_equal_p (operands[0], operands[7]) > + && REG_P (operands[2]) > + && REGNO (operands[2]) == REGNO (operands[6]))) > + && peep2_reg_dead_p (3, operands[5]) > + && peep2_reg_dead_p (4, operands[0]) > + && !reg_overlap_mentioned_p (operands[0], operands[1]) > + && !reg_overlap_mentioned_p (operands[0], operands[2]) > + && (<MODE>mode != QImode > + || immediate_operand (operands[2], SImode) > + || any_QIreg_operand (operands[2], SImode))" > + [(parallel [(set (match_dup 8) (match_dup 10)) > + (set (match_dup 1) (match_dup 9))])] > +{ > + operands[8] = SET_DEST (PATTERN (peep2_next_insn (3))); > + operands[9] = gen_rtx_XOR (<MODE>mode, copy_rtx (operands[1]), > + gen_lowpart (<MODE>mode, operands[2])); > + operands[10] > + = gen_rtx_COMPARE (GET_MODE (operands[8]), > + copy_rtx (operands[9]), > + const0_rtx); > +}) > + > ;; Attempt to optimize away memory stores of values the memory already > ;; has. See PR79593. > (define_peephole2 > diff --git a/gcc/testsuite/gcc.target/i386/pr49095-2.c > b/gcc/testsuite/gcc.target/i386/pr49095-2.c > new file mode 100644 > index 00000000000..25bc6b79a43 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr49095-2.c > @@ -0,0 +1,73 @@ > +/* PR rtl-optimization/49095 */ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-Os -fno-shrink-wrap -masm=att -mapxf" } */ > + > +void foo (void *); > + > +int * > +f1 (int *x) > +{ > + if (!--*x) > + foo (x); > + return x; > +} > + > +int > +g1 (int x) > +{ > + if (!--x) > + foo ((void *) 0); > + return x; > +} > + > +#define F(T, OP, OPN) \ > +T * \ > +f##T##OPN (T *x, T y) \ > +{ \ > + *x OP y; \ > + if (!*x) \ > + foo (x); \ > + return x; \ > +} \ > + \ > +T \ > +g##T##OPN (T x, T y) \ > +{ \ > + x OP y; \ > + if (!x) \ > + foo ((void *) 0); \ > + return x; \ > +} \ > + \ > +T * \ > +h##T##OPN (T *x) \ > +{ \ > + *x OP 24; \ > + if (!*x) \ > + foo (x); \ > + return x; \ > +} \ > + \ > +T \ > +i##T##OPN (T x, T y) \ > +{ \ > + x OP 24; \ > + if (!x) \ > + foo ((void *) 0); \ > + return x; \ > +} > + > +#define G(T) \ > +F (T, +=, plus) \ > +F (T, -=, minus) \ > +F (T, &=, and) \ > +F (T, |=, or) \ > +F (T, ^=, xor) > + > +G (char) > +G (short) > +G (int) > +G (long) > + > +/* { dg-final { scan-assembler-not "test\[lq\]" } } */ > +/* { dg-final { scan-assembler-not "\\(%\[re\]di\\), %" } } */ > -- > 2.31.1 >
-- BR, Hongtao