Hi, The patch aims to optimize movb (%rdi), %al movq %rdi, %rbx xorl %esi, %eax, %edx movb %dl, (%rdi) cmpb %sil, %al jne to xorb %sil, (%rdi) movq %rdi, %rbx jne
Reduce 2 mov and 1 cmp instructions. Due to APX NDD allowing the dest register and source register to be different, some original peephole2 are invalid. Add new peephole2 patterns for APX NDD. Bootstrapped and regtested on x86_64-linux-pc-gnu, OK for trunk? BRs, Lin gcc/ChangeLog: * config/i386/i386.md (define_peephole2): Define some new peephole2 for APX NDD. gcc/testsuite/ChangeLog: * gcc.target/i386/pr49095-2.c: New test. --- gcc/config/i386/i386.md | 135 ++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr49095-2.c | 73 ++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr49095-2.c diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index b7a18d583da..398cdf447b3 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -28212,6 +28212,41 @@ (define_peephole2 const0_rtx); }) +;; For APX NDD PLUS/MINUS/LOGIC +;; Like cmpelim optimized pattern. +;; Reduce an extra mov instruction like +;; decl (%rdi), %eax +;; mov %eax, (%rdi) +;; to +;; decl (%rdi) +(define_peephole2 + [(parallel [(set (reg FLAGS_REG) + (compare (match_operator:SWI 2 "plusminuslogic_operator" + [(match_operand:SWI 0 "memory_operand") + (match_operand:SWI 1 "<nonmemory_operand>")]) + (const_int 0))) + (set (match_operand:SWI 3 "register_operand") (match_dup 2))]) + (set (match_dup 0) (match_dup 3))] + "TARGET_APX_NDD + && (TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ()) + && peep2_reg_dead_p (2, operands[3]) + && !reg_overlap_mentioned_p (operands[3], operands[0]) + && ix86_match_ccmode (peep2_next_insn (0), + (GET_CODE (operands[2]) == PLUS + || GET_CODE (operands[2]) == MINUS) + ? CCGOCmode : CCNOmode)" + [(parallel [(set (match_dup 4) (match_dup 6)) + (set (match_dup 0) (match_dup 5))])] +{ + operands[4] = SET_DEST (XVECEXP (PATTERN (peep2_next_insn (0)), 0, 0)); + operands[5] + = gen_rtx_fmt_ee (GET_CODE (operands[2]), GET_MODE (operands[2]), + copy_rtx (operands[0]), operands[1]); + operands[6] + = gen_rtx_COMPARE (GET_MODE (operands[4]), copy_rtx (operands[5]), + const0_rtx); +}) + ;; Likewise for instances where we have a lea pattern. (define_peephole2 [(set (match_operand:SWI 0 "register_operand") @@ -28500,6 +28535,54 @@ (define_peephole2 const0_rtx); }) +;; For APX NDD XOR +;; Reduce 2 mov and 1 cmp instruction. +;; from +;; movq (%rdi), %rax +;; xorq %rsi, %rax, %rdx +;; movb %rdx, (%rdi) +;; cmpb %rsi, %rax +;; jne +;; to +;; xorb %rsi, (%rdi) +;; jne +(define_peephole2 + [(set (match_operand:SWI 0 "register_operand") + (match_operand:SWI 1 "memory_operand")) + (parallel [(set (match_operand:SWI 4 "register_operand") + (xor:SWI (match_operand:SWI 3 "register_operand") + (match_operand:SWI 2 "<nonmemory_operand>"))) + (clobber (reg:CC FLAGS_REG))]) + (set (match_dup 1) (match_dup 4)) + (set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_operand:SWI 5 "register_operand") + (match_operand:SWI 6 "<nonmemory_operand>")))] + "TARGET_APX_NDD + && (TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ()) + && REGNO (operands[3]) == REGNO (operands[0]) + && (rtx_equal_p (operands[0], operands[5]) + ? rtx_equal_p (operands[2], operands[6]) + : rtx_equal_p (operands[2], operands[5]) + && rtx_equal_p (operands[0], operands[6])) + && peep2_reg_dead_p (3, operands[4]) + && peep2_reg_dead_p (4, operands[0]) + && !reg_overlap_mentioned_p (operands[0], operands[1]) + && !reg_overlap_mentioned_p (operands[0], operands[2]) + && (<MODE>mode != QImode + || immediate_operand (operands[2], QImode) + || any_QIreg_operand (operands[2], QImode))" + [(parallel [(set (match_dup 7) (match_dup 9)) + (set (match_dup 1) (match_dup 8))])] +{ + operands[7] = SET_DEST (PATTERN (peep2_next_insn (3))); + operands[8] = gen_rtx_XOR (<MODE>mode, copy_rtx (operands[1]), + operands[2]); + operands[9] + = gen_rtx_COMPARE (GET_MODE (operands[7]), + copy_rtx (operands[8]), + const0_rtx); +}) + (define_peephole2 [(set (match_operand:SWI12 0 "register_operand") (match_operand:SWI12 1 "memory_operand")) @@ -28548,6 +28631,58 @@ (define_peephole2 const0_rtx); }) +;; For APX NDD XOR +;; Reduce 2 mov and 1 cmp instruction. +;; from +;; movb (%rdi), %al +;; xorl %esi, %eax, %edx +;; movb %dl, (%rdi) +;; cmpb %sil, %al +;; jne +;; to +;; xorl %sil, (%rdi) +;; jne +(define_peephole2 + [(set (match_operand:SWI12 0 "register_operand") + (match_operand:SWI12 1 "memory_operand")) + (parallel [(set (match_operand:SI 4 "register_operand") + (xor:SI (match_operand:SI 3 "register_operand") + (match_operand:SI 2 "<nonmemory_operand>"))) + (clobber (reg:CC FLAGS_REG))]) + (set (match_dup 1) (match_operand:SWI12 5 "register_operand")) + (set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_operand:SWI12 6 "register_operand") + (match_operand:SWI12 7 "<nonmemory_operand>")))] + "TARGET_APX_NDD + && (TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ()) + && REGNO (operands[3]) == REGNO (operands[0]) + && REGNO (operands[5]) == REGNO (operands[4]) + && (rtx_equal_p (operands[0], operands[6]) + ? (REG_P (operands[2]) + ? REG_P (operands[7]) && REGNO (operands[2]) == REGNO (operands[7]) + : rtx_equal_p (operands[2], operands[7])) + : (rtx_equal_p (operands[0], operands[7]) + && REG_P (operands[2]) + && REGNO (operands[2]) == REGNO (operands[6]))) + && peep2_reg_dead_p (3, operands[5]) + && peep2_reg_dead_p (4, operands[0]) + && !reg_overlap_mentioned_p (operands[0], operands[1]) + && !reg_overlap_mentioned_p (operands[0], operands[2]) + && (<MODE>mode != QImode + || immediate_operand (operands[2], SImode) + || any_QIreg_operand (operands[2], SImode))" + [(parallel [(set (match_dup 8) (match_dup 10)) + (set (match_dup 1) (match_dup 9))])] +{ + operands[8] = SET_DEST (PATTERN (peep2_next_insn (3))); + operands[9] = gen_rtx_XOR (<MODE>mode, copy_rtx (operands[1]), + gen_lowpart (<MODE>mode, operands[2])); + operands[10] + = gen_rtx_COMPARE (GET_MODE (operands[8]), + copy_rtx (operands[9]), + const0_rtx); +}) + ;; Attempt to optimize away memory stores of values the memory already ;; has. See PR79593. (define_peephole2 diff --git a/gcc/testsuite/gcc.target/i386/pr49095-2.c b/gcc/testsuite/gcc.target/i386/pr49095-2.c new file mode 100644 index 00000000000..25bc6b79a43 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr49095-2.c @@ -0,0 +1,73 @@ +/* PR rtl-optimization/49095 */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-Os -fno-shrink-wrap -masm=att -mapxf" } */ + +void foo (void *); + +int * +f1 (int *x) +{ + if (!--*x) + foo (x); + return x; +} + +int +g1 (int x) +{ + if (!--x) + foo ((void *) 0); + return x; +} + +#define F(T, OP, OPN) \ +T * \ +f##T##OPN (T *x, T y) \ +{ \ + *x OP y; \ + if (!*x) \ + foo (x); \ + return x; \ +} \ + \ +T \ +g##T##OPN (T x, T y) \ +{ \ + x OP y; \ + if (!x) \ + foo ((void *) 0); \ + return x; \ +} \ + \ +T * \ +h##T##OPN (T *x) \ +{ \ + *x OP 24; \ + if (!*x) \ + foo (x); \ + return x; \ +} \ + \ +T \ +i##T##OPN (T x, T y) \ +{ \ + x OP 24; \ + if (!x) \ + foo ((void *) 0); \ + return x; \ +} + +#define G(T) \ +F (T, +=, plus) \ +F (T, -=, minus) \ +F (T, &=, and) \ +F (T, |=, or) \ +F (T, ^=, xor) + +G (char) +G (short) +G (int) +G (long) + +/* { dg-final { scan-assembler-not "test\[lq\]" } } */ +/* { dg-final { scan-assembler-not "\\(%\[re\]di\\), %" } } */ -- 2.31.1