Hi! The following patch improves the last 8 cases (both -m64 and ia32) that were using RMW cycle, for xor we actually emit new = old ^ other; new != old rather than new = old ^ other; new != 0 and thus the peephole2 needs to recognize that too.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2019-03-29 Jakub Jelinek <ja...@redhat.com> PR rtl-optimization/89865 * config/i386/i386.md: Add peepholes for z = x; x ^= y; x != z. * gcc.target/i386/pr49095.c: Don't expect any RMW sequences. --- gcc/config/i386/i386.md.jj 2019-03-29 15:58:05.350731242 +0100 +++ gcc/config/i386/i386.md 2019-03-29 18:43:45.251208879 +0100 @@ -18922,6 +18922,100 @@ (define_peephole2 const0_rtx); }) +;; Special cases for xor, where (x ^= y) != 0 is (misoptimized) +;; into x = z; x ^= y; x != z +(define_peephole2 + [(set (match_operand:SWI 0 "register_operand") + (match_operand:SWI 1 "memory_operand")) + (set (match_operand:SWI 3 "register_operand") (match_dup 0)) + (parallel [(set (match_operand:SWI 4 "register_operand") + (xor:SWI (match_dup 4) + (match_operand:SWI 2 "<nonmemory_operand>"))) + (clobber (reg:CC FLAGS_REG))]) + (set (match_dup 1) (match_dup 4)) + (set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_operand:SWI 5 "register_operand") + (match_operand:SWI 6 "<nonmemory_operand>")))] + "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ()) + && (REGNO (operands[4]) == REGNO (operands[0]) + || REGNO (operands[4]) == REGNO (operands[3])) + && (rtx_equal_p (operands[REGNO (operands[4]) == REGNO (operands[0]) + ? 3 : 0], operands[5]) + ? rtx_equal_p (operands[2], operands[6]) + : rtx_equal_p (operands[2], operands[5]) + && rtx_equal_p (operands[REGNO (operands[4]) == REGNO (operands[0]) + ? 3 : 0], operands[6])) + && peep2_reg_dead_p (4, operands[4]) + && peep2_reg_dead_p (5, operands[REGNO (operands[4]) == REGNO (operands[0]) + ? 3 : 0]) + && !reg_overlap_mentioned_p (operands[0], operands[1]) + && !reg_overlap_mentioned_p (operands[0], operands[2]) + && !reg_overlap_mentioned_p (operands[3], operands[0]) + && !reg_overlap_mentioned_p (operands[3], operands[1]) + && !reg_overlap_mentioned_p (operands[3], operands[2]) + && (<MODE>mode != QImode + || immediate_operand (operands[2], QImode) + || any_QIreg_operand (operands[2], QImode))" + [(parallel [(set (match_dup 7) (match_dup 9)) + (set (match_dup 1) (match_dup 8))])] +{ + operands[7] = SET_DEST (PATTERN (peep2_next_insn (4))); + operands[8] = gen_rtx_XOR (<MODE>mode, copy_rtx (operands[1]), + operands[2]); + operands[9] + = gen_rtx_COMPARE (GET_MODE (operands[7]), + copy_rtx (operands[8]), + const0_rtx); +}) + +(define_peephole2 + [(set (match_operand:SWI12 0 "register_operand") + (match_operand:SWI12 1 "memory_operand")) + (set (match_operand:SWI12 3 "register_operand") (match_dup 0)) + (parallel [(set (match_operand:SI 4 "register_operand") + (xor:SI (match_dup 4) + (match_operand:SI 2 "<nonmemory_operand>"))) + (clobber (reg:CC FLAGS_REG))]) + (set (match_dup 1) (match_operand:SWI12 5 "register_operand")) + (set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_operand:SWI12 6 "register_operand") + (match_operand:SWI12 7 "<nonmemory_operand>")))] + "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ()) + && (REGNO (operands[5]) == REGNO (operands[0]) + || REGNO (operands[5]) == REGNO (operands[3])) + && REGNO (operands[5]) == REGNO (operands[4]) + && (rtx_equal_p (operands[REGNO (operands[5]) == REGNO (operands[0]) + ? 3 : 0], operands[6]) + ? (REG_P (operands[2]) + ? REG_P (operands[7]) && REGNO (operands[2]) == REGNO (operands[7]) + : rtx_equal_p (operands[2], operands[7])) + : (rtx_equal_p (operands[REGNO (operands[5]) == REGNO (operands[0]) + ? 3 : 0], operands[7]) + && REG_P (operands[2]) + && REGNO (operands[2]) == REGNO (operands[6]))) + && peep2_reg_dead_p (4, operands[5]) + && peep2_reg_dead_p (5, operands[REGNO (operands[5]) == REGNO (operands[0]) + ? 3 : 0]) + && !reg_overlap_mentioned_p (operands[0], operands[1]) + && !reg_overlap_mentioned_p (operands[0], operands[2]) + && !reg_overlap_mentioned_p (operands[3], operands[0]) + && !reg_overlap_mentioned_p (operands[3], operands[1]) + && !reg_overlap_mentioned_p (operands[3], operands[2]) + && (<MODE>mode != QImode + || immediate_operand (operands[2], SImode) + || any_QIreg_operand (operands[2], SImode))" + [(parallel [(set (match_dup 8) (match_dup 10)) + (set (match_dup 1) (match_dup 9))])] +{ + operands[8] = SET_DEST (PATTERN (peep2_next_insn (4))); + operands[9] = gen_rtx_XOR (<MODE>mode, copy_rtx (operands[1]), + gen_lowpart (<MODE>mode, operands[2])); + operands[10] + = gen_rtx_COMPARE (GET_MODE (operands[8]), + copy_rtx (operands[9]), + const0_rtx); +}) + ;; Attempt to optimize away memory stores of values the memory already ;; has. See PR79593. (define_peephole2 --- gcc/testsuite/gcc.target/i386/pr49095.c.jj 2019-03-29 16:00:58.526926046 +0100 +++ gcc/testsuite/gcc.target/i386/pr49095.c 2019-03-29 17:33:38.749010111 +0100 @@ -71,7 +71,5 @@ G (int) G (long) /* { dg-final { scan-assembler-not "test\[lq\]" } } */ -/* The {f,h}{char,short,int,long}xor functions aren't optimized into - a RMW instruction, so need load, modify and store. FIXME eventually. */ -/* { dg-final { scan-assembler-times "\\(%eax\\), %" 8 { target { ia32 } } } } */ -/* { dg-final { scan-assembler-times "\\(%\[re\]di\\), %" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-not "\\(%eax\\), %" { target { ia32 } } } } */ +/* { dg-final { scan-assembler-not "\\(%\[re\]di\\), %" { target { ! ia32 } } } } */ Jakub