Hi!

f{char,short,int,long}minus use a RMW instead of direct memory operation
(regression from 8.3) on ia32.  The problem is an extra register copy, which
regcprop would fix up, but unfortunately peephole2 runs before regcprop.
Also, in one of the existing peephole2s I've renumbered the operands so that
we don't overwrite existing operands.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-03-29  Jakub Jelinek  <ja...@redhat.com>

        PR rtl-optimization/89865
        * config/i386/i386.md
        (SWI12 peephole for mem {+,-,&,|,^}= x; mem != 0): Fix up operand
        numbers not to clash with the additional operands[4].
        (peepholes for mem {+,-,&,|,^}= x; mem != 0): New peephole2s
        with extra register copy in the middle.

        * gcc.target/i386/pr49095.c: Adjust number of expected RMW spots
        on ia32.

--- gcc/config/i386/i386.md.jj  2019-03-28 23:32:26.466689322 +0100
+++ gcc/config/i386/i386.md     2019-03-29 15:58:05.350731242 +0100
@@ -18795,17 +18795,130 @@ (define_peephole2
                         (GET_CODE (operands[3]) == PLUS
                          || GET_CODE (operands[3]) == MINUS)
                         ? CCGOCmode : CCNOmode)"
-  [(parallel [(set (match_dup 4) (match_dup 6))
-             (set (match_dup 1) (match_dup 5))])]
+  [(parallel [(set (match_dup 5) (match_dup 7))
+             (set (match_dup 1) (match_dup 6))])]
 {
-  operands[4] = SET_DEST (PATTERN (peep2_next_insn (3)));
-  operands[5]
+  operands[5] = SET_DEST (PATTERN (peep2_next_insn (3)));
+  operands[6]
     = gen_rtx_fmt_ee (GET_CODE (operands[3]), <MODE>mode,
                      copy_rtx (operands[1]),
                      gen_lowpart (<MODE>mode, operands[2]));
+  operands[7]
+    = gen_rtx_COMPARE (GET_MODE (operands[5]),
+                      copy_rtx (operands[6]),
+                      const0_rtx);
+})
+
+;; peephole2 comes before regcprop, so deal also with a case that
+;; would be cleaned up by regcprop.
+(define_peephole2
+  [(set (match_operand:SWI 0 "register_operand")
+       (match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (match_dup 0)
+                  (match_operator:SWI 3 "plusminuslogic_operator"
+                    [(match_dup 0)
+                     (match_operand:SWI 2 "<nonmemory_operand>")]))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:SWI 4 "register_operand") (match_dup 0))
+   (set (match_dup 1) (match_dup 4))
+   (set (reg FLAGS_REG) (compare (match_dup 4) (const_int 0)))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && peep2_reg_dead_p (5, operands[4])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])
+   && !reg_overlap_mentioned_p (operands[4], operands[1])
+   && (<MODE>mode != QImode
+       || immediate_operand (operands[2], QImode)
+       || any_QIreg_operand (operands[2], QImode))
+   && ix86_match_ccmode (peep2_next_insn (4),
+                        (GET_CODE (operands[3]) == PLUS
+                         || GET_CODE (operands[3]) == MINUS)
+                        ? CCGOCmode : CCNOmode)"
+  [(parallel [(set (match_dup 5) (match_dup 7))
+             (set (match_dup 1) (match_dup 6))])]
+{
+  operands[5] = SET_DEST (PATTERN (peep2_next_insn (4)));
+  operands[6]
+    = gen_rtx_fmt_ee (GET_CODE (operands[3]), GET_MODE (operands[3]),
+                     copy_rtx (operands[1]),
+                     operands[2]);
+  operands[7]
+    = gen_rtx_COMPARE (GET_MODE (operands[5]),
+                      copy_rtx (operands[6]),
+                      const0_rtx);
+})
+
+(define_peephole2
+  [(set (match_operand:SWI12 0 "register_operand")
+       (match_operand:SWI12 1 "memory_operand"))
+   (parallel [(set (match_operand:SI 4 "register_operand")
+                  (match_operator:SI 3 "plusminuslogic_operator"
+                    [(match_dup 4)
+                     (match_operand:SI 2 "nonmemory_operand")]))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:SWI12 5 "register_operand") (match_dup 0))
+   (set (match_dup 1) (match_dup 5))
+   (set (reg FLAGS_REG) (compare (match_dup 5) (const_int 0)))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && REGNO (operands[0]) == REGNO (operands[4])
+   && peep2_reg_dead_p (3, operands[0])
+   && peep2_reg_dead_p (5, operands[5])
+   && (<MODE>mode != QImode
+       || immediate_operand (operands[2], SImode)
+       || any_QIreg_operand (operands[2], SImode))
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])
+   && !reg_overlap_mentioned_p (operands[5], operands[1])
+   && ix86_match_ccmode (peep2_next_insn (4),
+                        (GET_CODE (operands[3]) == PLUS
+                         || GET_CODE (operands[3]) == MINUS)
+                        ? CCGOCmode : CCNOmode)"
+  [(parallel [(set (match_dup 6) (match_dup 8))
+             (set (match_dup 1) (match_dup 7))])]
+{
+  operands[6] = SET_DEST (PATTERN (peep2_next_insn (4)));
+  operands[7]
+    = gen_rtx_fmt_ee (GET_CODE (operands[3]), <MODE>mode,
+                     copy_rtx (operands[1]),
+                     gen_lowpart (<MODE>mode, operands[2]));
+  operands[8]
+    = gen_rtx_COMPARE (GET_MODE (operands[6]),
+                      copy_rtx (operands[7]),
+                      const0_rtx);
+})
+
+;; Likewise for cmpelim optimized pattern.
+(define_peephole2
+  [(set (match_operand:SWI 0 "register_operand")
+       (match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (reg FLAGS_REG)
+                  (compare (match_operator:SWI 3 "plusminuslogic_operator"
+                             [(match_dup 0)
+                              (match_operand:SWI 2 "<nonmemory_operand>")])
+                           (const_int 0)))
+             (set (match_dup 0) (match_dup 3))])
+   (set (match_operand:SWI 4 "register_operand") (match_dup 0))
+   (set (match_dup 1) (match_dup 4))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && peep2_reg_dead_p (4, operands[4])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])
+   && !reg_overlap_mentioned_p (operands[4], operands[1])
+   && ix86_match_ccmode (peep2_next_insn (1),
+                        (GET_CODE (operands[3]) == PLUS
+                         || GET_CODE (operands[3]) == MINUS)
+                        ? CCGOCmode : CCNOmode)"
+  [(parallel [(set (match_dup 5) (match_dup 7))
+             (set (match_dup 1) (match_dup 6))])]
+{
+  operands[5] = SET_DEST (XVECEXP (PATTERN (peep2_next_insn (1)), 0, 0));
   operands[6]
-    = gen_rtx_COMPARE (GET_MODE (operands[4]),
-                      copy_rtx (operands[5]),
+    = gen_rtx_fmt_ee (GET_CODE (operands[3]), GET_MODE (operands[3]),
+                     copy_rtx (operands[1]), operands[2]);
+  operands[7]
+    = gen_rtx_COMPARE (GET_MODE (operands[5]), copy_rtx (operands[6]),
                       const0_rtx);
 })
 
--- gcc/testsuite/gcc.target/i386/pr49095.c.jj  2019-03-29 13:11:54.941597147 
+0100
+++ gcc/testsuite/gcc.target/i386/pr49095.c     2019-03-29 16:00:58.526926046 
+0100
@@ -73,5 +73,5 @@ G (long)
 /* { dg-final { scan-assembler-not "test\[lq\]" } } */
 /* The {f,h}{char,short,int,long}xor functions aren't optimized into
    a RMW instruction, so need load, modify and store.  FIXME eventually.  */
-/* { dg-final { scan-assembler-times "\\(%eax\\), %" 12 { target { ia32 } } } 
} */
+/* { dg-final { scan-assembler-times "\\(%eax\\), %" 8 { target { ia32 } } } } 
*/
 /* { dg-final { scan-assembler-times "\\(%\[re\]di\\), %" 8 { target { ! ia32 
} } } } */

        Jakub

Reply via email to