Hi!

This patch adds a peephole2 for the optimization requested in the PR,
namely that we emit awful code for __atomic_sub_fetch (x, y, z) == 0
or __atomic_sub_fetch (x, y, z) != 0 when y is not constant.
This can't be done in the combiner which punts on combining UNSPEC_VOLATILE
into other insns.

For other ops we'd need different peephole2s, this one is specific with its
comparison instruction and negation that need to be matched.

Bootstrapped/regtested on x86_64-linux and i686-linux.  Is this ok for trunk
(as exception), or for GCC 12?

2021-01-27  Jakub Jelinek  <ja...@redhat.com>

        PR target/98737
        * config/i386/sync.md (neg; mov; lock xadd; add peephole2): New
        define_peephole2.
        (*atomic_fetch_sub_cmp<mode>): New define_insn.

        * gcc.target/i386/pr98737.c: New test.

--- gcc/config/i386/sync.md.jj  2021-01-04 10:25:45.392159555 +0100
+++ gcc/config/i386/sync.md     2021-01-26 16:03:13.911100510 +0100
@@ -777,6 +777,63 @@ (define_insn "*atomic_fetch_add_cmp<mode
   return "lock{%;} %K3add{<imodesuffix>}\t{%1, %0|%0, %1}";
 })
 
+;; Similarly, peephole for __sync_sub_fetch (x, b) == 0 into just
+;; lock sub followed by testing of flags instead of lock xadd, negation and
+;; comparison.
+(define_peephole2
+  [(parallel [(set (match_operand 0 "register_operand")
+                  (neg (match_dup 0)))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:SWI 1 "register_operand")
+       (match_operand:SWI 2 "register_operand"))
+   (parallel [(set (match_operand:SWI 3 "register_operand")
+                  (unspec_volatile:SWI
+                    [(match_operand:SWI 4 "memory_operand")
+                     (match_operand:SI 5 "const_int_operand")]
+                    UNSPECV_XCHG))
+             (set (match_dup 4)
+                  (plus:SWI (match_dup 4)
+                            (match_dup 3)))
+             (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (reg:CCZ FLAGS_REG)
+                  (compare:CCZ (neg:SWI
+                                 (match_operand:SWI 6 "register_operand"))
+                               (match_dup 3)))
+             (clobber (match_dup 3))])]
+  "(GET_MODE (operands[0]) == <LEAMODE>mode
+    || GET_MODE (operands[0]) == <MODE>mode)
+   && reg_or_subregno (operands[0]) == reg_or_subregno (operands[2])
+   && (rtx_equal_p (operands[2], operands[3])
+       ? rtx_equal_p (operands[1], operands[6])
+       : (rtx_equal_p (operands[2], operands[6])
+         && rtx_equal_p (operands[1], operands[3])))
+   && peep2_reg_dead_p (4, operands[6])
+   && peep2_reg_dead_p (4, operands[3])
+   && !reg_overlap_mentioned_p (operands[1], operands[4])
+   && !reg_overlap_mentioned_p (operands[2], operands[4])"
+  [(parallel [(set (reg:CCZ FLAGS_REG)
+                  (compare:CCZ
+                    (unspec_volatile:SWI [(match_dup 4) (match_dup 5)]
+                                         UNSPECV_XCHG)
+                    (match_dup 2)))
+             (set (match_dup 4)
+                  (minus:SWI (match_dup 4)
+                             (match_dup 2)))])])
+
+(define_insn "*atomic_fetch_sub_cmp<mode>"
+  [(set (reg:CCZ FLAGS_REG)
+       (compare:CCZ
+         (unspec_volatile:SWI
+           [(match_operand:SWI 0 "memory_operand" "+m")
+            (match_operand:SI 2 "const_int_operand")]          ;; model
+           UNSPECV_XCHG)
+         (match_operand:SWI 1 "register_operand" "r")))
+   (set (match_dup 0)
+       (minus:SWI (match_dup 0)
+                  (match_dup 1)))]
+  ""
+  "lock{%;} %K2sub{<imodesuffix>}\t{%1, %0|%0, %1}")
+
 ;; Recall that xchg implicitly sets LOCK#, so adding it again wastes space.
 ;; In addition, it is always a full barrier, so we can ignore the memory model.
 (define_insn "atomic_exchange<mode>"
--- gcc/testsuite/gcc.target/i386/pr98737.c.jj  2021-01-26 15:59:24.640620178 
+0100
+++ gcc/testsuite/gcc.target/i386/pr98737.c     2021-01-26 16:00:02.898205888 
+0100
@@ -0,0 +1,38 @@
+/* PR target/98737 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -masm=att" } */
+/* { dg-additional-options "-march=i686" { target ia32 } } */
+/* { dg-final { scan-assembler "lock\[^\n\r]\*subq\t" { target lp64 } } } */
+/* { dg-final { scan-assembler "lock\[^\n\r]\*subl\t" } } */
+/* { dg-final { scan-assembler "lock\[^\n\r]\*subw\t" } } */
+/* { dg-final { scan-assembler "lock\[^\n\r]\*subb\t" } } */
+/* { dg-final { scan-assembler-not "lock\[^\n\r]\*xadd" } } */
+
+long a;
+int b;
+short c;
+char d;
+
+int
+foo (long x)
+{
+  return __atomic_sub_fetch (&a, x, __ATOMIC_RELEASE) == 0;
+}
+
+int
+bar (int x)
+{
+  return __atomic_sub_fetch (&b, x, __ATOMIC_RELEASE) == 0;
+}
+
+int
+baz (short x)
+{
+  return __atomic_sub_fetch (&c, x, __ATOMIC_RELEASE) == 0;
+}
+
+int
+qux (char x)
+{
+  return __atomic_sub_fetch (&d, x, __ATOMIC_RELEASE) == 0;
+}

        Jakub

Reply via email to