Hi! This patch adds a peephole2 for the optimization requested in the PR, namely that we emit awful code for __atomic_sub_fetch (x, y, z) == 0 or __atomic_sub_fetch (x, y, z) != 0 when y is not constant. This can't be done in the combiner which punts on combining UNSPEC_VOLATILE into other insns.
For other ops we'd need different peephole2s, this one is specific with its comparison instruction and negation that need to be matched. Bootstrapped/regtested on x86_64-linux and i686-linux. Is this ok for trunk (as exception), or for GCC 12? 2021-01-27 Jakub Jelinek <ja...@redhat.com> PR target/98737 * config/i386/sync.md (neg; mov; lock xadd; add peephole2): New define_peephole2. (*atomic_fetch_sub_cmp<mode>): New define_insn. * gcc.target/i386/pr98737.c: New test. --- gcc/config/i386/sync.md.jj 2021-01-04 10:25:45.392159555 +0100 +++ gcc/config/i386/sync.md 2021-01-26 16:03:13.911100510 +0100 @@ -777,6 +777,63 @@ (define_insn "*atomic_fetch_add_cmp<mode return "lock{%;} %K3add{<imodesuffix>}\t{%1, %0|%0, %1}"; }) +;; Similarly, peephole for __sync_sub_fetch (x, b) == 0 into just +;; lock sub followed by testing of flags instead of lock xadd, negation and +;; comparison. +(define_peephole2 + [(parallel [(set (match_operand 0 "register_operand") + (neg (match_dup 0))) + (clobber (reg:CC FLAGS_REG))]) + (set (match_operand:SWI 1 "register_operand") + (match_operand:SWI 2 "register_operand")) + (parallel [(set (match_operand:SWI 3 "register_operand") + (unspec_volatile:SWI + [(match_operand:SWI 4 "memory_operand") + (match_operand:SI 5 "const_int_operand")] + UNSPECV_XCHG)) + (set (match_dup 4) + (plus:SWI (match_dup 4) + (match_dup 3))) + (clobber (reg:CC FLAGS_REG))]) + (parallel [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (neg:SWI + (match_operand:SWI 6 "register_operand")) + (match_dup 3))) + (clobber (match_dup 3))])] + "(GET_MODE (operands[0]) == <LEAMODE>mode + || GET_MODE (operands[0]) == <MODE>mode) + && reg_or_subregno (operands[0]) == reg_or_subregno (operands[2]) + && (rtx_equal_p (operands[2], operands[3]) + ? rtx_equal_p (operands[1], operands[6]) + : (rtx_equal_p (operands[2], operands[6]) + && rtx_equal_p (operands[1], operands[3]))) + && peep2_reg_dead_p (4, operands[6]) + && peep2_reg_dead_p (4, operands[3]) + && !reg_overlap_mentioned_p (operands[1], operands[4]) + && !reg_overlap_mentioned_p (operands[2], operands[4])" + [(parallel [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (unspec_volatile:SWI [(match_dup 4) (match_dup 5)] + UNSPECV_XCHG) + (match_dup 2))) + (set (match_dup 4) + (minus:SWI (match_dup 4) + (match_dup 2)))])]) + +(define_insn "*atomic_fetch_sub_cmp<mode>" + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (unspec_volatile:SWI + [(match_operand:SWI 0 "memory_operand" "+m") + (match_operand:SI 2 "const_int_operand")] ;; model + UNSPECV_XCHG) + (match_operand:SWI 1 "register_operand" "r"))) + (set (match_dup 0) + (minus:SWI (match_dup 0) + (match_dup 1)))] + "" + "lock{%;} %K2sub{<imodesuffix>}\t{%1, %0|%0, %1}") + ;; Recall that xchg implicitly sets LOCK#, so adding it again wastes space. ;; In addition, it is always a full barrier, so we can ignore the memory model. (define_insn "atomic_exchange<mode>" --- gcc/testsuite/gcc.target/i386/pr98737.c.jj 2021-01-26 15:59:24.640620178 +0100 +++ gcc/testsuite/gcc.target/i386/pr98737.c 2021-01-26 16:00:02.898205888 +0100 @@ -0,0 +1,38 @@ +/* PR target/98737 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -masm=att" } */ +/* { dg-additional-options "-march=i686" { target ia32 } } */ +/* { dg-final { scan-assembler "lock\[^\n\r]\*subq\t" { target lp64 } } } */ +/* { dg-final { scan-assembler "lock\[^\n\r]\*subl\t" } } */ +/* { dg-final { scan-assembler "lock\[^\n\r]\*subw\t" } } */ +/* { dg-final { scan-assembler "lock\[^\n\r]\*subb\t" } } */ +/* { dg-final { scan-assembler-not "lock\[^\n\r]\*xadd" } } */ + +long a; +int b; +short c; +char d; + +int +foo (long x) +{ + return __atomic_sub_fetch (&a, x, __ATOMIC_RELEASE) == 0; +} + +int +bar (int x) +{ + return __atomic_sub_fetch (&b, x, __ATOMIC_RELEASE) == 0; +} + +int +baz (short x) +{ + return __atomic_sub_fetch (&c, x, __ATOMIC_RELEASE) == 0; +} + +int +qux (char x) +{ + return __atomic_sub_fetch (&d, x, __ATOMIC_RELEASE) == 0; +} Jakub