[Bug target/88765] New: powerpc64le-linux-gnu sub-optimal code generation for builtin atomic ops

npiggin at gmail dot com Tue, 08 Jan 2019 21:48:52 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88765


            Bug ID: 88765
           Summary: powerpc64le-linux-gnu sub-optimal code generation for
                    builtin atomic ops
           Product: gcc
           Version: 8.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: npiggin at gmail dot com
  Target Milestone: ---
            Target: powerpc64le-linux-gnu

gcc version 8.2.0 (Debian 8.2.0-4) 

Linux uses a lot of non-trivial operations, and implementing them with
compare_exchange results in sub-optimal code generation. A common one is
add_unless_zero, which is commonly used with RCU to take a reference count, or
fail if the last reference had already gone (which is a very rare case).

---
#include <stdbool.h>

bool add_unless_zero(unsigned long *mem, unsigned long inc)
{
        unsigned long val;
        do {
                val = __atomic_load_n(mem, __ATOMIC_RELAXED);
                if (__builtin_expect(val == 0, false))
                        return false;
        } while (__builtin_expect(!__atomic_compare_exchange_n(mem,
                                &val, val + inc, true,
                                __ATOMIC_RELAXED, __ATOMIC_RELAXED), false));

        return true;
}
---
This compiles to:

add_unless_zero:
.L4:
        ld 10,0(3)
        cmpdi 7,10,0
        add 8,10,4
        beq 7,.L5
        ldarx 9,0,3
        cmpd 0,9,10
        bne 0,.L3
        stdcx. 8,0,3
.L3:
        bne 0,.L4
        li 3,1
        blr
.L5:
        li 3,0
        blr

Better would be

add_unless_zero:
.L4:
        ldarx 9,0,3
        cmpdi 0,9,0
        add 9,9,4
        bne 0,.L5
        stdcx. 8,0,3
        bne 0,.L4
        li 3,1
        blr
.L5:
        li 3,0
        blr

Using extended inline asm to implement these is an adequate workaround.
Unfortunately that does not work on 128 bit powerpc because no register pair
constraint, and much worse code generation with builtins. Changing types to
__int128_t gives a bad result:

add_unless_zero:
        lq 10,0(3)
        mr 6,3
        or. 9,10,11
        addc 3,11,4
        mr 7,11
        adde 11,10,5
        beq 0,.L16
        std 28,-32(1)
        std 29,-24(1)
        std 30,-16(1)
        std 31,-8(1)
.L12:
        lqarx 28,0,6
        xor 9,29,7
        xor 10,28,10
        or. 9,9,10
        bne 0,.L4
        mr 30,11
        mr 31,3
        stqcx. 30,0,6
.L4:
        mfcr 3,128
        rlwinm 3,3,3,1
        bne 0,.L17
.L2:
        ld 28,-32(1)
        ld 29,-24(1)
        ld 30,-16(1)
        ld 31,-8(1)
        blr
.L17:
        lq 10,0(6)
        or. 9,10,11
        addc 3,11,4
        mr 7,11
        adde 11,10,5
        bne 0,.L12
        li 3,0
        b .L2
.L16:
        li 3,0
        blr

Closer to ideal would be

add_unless_zero:
.Lagain:
       lqarx   6,0,3
       or.     8,6,7
       addc    6,6,4
       adde    7,7,5
       beq     0,.Lfail
       stqcx.  6,0,3
       bne     0,.Lagain
       li      3,1
       blr
.Lfail:
       li      3,0
       blr

[Bug target/88765] New: powerpc64le-linux-gnu sub-optimal code generation for builtin atomic ops

Reply via email to