On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote: > This uses more optimal ll/sc style access patterns (rather than > cmpxchg), and also sets the EH=1 lock hint on those operations > which acquire ownership of the lock. > --- > arch/powerpc/include/asm/qspinlock.h | 25 +++++-- > arch/powerpc/include/asm/qspinlock_types.h | 6 +- > arch/powerpc/lib/qspinlock.c | 81 +++++++++++++++------- > 3 files changed, 79 insertions(+), 33 deletions(-) > > diff --git a/arch/powerpc/include/asm/qspinlock.h > b/arch/powerpc/include/asm/qspinlock.h > index 79a1936fb68d..3ab354159e5e 100644 > --- a/arch/powerpc/include/asm/qspinlock.h > +++ b/arch/powerpc/include/asm/qspinlock.h > @@ -2,28 +2,43 @@ > #ifndef _ASM_POWERPC_QSPINLOCK_H > #define _ASM_POWERPC_QSPINLOCK_H > > -#include <linux/atomic.h> > #include <linux/compiler.h> > #include <asm/qspinlock_types.h> > > static __always_inline int queued_spin_is_locked(struct qspinlock *lock) > { > - return atomic_read(&lock->val); > + return READ_ONCE(lock->val); > } > > static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) > { > - return !atomic_read(&lock.val); > + return !lock.val; > } > > static __always_inline int queued_spin_is_contended(struct qspinlock *lock) > { > - return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK); > + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); > } > > static __always_inline int queued_spin_trylock(struct qspinlock *lock) > { > - if (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0) > + u32 new = _Q_LOCKED_VAL; > + u32 prev; > + > + asm volatile( > +"1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" > +" cmpwi 0,%0,0 \n" > +" bne- 2f \n" > +" stwcx. %2,0,%1 \n" > +" bne- 1b \n" > +"\t" PPC_ACQUIRE_BARRIER " \n" > +"2: \n" > + : "=&r" (prev) > + : "r" (&lock->val), "r" (new), > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0)
btw IS_ENABLED() already returns 1 or 0 > + : "cr0", "memory"); This is the ISA's "test and set" atomic primitive. Do you think it would be worth seperating it as a helper? > + > + if (likely(prev == 0)) > return 1; > return 0; same optional style nit: return likely(prev == 0); > } > diff --git a/arch/powerpc/include/asm/qspinlock_types.h > b/arch/powerpc/include/asm/qspinlock_types.h > index 3425dab42576..210adf05b235 100644 > --- a/arch/powerpc/include/asm/qspinlock_types.h > +++ b/arch/powerpc/include/asm/qspinlock_types.h > @@ -7,7 +7,7 @@ > > typedef struct qspinlock { > union { > - atomic_t val; > + u32 val; > > #ifdef __LITTLE_ENDIAN > struct { > @@ -23,10 +23,10 @@ typedef struct qspinlock { > }; > } arch_spinlock_t; > > -#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } > +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } } > > /* > - * Bitfields in the atomic value: > + * Bitfields in the lock word: > * > * 0: locked bit > * 16-31: tail cpu (+1) > diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c > index 5ebb88d95636..7c71e5e287df 100644 > --- a/arch/powerpc/lib/qspinlock.c > +++ b/arch/powerpc/lib/qspinlock.c > @@ -1,5 +1,4 @@ > // SPDX-License-Identifier: GPL-2.0-or-later > -#include <linux/atomic.h> > #include <linux/bug.h> > #include <linux/compiler.h> > #include <linux/export.h> > @@ -22,32 +21,59 @@ struct qnodes { > > static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); > > -static inline int encode_tail_cpu(void) > +static inline u32 encode_tail_cpu(void) > { > return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET; > } > > -static inline int get_tail_cpu(int val) > +static inline int get_tail_cpu(u32 val) > { > return (val >> _Q_TAIL_CPU_OFFSET) - 1; > } > > /* Take the lock by setting the bit, no other CPUs may concurrently lock it. > */ I think you missed deleting the above line. > +/* Take the lock by setting the lock bit, no other CPUs will touch it. */ > static __always_inline void lock_set_locked(struct qspinlock *lock) > { > - atomic_or(_Q_LOCKED_VAL, &lock->val); > - __atomic_acquire_fence(); > + u32 new = _Q_LOCKED_VAL; > + u32 prev; > + > + asm volatile( > +"1: lwarx %0,0,%1,%3 # lock_set_locked \n" > +" or %0,%0,%2 \n" > +" stwcx. %0,0,%1 \n" > +" bne- 1b \n" > +"\t" PPC_ACQUIRE_BARRIER " \n" > + : "=&r" (prev) > + : "r" (&lock->val), "r" (new), > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > + : "cr0", "memory"); > } This is pretty similar with the DEFINE_TESTOP() pattern from arch/powerpc/include/asm/bitops.h (such as test_and_set_bits_lock()) except for word instead of double word. Do you think it's possible / beneficial to make use of those macros? > > -/* Take lock, clearing tail, cmpxchg with val (which must not be locked) */ > -static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, > int val) > +/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */ > +static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, > u32 old) > { > - int newval = _Q_LOCKED_VAL; > - > - if (atomic_cmpxchg_acquire(&lock->val, val, newval) == val) > + u32 new = _Q_LOCKED_VAL; > + u32 prev; > + > + BUG_ON(old & _Q_LOCKED_VAL); The BUG_ON() could have been introduced in an earlier patch I think. > + > + asm volatile( > +"1: lwarx %0,0,%1,%4 # trylock_clear_tail_cpu \n" > +" cmpw 0,%0,%2 \n" > +" bne- 2f \n" > +" stwcx. %3,0,%1 \n" > +" bne- 1b \n" > +"\t" PPC_ACQUIRE_BARRIER " \n" > +"2: \n" > + : "=&r" (prev) > + : "r" (&lock->val), "r"(old), "r" (new), Could this be like "r"(_Q_TAIL_CPU_MASK) below? i.e. "r" (_Q_LOCKED_VAL)? Makes it clear new doesn't change. > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > + : "cr0", "memory"); > + > + if (likely(prev == old)) > return 1; > - else > - return 0; > + return 0; > } > > /* > @@ -56,20 +82,25 @@ static __always_inline int trylock_clear_tail_cpu(struct > qspinlock *lock, int va > * This provides a release barrier for publishing node, and an acquire > barrier Does the comment mean there needs to be an acquire barrier in this assembly? > * for getting the old node. > */ > -static __always_inline int publish_tail_cpu(struct qspinlock *lock, int tail) > +static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail) > { > - for (;;) { > - int val = atomic_read(&lock->val); > - int newval = (val & ~_Q_TAIL_CPU_MASK) | tail; > - int old; > - > - old = atomic_cmpxchg(&lock->val, val, newval); > - if (old == val) > - return old; > - } > + u32 prev, tmp; > + > + asm volatile( > +"\t" PPC_RELEASE_BARRIER " \n" > +"1: lwarx %0,0,%2 # publish_tail_cpu \n" > +" andc %1,%0,%4 \n" > +" or %1,%1,%3 \n" > +" stwcx. %1,0,%2 \n" > +" bne- 1b \n" > + : "=&r" (prev), "=&r"(tmp) > + : "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK) > + : "cr0", "memory"); > + > + return prev; > } > > -static struct qnode *get_tail_qnode(struct qspinlock *lock, int val) > +static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) > { > int cpu = get_tail_cpu(val); > struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu); > @@ -88,7 +119,7 @@ static inline void queued_spin_lock_mcs_queue(struct > qspinlock *lock) > { > struct qnodes *qnodesp; > struct qnode *next, *node; > - int val, old, tail; > + u32 val, old, tail; > int idx; > > BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); > @@ -134,7 +165,7 @@ static inline void queued_spin_lock_mcs_queue(struct > qspinlock *lock) > } > > /* We're at the head of the waitqueue, wait for the lock. */ > - while ((val = atomic_read(&lock->val)) & _Q_LOCKED_VAL) > + while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) > cpu_relax(); > > /* If we're the last queued, must clean up the tail. */