Le 10/11/2022 à 01:39, Jordan Niethe a écrit : > On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote: > [resend as utf-8, not utf-7] >> This uses more optimal ll/sc style access patterns (rather than >> cmpxchg), and also sets the EH=1 lock hint on those operations >> which acquire ownership of the lock. >> --- >> arch/powerpc/include/asm/qspinlock.h | 25 +++++-- >> arch/powerpc/include/asm/qspinlock_types.h | 6 +- >> arch/powerpc/lib/qspinlock.c | 81 +++++++++++++++------- >> 3 files changed, 79 insertions(+), 33 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/qspinlock.h >> b/arch/powerpc/include/asm/qspinlock.h >> index 79a1936fb68d..3ab354159e5e 100644 >> --- a/arch/powerpc/include/asm/qspinlock.h >> +++ b/arch/powerpc/include/asm/qspinlock.h >> @@ -2,28 +2,43 @@ >> #ifndef _ASM_POWERPC_QSPINLOCK_H >> #define _ASM_POWERPC_QSPINLOCK_H >> >> -#include <linux/atomic.h> >> #include <linux/compiler.h> >> #include <asm/qspinlock_types.h> >> >> static __always_inline int queued_spin_is_locked(struct qspinlock *lock) >> { >> - return atomic_read(&lock->val); >> + return READ_ONCE(lock->val); >> } >> >> static __always_inline int queued_spin_value_unlocked(struct qspinlock >> lock) >> { >> - return !atomic_read(&lock.val); >> + return !lock.val; >> } >> >> static __always_inline int queued_spin_is_contended(struct qspinlock *lock) >> { >> - return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK); >> + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); >> } >> >> static __always_inline int queued_spin_trylock(struct qspinlock *lock) >> { >> - if (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0) >> + u32 new = _Q_LOCKED_VAL; >> + u32 prev; >> + >> + asm volatile( >> +"1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" >> +" cmpwi 0,%0,0 \n" >> +" bne- 2f \n" >> +" stwcx. %2,0,%1 \n" >> +" bne- 1b \n" >> +"\t" PPC_ACQUIRE_BARRIER " >> \n" >> +"2: \n" >> + : "=&r" (prev) >> + : "r" (&lock->val), "r" (new), >> + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > > btw IS_ENABLED() already returns 1 or 0 > >> + : "cr0", "memory"); > > This is the ISA's "test and set" atomic primitive. Do you think it would be > worth seperating it as a helper? > >> + >> + if (likely(prev == 0)) >> return 1; >> return 0; > > same optional style nit: return likely(prev == 0);
return likely(!prev); > >> } >> diff --git a/arch/powerpc/include/asm/qspinlock_types.h >> b/arch/powerpc/include/asm/qspinlock_types.h >> index 3425dab42576..210adf05b235 100644 >> --- a/arch/powerpc/include/asm/qspinlock_types.h >> +++ b/arch/powerpc/include/asm/qspinlock_types.h >> @@ -7,7 +7,7 @@ >> >> typedef struct qspinlock { >> union { >> - atomic_t val; >> + u32 val; >> >> #ifdef __LITTLE_ENDIAN >> struct { >> @@ -23,10 +23,10 @@ typedef struct qspinlock { >> }; >> } arch_spinlock_t; >> >> -#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } >> +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } } >> >> /* >> - * Bitfields in the atomic value: >> + * Bitfields in the lock word: >> * >> * 0: locked bit >> * 16-31: tail cpu (+1) >> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c >> index 5ebb88d95636..7c71e5e287df 100644 >> --- a/arch/powerpc/lib/qspinlock.c >> +++ b/arch/powerpc/lib/qspinlock.c >> @@ -1,5 +1,4 @@ >> // SPDX-License-Identifier: GPL-2.0-or-later >> -#include <linux/atomic.h> >> #include <linux/bug.h> >> #include <linux/compiler.h> >> #include <linux/export.h> >> @@ -22,32 +21,59 @@ struct qnodes { >> >> static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); >> >> -static inline int encode_tail_cpu(void) >> +static inline u32 encode_tail_cpu(void) >> { >> return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET; >> } >> >> -static inline int get_tail_cpu(int val) >> +static inline int get_tail_cpu(u32 val) >> { >> return (val >> _Q_TAIL_CPU_OFFSET) - 1; >> } >> >> /* Take the lock by setting the bit, no other CPUs may concurrently lock >> it. */ > > I think you missed deleting the above line. > >> +/* Take the lock by setting the lock bit, no other CPUs will touch it. */ >> static __always_inline void lock_set_locked(struct qspinlock *lock) >> { >> - atomic_or(_Q_LOCKED_VAL, &lock->val); >> - __atomic_acquire_fence(); >> + u32 new = _Q_LOCKED_VAL; >> + u32 prev; >> + >> + asm volatile( >> +"1: lwarx %0,0,%1,%3 # lock_set_locked \n" >> +" or %0,%0,%2 \n" >> +" stwcx. %0,0,%1 \n" >> +" bne- 1b \n" >> +"\t" PPC_ACQUIRE_BARRIER " >> \n" >> + : "=&r" (prev) >> + : "r" (&lock->val), "r" (new), >> + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) >> + : "cr0", "memory"); >> } > > This is pretty similar with the DEFINE_TESTOP() pattern from > arch/powerpc/include/asm/bitops.h (such as test_and_set_bits_lock()) except > for > word instead of double word. Do you think it's possible / beneficial to make > use of those macros? > > >> >> -/* Take lock, clearing tail, cmpxchg with val (which must not be locked) */ >> -static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, >> int val) >> +/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */ >> +static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, >> u32 old) >> { >> - int newval = _Q_LOCKED_VAL; >> - >> - if (atomic_cmpxchg_acquire(&lock->val, val, newval) == val) >> + u32 new = _Q_LOCKED_VAL; >> + u32 prev; >> + >> + BUG_ON(old & _Q_LOCKED_VAL); > > The BUG_ON() could have been introduced in an earlier patch I think. Can we avoid the BUG_ON() at all and replace by a WARN_ON ? > >> + >> + asm volatile( >> +"1: lwarx %0,0,%1,%4 # trylock_clear_tail_cpu \n" >> +" cmpw 0,%0,%2 \n" >> +" bne- 2f \n" >> +" stwcx. %3,0,%1 \n" >> +" bne- 1b \n" >> +"\t" PPC_ACQUIRE_BARRIER " >> \n" >> +"2: \n" >> + : "=&r" (prev) >> + : "r" (&lock->val), "r"(old), "r" (new), > > Could this be like "r"(_Q_TAIL_CPU_MASK) below? > i.e. "r" (_Q_LOCKED_VAL)? Makes it clear new doesn't change. > >> + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) >> + : "cr0", "memory"); >> + >> + if (likely(prev == old)) >> return 1; >> - else >> - return 0; >> + return 0; >> } >> >> /* >> @@ -56,20 +82,25 @@ static __always_inline int trylock_clear_tail_cpu(struct >> qspinlock *lock, int va >> * This provides a release barrier for publishing node, and an acquire >> barrier > > Does the comment mean there needs to be an acquire barrier in this assembly? > > >> * for getting the old node. >> */ >> -static __always_inline int publish_tail_cpu(struct qspinlock *lock, int >> tail) >> +static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 >> tail) >> { >> - for (;;) { >> - int val = atomic_read(&lock->val); >> - int newval = (val & ~_Q_TAIL_CPU_MASK) | tail; >> - int old; >> - >> - old = atomic_cmpxchg(&lock->val, val, newval); >> - if (old == val) >> - return old; >> - } >> + u32 prev, tmp; >> + >> + asm volatile( >> +"\t" PPC_RELEASE_BARRIER " >> \n" >> +"1: lwarx %0,0,%2 # publish_tail_cpu \n" >> +" andc %1,%0,%4 \n" >> +" or %1,%1,%3 \n" >> +" stwcx. %1,0,%2 \n" >> +" bne- 1b \n" >> + : "=&r" (prev), "=&r"(tmp) >> + : "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK) >> + : "cr0", "memory"); >> + >> + return prev; >> } >> >> -static struct qnode *get_tail_qnode(struct qspinlock *lock, int val) >> +static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) >> { >> int cpu = get_tail_cpu(val); >> struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu); >> @@ -88,7 +119,7 @@ static inline void queued_spin_lock_mcs_queue(struct >> qspinlock *lock) >> { >> struct qnodes *qnodesp; >> struct qnode *next, *node; >> - int val, old, tail; >> + u32 val, old, tail; >> int idx; >> >> BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); >> @@ -134,7 +165,7 @@ static inline void queued_spin_lock_mcs_queue(struct >> qspinlock *lock) >> } >> >> /* We're at the head of the waitqueue, wait for the lock. */ >> - while ((val = atomic_read(&lock->val)) & _Q_LOCKED_VAL) >> + while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) >> cpu_relax(); >> >> /* If we're the last queued, must clean up the tail. */ >