On Thu Nov 10, 2022 at 10:39 AM AEST, Jordan Niethe wrote: > On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote: > [resend as utf-8, not utf-7] > > This uses more optimal ll/sc style access patterns (rather than > > cmpxchg), and also sets the EH=1 lock hint on those operations > > which acquire ownership of the lock. > > --- > > arch/powerpc/include/asm/qspinlock.h | 25 +++++-- > > arch/powerpc/include/asm/qspinlock_types.h | 6 +- > > arch/powerpc/lib/qspinlock.c | 81 +++++++++++++++------- > > 3 files changed, 79 insertions(+), 33 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/qspinlock.h > > b/arch/powerpc/include/asm/qspinlock.h > > index 79a1936fb68d..3ab354159e5e 100644 > > --- a/arch/powerpc/include/asm/qspinlock.h > > +++ b/arch/powerpc/include/asm/qspinlock.h > > @@ -2,28 +2,43 @@ > > #ifndef _ASM_POWERPC_QSPINLOCK_H > > #define _ASM_POWERPC_QSPINLOCK_H > > > > -#include <linux/atomic.h> > > #include <linux/compiler.h> > > #include <asm/qspinlock_types.h> > > > > static __always_inline int queued_spin_is_locked(struct qspinlock *lock) > > { > > - return atomic_read(&lock->val); > > + return READ_ONCE(lock->val); > > } > > > > static __always_inline int queued_spin_value_unlocked(struct qspinlock > > lock) > > { > > - return !atomic_read(&lock.val); > > + return !lock.val; > > } > > > > static __always_inline int queued_spin_is_contended(struct qspinlock *lock) > > { > > - return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK); > > + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); > > } > > > > static __always_inline int queued_spin_trylock(struct qspinlock *lock) > > { > > - if (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0) > > + u32 new = _Q_LOCKED_VAL; > > + u32 prev; > > + > > + asm volatile( > > +"1: lwarx %0,0,%1,%3 # queued_spin_trylock > > \n" > > +" cmpwi 0,%0,0 \n" > > +" bne- 2f \n" > > +" stwcx. %2,0,%1 \n" > > +" bne- 1b \n" > > +"\t" PPC_ACQUIRE_BARRIER " > > \n" > > +"2: > > \n" > > + : "=&r" (prev) > > + : "r" (&lock->val), "r" (new), > > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > > btw IS_ENABLED() already returns 1 or 0
I guess we already do that in atomic.h too. Okay. > > + : "cr0", "memory"); > > This is the ISA's "test and set" atomic primitive. Do you think it would be > worth seperating it as a helper? It ends up getting more complex as we go. I might leave some of these primitives open coded for now, we could possibly look at providing them or reusing more generic primitives after the series though. > > + > > + if (likely(prev == 0)) > > return 1; > > return 0; > > same optional style nit: return likely(prev == 0); Will do. > > > } > > diff --git a/arch/powerpc/include/asm/qspinlock_types.h > > b/arch/powerpc/include/asm/qspinlock_types.h > > index 3425dab42576..210adf05b235 100644 > > --- a/arch/powerpc/include/asm/qspinlock_types.h > > +++ b/arch/powerpc/include/asm/qspinlock_types.h > > @@ -7,7 +7,7 @@ > > > > typedef struct qspinlock { > > union { > > - atomic_t val; > > + u32 val; > > > > #ifdef __LITTLE_ENDIAN > > struct { > > @@ -23,10 +23,10 @@ typedef struct qspinlock { > > }; > > } arch_spinlock_t; > > > > -#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } > > +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } } > > > > /* > > - * Bitfields in the atomic value: > > + * Bitfields in the lock word: > > * > > * 0: locked bit > > * 16-31: tail cpu (+1) > > diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c > > index 5ebb88d95636..7c71e5e287df 100644 > > --- a/arch/powerpc/lib/qspinlock.c > > +++ b/arch/powerpc/lib/qspinlock.c > > @@ -1,5 +1,4 @@ > > // SPDX-License-Identifier: GPL-2.0-or-later > > -#include <linux/atomic.h> > > #include <linux/bug.h> > > #include <linux/compiler.h> > > #include <linux/export.h> > > @@ -22,32 +21,59 @@ struct qnodes { > > > > static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); > > > > -static inline int encode_tail_cpu(void) > > +static inline u32 encode_tail_cpu(void) > > { > > return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET; > > } > > > > -static inline int get_tail_cpu(int val) > > +static inline int get_tail_cpu(u32 val) > > { > > return (val >> _Q_TAIL_CPU_OFFSET) - 1; > > } > > > > /* Take the lock by setting the bit, no other CPUs may concurrently lock > > it. */ > > I think you missed deleting the above line. > > > +/* Take the lock by setting the lock bit, no other CPUs will touch it. */ > > static __always_inline void lock_set_locked(struct qspinlock *lock) > > { > > - atomic_or(_Q_LOCKED_VAL, &lock->val); > > - __atomic_acquire_fence(); > > + u32 new = _Q_LOCKED_VAL; > > + u32 prev; > > + > > + asm volatile( > > +"1: lwarx %0,0,%1,%3 # lock_set_locked > > \n" > > +" or %0,%0,%2 \n" > > +" stwcx. %0,0,%1 \n" > > +" bne- 1b \n" > > +"\t" PPC_ACQUIRE_BARRIER " > > \n" > > + : "=&r" (prev) > > + : "r" (&lock->val), "r" (new), > > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > > + : "cr0", "memory"); > > } > > This is pretty similar with the DEFINE_TESTOP() pattern from > arch/powerpc/include/asm/bitops.h (such as test_and_set_bits_lock()) except > for > word instead of double word. Do you think it's possible / beneficial to make > use of those macros? If we could pull almost all our atomic primitives into one place and make them usable by atomics, bitops, locks, etc. might be a good idea. That function specifically works on a dword so we can't use it here, and I don't want to modify any files except for the new ones in this series if possible, but consolidating our primitives a bit more would be nice. > > -/* Take lock, clearing tail, cmpxchg with val (which must not be locked) */ > > -static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, > > int val) > > +/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */ > > +static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, > > u32 old) > > { > > - int newval = _Q_LOCKED_VAL; > > - > > - if (atomic_cmpxchg_acquire(&lock->val, val, newval) == val) > > + u32 new = _Q_LOCKED_VAL; > > + u32 prev; > > + > > + BUG_ON(old & _Q_LOCKED_VAL); > > The BUG_ON() could have been introduced in an earlier patch I think. Yes. > > + > > + asm volatile( > > +"1: lwarx %0,0,%1,%4 # trylock_clear_tail_cpu > > \n" > > +" cmpw 0,%0,%2 \n" > > +" bne- 2f \n" > > +" stwcx. %3,0,%1 \n" > > +" bne- 1b \n" > > +"\t" PPC_ACQUIRE_BARRIER " > > \n" > > +"2: > > \n" > > + : "=&r" (prev) > > + : "r" (&lock->val), "r"(old), "r" (new), > > Could this be like "r"(_Q_TAIL_CPU_MASK) below? > i.e. "r" (_Q_LOCKED_VAL)? Makes it clear new doesn't change. Sure. > > > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > > + : "cr0", "memory"); > > + > > + if (likely(prev == old)) > > return 1; > > - else > > - return 0; > > + return 0; > > } > > > > /* > > @@ -56,20 +82,25 @@ static __always_inline int > > trylock_clear_tail_cpu(struct qspinlock *lock, int va > > * This provides a release barrier for publishing node, and an acquire > > barrier > > Does the comment mean there needs to be an acquire barrier in this assembly? Yes, another good catch. What I'm going to do instead is add the acquire to get_tail_qnode() because that path is only hit when you have multiple waiters, and I think pairing it that way makes the barriers more obvious. Thanks, Nick