A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more relaxed versions to improve performance on architectures that use LL/SC.
All the locking related cmpxchg's are replaced with the _acquire variants: - pv_queued_spin_steal_lock() - trylock_clear_pending() The cmpxchg's related to hashing are replaced by either by the _release or the _relaxed variants. See the inline comment for details. Signed-off-by: Waiman Long <long...@redhat.com> v1->v2: - Add comments in changelog and code for the rationale of the change. --- kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e3b5520..c31d1ab 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) struct __qspinlock *l = (void *)lock; if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { qstat_inc(qstat_pv_lock_stealing, true); return true; } @@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock) /* * The pending bit check in pv_queued_spin_steal_lock() isn't a memory - * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock - * just to be sure that it will get it. + * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the + * lock to provide the proper memory barrier. */ static __always_inline int trylock_clear_pending(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; return !READ_ONCE(l->locked) && - (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) - == _Q_PENDING_VAL); + (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL, + _Q_LOCKED_VAL) == _Q_PENDING_VAL); } #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) @@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) */ old = val; new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; - val = atomic_cmpxchg(&lock->val, old, new); + val = atomic_cmpxchg_acquire(&lock->val, old, new); if (val == old) return 1; @@ -209,9 +209,15 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) struct pv_hash_entry *he; int hopcnt = 0; + /* + * Synchronizing with the node state variable will control who does + * the hashing - the lock holder or lock waiter. The control + * dependency will ensure that node value is written after the lock + * value. So we don't need other ordering guarantee. + */ for_each_hash_entry(he, offset, hash) { hopcnt++; - if (!cmpxchg(&he->lock, NULL, lock)) { + if (!cmpxchg_relaxed(&he->lock, NULL, lock)) { WRITE_ONCE(he->node, node); qstat_hop(hopcnt); return &he->lock; @@ -309,7 +315,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) * MB MB * [L] pn->locked [RmW] pn->state = vcpu_hashed * - * Matches the cmpxchg() from pv_kick_node(). + * Matches the cmpxchg_release() from pv_kick_node(). */ smp_store_mb(pn->state, vcpu_halted); @@ -323,8 +329,14 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) * If pv_kick_node() changed us to vcpu_hashed, retain that * value so that pv_wait_head_or_lock() knows to not also try * to hash this lock. + * + * The smp_store_mb() and control dependency above will ensure + * that state change won't happen before that. Synchronizing + * with pv_kick_node() wrt hashing by this waiter or by the + * lock holder is done solely by the state variable. There is + * no other ordering requirement. */ - cmpxchg(&pn->state, vcpu_halted, vcpu_running); + cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running); /* * If the locked flag is still not set after wakeup, it is a @@ -360,9 +372,12 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * pv_wait_node(). If OTOH this fails, the vCPU was running and will * observe its next->locked value and advance itself. * - * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() + * Matches with smp_store_mb() and cmpxchg_relaxed() in pv_wait_node(). + * A release barrier is used here to ensure that node->locked is + * always set before changing the state. See comment in pv_wait_node(). */ - if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) + if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed) + != vcpu_halted) return; /* @@ -461,8 +476,8 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) } /* - * The cmpxchg() or xchg() call before coming here provides the - * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL + * The cmpxchg_acquire() or xchg() call before coming here provides + * the acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL * here is to indicate to the compiler that the value will always * be nozero to enable better code optimization. */ @@ -488,11 +503,12 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) } /* - * A failed cmpxchg doesn't provide any memory-ordering guarantees, - * so we need a barrier to order the read of the node data in - * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. + * A failed cmpxchg_release doesn't provide any memory-ordering + * guarantees, so we need a barrier to order the read of the node + * data in pv_unhash *after* we've read the lock being _Q_SLOW_VAL. * - * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL. + * Matches the cmpxchg_acquire() in pv_wait_head_or_lock() setting + * _Q_SLOW_VAL. */ smp_rmb(); -- 1.8.3.1