On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote: > Having all CPUs poll the lock word for the owner CPU that should be > yielded to defeats most of the purpose of using MCS queueing for > scalability. Yet it may be desirable for queued waiters to to yield > to a preempted owner. > > s390 addreses this problem by having queued waiters sample the lock > word to find the owner much less frequently. In this approach, the > waiters never sample it directly, but the queue head propagates the > owner CPU back to the next waiter if it ever finds the owner has > been preempted. Queued waiters then subsequently propagate the owner > CPU back to the next waiter, and so on. > > Disable this option by default for now, i.e., no logical change. > --- > arch/powerpc/lib/qspinlock.c | 85 +++++++++++++++++++++++++++++++++++- > 1 file changed, 84 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c > index 94f007f66942..28c85a2d5635 100644 > --- a/arch/powerpc/lib/qspinlock.c > +++ b/arch/powerpc/lib/qspinlock.c > @@ -12,6 +12,7 @@ > struct qnode { > struct qnode *next; > struct qspinlock *lock; > + int yield_cpu; > u8 locked; /* 1 if lock acquired */ > }; > > @@ -28,6 +29,7 @@ static int HEAD_SPINS __read_mostly = (1<<8); > static bool pv_yield_owner __read_mostly = true; > static bool pv_yield_allow_steal __read_mostly = false; > static bool pv_yield_prev __read_mostly = true; > +static bool pv_yield_propagate_owner __read_mostly = true;
This also seems to be enabled by default. > > static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); > > @@ -257,13 +259,66 @@ static __always_inline void > yield_head_to_locked_owner(struct qspinlock *lock, u > __yield_to_locked_owner(lock, val, paravirt, clear_mustq); > } > > +static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, > int *set_yield_cpu, bool paravirt) > +{ > + struct qnode *next; > + int owner; > + > + if (!paravirt) > + return; > + if (!pv_yield_propagate_owner) > + return; > + > + owner = get_owner_cpu(val); > + if (*set_yield_cpu == owner) > + return; > + > + next = READ_ONCE(node->next); > + if (!next) > + return; > + > + if (vcpu_is_preempted(owner)) { Is there a difference about using vcpu_is_preempted() here vs checking bit 0 in other places? > + next->yield_cpu = owner; > + *set_yield_cpu = owner; > + } else if (*set_yield_cpu != -1) { It might be worth giving the -1 CPU a #define. > + next->yield_cpu = owner; > + *set_yield_cpu = owner; > + } > +} Does this need to pass set_yield_cpu by reference? Couldn't it's new value be returned? To me it makes it more clear the function is used to change set_yield_cpu. I think this would work: int set_yield_cpu = -1; static __always_inline int propagate_yield_cpu(struct qnode *node, u32 val, int set_yield_cpu, bool paravirt) { struct qnode *next; int owner; if (!paravirt) goto out; if (!pv_yield_propagate_owner) goto out; owner = get_owner_cpu(val); if (set_yield_cpu == owner) goto out; next = READ_ONCE(node->next); if (!next) goto out; if (vcpu_is_preempted(owner)) { next->yield_cpu = owner; return owner; } else if (set_yield_cpu != -1) { next->yield_cpu = owner; return owner; } out: return set_yield_cpu; } set_yield_cpu = propagate_yield_cpu(... set_yield_cpu ...); > + > static __always_inline void yield_to_prev(struct qspinlock *lock, struct > qnode *node, int prev_cpu, bool paravirt) > { > u32 yield_count; > + int yield_cpu; > > if (!paravirt) > goto relax; > > + if (!pv_yield_propagate_owner) > + goto yield_prev; > + > + yield_cpu = READ_ONCE(node->yield_cpu); > + if (yield_cpu == -1) { > + /* Propagate back the -1 CPU */ > + if (node->next && node->next->yield_cpu != -1) > + node->next->yield_cpu = yield_cpu; > + goto yield_prev; > + } > + > + yield_count = yield_count_of(yield_cpu); > + if ((yield_count & 1) == 0) > + goto yield_prev; /* owner vcpu is running */ > + > + smp_rmb(); > + > + if (yield_cpu == node->yield_cpu) { > + if (node->next && node->next->yield_cpu != yield_cpu) > + node->next->yield_cpu = yield_cpu; > + yield_to_preempted(yield_cpu, yield_count); > + return; > + } > + > +yield_prev: > if (!pv_yield_prev) > goto relax; > > @@ -337,6 +392,7 @@ static __always_inline void > queued_spin_lock_mcs_queue(struct qspinlock *lock, b > node = &qnodesp->nodes[idx]; > node->next = NULL; > node->lock = lock; > + node->yield_cpu = -1; > node->locked = 0; > > tail = encode_tail_cpu(); > @@ -358,13 +414,21 @@ static __always_inline void > queued_spin_lock_mcs_queue(struct qspinlock *lock, b > while (!node->locked) > yield_to_prev(lock, node, prev_cpu, paravirt); > > + /* Clear out stale propagated yield_cpu */ > + if (paravirt && pv_yield_propagate_owner && node->yield_cpu != > -1) > + node->yield_cpu = -1; > + > smp_rmb(); /* acquire barrier for the mcs lock */ > } > > if (!MAYBE_STEALERS) { > + int set_yield_cpu = -1; > + > /* We're at the head of the waitqueue, wait for the lock. */ > - while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) > + while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) { > + propagate_yield_cpu(node, val, &set_yield_cpu, > paravirt); > yield_head_to_locked_owner(lock, val, paravirt, false); > + } > > /* If we're the last queued, must clean up the tail. */ > if ((val & _Q_TAIL_CPU_MASK) == tail) { > @@ -376,12 +440,14 @@ static __always_inline void > queued_spin_lock_mcs_queue(struct qspinlock *lock, b > /* We must be the owner, just set the lock bit and acquire */ > lock_set_locked(lock); > } else { > + int set_yield_cpu = -1; > int iters = 0; > bool set_mustq = false; > > again: > /* We're at the head of the waitqueue, wait for the lock. */ > while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) { > + propagate_yield_cpu(node, val, &set_yield_cpu, > paravirt); > yield_head_to_locked_owner(lock, val, paravirt, > pv_yield_allow_steal && set_mustq); > > @@ -540,6 +606,22 @@ static int pv_yield_prev_get(void *data, u64 *val) > > DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, > pv_yield_prev_set, "%llu\n"); > > +static int pv_yield_propagate_owner_set(void *data, u64 val) > +{ > + pv_yield_propagate_owner = !!val; > + > + return 0; > +} > + > +static int pv_yield_propagate_owner_get(void *data, u64 *val) > +{ > + *val = pv_yield_propagate_owner; > + > + return 0; > +} > + > +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, > pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n"); > + > static __init int spinlock_debugfs_init(void) > { > debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, > &fops_steal_spins); > @@ -548,6 +630,7 @@ static __init int spinlock_debugfs_init(void) > debugfs_create_file("qspl_pv_yield_owner", 0600, > arch_debugfs_dir, NULL, &fops_pv_yield_owner); > debugfs_create_file("qspl_pv_yield_allow_steal", 0600, > arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal); > debugfs_create_file("qspl_pv_yield_prev", 0600, > arch_debugfs_dir, NULL, &fops_pv_yield_prev); > + debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, > arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner); > } > > return 0;