On Thu Nov 10, 2022 at 10:44 AM AEST, Jordan Niethe wrote:
> On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> [resend as utf-8, not utf-7]
> > Finding the owner or a queued waiter on a lock with a preempted vcpu
> > is indicative of an oversubscribed guest causing the lock to get into
> > trouble. Provide some options to detect this situation and have new
> > CPUs avoid queueing for a longer time (more steal iterations) to
> > minimise the problems caused by vcpu preemption on the queue.
> > ---
> >  arch/powerpc/include/asm/qspinlock_types.h |   7 +-
> >  arch/powerpc/lib/qspinlock.c               | 240 +++++++++++++++++++--
> >  2 files changed, 232 insertions(+), 15 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
> > b/arch/powerpc/include/asm/qspinlock_types.h
> > index 35f9525381e6..4fbcc8a4230b 100644
> > --- a/arch/powerpc/include/asm/qspinlock_types.h
> > +++ b/arch/powerpc/include/asm/qspinlock_types.h
> > @@ -30,7 +30,7 @@ typedef struct qspinlock {
> >   *
> >   *     0: locked bit
> >   *  1-14: lock holder cpu
> > - *    15: unused bit
> > + *    15: lock owner or queuer vcpus observed to be preempted bit
> >   *    16: must queue bit
> >   * 17-31: tail cpu (+1)
> >   */
> > @@ -49,6 +49,11 @@ typedef struct qspinlock {
> >  #error "qspinlock does not support such large CONFIG_NR_CPUS"
> >  #endif
> >  
> > +#define _Q_SLEEPY_OFFSET   15
> > +#define _Q_SLEEPY_BITS             1
> > +#define _Q_SLEEPY_MASK             _Q_SET_MASK(SLEEPY_OWNER)
> > +#define _Q_SLEEPY_VAL              (1U << _Q_SLEEPY_OFFSET)
> > +
> >  #define _Q_MUST_Q_OFFSET   16
> >  #define _Q_MUST_Q_BITS             1
> >  #define _Q_MUST_Q_MASK             _Q_SET_MASK(MUST_Q)
> > diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> > index 5cfd69931e31..c18133c01450 100644
> > --- a/arch/powerpc/lib/qspinlock.c
> > +++ b/arch/powerpc/lib/qspinlock.c
> > @@ -5,6 +5,7 @@
> >  #include <linux/percpu.h>
> >  #include <linux/smp.h>
> >  #include <linux/topology.h>
> > +#include <linux/sched/clock.h>
> >  #include <asm/qspinlock.h>
> >  #include <asm/paravirt.h>
> >  
> > @@ -36,24 +37,54 @@ static int HEAD_SPINS __read_mostly = (1<<8);
> >  static bool pv_yield_owner __read_mostly = true;
> >  static bool pv_yield_allow_steal __read_mostly = false;
> >  static bool pv_spin_on_preempted_owner __read_mostly = false;
> > +static bool pv_sleepy_lock __read_mostly = true;
> > +static bool pv_sleepy_lock_sticky __read_mostly = false;
>
> The sticky part could potentially be its own patch.

I'll see how that looks.

> > +static u64 pv_sleepy_lock_interval_ns __read_mostly = 0;
> > +static int pv_sleepy_lock_factor __read_mostly = 256;
> >  static bool pv_yield_prev __read_mostly = true;
> >  static bool pv_yield_propagate_owner __read_mostly = true;
> >  static bool pv_prod_head __read_mostly = false;
> >  
> >  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
> > +static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock);
> >  
> > -static __always_inline int get_steal_spins(bool paravirt, bool remote)
> > +static __always_inline bool recently_sleepy(void)
> > +{
>
> Other users of pv_sleepy_lock_interval_ns first check pv_sleepy_lock.

In this case it should be implied, I've added a comment.

>
> > +   if (pv_sleepy_lock_interval_ns) {
> > +           u64 seen = this_cpu_read(sleepy_lock_seen_clock);
> > +
> > +           if (seen) {
> > +                   u64 delta = sched_clock() - seen;
> > +                   if (delta < pv_sleepy_lock_interval_ns)
> > +                           return true;
> > +                   this_cpu_write(sleepy_lock_seen_clock, 0);
> > +           }
> > +   }
> > +
> > +   return false;
> > +}
> > +
> > +static __always_inline int get_steal_spins(bool paravirt, bool remote, 
> > bool sleepy)
>
> It seems like paravirt is implied by sleepy.
>
> >  {
> >     if (remote) {
> > -           return REMOTE_STEAL_SPINS;
> > +           if (paravirt && sleepy)
> > +                   return REMOTE_STEAL_SPINS * pv_sleepy_lock_factor;
> > +           else
> > +                   return REMOTE_STEAL_SPINS;
> >     } else {
> > -           return STEAL_SPINS;
> > +           if (paravirt && sleepy)
> > +                   return STEAL_SPINS * pv_sleepy_lock_factor;
> > +           else
> > +                   return STEAL_SPINS;
> >     }
> >  }
>
> I think that separate functions would still be nicer but this could get rid of
> the nesting conditionals like
>
>
>       int spins;
>       if (remote)
>               spins = REMOTE_STEAL_SPINS;
>       else
>               spins = STEAL_SPINS;
>
>       if (sleepy)
>               return spins * pv_sleepy_lock_factor;
>       return spins;

Yeah it was getting a bit out of hand.

>
> >  
> > -static __always_inline int get_head_spins(bool paravirt)
> > +static __always_inline int get_head_spins(bool paravirt, bool sleepy)
> >  {
> > -   return HEAD_SPINS;
> > +   if (paravirt && sleepy)
> > +           return HEAD_SPINS * pv_sleepy_lock_factor;
> > +   else
> > +           return HEAD_SPINS;
> >  }
> >  
> >  static inline u32 encode_tail_cpu(void)
> > @@ -206,6 +237,60 @@ static __always_inline u32 lock_clear_mustq(struct 
> > qspinlock *lock)
> >     return prev;
> >  }
> >  
> > +static __always_inline bool lock_try_set_sleepy(struct qspinlock *lock, 
> > u32 old)
> > +{
> > +   u32 prev;
> > +   u32 new = old | _Q_SLEEPY_VAL;
> > +
> > +   BUG_ON(!(old & _Q_LOCKED_VAL));
> > +   BUG_ON(old & _Q_SLEEPY_VAL);
> > +
> > +   asm volatile(
> > +"1:        lwarx   %0,0,%1         # lock_try_set_sleepy                   
> > \n"
> > +"  cmpw    0,%0,%2                                                 \n"
> > +"  bne-    2f                                                      \n"
> > +"  stwcx.  %3,0,%1                                                 \n"
> > +"  bne-    1b                                                      \n"
> > +"2:                                                                        
> > \n"
> > +   : "=&r" (prev)
> > +   : "r" (&lock->val), "r"(old), "r" (new)
> > +   : "cr0", "memory");
> > +
> > +   if (prev == old)
> > +           return true;
> > +   return false;
> > +}
> > +
> > +static __always_inline void seen_sleepy_owner(struct qspinlock *lock, u32 
> > val)
> > +{
> > +   if (pv_sleepy_lock) {
> > +           if (pv_sleepy_lock_interval_ns)
> > +                   this_cpu_write(sleepy_lock_seen_clock, sched_clock());
> > +           if (!(val & _Q_SLEEPY_VAL))
> > +                   lock_try_set_sleepy(lock, val);
> > +   }
> > +}
> > +
> > +static __always_inline void seen_sleepy_lock(void)
> > +{
> > +   if (pv_sleepy_lock && pv_sleepy_lock_interval_ns)
> > +           this_cpu_write(sleepy_lock_seen_clock, sched_clock());
> > +}
> > +
> > +static __always_inline void seen_sleepy_node(struct qspinlock *lock)
> > +{
>
> If yield_to_prev() was made to take a raw val, that val could be passed to
> seen_sleepy_node() and it would not need to get it by itself.

Yep.

>
> > +   if (pv_sleepy_lock) {
> > +           u32 val = READ_ONCE(lock->val);
> > +
> > +           if (pv_sleepy_lock_interval_ns)
> > +                   this_cpu_write(sleepy_lock_seen_clock, sched_clock());
> > +           if (val & _Q_LOCKED_VAL) {
> > +                   if (!(val & _Q_SLEEPY_VAL))
> > +                           lock_try_set_sleepy(lock, val);
> > +           }
> > +   }
> > +}
> > +
> >  static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
> >  {
> >     int cpu = get_tail_cpu(val);
> > @@ -244,6 +329,7 @@ static __always_inline void 
> > __yield_to_locked_owner(struct qspinlock *lock, u32
> >  
> >     spin_end();
> >  
> > +   seen_sleepy_owner(lock, val);
> >     *preempted = true;
> >  
> >     /*
> > @@ -307,11 +393,13 @@ static __always_inline void 
> > propagate_yield_cpu(struct qnode *node, u32 val, int
> >     }
> >  }
> >  
> > -static __always_inline void yield_to_prev(struct qspinlock *lock, struct 
> > qnode *node, int prev_cpu, bool paravirt)
> > +static __always_inline void yield_to_prev(struct qspinlock *lock, struct 
> > qnode *node, int prev_cpu, bool paravirt, bool *preempted)
> >  {
> >     u32 yield_count;
> >     int yield_cpu;
> >  
> > +   *preempted = false;
> > +
> >     if (!paravirt)
> >             goto relax;
> >  
> > @@ -332,6 +420,9 @@ static __always_inline void yield_to_prev(struct 
> > qspinlock *lock, struct qnode *
> >  
> >     spin_end();
> >  
> > +   *preempted = true;
> > +   seen_sleepy_node(lock);
> > +
> >     smp_rmb();
> >  
> >     if (yield_cpu == node->yield_cpu) {
> > @@ -353,6 +444,9 @@ static __always_inline void yield_to_prev(struct 
> > qspinlock *lock, struct qnode *
> >  
> >     spin_end();
> >  
> > +   *preempted = true;
> > +   seen_sleepy_node(lock);
> > +
> >     smp_rmb(); /* See yield_to_locked_owner comment */
> >  
> >     if (!node->locked) {
> > @@ -369,6 +463,9 @@ static __always_inline void yield_to_prev(struct 
> > qspinlock *lock, struct qnode *
> >  
> >  static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool 
> > paravirt)
> >  {
> > +   bool preempted;
> > +   bool seen_preempted = false;
> > +   bool sleepy = false;
> >     int iters = 0;
> >  
> >     if (!STEAL_SPINS) {
> > @@ -376,7 +473,6 @@ static __always_inline bool try_to_steal_lock(struct 
> > qspinlock *lock, bool parav
> >                     spin_begin();
> >                     for (;;) {
> >                             u32 val = READ_ONCE(lock->val);
> > -                           bool preempted;
> >  
> >                             if (val & _Q_MUST_Q_VAL)
> >                                     break;
> > @@ -395,7 +491,6 @@ static __always_inline bool try_to_steal_lock(struct 
> > qspinlock *lock, bool parav
> >     spin_begin();
> >     for (;;) {
> >             u32 val = READ_ONCE(lock->val);
> > -           bool preempted;
> >  
> >             if (val & _Q_MUST_Q_VAL)
> >                     break;
> > @@ -408,9 +503,29 @@ static __always_inline bool try_to_steal_lock(struct 
> > qspinlock *lock, bool parav
> >                     continue;
> >             }
> >  
> > +           if (paravirt && pv_sleepy_lock && !sleepy) {
> > +                   if (!sleepy) {
>
> The enclosing conditional means this would always be true. I think the out 
> conditional should be
> if (paravirt && pv_sleepy_lock)
> otherwise the pv_sleepy_lock_sticky part wouldn't work properly.

Good catch, I think you're right.
>
>
> > +                           if (val & _Q_SLEEPY_VAL) {
> > +                                   seen_sleepy_lock();
> > +                                   sleepy = true;
> > +                           } else if (recently_sleepy()) {
> > +                                   sleepy = true;
> > +                           }
> > +
> > +                   if (pv_sleepy_lock_sticky && seen_preempted &&
> > +                                   !(val & _Q_SLEEPY_VAL)) {
> > +                           if (lock_try_set_sleepy(lock, val))
> > +                                   val |= _Q_SLEEPY_VAL;
> > +                   }
> > +
> > +
> >             yield_to_locked_owner(lock, val, paravirt, &preempted);
> > +           if (preempted)
> > +                   seen_preempted = true;
>
> This could belong to the next if statement, there can not be !paravirt && 
> preempted ?

Yep.

Thanks,
Nick

Reply via email to