Hi Waiman,

On Thu, May 26, 2016 at 02:21:57PM -0400, Waiman Long wrote:
> Currently, calling pv_hash() and setting _Q_SLOW_VAL is only
> done once for any pv_node. It is either in pv_kick_node() or in
> pv_wait_head_or_lock(). Because of lock stealing, a pv_kick'ed node is
> not guaranteed to get the lock before the spinning threshold expires
> and has to call pv_wait() again. As a result, the new lock holder
> won't see _Q_SLOW_VAL and so won't wake up the sleeping vCPU.
> 
> This patch fixes this missed PV wakeup problem by allowing multiple
> _Q_SLOW_VAL settings within pv_wait_head_or_lock() and matching each
> successful setting of _Q_SLOW_VAL to a pv_hash() call.
> 
> Reported-by: Pan Xinhui <xinhui....@linux.vnet.ibm.com>
> Signed-off-by: Waiman Long <waiman.l...@hpe.com>
> ---
>  kernel/locking/qspinlock_paravirt.h |   48 ++++++++++++++++++++++------------
>  1 files changed, 31 insertions(+), 17 deletions(-)
> 
> diff --git a/kernel/locking/qspinlock_paravirt.h 
> b/kernel/locking/qspinlock_paravirt.h
> index 21ede57..452d06d 100644
> --- a/kernel/locking/qspinlock_paravirt.h
> +++ b/kernel/locking/qspinlock_paravirt.h
> @@ -369,12 +369,16 @@ static void pv_kick_node(struct qspinlock *lock, struct 
> mcs_spinlock *node)
>       /*
>        * Put the lock into the hash table and set the _Q_SLOW_VAL.
>        *
> -      * As this is the same vCPU that will check the _Q_SLOW_VAL value and
> -      * the hash table later on at unlock time, no atomic instruction is
> -      * needed.
> +      * It is very unlikely that this will race with the _Q_SLOW_VAL setting
> +      * in pv_wait_head_or_lock(). However, we use cmpxchg() here to be
> +      * sure that we won't do a double pv_hash().
> +      *
> +      * As it is the lock holder, it won't race with
> +      * __pv_queued_spin_unlock().
>        */
> -     WRITE_ONCE(l->locked, _Q_SLOW_VAL);
> -     (void)pv_hash(lock, pn);
> +     if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)
> +                     == _Q_LOCKED_VAL))
> +             pv_hash(lock, pn);
>  }
>  
>  /*
> @@ -389,18 +393,10 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct 
> mcs_spinlock *node)
>  {
>       struct pv_node *pn = (struct pv_node *)node;
>       struct __qspinlock *l = (void *)lock;
> -     struct qspinlock **lp = NULL;
>       int waitcnt = 0;
>       int loop;
>  
>       /*
> -      * If pv_kick_node() already advanced our state, we don't need to
> -      * insert ourselves into the hash table anymore.
> -      */
> -     if (READ_ONCE(pn->state) == vcpu_hashed)
> -             lp = (struct qspinlock **)1;
> -
> -     /*
>        * Tracking # of slowpath locking operations
>        */
>       qstat_inc(qstat_pv_lock_slowpath, true);
> @@ -422,11 +418,19 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct 
> mcs_spinlock *node)
>                               goto gotlock;
>                       cpu_relax();
>               }
> -             clear_pending(lock);
>  
> +             /*
> +              * Make sure the lock value check below is executed after
> +              * all the previous loads.
> +              */
> +             smp_rmb();
>  
> -             if (!lp) { /* ONCE */
> -                     lp = pv_hash(lock, pn);
> +             /*
> +              * Set _Q_SLOW_VAL and hash the PV node, if necessary.
> +              */
> +             if (READ_ONCE(l->locked) != _Q_SLOW_VAL) {
> +                     struct qspinlock **lp = pv_hash(lock, pn);
> +                     u8 locked;
>  

Just out of curiosity, what if the following sequence happens:

        CPU 0                   CPU 1
        =================       ====================
        spin_lock():            spin_lock():
          pv_kick_node():         pv_wait_head_or_lock():
                                  if (READ_ONCE(l->locked) != _Q_SLOW_VAL) { // 
True
                                    pv_hash();

            cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);
            pv_hash();
                                    locked = xchg(&l->locked, _Q_SLOW_VAL);
        do_something();             if(...) {
                                    }
        spin_unlock():
          pv_unhash();
                                    else if (unlikely(locked == _Q_SLOW_VAL)) {
                                        WRITE_ONCE(*lp, NULL);

because pv_hash() on CPU 1 called before the one on CPU 0, therefore
the hash entry from CPU 1 is preceding the hash entry from CPU 0 in the
hash table, so that when pv_unhash() called, hash entry from CPU 1 will
be unhashed, however, the WRITE_ONCE(*lp, NULL) on CPU 1 will also
unhash the same entry, leaving that hash entry from CPU 0 not unhashed.

This could result in several interesting problems, right? ;-)

Am I missing something here?

Regards,
Boqun

>                       /*
>                        * We must hash before setting _Q_SLOW_VAL, such that
> @@ -439,7 +443,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct 
> mcs_spinlock *node)
>                        *
>                        * Matches the smp_rmb() in __pv_queued_spin_unlock().
>                        */
> -                     if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
> +                     locked = xchg(&l->locked, _Q_SLOW_VAL);
> +                     if (locked == 0) {
>                               /*
>                                * The lock was free and now we own the lock.
>                                * Change the lock value back to _Q_LOCKED_VAL
> @@ -447,9 +452,18 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct 
> mcs_spinlock *node)
>                                */
>                               WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
>                               WRITE_ONCE(*lp, NULL);
> +                             clear_pending(lock);
>                               goto gotlock;
> +                     } else if (unlikely(locked == _Q_SLOW_VAL)) {
> +                             /*
> +                              * Racing with pv_kick_node(), need to undo
> +                              * the pv_hash().
> +                              */
> +                             WRITE_ONCE(*lp, NULL);
>                       }
>               }
> +             clear_pending(lock);    /* Enable lock stealing */
> +
>               WRITE_ONCE(pn->state, vcpu_halted);
>               qstat_inc(qstat_pv_wait_head, true);
>               qstat_inc(qstat_pv_wait_again, waitcnt);
> -- 
> 1.7.1
> 

Attachment: signature.asc
Description: PGP signature

Reply via email to