Hello Peter

On 2016年04月20日 20:08, Peter Zijlstra wrote:
> On Thu, Apr 14, 2016 at 02:41:58PM -0400, Waiman Long wrote:
>> Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
>> to help the porting of pvqspinlock to PPC. The new argument will can
>> potentially help hypervisor expediate the execution of the critical
>> section so that the lock holder vCPU can release the lock sooner.
>>
>> This patch does just that by storing the previous node vCPU number.
>> In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
>> number as it is likely to be the lock holder.
>>
>> In pv_wait_node(), the newly added pv_lookup_hash() function will
>> be called to look up the queue head and pass in the lock holder vCPU
>> number stored there.
>>
>> This patch introduces negligible overhead to the current pvqspinlock
>> code. The extra lockcpu argument isn't currently used in x86
>> architecture.
> 
> This Changelog is completely useless; it does not explain how this
> works at all.
> 
> 
>> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
>> index ce2f75e..99f31e4 100644
>> --- a/kernel/locking/qspinlock.c
>> +++ b/kernel/locking/qspinlock.c
>> @@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock 
>> *lock)
>>   */
>>  
>>  static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
>> -static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
>> +static __always_inline void __pv_wait_node(struct qspinlock *lock,
>> +                                       struct mcs_spinlock *node,
>>                                         struct mcs_spinlock *prev) { }
>>  static __always_inline void __pv_kick_node(struct qspinlock *lock,
>>                                         struct mcs_spinlock *node) { }
>> @@ -407,7 +408,7 @@ queue:
>>              prev = decode_tail(old);
>>              WRITE_ONCE(prev->next, node);
>>  
>> -            pv_wait_node(node, prev);
>> +            pv_wait_node(lock, node, prev);
>>              arch_mcs_spin_lock_contended(&node->locked);
>>  
>>              /*
>> diff --git a/kernel/locking/qspinlock_paravirt.h 
>> b/kernel/locking/qspinlock_paravirt.h
>> index 21ede57..895224e 100644
>> --- a/kernel/locking/qspinlock_paravirt.h
>> +++ b/kernel/locking/qspinlock_paravirt.h
>> @@ -51,6 +51,7 @@ struct pv_node {
>>      struct mcs_spinlock     __res[3];
>>  
>>      int                     cpu;
>> +    int                     prev_cpu;       /* Previous node cpu */
> 
> That is a horrible name; what is a 'node cpu'.
> 
>>      u8                      state;
>>  };
>>  
>> @@ -156,8 +157,7 @@ static __always_inline int trylock_clear_pending(struct 
>> qspinlock *lock)
>>   * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
>>   *
>>   * Since we should not be holding locks from NMI context (very rare indeed) 
>> the
>> - * max load factor is 0.75, which is around the point where open addressing
>> - * breaks down.
>> + * max load factor is 0.75.
> 
> Why? Isn't that true anymore?
> 
>>   *
>>   */
>>  struct pv_hash_entry {
>> @@ -251,6 +251,31 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
>>  }
>>  
>>  /*
>> + * Look up the given lock in the hash table
>> + * Return the pv_node if found, NULL otherwise
>> + */
>> +static struct pv_node *pv_lookup_hash(struct qspinlock *lock)
>> +{
>> +    unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
>> +    struct pv_hash_entry *he;
>> +
>> +    for_each_hash_entry(he, offset, hash) {
>> +            struct qspinlock *l = READ_ONCE(he->lock);
>> +
>> +            if (l == lock)
> 
> The other loop writes:
> 
>               if (READ_ONCE(he->lock) == lock)
> 
Maybe because we check l is NULL or not later. So save one load.

>> +                    return READ_ONCE(he->node);
>> +            /*
>> +             * Presence of an empty slot signal the end of search. We
>> +             * may miss the entry, but that will limit the amount of
>> +             * time doing the search when the desired entry isn't there.
>> +             */
>> +            else if (!l)
>> +                    break;
> 
> That 'else' is entirely pointless. Also, why isn't this: return NULL;
> 
>> +    }
>> +    return NULL;
> 
> and this BUG() ?
> 
It's not a bug, the lock might not be stored in the hashtable. in unlock 
function, we will unhash the lock, then what will happen is:

cpu1                                    cpu2                                    
cpu3
pv_kick_node                    pv_wait_head_or_lock                    
pv_wait_node

                                WRITE_ONCE(pn->state, vcpu_running);
if (cmpxchg(&pn->state, 
vcpu_halted, vcpu_hashed) != vcpu_halted)
        return;
                                                                        
pv_hash_lookup //no lock in hashtable

So there is such case that we search the whole hashtable and the lock is not 
found. :(
Waiman assume that if l = null, the lock is not stored. however the lock might 
be there actually.
But to avoid the worst case I just mentioned above, it can quickly finish the 
lookup.
So I agree with Waiman.

> +}
>> +
>> +/*
>>   * Return true if when it is time to check the previous node which is not
>>   * in a running state.
>>   */
>> @@ -275,6 +300,7 @@ static void pv_init_node(struct mcs_spinlock *node)
>>  
>>      pn->cpu = smp_processor_id();
>>      pn->state = vcpu_running;
>> +    pn->prev_cpu = -1;
> 
> This does not match the struct element order.
> 
>>  }
>>  
>>  /*
>> @@ -282,7 +308,8 @@ static void pv_init_node(struct mcs_spinlock *node)
>>   * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
>>   * behalf.
>>   */
>> -static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock 
>> *prev)
>> +static void pv_wait_node(struct qspinlock *lock, struct mcs_spinlock *node,
>> +                     struct mcs_spinlock *prev)
>>  {
>>      struct pv_node *pn = (struct pv_node *)node;
>>      struct pv_node *pp = (struct pv_node *)prev;
>> @@ -290,6 +317,8 @@ static void pv_wait_node(struct mcs_spinlock *node, 
>> struct mcs_spinlock *prev)
>>      int loop;
>>      bool wait_early;
>>  
>> +    pn->prev_cpu = pp->cpu; /* Save previous node vCPU */
> 
> again a useless comment.
> 
>> +
>>      /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
>>      for (;; waitcnt++) {
>>              for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
>> @@ -314,10 +343,21 @@ static void pv_wait_node(struct mcs_spinlock *node, 
>> struct mcs_spinlock *prev)
>>              smp_store_mb(pn->state, vcpu_halted);
>>  
>>              if (!READ_ONCE(node->locked)) {
>> +                    struct pv_node *hn;
>> +
>>                      qstat_inc(qstat_pv_wait_node, true);
>>                      qstat_inc(qstat_pv_wait_again, waitcnt);
>>                      qstat_inc(qstat_pv_wait_early, wait_early);
>> -                    pv_wait(&pn->state, vcpu_halted);
>> +
>> +                    /*
>> +                     * We try to locate the queue head pv_node by looking
>> +                     * up the hash table. If it is not found, use the
>> +                     * CPU in the previous node instead.
>> +                     */
>> +                    hn = pv_lookup_hash(lock);
>> +                    if (!hn)
>> +                            hn = pn;
> 
> This is potentially expensive... it does not explain why this lookup can
> fail etc.. nor mentioned that lock stealing caveat.
>
Yes, it's expensive. Normally, PPC phyp don't always need the correct holder. 
That means current vcpu can just give up its slice.
There is one lpar hvcall H_CONFER. I paste some spec below.

hcall (const uint32 H_CONFER, /*Confer the calling virtual processor’s cycles 
to the specified processor*/
int32proc, /*Target Processor number -- minus 1 is all partition processors */
uint32 dispatch); /* The dispatch number (ignored if proc=caller) */  

So we really don't need the correct holder all the time. :)

>> +                    pv_wait(&pn->state, vcpu_halted, hn->prev_cpu);
>>              }
>>  
>>              /*
>> @@ -453,7 +493,15 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct 
>> mcs_spinlock *node)
>>              WRITE_ONCE(pn->state, vcpu_halted);
>>              qstat_inc(qstat_pv_wait_head, true);
>>              qstat_inc(qstat_pv_wait_again, waitcnt);
>> -            pv_wait(&l->locked, _Q_SLOW_VAL);
>> +
>> +            /*
>> +             * Pass in the previous node vCPU nmber which is likely to be
>> +             * the lock holder vCPU. This additional information may help
>> +             * the hypervisor to give more resource to that vCPU so that
>> +             * it can release the lock faster. With lock stealing,
>> +             * however, that vCPU may not be the actual lock holder.
>> +             */
>> +            pv_wait(&l->locked, _Q_SLOW_VAL, pn->prev_cpu);
> 
> 
> urgh..
> 
> 
> With all the holes in, does it really still matter?
> 
> In any case, I would really only want to see this together with the
> patches that make use of it, and then still have it have numbers with
> and without this thing.
> 
> 

I am preparing the patches. try to send them out during this week. :)

> 

Reply via email to