Re: [PATCH] srcu: Implement more-efficient reader counts

Paul E. McKenney Mon, 23 Jan 2017 16:44:05 -0800

On Mon, Jan 23, 2017 at 01:35:18PM -0800, Lance Roy wrote:
> SRCU uses two per-cpu counters: a nesting counter to count the number of
> active critical sections, and a sequence counter to ensure that the nesting
> counters don't change while they are being added together in
> srcu_readers_active_idx_check().
> 
> This patch instead uses per-cpu lock and unlock counters. Because both
> counters only increase and srcu_readers_active_idx_check() reads the unlock
> counter before the lock counter, this achieves the same end without having
> to increment two different counters in srcu_read_lock(). This also saves a
> smp_mb() in srcu_readers_active_idx_check().
> 
> Possible bug: There is no guarantee that the lock counter won't overflow
> during srcu_readers_active_idx_check(), as there are no memory barriers
> around srcu_flip() (see comment in srcu_readers_active_idx_check() for
> details). However, this problem was already present before this patch.
> 
> Suggested-by: Mathieu Desnoyers <mathieu.desnoy...@efficios.com>
> Signed-off-by: Lance Roy <ldr...@gmail.com>
> Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
> Cc: Lai Jiangshan <jiangshan...@gmail.com>
> Cc: Peter Zijlstra <pet...@infradead.org>


OK, this only has differences only in the comments, so I can reasonably
substitute it, even this near the next merge window.

But let's talk about the potential overflow.  If I understand correctly,
for this to happen, there needs to be ULONG_MAX/2 or thereabouts
srcu_read_lock() calls without matching srcu_read_unlock() calls.
Let's focus on 32-bit systems for the moment.

Lockdep allows at most 48 locks held at a given time by any single task,
and RCU does not pass in a non-NULL nest_lock -- if you acquire more than
that, a lockdep kernel will splat.  Each task has at least one 4K page of
kernel stack, so there can be at most 1,048,576 tasks (actually quite a
bit fewer due to the size of the task_struct and so on).  This allows
at most 50,331,648 unmatched srcu_read_lock() calls in the system,
which is not sufficient to cause overflow.

Or am I missing something here?

                                                                Thanx, Paul

> ---
>  include/linux/srcu.h    |  10 ++--
>  kernel/rcu/rcutorture.c |  19 +++++++-
>  kernel/rcu/srcu.c       | 122 
> +++++++++++++++++-------------------------------
>  3 files changed, 66 insertions(+), 85 deletions(-)
> 
> diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> index dc8eb63..a598cf3 100644
> --- a/include/linux/srcu.h
> +++ b/include/linux/srcu.h
> @@ -33,9 +33,9 @@
>  #include <linux/rcupdate.h>
>  #include <linux/workqueue.h>
> 
> -struct srcu_struct_array {
> -     unsigned long c[2];
> -     unsigned long seq[2];
> +struct srcu_array {
> +     unsigned long lock_count[2];
> +     unsigned long unlock_count[2];
>  };
> 
>  struct rcu_batch {
> @@ -46,7 +46,7 @@ struct rcu_batch {
> 
>  struct srcu_struct {
>       unsigned long completed;
> -     struct srcu_struct_array __percpu *per_cpu_ref;
> +     struct srcu_array __percpu *per_cpu_ref;
>       spinlock_t queue_lock; /* protect ->batch_queue, ->running */
>       bool running;
>       /* callbacks just queued */
> @@ -118,7 +118,7 @@ void process_srcu(struct work_struct *work);
>   * See include/linux/percpu-defs.h for the rules on per-CPU variables.
>   */
>  #define __DEFINE_SRCU(name, is_static)                                       
> \
> -     static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
> +     static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
>       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
>  #define DEFINE_SRCU(name)            __DEFINE_SRCU(name, /* not static */)
>  #define DEFINE_STATIC_SRCU(name)     __DEFINE_SRCU(name, static)
> diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
> index 87c5122..d81345b 100644
> --- a/kernel/rcu/rcutorture.c
> +++ b/kernel/rcu/rcutorture.c
> @@ -564,10 +564,25 @@ static void srcu_torture_stats(void)
>       pr_alert("%s%s per-CPU(idx=%d):",
>                torture_type, TORTURE_FLAG, idx);
>       for_each_possible_cpu(cpu) {
> +             unsigned long l0, l1;
> +             unsigned long u0, u1;
>               long c0, c1;
> +             struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, 
> cpu);
> 
> -             c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
> -             c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
> +             u0 = counts->unlock_count[!idx];
> +             u1 = counts->unlock_count[idx];
> +
> +             /*
> +              * Make sure that a lock is always counted if the corresponding
> +              * unlock is counted.
> +              */
> +             smp_rmb();
> +
> +             l0 = counts->lock_count[!idx];
> +             l1 = counts->lock_count[idx];
> +
> +             c0 = l0 - u0;
> +             c1 = l1 - u1;
>               pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
>       }
>       pr_cont("\n");
> diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
> index 9b9cdd5..c9a0015 100644
> --- a/kernel/rcu/srcu.c
> +++ b/kernel/rcu/srcu.c
> @@ -106,7 +106,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
>       rcu_batch_init(&sp->batch_check1);
>       rcu_batch_init(&sp->batch_done);
>       INIT_DELAYED_WORK(&sp->work, process_srcu);
> -     sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
> +     sp->per_cpu_ref = alloc_percpu(struct srcu_array);
>       return sp->per_cpu_ref ? 0 : -ENOMEM;
>  }
> 
> @@ -141,114 +141,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
>  #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
> 
>  /*
> - * Returns approximate total of the readers' ->seq[] values for the
> + * Returns approximate total of the readers' ->lock_count[] values for the
>   * rank of per-CPU counters specified by idx.
>   */
> -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
> +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
>  {
>       int cpu;
>       unsigned long sum = 0;
> -     unsigned long t;
> 
>       for_each_possible_cpu(cpu) {
> -             t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
> -             sum += t;
> +             struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
> +
> +             sum += READ_ONCE(cpuc->lock_count[idx]);
>       }
>       return sum;
>  }
> 
>  /*
> - * Returns approximate number of readers active on the specified rank
> - * of the per-CPU ->c[] counters.
> + * Returns approximate total of the readers' ->unlock_count[] values for the
> + * rank of per-CPU counters specified by idx.
>   */
> -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
> +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
>  {
>       int cpu;
>       unsigned long sum = 0;
> -     unsigned long t;
> 
>       for_each_possible_cpu(cpu) {
> -             t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
> -             sum += t;
> +             struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
> +
> +             sum += READ_ONCE(cpuc->unlock_count[idx]);
>       }
>       return sum;
>  }
> 
>  /*
>   * Return true if the number of pre-existing readers is determined to
> - * be stably zero.  An example unstable zero can occur if the call
> - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
> - * but due to task migration, sees the corresponding __srcu_read_unlock()
> - * decrement.  This can happen because srcu_readers_active_idx() takes
> - * time to sum the array, and might in fact be interrupted or preempted
> - * partway through the summation.
> + * be zero.
>   */
>  static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
>  {
> -     unsigned long seq;
> +     unsigned long unlocks;
> 
> -     seq = srcu_readers_seq_idx(sp, idx);
> +     unlocks = srcu_readers_unlock_idx(sp, idx);
> 
>       /*
> -      * The following smp_mb() A pairs with the smp_mb() B located in
> -      * __srcu_read_lock().  This pairing ensures that if an
> -      * __srcu_read_lock() increments its counter after the summation
> -      * in srcu_readers_active_idx(), then the corresponding SRCU read-side
> -      * critical section will see any changes made prior to the start
> -      * of the current SRCU grace period.
> +      * Make sure that a lock is always counted if the corresponding unlock
> +      * is counted. Needs to be a smp_mb() as the read side may contain a
> +      * read from a variable that is written to before the synchronize_srcu()
> +      * in the write side. In this case smp_mb()s A and B act like the store
> +      * buffering pattern.
>        *
> -      * Also, if the above call to srcu_readers_seq_idx() saw the
> -      * increment of ->seq[], then the call to srcu_readers_active_idx()
> -      * must see the increment of ->c[].
> +      * This smp_mb() also pairs with smp_mb() C to prevent accesses after 
> the
> +      * synchronize_srcu() from being executed before the grace period ends.
>        */
>       smp_mb(); /* A */
> 
>       /*
> -      * Note that srcu_readers_active_idx() can incorrectly return
> -      * zero even though there is a pre-existing reader throughout.
> -      * To see this, suppose that task A is in a very long SRCU
> -      * read-side critical section that started on CPU 0, and that
> -      * no other reader exists, so that the sum of the counters
> -      * is equal to one.  Then suppose that task B starts executing
> -      * srcu_readers_active_idx(), summing up to CPU 1, and then that
> -      * task C starts reading on CPU 0, so that its increment is not
> -      * summed, but finishes reading on CPU 2, so that its decrement
> -      * -is- summed.  Then when task B completes its sum, it will
> -      * incorrectly get zero, despite the fact that task A has been
> -      * in its SRCU read-side critical section the whole time.
> +      * If the locks are the same as the unlocks, then there must have
> +      * been no readers on this index at some time in between. This does not
> +      * mean that there are no more readers, as one could have read the
> +      * current index but not have incremented the lock counter yet.
>        *
> -      * We therefore do a validation step should srcu_readers_active_idx()
> -      * return zero.
> +      * Possible bug: There is no guarantee that there haven't been ULONG_MAX
> +      * increments of ->lock_count[] since the unlocks were counted, meaning
> +      * that this could return true even if there are still active readers.
> +      * Since there are no memory barriers around srcu_flip(), the CPU is not
> +      * required to increment ->completed before running
> +      * srcu_readers_unlock_idx(), which means that there could be an
> +      * arbitrarily large number of critical sections that execute after
> +      * srcu_readers_unlock_idx() but use the old value of ->completed.
>        */
> -     if (srcu_readers_active_idx(sp, idx) != 0)
> -             return false;
> -
> -     /*
> -      * The remainder of this function is the validation step.
> -      * The following smp_mb() D pairs with the smp_mb() C in
> -      * __srcu_read_unlock().  If the __srcu_read_unlock() was seen
> -      * by srcu_readers_active_idx() above, then any destructive
> -      * operation performed after the grace period will happen after
> -      * the corresponding SRCU read-side critical section.
> -      *
> -      * Note that there can be at most NR_CPUS worth of readers using
> -      * the old index, which is not enough to overflow even a 32-bit
> -      * integer.  (Yes, this does mean that systems having more than
> -      * a billion or so CPUs need to be 64-bit systems.)  Therefore,
> -      * the sum of the ->seq[] counters cannot possibly overflow.
> -      * Therefore, the only way that the return values of the two
> -      * calls to srcu_readers_seq_idx() can be equal is if there were
> -      * no increments of the corresponding rank of ->seq[] counts
> -      * in the interim.  But the missed-increment scenario laid out
> -      * above includes an increment of the ->seq[] counter by
> -      * the corresponding __srcu_read_lock().  Therefore, if this
> -      * scenario occurs, the return values from the two calls to
> -      * srcu_readers_seq_idx() will differ, and thus the validation
> -      * step below suffices.
> -      */
> -     smp_mb(); /* D */
> -
> -     return srcu_readers_seq_idx(sp, idx) == seq;
> +     return srcu_readers_lock_idx(sp, idx) == unlocks;
>  }
> 
>  /**
> @@ -266,8 +229,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
>       unsigned long sum = 0;
> 
>       for_each_possible_cpu(cpu) {
> -             sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
> -             sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
> +             struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
> +
> +             sum += READ_ONCE(cpuc->lock_count[0]);
> +             sum += READ_ONCE(cpuc->lock_count[1]);
> +             sum -= READ_ONCE(cpuc->unlock_count[0]);
> +             sum -= READ_ONCE(cpuc->unlock_count[1]);
>       }
>       return sum;
>  }
> @@ -298,9 +265,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
>       int idx;
> 
>       idx = READ_ONCE(sp->completed) & 0x1;
> -     __this_cpu_inc(sp->per_cpu_ref->c[idx]);
> +     __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
>       smp_mb(); /* B */  /* Avoid leaking the critical section. */
> -     __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
>       return idx;
>  }
>  EXPORT_SYMBOL_GPL(__srcu_read_lock);
> @@ -314,7 +280,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
>  void __srcu_read_unlock(struct srcu_struct *sp, int idx)
>  {
>       smp_mb(); /* C */  /* Avoid leaking the critical section. */
> -     this_cpu_dec(sp->per_cpu_ref->c[idx]);
> +     this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
>  }
>  EXPORT_SYMBOL_GPL(__srcu_read_unlock);
> 
> @@ -349,7 +315,7 @@ static bool try_check_zero(struct srcu_struct *sp, int 
> idx, int trycount)
> 
>  /*
>   * Increment the ->completed counter so that future SRCU readers will
> - * use the other rank of the ->c[] and ->seq[] arrays.  This allows
> + * use the other rank of the ->(un)lock_count[] arrays.  This allows
>   * us to wait for pre-existing readers in a starvation-free manner.
>   */
>  static void srcu_flip(struct srcu_struct *sp)
> -- 
> 2.9.0
>

Re: [PATCH] srcu: Implement more-efficient reader counts

Reply via email to