On Mon, Jan 23, 2017 at 01:35:18PM -0800, Lance Roy wrote: > SRCU uses two per-cpu counters: a nesting counter to count the number of > active critical sections, and a sequence counter to ensure that the nesting > counters don't change while they are being added together in > srcu_readers_active_idx_check(). > > This patch instead uses per-cpu lock and unlock counters. Because both > counters only increase and srcu_readers_active_idx_check() reads the unlock > counter before the lock counter, this achieves the same end without having > to increment two different counters in srcu_read_lock(). This also saves a > smp_mb() in srcu_readers_active_idx_check(). > > Possible bug: There is no guarantee that the lock counter won't overflow > during srcu_readers_active_idx_check(), as there are no memory barriers > around srcu_flip() (see comment in srcu_readers_active_idx_check() for > details). However, this problem was already present before this patch. > > Suggested-by: Mathieu Desnoyers <mathieu.desnoy...@efficios.com> > Signed-off-by: Lance Roy <ldr...@gmail.com> > Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com> > Cc: Lai Jiangshan <jiangshan...@gmail.com> > Cc: Peter Zijlstra <pet...@infradead.org>
OK, this only has differences only in the comments, so I can reasonably substitute it, even this near the next merge window. But let's talk about the potential overflow. If I understand correctly, for this to happen, there needs to be ULONG_MAX/2 or thereabouts srcu_read_lock() calls without matching srcu_read_unlock() calls. Let's focus on 32-bit systems for the moment. Lockdep allows at most 48 locks held at a given time by any single task, and RCU does not pass in a non-NULL nest_lock -- if you acquire more than that, a lockdep kernel will splat. Each task has at least one 4K page of kernel stack, so there can be at most 1,048,576 tasks (actually quite a bit fewer due to the size of the task_struct and so on). This allows at most 50,331,648 unmatched srcu_read_lock() calls in the system, which is not sufficient to cause overflow. Or am I missing something here? Thanx, Paul > --- > include/linux/srcu.h | 10 ++-- > kernel/rcu/rcutorture.c | 19 +++++++- > kernel/rcu/srcu.c | 122 > +++++++++++++++++------------------------------- > 3 files changed, 66 insertions(+), 85 deletions(-) > > diff --git a/include/linux/srcu.h b/include/linux/srcu.h > index dc8eb63..a598cf3 100644 > --- a/include/linux/srcu.h > +++ b/include/linux/srcu.h > @@ -33,9 +33,9 @@ > #include <linux/rcupdate.h> > #include <linux/workqueue.h> > > -struct srcu_struct_array { > - unsigned long c[2]; > - unsigned long seq[2]; > +struct srcu_array { > + unsigned long lock_count[2]; > + unsigned long unlock_count[2]; > }; > > struct rcu_batch { > @@ -46,7 +46,7 @@ struct rcu_batch { > > struct srcu_struct { > unsigned long completed; > - struct srcu_struct_array __percpu *per_cpu_ref; > + struct srcu_array __percpu *per_cpu_ref; > spinlock_t queue_lock; /* protect ->batch_queue, ->running */ > bool running; > /* callbacks just queued */ > @@ -118,7 +118,7 @@ void process_srcu(struct work_struct *work); > * See include/linux/percpu-defs.h for the rules on per-CPU variables. > */ > #define __DEFINE_SRCU(name, is_static) > \ > - static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\ > + static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ > is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) > #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) > #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) > diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c > index 87c5122..d81345b 100644 > --- a/kernel/rcu/rcutorture.c > +++ b/kernel/rcu/rcutorture.c > @@ -564,10 +564,25 @@ static void srcu_torture_stats(void) > pr_alert("%s%s per-CPU(idx=%d):", > torture_type, TORTURE_FLAG, idx); > for_each_possible_cpu(cpu) { > + unsigned long l0, l1; > + unsigned long u0, u1; > long c0, c1; > + struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, > cpu); > > - c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx]; > - c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx]; > + u0 = counts->unlock_count[!idx]; > + u1 = counts->unlock_count[idx]; > + > + /* > + * Make sure that a lock is always counted if the corresponding > + * unlock is counted. > + */ > + smp_rmb(); > + > + l0 = counts->lock_count[!idx]; > + l1 = counts->lock_count[idx]; > + > + c0 = l0 - u0; > + c1 = l1 - u1; > pr_cont(" %d(%ld,%ld)", cpu, c0, c1); > } > pr_cont("\n"); > diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c > index 9b9cdd5..c9a0015 100644 > --- a/kernel/rcu/srcu.c > +++ b/kernel/rcu/srcu.c > @@ -106,7 +106,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) > rcu_batch_init(&sp->batch_check1); > rcu_batch_init(&sp->batch_done); > INIT_DELAYED_WORK(&sp->work, process_srcu); > - sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); > + sp->per_cpu_ref = alloc_percpu(struct srcu_array); > return sp->per_cpu_ref ? 0 : -ENOMEM; > } > > @@ -141,114 +141,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); > #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ > > /* > - * Returns approximate total of the readers' ->seq[] values for the > + * Returns approximate total of the readers' ->lock_count[] values for the > * rank of per-CPU counters specified by idx. > */ > -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) > +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) > { > int cpu; > unsigned long sum = 0; > - unsigned long t; > > for_each_possible_cpu(cpu) { > - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); > - sum += t; > + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); > + > + sum += READ_ONCE(cpuc->lock_count[idx]); > } > return sum; > } > > /* > - * Returns approximate number of readers active on the specified rank > - * of the per-CPU ->c[] counters. > + * Returns approximate total of the readers' ->unlock_count[] values for the > + * rank of per-CPU counters specified by idx. > */ > -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) > +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) > { > int cpu; > unsigned long sum = 0; > - unsigned long t; > > for_each_possible_cpu(cpu) { > - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); > - sum += t; > + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); > + > + sum += READ_ONCE(cpuc->unlock_count[idx]); > } > return sum; > } > > /* > * Return true if the number of pre-existing readers is determined to > - * be stably zero. An example unstable zero can occur if the call > - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, > - * but due to task migration, sees the corresponding __srcu_read_unlock() > - * decrement. This can happen because srcu_readers_active_idx() takes > - * time to sum the array, and might in fact be interrupted or preempted > - * partway through the summation. > + * be zero. > */ > static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) > { > - unsigned long seq; > + unsigned long unlocks; > > - seq = srcu_readers_seq_idx(sp, idx); > + unlocks = srcu_readers_unlock_idx(sp, idx); > > /* > - * The following smp_mb() A pairs with the smp_mb() B located in > - * __srcu_read_lock(). This pairing ensures that if an > - * __srcu_read_lock() increments its counter after the summation > - * in srcu_readers_active_idx(), then the corresponding SRCU read-side > - * critical section will see any changes made prior to the start > - * of the current SRCU grace period. > + * Make sure that a lock is always counted if the corresponding unlock > + * is counted. Needs to be a smp_mb() as the read side may contain a > + * read from a variable that is written to before the synchronize_srcu() > + * in the write side. In this case smp_mb()s A and B act like the store > + * buffering pattern. > * > - * Also, if the above call to srcu_readers_seq_idx() saw the > - * increment of ->seq[], then the call to srcu_readers_active_idx() > - * must see the increment of ->c[]. > + * This smp_mb() also pairs with smp_mb() C to prevent accesses after > the > + * synchronize_srcu() from being executed before the grace period ends. > */ > smp_mb(); /* A */ > > /* > - * Note that srcu_readers_active_idx() can incorrectly return > - * zero even though there is a pre-existing reader throughout. > - * To see this, suppose that task A is in a very long SRCU > - * read-side critical section that started on CPU 0, and that > - * no other reader exists, so that the sum of the counters > - * is equal to one. Then suppose that task B starts executing > - * srcu_readers_active_idx(), summing up to CPU 1, and then that > - * task C starts reading on CPU 0, so that its increment is not > - * summed, but finishes reading on CPU 2, so that its decrement > - * -is- summed. Then when task B completes its sum, it will > - * incorrectly get zero, despite the fact that task A has been > - * in its SRCU read-side critical section the whole time. > + * If the locks are the same as the unlocks, then there must have > + * been no readers on this index at some time in between. This does not > + * mean that there are no more readers, as one could have read the > + * current index but not have incremented the lock counter yet. > * > - * We therefore do a validation step should srcu_readers_active_idx() > - * return zero. > + * Possible bug: There is no guarantee that there haven't been ULONG_MAX > + * increments of ->lock_count[] since the unlocks were counted, meaning > + * that this could return true even if there are still active readers. > + * Since there are no memory barriers around srcu_flip(), the CPU is not > + * required to increment ->completed before running > + * srcu_readers_unlock_idx(), which means that there could be an > + * arbitrarily large number of critical sections that execute after > + * srcu_readers_unlock_idx() but use the old value of ->completed. > */ > - if (srcu_readers_active_idx(sp, idx) != 0) > - return false; > - > - /* > - * The remainder of this function is the validation step. > - * The following smp_mb() D pairs with the smp_mb() C in > - * __srcu_read_unlock(). If the __srcu_read_unlock() was seen > - * by srcu_readers_active_idx() above, then any destructive > - * operation performed after the grace period will happen after > - * the corresponding SRCU read-side critical section. > - * > - * Note that there can be at most NR_CPUS worth of readers using > - * the old index, which is not enough to overflow even a 32-bit > - * integer. (Yes, this does mean that systems having more than > - * a billion or so CPUs need to be 64-bit systems.) Therefore, > - * the sum of the ->seq[] counters cannot possibly overflow. > - * Therefore, the only way that the return values of the two > - * calls to srcu_readers_seq_idx() can be equal is if there were > - * no increments of the corresponding rank of ->seq[] counts > - * in the interim. But the missed-increment scenario laid out > - * above includes an increment of the ->seq[] counter by > - * the corresponding __srcu_read_lock(). Therefore, if this > - * scenario occurs, the return values from the two calls to > - * srcu_readers_seq_idx() will differ, and thus the validation > - * step below suffices. > - */ > - smp_mb(); /* D */ > - > - return srcu_readers_seq_idx(sp, idx) == seq; > + return srcu_readers_lock_idx(sp, idx) == unlocks; > } > > /** > @@ -266,8 +229,12 @@ static bool srcu_readers_active(struct srcu_struct *sp) > unsigned long sum = 0; > > for_each_possible_cpu(cpu) { > - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); > - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); > + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); > + > + sum += READ_ONCE(cpuc->lock_count[0]); > + sum += READ_ONCE(cpuc->lock_count[1]); > + sum -= READ_ONCE(cpuc->unlock_count[0]); > + sum -= READ_ONCE(cpuc->unlock_count[1]); > } > return sum; > } > @@ -298,9 +265,8 @@ int __srcu_read_lock(struct srcu_struct *sp) > int idx; > > idx = READ_ONCE(sp->completed) & 0x1; > - __this_cpu_inc(sp->per_cpu_ref->c[idx]); > + __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); > smp_mb(); /* B */ /* Avoid leaking the critical section. */ > - __this_cpu_inc(sp->per_cpu_ref->seq[idx]); > return idx; > } > EXPORT_SYMBOL_GPL(__srcu_read_lock); > @@ -314,7 +280,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); > void __srcu_read_unlock(struct srcu_struct *sp, int idx) > { > smp_mb(); /* C */ /* Avoid leaking the critical section. */ > - this_cpu_dec(sp->per_cpu_ref->c[idx]); > + this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); > } > EXPORT_SYMBOL_GPL(__srcu_read_unlock); > > @@ -349,7 +315,7 @@ static bool try_check_zero(struct srcu_struct *sp, int > idx, int trycount) > > /* > * Increment the ->completed counter so that future SRCU readers will > - * use the other rank of the ->c[] and ->seq[] arrays. This allows > + * use the other rank of the ->(un)lock_count[] arrays. This allows > * us to wait for pre-existing readers in a starvation-free manner. > */ > static void srcu_flip(struct srcu_struct *sp) > -- > 2.9.0 >