On Mon, May 15, 2017 at 11:03:18AM +0200, Peter Zijlstra wrote: > On Sat, May 13, 2017 at 06:40:03AM -0700, Paul E. McKenney wrote: > > On Fri, May 12, 2017 at 05:34:48PM -0400, Steven Rostedt wrote: > > > On Fri, 12 May 2017 21:49:56 +0200 > > > > [ . . . ] > > > > > This means that text_mutex, which was taken by the alternative code, no > > > longer is taken in cpu hotplug code. That means there's no longer a > > > deadlock scenario, as we don't have anyplace(*) that grabs > > > get_online_cpus() and takes the text_mutex. Removing that will > > > simplify things tremendously! > > > > > > (*) with one exception: perf. > > > > > > Currently perf does a get_online_cpu() at a high level. Will it be > > > possible to move that down, such that we don't have it taken when we do > > > any software events? > > > > Can perf get rid of get_online_cpus(), perhaps using the mutexes acquired > > by perf_event_init_cpu() or by perf_event_exit_cpu()? > > > Best I could come up with is something like that below that moves the > get_online_cpus() to a possibly less onerous place. > > Compile tested only..
I freely confess that I don't fully understand this code, but... Given that you acquire the global pmus_lock when doing the get_online_cpus(), and given that CPU hotplug is rare, is it possible to momentarily acquire the global pmus_lock in perf_event_init_cpu() and perf_event_exit_cpu() and interact directly with that? Then perf would presumably leave alone any outgoing CPU that had already executed perf_event_exit_cpu(), and also any incoming CPU that had not already executed perf_event_init_cpu(). What prevents this approach from working? Thanx, Paul > --- > include/linux/perf_event.h | 2 ++ > kernel/events/core.c | 85 > ++++++++++++++++++++++++++++++++++------------ > 2 files changed, 66 insertions(+), 21 deletions(-) > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h > index 24a635887f28..7d6aa29094b2 100644 > --- a/include/linux/perf_event.h > +++ b/include/linux/perf_event.h > @@ -801,6 +801,8 @@ struct perf_cpu_context { > > struct list_head sched_cb_entry; > int sched_cb_usage; > + > + int online; > }; > > struct perf_output_handle { > diff --git a/kernel/events/core.c b/kernel/events/core.c > index 13f5b942580b..f9a595641d32 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -3812,14 +3812,6 @@ find_get_context(struct pmu *pmu, struct task_struct > *task, > if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) > return ERR_PTR(-EACCES); > > - /* > - * We could be clever and allow to attach a event to an > - * offline CPU and activate it when the CPU comes up, but > - * that's for later. > - */ > - if (!cpu_online(cpu)) > - return ERR_PTR(-ENODEV); > - > cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); > ctx = &cpuctx->ctx; > get_ctx(ctx); > @@ -9005,6 +8997,7 @@ int perf_pmu_register(struct pmu *pmu, const char > *name, int type) > { > int cpu, ret; > > + get_online_cpus(); > mutex_lock(&pmus_lock); > ret = -ENOMEM; > pmu->pmu_disable_count = alloc_percpu(int); > @@ -9064,6 +9057,7 @@ int perf_pmu_register(struct pmu *pmu, const char > *name, int type) > lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); > lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); > cpuctx->ctx.pmu = pmu; > + cpuctx->online = cpu_online(cpu); > > __perf_mux_hrtimer_init(cpuctx, cpu); > } > @@ -9099,6 +9093,7 @@ int perf_pmu_register(struct pmu *pmu, const char > *name, int type) > ret = 0; > unlock: > mutex_unlock(&pmus_lock); > + put_online_cpus(); > > return ret; > > @@ -9908,13 +9903,11 @@ SYSCALL_DEFINE5(perf_event_open, > if (flags & PERF_FLAG_PID_CGROUP) > cgroup_fd = pid; > > - get_online_cpus(); > - > event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, > NULL, NULL, cgroup_fd); > if (IS_ERR(event)) { > err = PTR_ERR(event); > - goto err_cpus; > + goto err_cred; > } > > if (is_sampling_event(event)) { > @@ -10078,6 +10071,23 @@ SYSCALL_DEFINE5(perf_event_open, > goto err_locked; > } > > + if (!task) { > + /* > + * Check if the @cpu we're creating an event for is online. > + * > + * We use the perf_cpu_context::ctx::mutex to serialize against > + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). > + */ > + struct perf_cpu_context *cpuctx = > + container_of(ctx, struct perf_cpu_context, ctx); > + > + if (!cpuctx->online) { > + err = -ENODEV; > + goto err_locked; > + } > + } > + > + > /* > * Must be under the same ctx::mutex as perf_install_in_context(), > * because we need to serialize with concurrent event creation. > @@ -10162,8 +10172,6 @@ SYSCALL_DEFINE5(perf_event_open, > perf_event_ctx_unlock(group_leader, gctx); > mutex_unlock(&ctx->mutex); > > - put_online_cpus(); > - > if (task) { > mutex_unlock(&task->signal->cred_guard_mutex); > put_task_struct(task); > @@ -10205,8 +10213,6 @@ SYSCALL_DEFINE5(perf_event_open, > */ > if (!event_file) > free_event(event); > -err_cpus: > - put_online_cpus(); > err_cred: > if (task) > mutex_unlock(&task->signal->cred_guard_mutex); > @@ -10237,7 +10243,6 @@ perf_event_create_kernel_counter(struct > perf_event_attr *attr, int cpu, > struct perf_event *event; > int err; > > - get_online_cpus(); > /* > * Get the target context (task or percpu): > */ > @@ -10264,6 +10269,21 @@ perf_event_create_kernel_counter(struct > perf_event_attr *attr, int cpu, > goto err_unlock; > } > > + if (!task) { > + /* > + * Check if the @cpu we're creating an event for is online. > + * > + * We use the perf_cpu_context::ctx::mutex to serialize against > + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). > + */ > + struct perf_cpu_context *cpuctx = > + container_of(ctx, struct perf_cpu_context, ctx); > + if (!cpuctx->online) { > + err = -ENODEV; > + goto err_unlock; > + } > + } > + > if (!exclusive_event_installable(event, ctx)) { > err = -EBUSY; > goto err_unlock; > @@ -10272,7 +10292,6 @@ perf_event_create_kernel_counter(struct > perf_event_attr *attr, int cpu, > perf_install_in_context(ctx, event, cpu); > perf_unpin_context(ctx); > mutex_unlock(&ctx->mutex); > - put_online_cpus(); > return event; > > err_unlock: > @@ -10282,7 +10301,6 @@ perf_event_create_kernel_counter(struct > perf_event_attr *attr, int cpu, > err_free: > free_event(event); > err: > - put_online_cpus(); > return ERR_PTR(err); > } > EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); > @@ -10951,7 +10969,7 @@ static void __init perf_event_init_all_cpus(void) > } > } > > -int perf_event_init_cpu(unsigned int cpu) > +void perf_swevent_init_cpu(unsigned int cpu) > { > struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); > > @@ -10964,7 +10982,6 @@ int perf_event_init_cpu(unsigned int cpu) > rcu_assign_pointer(swhash->swevent_hlist, hlist); > } > mutex_unlock(&swhash->hlist_mutex); > - return 0; > } > > #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE > @@ -10982,16 +10999,19 @@ static void __perf_event_exit_context(void *__info) > > static void perf_event_exit_cpu_context(int cpu) > { > + struct perf_cpu_context *cpuctx; > struct perf_event_context *ctx; > struct pmu *pmu; > int idx; > > idx = srcu_read_lock(&pmus_srcu); > list_for_each_entry_rcu(pmu, &pmus, entry) { > - ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; > + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); > + ctx = &cpuctx->ctx; > > mutex_lock(&ctx->mutex); > smp_call_function_single(cpu, __perf_event_exit_context, ctx, > 1); > + cpuctx->online = 0; > mutex_unlock(&ctx->mutex); > } > srcu_read_unlock(&pmus_srcu, idx); > @@ -11002,6 +11022,29 @@ static void perf_event_exit_cpu_context(int cpu) { } > > #endif > > +int perf_event_init_cpu(unsigned int cpu) > +{ > + struct perf_cpu_context *cpuctx; > + struct perf_event_context *ctx; > + struct pmu *pmu; > + int idx; > + > + perf_swevent_init_cpu(cpu); > + > + idx = srcu_read_lock(&pmus_srcu); > + list_for_each_entry_rcu(pmu, &pmus, entry) { > + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); > + ctx = &cpuctx->ctx; > + > + mutex_lock(&ctx->mutex); > + cpuctx->online = 1; > + mutex_unlock(&ctx->mutex); > + } > + srcu_read_unlock(&pmus_srcu, idx); > + > + return 0; > +} > + > int perf_event_exit_cpu(unsigned int cpu) > { > perf_event_exit_cpu_context(cpu); >