Whenever perf tool is executed inside a container, this patch restricts the events to the perf-namespace in which the perf tool is executing.
This patch is based on the existing support available for tracing with cgroups. TODO: - Avoid code duplication. Signed-off-by: Aravinda Prasad <aravi...@linux.vnet.ibm.com> --- include/linux/perf_event.h | 8 + include/linux/perf_namespace.h | 6 + kernel/events/core.c | 347 ++++++++++++++++++++++++++++++++++++++++ kernel/perf_namespace.c | 8 + 4 files changed, 368 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1a827ce..8d797d9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -43,6 +43,7 @@ struct perf_guest_info_callbacks { #include <linux/hrtimer.h> #include <linux/fs.h> #include <linux/pid_namespace.h> +#include <linux/perf_namespace.h> #include <linux/workqueue.h> #include <linux/ftrace.h> #include <linux/cpu.h> @@ -656,6 +657,11 @@ struct perf_event { struct rcu_head rcu_head; struct pid_namespace *ns; +#ifdef CONFIG_PERF_NS + struct perf_namespace *perf_ns; + int perfns_defer_enabled; +#endif + u64 id; u64 (*clock)(void); @@ -725,6 +731,7 @@ struct perf_event_context { u64 generation; int pin_count; int nr_cgroups; /* cgroup evts */ + int nr_perfns; void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; }; @@ -751,6 +758,7 @@ struct perf_cpu_context { struct pmu *unique_pmu; struct perf_cgroup *cgrp; + struct perf_namespace *perf_ns; }; struct perf_output_handle { diff --git a/include/linux/perf_namespace.h b/include/linux/perf_namespace.h index 9713724..2aad0e9 100644 --- a/include/linux/perf_namespace.h +++ b/include/linux/perf_namespace.h @@ -8,8 +8,14 @@ struct user_namespace; extern struct user_namespace init_user_ns; +struct perf_ns_info { + u64 time; + u64 timestamp; +}; + struct perf_namespace { struct kref kref; + struct perf_ns_info __percpu *info; struct user_namespace *user_ns; /* Owning user namespace */ struct ns_common ns; }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 274450e..757a169 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -334,6 +334,7 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(atomic_t, perf_perfns_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); static atomic_t nr_mmap_events __read_mostly; @@ -914,6 +915,288 @@ perf_cgroup_mark_enabled(struct perf_event *event, } #endif +#ifdef CONFIG_PERF_NS +static inline bool perf_perfns_match(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + /* @event doesn't care about perfns */ + if (!event->perf_ns) + return true; + + if (cpuctx->perf_ns != event->perf_ns) + return false; + + return true; +} + +static inline void perf_detach_perfns(struct perf_event *event) +{ + event->perf_ns = NULL; +} + +static inline int is_perfns_event(struct perf_event *event) +{ + return event->perf_ns != NULL; +} + +static inline u64 perf_perfns_event_time(struct perf_event *event) +{ + struct perf_ns_info *t; + + t = per_cpu_ptr(event->perf_ns->info, event->cpu); + return t ? t->time : 0; +} + +static inline void __update_perfns_time(struct perf_namespace *p_ns) +{ + struct perf_ns_info *info; + u64 now; + + now = perf_clock(); + + if (!p_ns->info) + return; + + info = this_cpu_ptr(p_ns->info); + + info->time += now - info->timestamp; + info->timestamp = now; +} + +static inline void update_perfns_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ + struct perf_namespace *perfns_out = cpuctx->perf_ns; + + if (perfns_out) + __update_perfns_time(perfns_out); +} + +static inline void update_perfns_time_from_event(struct perf_event *event) +{ + struct perf_namespace *perf_ns = current->nsproxy->perf_ns; + + if (!is_perfns_event(event)) + return; + + if (perf_ns == event->perf_ns) + __update_perfns_time(event->perf_ns); +} + +static inline void +perf_perfns_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ + struct perf_namespace *perf_ns = task->nsproxy->perf_ns; + struct perf_ns_info *info; + + if (!task || !ctx->nr_perfns) + return; + + if (!perf_ns->info) + return; + + info = this_cpu_ptr(perf_ns->info); + info->timestamp = ctx->timestamp; +} + +#define PERF_PERFNS_SWOUT 0x1 /* perfns switch out every event */ +#define PERF_PERFNS_SWIN 0x2 /* perfns switch in events based on task */ + +/* + * mode SWOUT : schedule out everything + * mode SWIN : schedule in based on perfns for next + */ +static void perf_perfns_switch(struct task_struct *task, int mode) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + local_irq_save(flags); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->unique_pmu != pmu) + continue; /* ensure we process each cpuctx once */ + + if (cpuctx->ctx.nr_perfns > 0) { + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(cpuctx->ctx.pmu); + + if (mode & PERF_PERFNS_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->perf_ns = NULL; + } + + if (mode & PERF_PERFNS_SWIN) { + WARN_ON_ONCE(cpuctx->perf_ns); + + cpuctx->perf_ns = task->nsproxy->perf_ns; + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); + } + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + } + } + + local_irq_restore(flags); +} + +static inline void perf_perfns_sched_out(struct task_struct *task, + struct task_struct *next) +{ + rcu_read_lock(); + perf_perfns_switch(task, PERF_PERFNS_SWOUT); + rcu_read_unlock(); +} + +static inline void perf_perfns_sched_in(struct task_struct *prev, + struct task_struct *task) +{ + rcu_read_lock(); + + if (task->nsproxy->perf_ns != &init_perf_ns) + perf_perfns_switch(task, PERF_PERFNS_SWIN); + + rcu_read_unlock(); +} + +static inline int perf_perfns_connect(struct perf_event *event, + struct perf_event *group_leader) +{ + if (current->nsproxy->perf_ns != &init_perf_ns) { + /* + * If we are called from our own perf namespace, set + * event->perf_ns + */ + event->perf_ns = current->nsproxy->perf_ns; + + if (group_leader && group_leader->perf_ns != event->perf_ns) { + perf_detach_perfns(event); + return -EINVAL; + } + } + return 0; +} + +static inline void +perf_perfns_set_shadow_time(struct perf_event *event, u64 now) +{ + struct perf_ns_info *t; + + t = per_cpu_ptr(event->perf_ns->info, event->cpu); + event->shadow_ctx_time = now - t->timestamp; +} + +static inline void +perf_perfns_defer_enabled(struct perf_event *event) +{ + if (is_perfns_event(event) && !perf_perfns_match(event)) + event->perfns_defer_enabled = 1; +} + +static inline void +perf_perfns_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + if (!event->perfns_defer_enabled) + return; + + event->perfns_defer_enabled = 0; + + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + sub->perfns_defer_enabled = 0; + } + } +} +#else /* CONFIG_PERFNS */ +static inline bool perf_perfns_match(struct perf_event *event) +{ + return true; +} + +static inline void perf_detach_perfns(struct perf_event *event) +{} + +static inline int is_perfns_event(struct perf_event *event) +{ + return 0; +} + +static inline u64 perf_perfns_event_perfns_time(struct perf_event *event) +{ + return 0; +} + +static inline void update_perfns_time_from_event(struct perf_event *event) +{ +} + +static inline void update_perfns_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ +} + +static inline void perf_perfns_sched_out(struct task_struct *task, + struct task_struct *next) +{ +} + +static inline void perf_perfns_sched_in(struct task_struct *prev, + struct task_struct *task) +{ +} + +static inline void +perf_perfns_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ +} + +void +perf_perfns_switch(struct task_struct *task, struct task_struct *next) +{ +} + + +static inline int perf_perfns_connect(struct perf_event *event, + struct perf_event *group_leader) +{ + return 0; +} + +static inline void +perf_perfns_set_shadow_time(struct perf_event *event, u64 now) +{ +} + +static inline u64 perf_perfns_event_time(struct perf_event *event) +{ + return 0; +} + +static inline void +perf_perfns_defer_enabled(struct perf_event *event) +{ +} + +static inline void +perf_perfns_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ +#endif /* CONFIG_PERF_NS */ + /* * set default to be dependent on timer tick just * like original code @@ -1311,6 +1594,9 @@ static u64 perf_event_time(struct perf_event *event) if (is_cgroup_event(event)) return perf_cgroup_event_time(event); + if (is_perfns_event(event)) + return perf_perfns_event_time(event); + return ctx ? ctx->time : 0; } @@ -1340,6 +1626,8 @@ static void update_event_times(struct perf_event *event) */ if (is_cgroup_event(event)) run_end = perf_cgroup_event_time(event); + else if (is_perfns_event(event)) + run_end = perf_perfns_event_time(event); else if (ctx->is_active) run_end = ctx->time; else @@ -1407,6 +1695,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) ctx->nr_cgroups++; + if (is_perfns_event(event)) + ctx->nr_perfns++; + list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->attr.inherit_stat) @@ -1601,6 +1892,13 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) cpuctx->cgrp = NULL; } + if (is_perfns_event(event)) { + ctx->nr_perfns--; + cpuctx = __get_cpu_context(ctx); + if (!ctx->nr_perfns) + cpuctx->perf_ns = NULL; + } + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -1688,7 +1986,8 @@ static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) - && perf_cgroup_match(event) && pmu_filter_match(event); + && perf_cgroup_match(event) && pmu_filter_match(event) + && perf_perfns_match(event); } static void @@ -1821,6 +2120,7 @@ static void __perf_event_disable(struct perf_event *event, update_context_time(ctx); update_cgrp_time_from_event(event); + update_perfns_time_from_event(event); update_group_times(event); if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); @@ -1907,6 +2207,8 @@ static void perf_set_shadow_time(struct perf_event *event, */ if (is_cgroup_event(event)) perf_cgroup_set_shadow_time(event, tstamp); + else if (is_perfns_event(event)) + perf_perfns_set_shadow_time(event, tstamp); else event->shadow_ctx_time = tstamp - ctx->timestamp; } @@ -2300,6 +2602,8 @@ static void __perf_event_enable(struct perf_event *event, if (!event_filter_match(event)) { if (is_cgroup_event(event)) perf_cgroup_defer_enabled(event); + if (is_perfns_event(event)) + perf_perfns_defer_enabled(event); ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; } @@ -2546,6 +2850,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, /* update (and stop) ctx time */ update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx); + update_perfns_time_from_cpuctx(cpuctx); } is_active ^= ctx->is_active; /* changed bits */ @@ -2837,6 +3142,9 @@ void __perf_event_task_sched_out(struct task_struct *task, */ if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_out(task, next); + + if (atomic_read(this_cpu_ptr(&perf_perfns_events))) + perf_perfns_sched_out(task, next); } /* @@ -2864,6 +3172,9 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, if (is_cgroup_event(event)) perf_cgroup_mark_enabled(event, ctx); + if (is_perfns_event(event)) + perf_perfns_mark_enabled(event, ctx); + if (group_can_go_on(event, cpuctx, 1)) group_sched_in(event, cpuctx, ctx); @@ -2900,6 +3211,9 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (is_cgroup_event(event)) perf_cgroup_mark_enabled(event, ctx); + if (is_perfns_event(event)) + perf_perfns_mark_enabled(event, ctx); + if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; @@ -2936,6 +3250,7 @@ ctx_sched_in(struct perf_event_context *ctx, now = perf_clock(); ctx->timestamp = now; perf_cgroup_set_timestamp(task, ctx); + perf_perfns_set_timestamp(task, ctx); } /* @@ -3008,6 +3323,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + if (atomic_read(this_cpu_ptr(&perf_perfns_events))) + perf_perfns_sched_in(prev, task); + for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (likely(!ctx)) @@ -3353,6 +3671,7 @@ static void __perf_event_read(void *info) if (ctx->is_active) { update_context_time(ctx); update_cgrp_time_from_event(event); + update_perfns_time_from_event(event); } update_event_times(event); @@ -3477,6 +3796,7 @@ static int perf_event_read(struct perf_event *event, bool group) if (ctx->is_active) { update_context_time(ctx); update_cgrp_time_from_event(event); + update_perfns_time_from_event(event); } if (group) update_group_times(event); @@ -3672,6 +3992,9 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) if (is_cgroup_event(event)) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); + + if (is_perfns_event(event)) + atomic_dec(&per_cpu(perf_perfns_events, cpu)); } #ifdef CONFIG_NO_HZ_FULL @@ -3719,6 +4042,8 @@ static void unaccount_event(struct perf_event *event) } if (is_cgroup_event(event)) dec = true; + if (is_perfns_event(event)) + dec = true; if (has_branch_stack(event)) dec = true; @@ -3847,6 +4172,9 @@ static void _free_event(struct perf_event *event) if (is_cgroup_event(event)) perf_detach_cgroup(event); + if (is_perfns_event(event)) + perf_detach_perfns(event); + if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); @@ -8655,6 +8983,9 @@ static void account_event_cpu(struct perf_event *event, int cpu) if (is_cgroup_event(event)) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); + + if (is_perfns_event(event)) + atomic_inc(&per_cpu(perf_perfns_events, cpu)); } /* Freq events need the tick to stay alive (see perf_event_task_tick). */ @@ -8703,6 +9034,8 @@ static void account_event(struct perf_event *event) inc = true; if (is_cgroup_event(event)) inc = true; + if (is_perfns_event(event)) + inc = true; if (inc) { if (atomic_inc_not_zero(&perf_sched_count)) @@ -8851,6 +9184,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + if (!task) { + err = perf_perfns_connect(event, group_leader); + if (err) + goto err_ns; + } + pmu = perf_init_event(event); if (!pmu) goto err_ns; @@ -8900,6 +9239,8 @@ err_pmu: err_ns: if (is_cgroup_event(event)) perf_detach_cgroup(event); + if (is_perfns_event(event)) + perf_detach_perfns(event); if (event->ns) put_pid_ns(event->ns); kfree(event); @@ -10367,6 +10708,10 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); + init_perf_ns.info = alloc_percpu(struct perf_ns_info); + if (!(init_perf_ns.info)) + WARN(-ENOMEM, "perf namespace memory allocation failed"); + /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. diff --git a/kernel/perf_namespace.c b/kernel/perf_namespace.c index 5b76fd8..7991a93 100644 --- a/kernel/perf_namespace.c +++ b/kernel/perf_namespace.c @@ -30,6 +30,13 @@ static struct perf_namespace *create_perf_ns(struct user_namespace *user_ns) perf_ns->ns.ops = &perfns_operations; perf_ns->user_ns = get_user_ns(user_ns); + + perf_ns->info = alloc_percpu(struct perf_ns_info); + if (!perf_ns->info) { + kfree(perf_ns); + return ERR_PTR(-ENOMEM); + } + return perf_ns; } @@ -115,6 +122,7 @@ struct perf_namespace init_perf_ns = { .kref = { .refcount = ATOMIC_INIT(2), }, + .info = NULL, .user_ns = &init_user_ns, .ns.inum = PROC_PERF_INIT_INO, #ifdef CONFIG_PERF_NS