Whenever perf tool is executed inside a container, this
patch restricts the events to the perf-namespace in which
the perf tool is executing.

This patch is based on the existing support available
for tracing with cgroups.

TODO:
    - Avoid code duplication.

Signed-off-by: Aravinda Prasad <aravi...@linux.vnet.ibm.com>
---
 include/linux/perf_event.h     |    8 +
 include/linux/perf_namespace.h |    6 +
 kernel/events/core.c           |  347 ++++++++++++++++++++++++++++++++++++++++
 kernel/perf_namespace.c        |    8 +
 4 files changed, 368 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1a827ce..8d797d9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -43,6 +43,7 @@ struct perf_guest_info_callbacks {
 #include <linux/hrtimer.h>
 #include <linux/fs.h>
 #include <linux/pid_namespace.h>
+#include <linux/perf_namespace.h>
 #include <linux/workqueue.h>
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
@@ -656,6 +657,11 @@ struct perf_event {
        struct rcu_head                 rcu_head;
 
        struct pid_namespace            *ns;
+#ifdef CONFIG_PERF_NS
+       struct perf_namespace           *perf_ns;
+       int     perfns_defer_enabled;
+#endif
+
        u64                             id;
 
        u64                             (*clock)(void);
@@ -725,6 +731,7 @@ struct perf_event_context {
        u64                             generation;
        int                             pin_count;
        int                             nr_cgroups;      /* cgroup evts */
+       int                             nr_perfns;
        void                            *task_ctx_data; /* pmu specific data */
        struct rcu_head                 rcu_head;
 };
@@ -751,6 +758,7 @@ struct perf_cpu_context {
 
        struct pmu                      *unique_pmu;
        struct perf_cgroup              *cgrp;
+       struct perf_namespace   *perf_ns;
 };
 
 struct perf_output_handle {
diff --git a/include/linux/perf_namespace.h b/include/linux/perf_namespace.h
index 9713724..2aad0e9 100644
--- a/include/linux/perf_namespace.h
+++ b/include/linux/perf_namespace.h
@@ -8,8 +8,14 @@
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
+struct perf_ns_info {
+       u64             time;
+       u64             timestamp;
+};
+
 struct perf_namespace {
        struct kref kref;
+       struct perf_ns_info __percpu *info;
        struct user_namespace *user_ns; /* Owning user namespace */
        struct ns_common ns;
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 274450e..757a169 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -334,6 +334,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(atomic_t, perf_perfns_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -914,6 +915,288 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 }
 #endif
 
+#ifdef CONFIG_PERF_NS
+static inline bool perf_perfns_match(struct perf_event *event)
+{
+       struct perf_event_context *ctx = event->ctx;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+       /* @event doesn't care about perfns */
+       if (!event->perf_ns)
+               return true;
+
+       if (cpuctx->perf_ns != event->perf_ns)
+               return false;
+
+       return true;
+}
+
+static inline void perf_detach_perfns(struct perf_event *event)
+{
+       event->perf_ns = NULL;
+}
+
+static inline int is_perfns_event(struct perf_event *event)
+{
+       return event->perf_ns != NULL;
+}
+
+static inline u64 perf_perfns_event_time(struct perf_event *event)
+{
+       struct perf_ns_info *t;
+
+       t = per_cpu_ptr(event->perf_ns->info, event->cpu);
+       return t ? t->time : 0;
+}
+
+static inline void __update_perfns_time(struct perf_namespace *p_ns)
+{
+       struct perf_ns_info *info;
+       u64 now;
+
+       now = perf_clock();
+
+       if (!p_ns->info)
+               return;
+
+       info = this_cpu_ptr(p_ns->info);
+
+       info->time += now - info->timestamp;
+       info->timestamp = now;
+}
+
+static inline void update_perfns_time_from_cpuctx(struct perf_cpu_context 
*cpuctx)
+{
+       struct perf_namespace *perfns_out = cpuctx->perf_ns;
+
+       if (perfns_out)
+               __update_perfns_time(perfns_out);
+}
+
+static inline void update_perfns_time_from_event(struct perf_event *event)
+{
+       struct perf_namespace *perf_ns = current->nsproxy->perf_ns;
+
+       if (!is_perfns_event(event))
+               return;
+
+       if (perf_ns == event->perf_ns)
+               __update_perfns_time(event->perf_ns);
+}
+
+static inline void
+perf_perfns_set_timestamp(struct task_struct *task,
+                         struct perf_event_context *ctx)
+{
+       struct perf_namespace *perf_ns = task->nsproxy->perf_ns;
+       struct perf_ns_info *info;
+
+       if (!task || !ctx->nr_perfns)
+               return;
+
+       if (!perf_ns->info)
+               return;
+
+       info = this_cpu_ptr(perf_ns->info);
+       info->timestamp = ctx->timestamp;
+}
+
+#define PERF_PERFNS_SWOUT      0x1 /* perfns switch out every event */
+#define PERF_PERFNS_SWIN       0x2 /* perfns switch in events based on task */
+
+/*
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on perfns for next
+ */
+static void perf_perfns_switch(struct task_struct *task, int mode)
+{
+       struct perf_cpu_context *cpuctx;
+       struct pmu *pmu;
+       unsigned long flags;
+
+       local_irq_save(flags);
+
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+               if (cpuctx->unique_pmu != pmu)
+                       continue; /* ensure we process each cpuctx once */
+
+               if (cpuctx->ctx.nr_perfns > 0) {
+                       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+                       perf_pmu_disable(cpuctx->ctx.pmu);
+
+                       if (mode & PERF_PERFNS_SWOUT) {
+                               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+                               /*
+                                * must not be done before ctxswout due
+                                * to event_filter_match() in event_sched_out()
+                                */
+                               cpuctx->perf_ns = NULL;
+                       }
+
+                       if (mode & PERF_PERFNS_SWIN) {
+                               WARN_ON_ONCE(cpuctx->perf_ns);
+
+                               cpuctx->perf_ns = task->nsproxy->perf_ns;
+                               cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+                       }
+                       perf_pmu_enable(cpuctx->ctx.pmu);
+                       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+               }
+       }
+
+       local_irq_restore(flags);
+}
+
+static inline void perf_perfns_sched_out(struct task_struct *task,
+                                        struct task_struct *next)
+{
+       rcu_read_lock();
+       perf_perfns_switch(task, PERF_PERFNS_SWOUT);
+       rcu_read_unlock();
+}
+
+static inline void perf_perfns_sched_in(struct task_struct *prev,
+                                       struct task_struct *task)
+{
+       rcu_read_lock();
+
+       if (task->nsproxy->perf_ns != &init_perf_ns)
+               perf_perfns_switch(task, PERF_PERFNS_SWIN);
+
+       rcu_read_unlock();
+}
+
+static inline int perf_perfns_connect(struct perf_event *event,
+                                     struct perf_event *group_leader)
+{
+       if (current->nsproxy->perf_ns != &init_perf_ns) {
+               /*
+                * If we are called from our own perf namespace, set
+                * event->perf_ns
+                */
+               event->perf_ns = current->nsproxy->perf_ns;
+
+               if (group_leader && group_leader->perf_ns != event->perf_ns) {
+                       perf_detach_perfns(event);
+                       return -EINVAL;
+               }
+       }
+       return 0;
+}
+
+static inline void
+perf_perfns_set_shadow_time(struct perf_event *event, u64 now)
+{
+       struct perf_ns_info *t;
+
+       t = per_cpu_ptr(event->perf_ns->info, event->cpu);
+       event->shadow_ctx_time = now - t->timestamp;
+}
+
+static inline void
+perf_perfns_defer_enabled(struct perf_event *event)
+{
+       if (is_perfns_event(event) && !perf_perfns_match(event))
+               event->perfns_defer_enabled = 1;
+}
+
+static inline void
+perf_perfns_mark_enabled(struct perf_event *event,
+                        struct perf_event_context *ctx)
+{
+       struct perf_event *sub;
+       u64 tstamp = perf_event_time(event);
+
+       if (!event->perfns_defer_enabled)
+               return;
+
+       event->perfns_defer_enabled = 0;
+
+       event->tstamp_enabled = tstamp - event->total_time_enabled;
+       list_for_each_entry(sub, &event->sibling_list, group_entry) {
+               if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+                       sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+                       sub->perfns_defer_enabled = 0;
+               }
+       }
+}
+#else /* CONFIG_PERFNS */
+static inline bool perf_perfns_match(struct perf_event *event)
+{
+       return true;
+}
+
+static inline void perf_detach_perfns(struct perf_event *event)
+{}
+
+static inline int is_perfns_event(struct perf_event *event)
+{
+       return 0;
+}
+
+static inline u64 perf_perfns_event_perfns_time(struct perf_event *event)
+{
+       return 0;
+}
+
+static inline void update_perfns_time_from_event(struct perf_event *event)
+{
+}
+
+static inline void update_perfns_time_from_cpuctx(struct perf_cpu_context 
*cpuctx)
+{
+}
+
+static inline void perf_perfns_sched_out(struct task_struct *task,
+                                        struct task_struct *next)
+{
+}
+
+static inline void perf_perfns_sched_in(struct task_struct *prev,
+                                       struct task_struct *task)
+{
+}
+
+static inline void
+perf_perfns_set_timestamp(struct task_struct *task,
+                         struct perf_event_context *ctx)
+{
+}
+
+void
+perf_perfns_switch(struct task_struct *task, struct task_struct *next)
+{
+}
+
+
+static inline int perf_perfns_connect(struct perf_event *event,
+                                     struct perf_event *group_leader)
+{
+       return 0;
+}
+
+static inline void
+perf_perfns_set_shadow_time(struct perf_event *event, u64 now)
+{
+}
+
+static inline u64 perf_perfns_event_time(struct perf_event *event)
+{
+       return 0;
+}
+
+static inline void
+perf_perfns_defer_enabled(struct perf_event *event)
+{
+}
+
+static inline void
+perf_perfns_mark_enabled(struct perf_event *event,
+                        struct perf_event_context *ctx)
+{
+#endif /* CONFIG_PERF_NS */
+
 /*
  * set default to be dependent on timer tick just
  * like original code
@@ -1311,6 +1594,9 @@ static u64 perf_event_time(struct perf_event *event)
        if (is_cgroup_event(event))
                return perf_cgroup_event_time(event);
 
+       if (is_perfns_event(event))
+               return perf_perfns_event_time(event);
+
        return ctx ? ctx->time : 0;
 }
 
@@ -1340,6 +1626,8 @@ static void update_event_times(struct perf_event *event)
         */
        if (is_cgroup_event(event))
                run_end = perf_cgroup_event_time(event);
+       else if (is_perfns_event(event))
+               run_end = perf_perfns_event_time(event);
        else if (ctx->is_active)
                run_end = ctx->time;
        else
@@ -1407,6 +1695,9 @@ list_add_event(struct perf_event *event, struct 
perf_event_context *ctx)
        if (is_cgroup_event(event))
                ctx->nr_cgroups++;
 
+       if (is_perfns_event(event))
+               ctx->nr_perfns++;
+
        list_add_rcu(&event->event_entry, &ctx->event_list);
        ctx->nr_events++;
        if (event->attr.inherit_stat)
@@ -1601,6 +1892,13 @@ list_del_event(struct perf_event *event, struct 
perf_event_context *ctx)
                        cpuctx->cgrp = NULL;
        }
 
+       if (is_perfns_event(event)) {
+               ctx->nr_perfns--;
+               cpuctx = __get_cpu_context(ctx);
+               if (!ctx->nr_perfns)
+                       cpuctx->perf_ns = NULL;
+       }
+
        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
@@ -1688,7 +1986,8 @@ static inline int
 event_filter_match(struct perf_event *event)
 {
        return (event->cpu == -1 || event->cpu == smp_processor_id())
-           && perf_cgroup_match(event) && pmu_filter_match(event);
+           && perf_cgroup_match(event) && pmu_filter_match(event)
+                       && perf_perfns_match(event);
 }
 
 static void
@@ -1821,6 +2120,7 @@ static void __perf_event_disable(struct perf_event *event,
 
        update_context_time(ctx);
        update_cgrp_time_from_event(event);
+       update_perfns_time_from_event(event);
        update_group_times(event);
        if (event == event->group_leader)
                group_sched_out(event, cpuctx, ctx);
@@ -1907,6 +2207,8 @@ static void perf_set_shadow_time(struct perf_event *event,
         */
        if (is_cgroup_event(event))
                perf_cgroup_set_shadow_time(event, tstamp);
+       else if (is_perfns_event(event))
+               perf_perfns_set_shadow_time(event, tstamp);
        else
                event->shadow_ctx_time = tstamp - ctx->timestamp;
 }
@@ -2300,6 +2602,8 @@ static void __perf_event_enable(struct perf_event *event,
        if (!event_filter_match(event)) {
                if (is_cgroup_event(event))
                        perf_cgroup_defer_enabled(event);
+               if (is_perfns_event(event))
+                       perf_perfns_defer_enabled(event);
                ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
                return;
        }
@@ -2546,6 +2850,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
                /* update (and stop) ctx time */
                update_context_time(ctx);
                update_cgrp_time_from_cpuctx(cpuctx);
+               update_perfns_time_from_cpuctx(cpuctx);
        }
 
        is_active ^= ctx->is_active; /* changed bits */
@@ -2837,6 +3142,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
         */
        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                perf_cgroup_sched_out(task, next);
+
+       if (atomic_read(this_cpu_ptr(&perf_perfns_events)))
+               perf_perfns_sched_out(task, next);
 }
 
 /*
@@ -2864,6 +3172,9 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
                if (is_cgroup_event(event))
                        perf_cgroup_mark_enabled(event, ctx);
 
+               if (is_perfns_event(event))
+                       perf_perfns_mark_enabled(event, ctx);
+
                if (group_can_go_on(event, cpuctx, 1))
                        group_sched_in(event, cpuctx, ctx);
 
@@ -2900,6 +3211,9 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                if (is_cgroup_event(event))
                        perf_cgroup_mark_enabled(event, ctx);
 
+               if (is_perfns_event(event))
+                       perf_perfns_mark_enabled(event, ctx);
+
                if (group_can_go_on(event, cpuctx, can_add_hw)) {
                        if (group_sched_in(event, cpuctx, ctx))
                                can_add_hw = 0;
@@ -2936,6 +3250,7 @@ ctx_sched_in(struct perf_event_context *ctx,
                now = perf_clock();
                ctx->timestamp = now;
                perf_cgroup_set_timestamp(task, ctx);
+               perf_perfns_set_timestamp(task, ctx);
        }
 
        /*
@@ -3008,6 +3323,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                perf_cgroup_sched_in(prev, task);
 
+       if (atomic_read(this_cpu_ptr(&perf_perfns_events)))
+               perf_perfns_sched_in(prev, task);
+
        for_each_task_context_nr(ctxn) {
                ctx = task->perf_event_ctxp[ctxn];
                if (likely(!ctx))
@@ -3353,6 +3671,7 @@ static void __perf_event_read(void *info)
        if (ctx->is_active) {
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
+               update_perfns_time_from_event(event);
        }
 
        update_event_times(event);
@@ -3477,6 +3796,7 @@ static int perf_event_read(struct perf_event *event, bool 
group)
                if (ctx->is_active) {
                        update_context_time(ctx);
                        update_cgrp_time_from_event(event);
+                       update_perfns_time_from_event(event);
                }
                if (group)
                        update_group_times(event);
@@ -3672,6 +3992,9 @@ static void unaccount_event_cpu(struct perf_event *event, 
int cpu)
 
        if (is_cgroup_event(event))
                atomic_dec(&per_cpu(perf_cgroup_events, cpu));
+
+       if (is_perfns_event(event))
+               atomic_dec(&per_cpu(perf_perfns_events, cpu));
 }
 
 #ifdef CONFIG_NO_HZ_FULL
@@ -3719,6 +4042,8 @@ static void unaccount_event(struct perf_event *event)
        }
        if (is_cgroup_event(event))
                dec = true;
+       if (is_perfns_event(event))
+               dec = true;
        if (has_branch_stack(event))
                dec = true;
 
@@ -3847,6 +4172,9 @@ static void _free_event(struct perf_event *event)
        if (is_cgroup_event(event))
                perf_detach_cgroup(event);
 
+       if (is_perfns_event(event))
+               perf_detach_perfns(event);
+
        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
                        put_callchain_buffers();
@@ -8655,6 +8983,9 @@ static void account_event_cpu(struct perf_event *event, 
int cpu)
 
        if (is_cgroup_event(event))
                atomic_inc(&per_cpu(perf_cgroup_events, cpu));
+
+       if (is_perfns_event(event))
+               atomic_inc(&per_cpu(perf_perfns_events, cpu));
 }
 
 /* Freq events need the tick to stay alive (see perf_event_task_tick). */
@@ -8703,6 +9034,8 @@ static void account_event(struct perf_event *event)
                inc = true;
        if (is_cgroup_event(event))
                inc = true;
+       if (is_perfns_event(event))
+               inc = true;
 
        if (inc) {
                if (atomic_inc_not_zero(&perf_sched_count))
@@ -8851,6 +9184,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                        goto err_ns;
        }
 
+       if (!task) {
+               err = perf_perfns_connect(event, group_leader);
+               if (err)
+                       goto err_ns;
+       }
+
        pmu = perf_init_event(event);
        if (!pmu)
                goto err_ns;
@@ -8900,6 +9239,8 @@ err_pmu:
 err_ns:
        if (is_cgroup_event(event))
                perf_detach_cgroup(event);
+       if (is_perfns_event(event))
+               perf_detach_perfns(event);
        if (event->ns)
                put_pid_ns(event->ns);
        kfree(event);
@@ -10367,6 +10708,10 @@ void __init perf_event_init(void)
        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
 
+       init_perf_ns.info = alloc_percpu(struct perf_ns_info);
+       if (!(init_perf_ns.info))
+               WARN(-ENOMEM, "perf namespace memory allocation failed");
+
        /*
         * Build time assertion that we keep the data_head at the intended
         * location.  IOW, validation we got the __reserved[] size right.
diff --git a/kernel/perf_namespace.c b/kernel/perf_namespace.c
index 5b76fd8..7991a93 100644
--- a/kernel/perf_namespace.c
+++ b/kernel/perf_namespace.c
@@ -30,6 +30,13 @@ static struct perf_namespace *create_perf_ns(struct 
user_namespace *user_ns)
 
        perf_ns->ns.ops = &perfns_operations;
        perf_ns->user_ns = get_user_ns(user_ns);
+
+       perf_ns->info = alloc_percpu(struct perf_ns_info);
+       if (!perf_ns->info) {
+               kfree(perf_ns);
+               return ERR_PTR(-ENOMEM);
+       }
+
        return perf_ns;
 }
 
@@ -115,6 +122,7 @@ struct perf_namespace init_perf_ns = {
        .kref = {
                .refcount = ATOMIC_INIT(2),
        },
+       .info = NULL,
        .user_ns = &init_user_ns,
        .ns.inum = PROC_PERF_INIT_INO,
 #ifdef CONFIG_PERF_NS

Reply via email to