On Mon, Jan 25, 2016 at 12:48:46PM +0100, Peter Zijlstra wrote:
> On Fri, Jan 22, 2016 at 11:44:03AM -0800, Alexei Starovoitov wrote:
> > On Fri, Jan 22, 2016 at 01:38:47PM +0100, Peter Zijlstra wrote:
> > > On Fri, Jan 22, 2016 at 01:35:40PM +0200, Alexander Shishkin wrote:
> > > > Peter Zijlstra <pet...@infradead.org> writes:
> > > > 
> > > > > So I think there's a number of problems still :-(
> > 
> > I've been looking at how perf_event->owner is handled and couldn't
> > figure out how you deal with the case of passing perf_event_fd via 
> > scm_rights.
> > It seems one process can open an event, pass it to another process,
> > but when current process exists owner will still point to dead task,
> > since refcount > 0.
> > Which part am I missing?
> 
> Nothing, you raised a good point. I think this shows we cannot link
> !event->owner to an event being 'dead'.
> 
> If we keep these two states separate, the scm_rights thing should work
> again.

Alexander, Alexei,

How about the below? That uses event->state == PERF_EVENT_STATE_EXIT to
indicate the event has been given up by its 'owner' and decouples us
from the actual event->owner logic.

This retains the event->owner and event->owner_list thing purely for the
prclt(.option = PR_TASK_PERF_EVENTS_{EN,DIS}ABLE) calls, but does give
us strict 'owner' semantics in that:

  struct perf_event *my_event = perf_event_create_kernel_counter();

  /* ... */

  perf_event_release_kernel(my_event);

Or

  int fd = sys_perf_event_open(...);

  close(fd); /* last, calls fops::release */

Will destroy the event dead. event::refcount will 'retain' the object
but it will become non functional and is strictly meant as a temporal
existence guarantee (for when RCU isn't good enough).

So this should restore the scm_rights case, which preserves the fd but
could result in not having event->owner (and therefore being removed
from its owner_list), which is fine.

BPF still needs to get fixed to use filedesc references instead.

---

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -634,9 +634,6 @@ struct perf_event_context {
        int                             nr_cgroups;      /* cgroup evts */
        void                            *task_ctx_data; /* pmu specific data */
        struct rcu_head                 rcu_head;
-
-       struct delayed_work             orphans_remove;
-       bool                            orphans_remove_sched;
 };
 
 /*
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,8 +49,6 @@
 
 #include <asm/irq_regs.h>
 
-static struct workqueue_struct *perf_wq;
-
 typedef int (*remote_function_f)(void *);
 
 struct remote_function_call {
@@ -1086,8 +1084,8 @@ static void put_ctx(struct perf_event_co
  * Lock order:
  *     task_struct::perf_event_mutex
  *       perf_event_context::mutex
- *         perf_event_context::lock
  *         perf_event::child_mutex;
+ *           perf_event_context::lock
  *         perf_event::mmap_mutex
  *         mmap_sem
  */
@@ -1646,45 +1644,11 @@ static void perf_group_detach(struct per
                perf_event__header_size(tmp);
 }
 
-/*
- * User event without the task.
- */
 static bool is_orphaned_event(struct perf_event *event)
 {
-       return event && !is_kernel_event(event) && !event->owner;
-}
-
-/*
- * Event has a parent but parent's task finished and it's
- * alive only because of children holding refference.
- */
-static bool is_orphaned_child(struct perf_event *event)
-{
-       return is_orphaned_event(event->parent);
-}
-
-static void orphans_remove_work(struct work_struct *work);
-
-static void schedule_orphans_remove(struct perf_event_context *ctx)
-{
-       if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
-               return;
-
-       if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
-               get_ctx(ctx);
-               ctx->orphans_remove_sched = true;
-       }
+       return event->state == PERF_EVENT_STATE_EXIT;
 }
 
-static int __init perf_workqueue_init(void)
-{
-       perf_wq = create_singlethread_workqueue("perf");
-       WARN(!perf_wq, "failed to create perf workqueue\n");
-       return perf_wq ? 0 : -1;
-}
-
-core_initcall(perf_workqueue_init);
-
 static inline int pmu_filter_match(struct perf_event *event)
 {
        struct pmu *pmu = event->pmu;
@@ -1745,9 +1709,6 @@ event_sched_out(struct perf_event *event
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
 
-       if (is_orphaned_child(event))
-               schedule_orphans_remove(ctx);
-
        perf_pmu_enable(event->pmu);
 }
 
@@ -1771,6 +1732,9 @@ group_sched_out(struct perf_event *group
                cpuctx->exclusive = 0;
 }
 
+#define DETACH_GROUP   0x01
+#define DETACH_STATE   0x02
+
 /*
  * Cross CPU call to remove a performance event
  *
@@ -1783,13 +1747,19 @@ __perf_remove_from_context(struct perf_e
                           struct perf_event_context *ctx,
                           void *info)
 {
-       bool detach_group = (unsigned long)info;
+       unsigned long flags = (unsigned long)info;
 
        event_sched_out(event, cpuctx, ctx);
-       if (detach_group)
+
+       if (flags & DETACH_GROUP)
                perf_group_detach(event);
        list_del_event(event, ctx);
 
+       if (flags & DETACH_STATE) {
+               event->state = PERF_EVENT_STATE_EXIT;
+               perf_event_wakeup(event);
+       }
+
        if (!ctx->nr_events && ctx->is_active) {
                ctx->is_active = 0;
                if (ctx->task) {
@@ -1809,12 +1779,11 @@ __perf_remove_from_context(struct perf_e
  * When called from perf_event_exit_task, it's OK because the
  * context has been detached from its task.
  */
-static void perf_remove_from_context(struct perf_event *event, bool 
detach_group)
+static void perf_remove_from_context(struct perf_event *event, unsigned long 
flags)
 {
        lockdep_assert_held(&event->ctx->mutex);
 
-       event_function_call(event, __perf_remove_from_context,
-                           (void *)(unsigned long)detach_group);
+       event_function_call(event, __perf_remove_from_context, (void *)flags);
 }
 
 /*
@@ -1980,9 +1949,6 @@ event_sched_in(struct perf_event *event,
        if (event->attr.exclusive)
                cpuctx->exclusive = 1;
 
-       if (is_orphaned_child(event))
-               schedule_orphans_remove(ctx);
-
 out:
        perf_pmu_enable(event->pmu);
 
@@ -2253,7 +2219,8 @@ static void __perf_event_enable(struct p
        struct perf_event *leader = event->group_leader;
        struct perf_event_context *task_ctx;
 
-       if (event->state >= PERF_EVENT_STATE_INACTIVE)
+       if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+           event->state <= PERF_EVENT_STATE_ERROR)
                return;
 
        update_context_time(ctx);
@@ -2298,7 +2265,8 @@ static void _perf_event_enable(struct pe
        struct perf_event_context *ctx = event->ctx;
 
        raw_spin_lock_irq(&ctx->lock);
-       if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+       if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+           event->state < PERF_EVENT_STATE_ERROR) {
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
@@ -3363,7 +3331,6 @@ static void __perf_event_init_context(st
        INIT_LIST_HEAD(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        atomic_set(&ctx->refcount, 1);
-       INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
 }
 
 static struct perf_event_context *
@@ -3665,29 +3632,6 @@ static bool exclusive_event_installable(
        return true;
 }
 
-static void __free_event(struct perf_event *event)
-{
-       if (!event->parent) {
-               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
-                       put_callchain_buffers();
-       }
-
-       perf_event_free_bpf_prog(event);
-
-       if (event->destroy)
-               event->destroy(event);
-
-       if (event->ctx)
-               put_ctx(event->ctx);
-
-       if (event->pmu) {
-               exclusive_event_destroy(event);
-               module_put(event->pmu->module);
-       }
-
-       call_rcu(&event->rcu_head, free_event_rcu);
-}
-
 static void _free_event(struct perf_event *event)
 {
        irq_work_sync(&event->pending);
@@ -3709,7 +3653,25 @@ static void _free_event(struct perf_even
        if (is_cgroup_event(event))
                perf_detach_cgroup(event);
 
-       __free_event(event);
+       if (!event->parent) {
+               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+                       put_callchain_buffers();
+       }
+
+       perf_event_free_bpf_prog(event);
+
+       if (event->destroy)
+               event->destroy(event);
+
+       if (event->ctx)
+               put_ctx(event->ctx);
+
+       if (event->pmu) {
+               exclusive_event_destroy(event);
+               module_put(event->pmu->module);
+       }
+
+       call_rcu(&event->rcu_head, free_event_rcu);
 }
 
 /*
@@ -3771,8 +3733,10 @@ static void perf_remove_from_owner(struc
                 * ensured they're done, and we can proceed with freeing the
                 * event.
                 */
-               if (event->owner)
+               if (event->owner) {
                        list_del_init(&event->owner_entry);
+                       event->owner = NULL;
+               }
                mutex_unlock(&owner->perf_event_mutex);
                put_task_struct(owner);
        }
@@ -3780,11 +3744,23 @@ static void perf_remove_from_owner(struc
 
 static void put_event(struct perf_event *event)
 {
-       struct perf_event_context *ctx;
-
        if (!atomic_long_dec_and_test(&event->refcount))
                return;
 
+       _free_event(event);
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+       struct perf_event_context *ctx;
+       struct perf_event *child, *tmp;
+       LIST_HEAD(child_list);
+
        if (!is_kernel_event(event))
                perf_remove_from_owner(event);
 
@@ -3802,14 +3778,57 @@ static void put_event(struct perf_event
         */
        ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
        WARN_ON_ONCE(ctx->parent_ctx);
-       perf_remove_from_context(event, true);
+       perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE);
        perf_event_ctx_unlock(event, ctx);
 
-       _free_event(event);
-}
+       /*
+        * At this point we must have event->state == PERF_EVENT_STATE_EXIT,
+        * either from the above perf_remove_from_context() or through
+        * __perf_event_exit_task().
+        *
+        * Therefore, anybody acquireing event->child_mutex after the below
+        * list_splice_init() _must_ also see this, most importantly
+        * inherit_event() which will avoid placing more children on the
+        * list.
+        *
+        * Thus this guarantees that we will in fact observe and kill _ALL_
+        * child events.
+        */
+       WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT);
 
-int perf_event_release_kernel(struct perf_event *event)
-{
+       /*
+        * event::child_mutex nests inside ctx::mutex, so move children
+        * to a safe place first and avoid inversion
+        */
+       mutex_lock(&event->child_mutex);
+       list_splice_init(&event->child_list, &child_list);
+       mutex_unlock(&event->child_mutex);
+
+       list_for_each_entry_safe(child, tmp, &child_list, child_list) {
+               struct perf_event_context *ctx;
+
+               /*
+                * This is somewhat similar to perf_free_event(),
+                * except for these events are alive and need
+                * proper perf_remove_from_context().
+                */
+               ctx = perf_event_ctx_lock(child);
+               perf_remove_from_context(child, DETACH_GROUP);
+               perf_event_ctx_unlock(child, ctx);
+
+               list_del(&child->child_list);
+
+               /* Children will have exactly one reference */
+               free_event(child);
+
+               /*
+                * This matches the refcount bump in inherit_event();
+                * this can't be the last reference.
+                */
+               put_event(event);
+       }
+
+       /* Must be the last reference */
        put_event(event);
        return 0;
 }
@@ -3820,46 +3839,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_ker
  */
 static int perf_release(struct inode *inode, struct file *file)
 {
-       put_event(file->private_data);
+       perf_event_release_kernel(file->private_data);
        return 0;
 }
 
-/*
- * Remove all orphanes events from the context.
- */
-static void orphans_remove_work(struct work_struct *work)
-{
-       struct perf_event_context *ctx;
-       struct perf_event *event, *tmp;
-
-       ctx = container_of(work, struct perf_event_context,
-                          orphans_remove.work);
-
-       mutex_lock(&ctx->mutex);
-       list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
-               struct perf_event *parent_event = event->parent;
-
-               if (!is_orphaned_child(event))
-                       continue;
-
-               perf_remove_from_context(event, true);
-
-               mutex_lock(&parent_event->child_mutex);
-               list_del_init(&event->child_list);
-               mutex_unlock(&parent_event->child_mutex);
-
-               free_event(event);
-               put_event(parent_event);
-       }
-
-       raw_spin_lock_irq(&ctx->lock);
-       ctx->orphans_remove_sched = false;
-       raw_spin_unlock_irq(&ctx->lock);
-       mutex_unlock(&ctx->mutex);
-
-       put_ctx(ctx);
-}
-
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
        struct perf_event *child;
@@ -8432,11 +8415,11 @@ SYSCALL_DEFINE5(perf_event_open,
                 * See perf_event_ctx_lock() for comments on the details
                 * of swizzling perf_event::ctx.
                 */
-               perf_remove_from_context(group_leader, false);
+               perf_remove_from_context(group_leader, 0);
 
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
-                       perf_remove_from_context(sibling, false);
+                       perf_remove_from_context(sibling, 0);
                        put_ctx(gctx);
                }
 
@@ -8616,7 +8599,7 @@ void perf_pmu_migrate_context(struct pmu
        mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
                                 event_entry) {
-               perf_remove_from_context(event, false);
+               perf_remove_from_context(event, 0);
                unaccount_event_cpu(event, src_cpu);
                put_ctx(src_ctx);
                list_add(&event->migrate_entry, &events);
@@ -8665,7 +8648,7 @@ void perf_pmu_migrate_context(struct pmu
 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
 
 static void sync_child_event(struct perf_event *child_event,
-                              struct task_struct *child)
+                            struct task_struct *child)
 {
        struct perf_event *parent_event = child_event->parent;
        u64 child_val;
@@ -8684,25 +8667,6 @@ static void sync_child_event(struct perf
        atomic64_add(child_event->total_time_running,
                     &parent_event->child_total_time_running);
 
-       /*
-        * Remove this event from the parent's list
-        */
-       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-       mutex_lock(&parent_event->child_mutex);
-       list_del_init(&child_event->child_list);
-       mutex_unlock(&parent_event->child_mutex);
-
-       /*
-        * Make sure user/parent get notified, that we just
-        * lost one event.
-        */
-       perf_event_wakeup(parent_event);
-
-       /*
-        * Release the parent event, if this was the last
-        * reference to it.
-        */
-       put_event(parent_event);
 }
 
 static void
@@ -8710,6 +8674,8 @@ __perf_event_exit_task(struct perf_event
                         struct perf_event_context *child_ctx,
                         struct task_struct *child)
 {
+       struct perf_event *parent_event = child_event->parent;
+
        /*
         * Do not destroy the 'original' grouping; because of the context
         * switch optimization the original events could've ended up in a
@@ -8728,20 +8694,39 @@ __perf_event_exit_task(struct perf_event
        if (!!child_event->parent)
                perf_group_detach(child_event);
        list_del_event(child_event, child_ctx);
+       child_event->state = PERF_EVENT_STATE_EXIT; /* see 
perf_event_release_kernel() */
        raw_spin_unlock_irq(&child_ctx->lock);
 
        /*
-        * It can happen that the parent exits first, and has events
-        * that are still around due to the child reference. These
-        * events need to be zapped.
+        * Parent events are governed by their filedesc, retain them.
         */
-       if (child_event->parent) {
-               sync_child_event(child_event, child);
-               free_event(child_event);
-       } else {
-               child_event->state = PERF_EVENT_STATE_EXIT;
+       if (!child_event->parent) {
                perf_event_wakeup(child_event);
+               return;
        }
+       /*
+        * This is a child event, cleanup and free.
+        */
+
+       /*
+        * Fold delta back into parent counts.
+        */
+       sync_child_event(child_event, child);
+
+       /*
+        * Remove this event from the parent's list.
+        */
+       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+       mutex_lock(&parent_event->child_mutex);
+       list_del_init(&child_event->child_list);
+       mutex_unlock(&parent_event->child_mutex);
+
+       /*
+        * Kick perf_poll for is_event_hup().
+        */
+       perf_event_wakeup(parent_event);
+       free_event(child_event);
+       put_event(parent_event);
 }
 
 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
@@ -8973,8 +8958,15 @@ inherit_event(struct perf_event *parent_
        if (IS_ERR(child_event))
                return child_event;
 
+       /*
+        * is_orphaned_event() and list_add_tail(&parent_event->child_list)
+        * must be under the same lock in order to serialize against
+        * perf_event_release_kernel().
+        */
+       mutex_lock(&parent_event->child_mutex);
        if (is_orphaned_event(parent_event) ||
            !atomic_long_inc_not_zero(&parent_event->refcount)) {
+               mutex_unlock(&parent_event->child_mutex);
                free_event(child_event);
                return NULL;
        }
@@ -9022,8 +9014,6 @@ inherit_event(struct perf_event *parent_
        /*
         * Link this into the parent event's child list
         */
-       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-       mutex_lock(&parent_event->child_mutex);
        list_add_tail(&child_event->child_list, &parent_event->child_list);
        mutex_unlock(&parent_event->child_mutex);
 

Reply via email to