On Mon, Jun 24, 2019 at 02:19:02PM +0200, Peter Zijlstra wrote:
> 
>       close()                                         clone()
> 
>                                                         copy_process()
>                                                           
> perf_event_init_task()
>                                                             
> perf_event_init_context()
>                                                               
> mutex_lock(parent_ctx->mutex)
>                                                               
> inherit_task_group()
>                                                                 
> inherit_group()
>                                                                   
> inherit_event()
>                                                                     
> mutex_lock(event->child_mutex)
>                                                                     // expose 
> event on child list
>                                                                     
> list_add_tail()
>                                                                     
> mutex_unlock(event->child_mutex)
>                                                               
> mutex_unlock(parent_ctx->mutex)
> 
>                                                           ...
>                                                           goto bad_fork_*
> 
>                                                         bad_fork_cleanup_perf:
>                                                           
> perf_event_free_task()
> 
>         perf_release()
>           perf_event_release_kernel()
>             list_for_each_entry()
>               mutex_lock(ctx->mutex)
>               mutex_lock(event->child_mutex)
>               // event is from the failing inherit
>               // on the other CPU
>               perf_remove_from_context()
>               list_move()
>               mutex_unlock(event->child_mutex)
>               mutex_unlock(ctx->mutex)
> 
>                                                             
> mutex_lock(ctx->mutex)
>                                                             
> list_for_each_entry_safe()
>                                                               // event 
> already stolen
>                                                             
> mutex_unlock(ctx->mutex)
> 
>                                                           delayed_free_task()
>                                                             free_task()
> 
>            list_for_each_entry_safe()
>              list_del()
>              free_event()
>                _free_event()
>                  // and so event->hw.target
>                  // is the already freed failed clone()
>                  if (event->hw.target)
>                    put_task_struct(event->hw.target)
>                      // WHOOPSIE, already quite dead
> 
> 
> Which puts the lie to the the comment on perf_event_free_task();
> 'unexposed, unused context' my ass.
> 
> Which is a 'fun' confluence of fail; copy_process() doing an
> unconditional free_task() and not respecting refcounts, and perf having
> creative locking. In particular:
> 
>   82d94856fa22 ("perf/core: Fix lock inversion between perf,trace,cpuhp")
> 
> seems to have overlooked this 'fun' parade.

The below seems to cure things; it uses the fact that detached events
still have a reference count on their context.

So perf_event_free_task() can detect when (child) events have gotten
stolen and wait for it.

The below (which includes a debug printk) confirms the reproducer
triggered the problem by printing a 2 (where a 1 is expected).

Thoughts?

---
 kernel/events/core.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 10c1dba9068c..e19d036125d1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4463,12 +4463,16 @@ static void _free_event(struct perf_event *event)
        if (event->destroy)
                event->destroy(event);
 
-       if (event->ctx)
-               put_ctx(event->ctx);
-
+       /*
+        * Must be after ->destroy(), due to uprobe_perf_close() using
+        * hw.target, but before put_ctx() because of perf_event_free_task().
+        */
        if (event->hw.target)
                put_task_struct(event->hw.target);
 
+       if (event->ctx)
+               put_ctx(event->ctx);
+
        exclusive_event_destroy(event);
        module_put(event->pmu->module);
 
@@ -4648,10 +4652,20 @@ int perf_event_release_kernel(struct perf_event *event)
        mutex_unlock(&event->child_mutex);
 
        list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+               void *var = &child->ctx->refcount;
+
                list_del(&child->child_list);
                free_event(child);
+
+               /*
+                * Wake any perf_event_free_task() waiting for this event to be
+                * freed.
+                */
+               smp_mb(); /* pairs with wait_var_event() */
+               wake_up_var(var);
        }
 
+
 no_ctx:
        put_event(event); /* Must be the 'last' reference */
        return 0;
@@ -11546,7 +11560,19 @@ void perf_event_free_task(struct task_struct *task)
                        perf_free_event(event, ctx);
 
                mutex_unlock(&ctx->mutex);
-               put_ctx(ctx);
+
+               /*
+                * perf_event_release_kernel() could've stolen some of our
+                * child events and still have them on its free_list. In that
+                * case it can do free_event() after the failing copy_process()
+                * has already freed the task, and get a UaF on
+                * event->hw.target.
+                *
+                * Wait for all events to drop their context reference.
+                */
+               printk("%d\n", refcount_read(&ctx->refcount));
+               wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 
1);
+               put_ctx(ctx); /* must be last */
        }
 }
 

Reply via email to