Linus,

please pull the latest perf-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
perf-urgent-for-linus

Four patches which all address lock inversions and deadlocks in the perf
core code and the Intel debug store.

Thanks,

        tglx

------------------>
Peter Zijlstra (4):
      perf/core: Fix lock inversion between perf,trace,cpuhp
      perf/core: Fix another perf,trace,cpuhp lock inversion
      perf/core: Fix ctx::mutex deadlock
      perf/x86: Fix perf,x86,cpuhp deadlock


 arch/x86/events/intel/ds.c | 33 +++++++++++++++++---------------
 kernel/events/core.c       | 47 +++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8156e47da7ba..18c25ab28557 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -372,10 +372,9 @@ static int alloc_pebs_buffer(int cpu)
 static void release_pebs_buffer(int cpu)
 {
        struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-       struct debug_store *ds = hwev->ds;
        void *cea;
 
-       if (!ds || !x86_pmu.pebs)
+       if (!x86_pmu.pebs)
                return;
 
        kfree(per_cpu(insn_buffer, cpu));
@@ -384,7 +383,6 @@ static void release_pebs_buffer(int cpu)
        /* Clear the fixmap */
        cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
        ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
-       ds->pebs_buffer_base = 0;
        dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
        hwev->ds_pebs_vaddr = NULL;
 }
@@ -419,16 +417,14 @@ static int alloc_bts_buffer(int cpu)
 static void release_bts_buffer(int cpu)
 {
        struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-       struct debug_store *ds = hwev->ds;
        void *cea;
 
-       if (!ds || !x86_pmu.bts)
+       if (!x86_pmu.bts)
                return;
 
        /* Clear the fixmap */
        cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
        ds_clear_cea(cea, BTS_BUFFER_SIZE);
-       ds->bts_buffer_base = 0;
        dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
        hwev->ds_bts_vaddr = NULL;
 }
@@ -454,16 +450,22 @@ void release_ds_buffers(void)
        if (!x86_pmu.bts && !x86_pmu.pebs)
                return;
 
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
+               release_ds_buffer(cpu);
+
+       for_each_possible_cpu(cpu) {
+               /*
+                * Again, ignore errors from offline CPUs, they will no longer
+                * observe cpu_hw_events.ds and not program the DS_AREA when
+                * they come up.
+                */
                fini_debug_store_on_cpu(cpu);
+       }
 
        for_each_possible_cpu(cpu) {
                release_pebs_buffer(cpu);
                release_bts_buffer(cpu);
-               release_ds_buffer(cpu);
        }
-       put_online_cpus();
 }
 
 void reserve_ds_buffers(void)
@@ -483,8 +485,6 @@ void reserve_ds_buffers(void)
        if (!x86_pmu.pebs)
                pebs_err = 1;
 
-       get_online_cpus();
-
        for_each_possible_cpu(cpu) {
                if (alloc_ds_buffer(cpu)) {
                        bts_err = 1;
@@ -521,11 +521,14 @@ void reserve_ds_buffers(void)
                if (x86_pmu.pebs && !pebs_err)
                        x86_pmu.pebs_active = 1;
 
-               for_each_online_cpu(cpu)
+               for_each_possible_cpu(cpu) {
+                       /*
+                        * Ignores wrmsr_on_cpu() errors for offline CPUs they
+                        * will get this call through intel_pmu_cpu_starting().
+                        */
                        init_debug_store_on_cpu(cpu);
+               }
        }
-
-       put_online_cpus();
 }
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4df5b695bf0d..5d8f4031f8d5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx)
  *           perf_event_context::lock
  *         perf_event::mmap_mutex
  *         mmap_sem
+ *
+ *    cpu_hotplug_lock
+ *      pmus_lock
+ *       cpuctx->mutex / perf_event_context::mutex
  */
 static struct perf_event_context *
 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
@@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *child, *tmp;
+       LIST_HEAD(free_list);
 
        /*
         * If we got here through err_file: fput(event_file); we will not have
@@ -4268,8 +4273,7 @@ int perf_event_release_kernel(struct perf_event *event)
                                               struct perf_event, child_list);
                if (tmp == child) {
                        perf_remove_from_context(child, DETACH_GROUP);
-                       list_del(&child->child_list);
-                       free_event(child);
+                       list_move(&child->child_list, &free_list);
                        /*
                         * This matches the refcount bump in inherit_event();
                         * this can't be the last reference.
@@ -4284,6 +4288,11 @@ int perf_event_release_kernel(struct perf_event *event)
        }
        mutex_unlock(&event->child_mutex);
 
+       list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+               list_del(&child->child_list);
+               free_event(child);
+       }
+
 no_ctx:
        put_event(event); /* Must be the 'last' reference */
        return 0;
@@ -8516,6 +8525,29 @@ perf_event_set_addr_filter(struct perf_event *event, 
char *filter_str)
        return ret;
 }
 
+static int
+perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
+{
+       struct perf_event_context *ctx = event->ctx;
+       int ret;
+
+       /*
+        * Beware, here be dragons!!
+        *
+        * the tracepoint muck will deadlock against ctx->mutex, but the 
tracepoint
+        * stuff does not actually need it. So temporarily drop ctx->mutex. As 
per
+        * perf_event_ctx_lock() we already have a reference on ctx.
+        *
+        * This can result in event getting moved to a different ctx, but that
+        * does not affect the tracepoint state.
+        */
+       mutex_unlock(&ctx->mutex);
+       ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+       mutex_lock(&ctx->mutex);
+
+       return ret;
+}
+
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 {
        char *filter_str;
@@ -8532,8 +8564,7 @@ static int perf_event_set_filter(struct perf_event 
*event, void __user *arg)
 
        if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
            event->attr.type == PERF_TYPE_TRACEPOINT)
-               ret = ftrace_profile_set_filter(event, event->attr.config,
-                                               filter_str);
+               ret = perf_tracepoint_set_filter(event, filter_str);
        else if (has_addr_filter(event))
                ret = perf_event_set_addr_filter(event, filter_str);
 
@@ -9168,7 +9199,13 @@ static int perf_try_init_event(struct pmu *pmu, struct 
perf_event *event)
        if (!try_module_get(pmu->module))
                return -ENODEV;
 
-       if (event->group_leader != event) {
+       /*
+        * A number of pmu->event_init() methods iterate the sibling_list to,
+        * for example, validate if the group fits on the PMU. Therefore,
+        * if this is a sibling event, acquire the ctx->mutex to protect
+        * the sibling_list.
+        */
+       if (event->group_leader != event && pmu->task_ctx_nr != 
perf_sw_context) {
                /*
                 * This ctx->mutex can nest when we're called through
                 * inheritance. See the perf_event_ctx_lock_nested() comment.

Reply via email to