On Mon, Oct 14, 2024 at 09:25:19PM +0200, Peter Zijlstra wrote:

> Let me ponder that a little bit.

So I did the thing on top of the get/put thing that would allow you to
get rid of the ->closed thing, and before I was finished I already hated
all of it :-(

The thing is, if you're going to the effort of adding get/put and
putting the responsibility on the implementation, then the ->closed
thing is only a little extra ask.

So... I wondered, how hard it would be for perf_pmu_unregister() to do
what you want.

Time passed, hacks were done...

and while what I have is still riddled with holes (more work is
definitely required), it does pass your dummy_pmu test scipt.

I'll poke at this a little longer. Afaict it should be possible to make
this good enough for what you want. Just needs more holes filled.

---
 include/linux/perf_event.h |  13 ++-
 kernel/events/core.c       | 260 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 228 insertions(+), 45 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb908843f209..20995d33d27e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -318,6 +318,9 @@ struct perf_output_handle;
 struct pmu {
        struct list_head                entry;
 
+       spinlock_t                      events_lock;
+       struct list_head                events;
+
        struct module                   *module;
        struct device                   *dev;
        struct device                   *parent;
@@ -612,9 +615,10 @@ struct perf_addr_filter_range {
  * enum perf_event_state - the states of an event:
  */
 enum perf_event_state {
-       PERF_EVENT_STATE_DEAD           = -4,
-       PERF_EVENT_STATE_EXIT           = -3,
-       PERF_EVENT_STATE_ERROR          = -2,
+       PERF_EVENT_STATE_DEAD           = -5,
+       PERF_EVENT_STATE_REVOKED        = -4, /* pmu gone, must not touch */
+       PERF_EVENT_STATE_EXIT           = -3, /* task died, still inherit */
+       PERF_EVENT_STATE_ERROR          = -2, /* scheduling error, can enable */
        PERF_EVENT_STATE_OFF            = -1,
        PERF_EVENT_STATE_INACTIVE       =  0,
        PERF_EVENT_STATE_ACTIVE         =  1,
@@ -853,6 +857,7 @@ struct perf_event {
        void *security;
 #endif
        struct list_head                sb_list;
+       struct list_head                pmu_list;
 
        /*
         * Certain events gets forwarded to another pmu internally by over-
@@ -1103,7 +1108,7 @@ extern void perf_aux_output_flag(struct 
perf_output_handle *handle, u64 flags);
 extern void perf_event_itrace_started(struct perf_event *event);
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
-extern void perf_pmu_unregister(struct pmu *pmu);
+extern int perf_pmu_unregister(struct pmu *pmu);
 
 extern void __perf_event_task_sched_in(struct task_struct *prev,
                                       struct task_struct *task);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index cdd09769e6c5..e66827367a97 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2406,7 +2406,9 @@ ctx_time_update_event(struct perf_event_context *ctx, 
struct perf_event *event)
 
 #define DETACH_GROUP   0x01UL
 #define DETACH_CHILD   0x02UL
-#define DETACH_DEAD    0x04UL
+#define DETACH_EXIT    0x04UL
+#define DETACH_REVOKE  0x08UL
+#define DETACH_DEAD    0x10UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -2421,6 +2423,7 @@ __perf_remove_from_context(struct perf_event *event,
                           void *info)
 {
        struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
+       enum perf_event_state state = PERF_EVENT_STATE_OFF;
        unsigned long flags = (unsigned long)info;
 
        ctx_time_update(cpuctx, ctx);
@@ -2429,16 +2432,22 @@ __perf_remove_from_context(struct perf_event *event,
         * Ensure event_sched_out() switches to OFF, at the very least
         * this avoids raising perf_pending_task() at this time.
         */
-       if (flags & DETACH_DEAD)
+       if (flags & DETACH_EXIT)
+               state = PERF_EVENT_STATE_EXIT;
+       if (flags & DETACH_REVOKE)
+               state = PERF_EVENT_STATE_REVOKED;
+       if (flags & DETACH_DEAD) {
                event->pending_disable = 1;
+               state = PERF_EVENT_STATE_DEAD;
+       }
        event_sched_out(event, ctx);
        if (flags & DETACH_GROUP)
                perf_group_detach(event);
        if (flags & DETACH_CHILD)
                perf_child_detach(event);
        list_del_event(event, ctx);
-       if (flags & DETACH_DEAD)
-               event->state = PERF_EVENT_STATE_DEAD;
+
+       event->state = state;
 
        if (!pmu_ctx->nr_events) {
                pmu_ctx->rotate_necessary = 0;
@@ -4508,7 +4517,8 @@ static void perf_event_enable_on_exec(struct 
perf_event_context *ctx)
 
 static void perf_remove_from_owner(struct perf_event *event);
 static void perf_event_exit_event(struct perf_event *event,
-                                 struct perf_event_context *ctx);
+                                 struct perf_event_context *ctx,
+                                 bool revoke);
 
 /*
  * Removes all events from the current task that have been marked
@@ -4535,7 +4545,7 @@ static void perf_event_remove_on_exec(struct 
perf_event_context *ctx)
 
                modified = true;
 
-               perf_event_exit_event(event, ctx);
+               perf_event_exit_event(event, ctx, false);
        }
 
        raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -5121,6 +5131,7 @@ static bool is_sb_event(struct perf_event *event)
            attr->context_switch || attr->text_poke ||
            attr->bpf_event)
                return true;
+
        return false;
 }
 
@@ -5321,6 +5332,8 @@ static void perf_pending_task_sync(struct perf_event 
*event)
 
 static void _free_event(struct perf_event *event)
 {
+       struct pmu *pmu = event->pmu;
+
        irq_work_sync(&event->pending_irq);
        irq_work_sync(&event->pending_disable_irq);
        perf_pending_task_sync(event);
@@ -5330,6 +5343,7 @@ static void _free_event(struct perf_event *event)
        security_perf_event_free(event);
 
        if (event->rb) {
+               WARN_ON_ONCE(!pmu);
                /*
                 * Can happen when we close an event with re-directed output.
                 *
@@ -5349,12 +5363,16 @@ static void _free_event(struct perf_event *event)
                        put_callchain_buffers();
        }
 
-       perf_event_free_bpf_prog(event);
-       perf_addr_filters_splice(event, NULL);
-       kfree(event->addr_filter_ranges);
+       if (pmu) {
+               perf_event_free_bpf_prog(event);
+               perf_addr_filters_splice(event, NULL);
+               kfree(event->addr_filter_ranges);
+       }
 
-       if (event->destroy)
+       if (event->destroy) {
+               WARN_ON_ONCE(!pmu);
                event->destroy(event);
+       }
 
        /*
         * Must be after ->destroy(), due to uprobe_perf_close() using
@@ -5363,8 +5381,10 @@ static void _free_event(struct perf_event *event)
        if (event->hw.target)
                put_task_struct(event->hw.target);
 
-       if (event->pmu_ctx)
+       if (event->pmu_ctx) {
+               WARN_ON_ONCE(!pmu);
                put_pmu_ctx(event->pmu_ctx);
+       }
 
        /*
         * perf_event_free_task() relies on put_ctx() being 'last', in 
particular
@@ -5373,8 +5393,14 @@ static void _free_event(struct perf_event *event)
        if (event->ctx)
                put_ctx(event->ctx);
 
-       exclusive_event_destroy(event);
-       module_put(event->pmu->module);
+       if (pmu) {
+               exclusive_event_destroy(event);
+               module_put(pmu->module);
+               scoped_guard(spinlock, &pmu->events_lock) {
+                       list_del(&event->pmu_list);
+                       wake_up_var(pmu);
+               }
+       }
 
        call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -5492,7 +5518,11 @@ int perf_event_release_kernel(struct perf_event *event)
         * Thus this guarantees that we will in fact observe and kill _ALL_
         * child events.
         */
-       perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+       if (event->state > PERF_EVENT_STATE_REVOKED) {
+               perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+       } else {
+               event->state = PERF_EVENT_STATE_DEAD;
+       }
 
        perf_event_ctx_unlock(event, ctx);
 
@@ -5803,7 +5833,7 @@ __perf_read(struct perf_event *event, char __user *buf, 
size_t count)
         * error state (i.e. because it was pinned but it couldn't be
         * scheduled on to the CPU at some point).
         */
-       if (event->state == PERF_EVENT_STATE_ERROR)
+       if (event->state <= PERF_EVENT_STATE_ERROR)
                return 0;
 
        if (count < event->read_size)
@@ -5842,8 +5872,14 @@ static __poll_t perf_poll(struct file *file, poll_table 
*wait)
        struct perf_buffer *rb;
        __poll_t events = EPOLLHUP;
 
+       if (event->state <= PERF_EVENT_STATE_REVOKED)
+               return EPOLLERR;
+
        poll_wait(file, &event->waitq, wait);
 
+       if (event->state <= PERF_EVENT_STATE_REVOKED)
+               return EPOLLERR;
+
        if (is_event_hup(event))
                return events;
 
@@ -6013,7 +6049,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 }
 
 static int perf_event_set_output(struct perf_event *event,
-                                struct perf_event *output_event);
+                                struct perf_event *output_event, bool force);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr);
@@ -6023,6 +6059,9 @@ static long _perf_ioctl(struct perf_event *event, 
unsigned int cmd, unsigned lon
        void (*func)(struct perf_event *);
        u32 flags = arg;
 
+       if (event->state <= PERF_EVENT_STATE_REVOKED)
+               return -ENODEV;
+
        switch (cmd) {
        case PERF_EVENT_IOC_ENABLE:
                func = _perf_event_enable;
@@ -6065,10 +6104,10 @@ static long _perf_ioctl(struct perf_event *event, 
unsigned int cmd, unsigned lon
                        if (ret)
                                return ret;
                        output_event = fd_file(output)->private_data;
-                       ret = perf_event_set_output(event, output_event);
+                       ret = perf_event_set_output(event, output_event, false);
                        fdput(output);
                } else {
-                       ret = perf_event_set_output(event, NULL);
+                       ret = perf_event_set_output(event, NULL, false);
                }
                return ret;
        }
@@ -6472,6 +6511,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
        unsigned long size = perf_data_size(rb);
        bool detach_rest = false;
 
+       /* FIXIES vs perf_pmu_unregister() */
        if (event->pmu->event_unmapped)
                event->pmu->event_unmapped(event, vma->vm_mm);
 
@@ -6591,7 +6631,15 @@ static int perf_mmap(struct file *file, struct 
vm_area_struct *vma)
        unsigned long vma_size;
        unsigned long nr_pages;
        long user_extra = 0, extra = 0;
-       int ret = 0, flags = 0;
+       int ret, flags = 0;
+
+       ret = security_perf_event_read(event);
+       if (ret)
+               return ret;
+
+       /* XXX needs event->mmap_mutex? */
+       if (event->state <= PERF_EVENT_STATE_REVOKED)
+               return -ENODEV;
 
        /*
         * Don't allow mmap() of inherited per-task counters. This would
@@ -6604,10 +6652,6 @@ static int perf_mmap(struct file *file, struct 
vm_area_struct *vma)
        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;
 
-       ret = security_perf_event_read(event);
-       if (ret)
-               return ret;
-
        vma_size = vma->vm_end - vma->vm_start;
 
        if (vma->vm_pgoff == 0) {
@@ -6810,6 +6854,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
        struct perf_event *event = filp->private_data;
        int retval;
 
+       if (event->state <= PERF_EVENT_STATE_REVOKED)
+               return -ENODEV;
+
        inode_lock(inode);
        retval = fasync_helper(fd, filp, on, &event->fasync);
        inode_unlock(inode);
@@ -11513,6 +11560,7 @@ static int perf_event_idx_default(struct perf_event 
*event)
 
 static void free_pmu_context(struct pmu *pmu)
 {
+       free_percpu(pmu->pmu_disable_count);
        free_percpu(pmu->cpu_pmu_context);
 }
 
@@ -11753,6 +11801,7 @@ int perf_pmu_register(struct pmu *pmu, const char 
*name, int type)
        if (type >= 0)
                max = type;
 
+       // XXX broken vs perf_init_event(), this publishes before @pmu is 
finalized
        ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
        if (ret < 0)
                goto free_pdc;
@@ -11809,8 +11858,14 @@ int perf_pmu_register(struct pmu *pmu, const char 
*name, int type)
        if (!pmu->event_idx)
                pmu->event_idx = perf_event_idx_default;
 
-       list_add_rcu(&pmu->entry, &pmus);
        atomic_set(&pmu->exclusive_cnt, 0);
+       INIT_LIST_HEAD(&pmu->events);
+       spin_lock_init(&pmu->events_lock);
+
+       /*
+        * Publish the pmu after it is fully initialized.
+        */
+       list_add_rcu(&pmu->entry, &pmus);
        ret = 0;
 unlock:
        mutex_unlock(&pmus_lock);
@@ -11832,10 +11887,110 @@ int perf_pmu_register(struct pmu *pmu, const char 
*name, int type)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_register);
 
-void perf_pmu_unregister(struct pmu *pmu)
+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
+                              struct perf_event_context *ctx)
 {
+       /*
+        * De-schedule the event and mark it EXIT_PMU.
+        */
+       perf_event_exit_event(event, ctx, true);
+
+       /*
+        * All _free_event() bits that rely on event->pmu:
+        */
+       perf_event_set_output(event, NULL, true);
+
+       perf_event_free_bpf_prog(event);
+       perf_addr_filters_splice(event, NULL);
+       kfree(event->addr_filter_ranges);
+
+       if (event->destroy) {
+               event->destroy(event);
+               event->destroy = NULL;
+       }
+
+       if (event->pmu_ctx) {
+               /*
+                * Make sure pmu->cpu_pmu_context is unused. An alternative is
+                * to have it be pointers and make put_pmu_ctx()'s
+                * epc->embedded case be another RCU free case.
+                */
+               put_pmu_ctx(event->pmu_ctx);
+               event->pmu_ctx = NULL;
+       }
+
+       exclusive_event_destroy(event);
+       module_put(pmu->module);
+
+       event->pmu = NULL; /* force fault instead of UAF */
+}
+
+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
+{
+       struct perf_event_context *ctx;
+
+       ctx = perf_event_ctx_lock(event);
+       __pmu_detach_event(pmu, event, ctx);
+       perf_event_ctx_unlock(event, ctx);
+
+       scoped_guard(spinlock, &pmu->events_lock)
+               list_del(&event->pmu_list);
+}
+
+static struct perf_event *pmu_get_event(struct pmu *pmu)
+{
+       struct perf_event *event;
+
+       guard(spinlock)(&pmu->events_lock);
+       list_for_each_entry(event, &pmu->events, pmu_list) {
+               if (atomic_long_inc_not_zero(&event->refcount))
+                       return event;
+       }
+
+       return NULL;
+}
+
+static bool pmu_empty(struct pmu *pmu)
+{
+       guard(spinlock)(&pmu->events_lock);
+       return list_empty(&pmu->events);
+}
+
+static void pmu_detach_events(struct pmu *pmu)
+{
+       struct perf_event *event;
+
+       for (;;) {
+               event = pmu_get_event(pmu);
+               if (!event)
+                       break;
+
+               pmu_detach_event(pmu, event);
+               put_event(event);
+       }
+
+       /*
+        * wait for pending _free_event()s
+        */
+       wait_var_event(pmu, pmu_empty(pmu));
+}
+
+int perf_pmu_unregister(struct pmu *pmu)
+{
+       /*
+        * FIXME!
+        *
+        *   perf_mmap_close(); event->pmu->event_unmapped()
+        *
+        * XXX this check is racy vs perf_event_alloc()
+        */
+       if (pmu->event_unmapped && !pmu_empty(pmu))
+               return -EBUSY;
+
        mutex_lock(&pmus_lock);
        list_del_rcu(&pmu->entry);
+       idr_remove(&pmu_idr, pmu->type);
+       mutex_unlock(&pmus_lock);
 
        /*
         * We dereference the pmu list under both SRCU and regular RCU, so
@@ -11844,16 +11999,29 @@ void perf_pmu_unregister(struct pmu *pmu)
        synchronize_srcu(&pmus_srcu);
        synchronize_rcu();
 
-       free_percpu(pmu->pmu_disable_count);
-       idr_remove(&pmu_idr, pmu->type);
+       /*
+        * PMU is removed from the pmus list, so no new events will
+        * be created, now take care of the existing ones.
+        */
+       pmu_detach_events(pmu);
+
+       /*
+        * PMU is unused, make it go away.
+        */
        if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
                if (pmu->nr_addr_filters)
                        device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
                device_del(pmu->dev);
                put_device(pmu->dev);
        }
+
+       /* 
+        * XXX -- broken? can still contain SW events at this point?
+        * audit more, make sure DETACH_GROUP DTRT
+        */
        free_pmu_context(pmu);
-       mutex_unlock(&pmus_lock);
+
+       return 0;
 }
 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 
@@ -12330,6 +12498,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        /* symmetric to unaccount_event() in _free_event() */
        account_event(event);
 
+       scoped_guard(spinlock, &pmu->events_lock)
+               list_add(&event->pmu_list, &pmu->events);
+
        return event;
 
 err_callchain_buffer:
@@ -12493,7 +12664,7 @@ static void mutex_lock_double(struct mutex *a, struct 
mutex *b)
 }
 
 static int
-perf_event_set_output(struct perf_event *event, struct perf_event 
*output_event)
+perf_event_set_output(struct perf_event *event, struct perf_event 
*output_event, bool force)
 {
        struct perf_buffer *rb = NULL;
        int ret = -EINVAL;
@@ -12549,8 +12720,12 @@ perf_event_set_output(struct perf_event *event, struct 
perf_event *output_event)
        mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
 set:
        /* Can't redirect output if we've got an active mmap() */
-       if (atomic_read(&event->mmap_count))
-               goto unlock;
+       if (atomic_read(&event->mmap_count)) {
+               if (!force)
+                       goto unlock;
+
+               WARN_ON_ONCE(event->pmu->event_unmapped);
+       }
 
        if (output_event) {
                /* get the rb we want to redirect to */
@@ -12740,6 +12915,10 @@ SYSCALL_DEFINE5(perf_event_open,
                if (err)
                        goto err_fd;
                group_leader = fd_file(group)->private_data;
+               if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
+                       err = -ENODEV;
+                       goto err_group_fd;
+               }
                if (flags & PERF_FLAG_FD_OUTPUT)
                        output_event = group_leader;
                if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -12916,7 +13095,7 @@ SYSCALL_DEFINE5(perf_event_open,
        event->pmu_ctx = pmu_ctx;
 
        if (output_event) {
-               err = perf_event_set_output(event, output_event);
+               err = perf_event_set_output(event, output_event, false);
                if (err)
                        goto err_context;
        }
@@ -13287,10 +13466,11 @@ static void sync_child_event(struct perf_event 
*child_event)
 }
 
 static void
-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
+perf_event_exit_event(struct perf_event *event,
+                     struct perf_event_context *ctx, bool revoke)
 {
        struct perf_event *parent_event = event->parent;
-       unsigned long detach_flags = 0;
+       unsigned long detach_flags = DETACH_EXIT;
 
        if (parent_event) {
                /*
@@ -13305,16 +13485,14 @@ perf_event_exit_event(struct perf_event *event, 
struct perf_event_context *ctx)
                 * Do destroy all inherited groups, we don't care about those
                 * and being thorough is better.
                 */
-               detach_flags = DETACH_GROUP | DETACH_CHILD;
+               detach_flags |= DETACH_GROUP | DETACH_CHILD;
                mutex_lock(&parent_event->child_mutex);
        }
 
-       perf_remove_from_context(event, detach_flags);
+       if (revoke)
+               detach_flags |= DETACH_GROUP | DETACH_REVOKE;
 
-       raw_spin_lock_irq(&ctx->lock);
-       if (event->state > PERF_EVENT_STATE_EXIT)
-               perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
-       raw_spin_unlock_irq(&ctx->lock);
+       perf_remove_from_context(event, detach_flags);
 
        /*
         * Child events can be freed.
@@ -13390,7 +13568,7 @@ static void perf_event_exit_task_context(struct 
task_struct *child)
        perf_event_task(child, child_ctx, 0);
 
        list_for_each_entry_safe(child_event, next, &child_ctx->event_list, 
event_entry)
-               perf_event_exit_event(child_event, child_ctx);
+               perf_event_exit_event(child_event, child_ctx, false);
 
        mutex_unlock(&child_ctx->mutex);
 

Reply via email to