* Ingo Molnar <mi...@kernel.org> wrote:

> Note, for tracing the PERF_FLAG_FD_OUTPUT method of multiplexing 
> multiple events onto a single mmap buffers is probably useful (also 
> usable via the PERF_EVENT_IOC_SET_OUTPUT ioctl()), so please make sure 
> the scheme works naturally with that model as well, not just with 1:1 
> event+buffer mappings.
> 
> See the uses of PERF_EVENT_IOC_SET_OUTPUT in tools/perf/.

Note that another facility that would be very useful for tracing is 
PeterZ's and tglx's patch that enables multiple tracepoints to be attached 
to a single event.

See the 2+ years old (bitrotten and unfinished) WIP patch below.

It adds a PERF_EVENT_IOC_ADD_TP ioctl() that adds a new tracepoint to an 
existing event. This makes perf based tracing scale up to an arbitrary 
number of tracepoints in essence.

Thanks,

        Ingo

------------------>
Subject: perf-tracepoint-idr.patch
From: Thomas Gleixner <t...@linutronix.de>
Date: Wed, 24 Nov 2010 12:09:26 +0100

Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijls...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---
 include/linux/ftrace_event.h    |   10 
 include/linux/perf_event.h      |    9 
 include/linux/sched.h           |    9 
 include/trace/ftrace.h          |    4 
 kernel/events/core.c            |  407 ++++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace_event_perf.c |   95 +++------
 kernel/trace/trace_kprobe.c     |   10 
 kernel/trace/trace_output.c     |  116 +++--------
 kernel/trace/trace_syscalls.c   |    8 
 9 files changed, 498 insertions(+), 170 deletions(-)

Index: linux/include/linux/ftrace_event.h
===================================================================
--- linux.orig/include/linux/ftrace_event.h
+++ linux/include/linux/ftrace_event.h
@@ -87,8 +87,6 @@ struct trace_event_functions {
 };
 
 struct trace_event {
-       struct hlist_node               node;
-       struct list_head                list;
        int                             type;
        struct trace_event_functions    *funcs;
 };
@@ -194,7 +192,6 @@ struct ftrace_event_call {
 
 #ifdef CONFIG_PERF_EVENTS
        int                             perf_refcount;
-       struct hlist_head __percpu      *perf_events;
 #endif
 };
 
@@ -263,8 +260,9 @@ struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
 
-extern int  perf_trace_init(struct perf_event *event);
+extern int  perf_trace_init(struct perf_event *event, int event_id);
 extern void perf_trace_destroy(struct perf_event *event);
+extern void perf_trace_destroy_id(int id);
 extern int  perf_trace_add(struct perf_event *event, int flags);
 extern void perf_trace_del(struct perf_event *event, int flags);
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -275,9 +273,9 @@ extern void *perf_trace_buf_prepare(int 
 
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
-                      u64 count, struct pt_regs *regs, void *head)
+                      u64 count, struct pt_regs *regs, int id)
 {
-       perf_tp_event(addr, count, raw_data, size, regs, head, rctx);
+       perf_tp_event(addr, count, raw_data, size, regs, rctx, id);
 }
 #endif
 
Index: linux/include/linux/perf_event.h
===================================================================
--- linux.orig/include/linux/perf_event.h
+++ linux/include/linux/perf_event.h
@@ -247,6 +247,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_PERIOD          _IOW('$', 4, __u64)
 #define PERF_EVENT_IOC_SET_OUTPUT      _IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER      _IOW('$', 6, char *)
+#define PERF_EVENT_IOC_ADD_TP          _IO ('$', 7)
 
 enum perf_event_ioc_flags {
        PERF_IOC_FLAG_GROUP             = 1U << 0,
@@ -568,6 +569,11 @@ struct hw_perf_event {
                        struct task_struct              *bp_target;
                };
 #endif
+               /*
+                * Same fudge as for breakpoints, trace-events needs
+                * it too,.. convert the bp crap over..
+                */
+               struct task_struct *event_target;
        };
        int                             state;
        local64_t                       prev_count;
@@ -859,6 +865,7 @@ struct perf_event {
 #ifdef CONFIG_EVENT_TRACING
        struct ftrace_event_call        *tp_event;
        struct event_filter             *filter;
+       struct perf_tp_idr              tp_idr;
 #endif
 
 #ifdef CONFIG_CGROUP_PERF
@@ -1133,7 +1140,7 @@ static inline bool perf_paranoid_kernel(
 extern void perf_event_init(void);
 extern void perf_tp_event(u64 addr, u64 count, void *record,
                          int entry_size, struct pt_regs *regs,
-                         struct hlist_head *head, int rctx);
+                         int rctx, int id);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -82,6 +82,7 @@ struct sched_param {
 #include <linux/rculist.h>
 #include <linux/rtmutex.h>
 
+#include <linux/idr.h>
 #include <linux/time.h>
 #include <linux/param.h>
 #include <linux/resource.h>
@@ -1199,6 +1200,11 @@ enum perf_event_task_context {
        perf_nr_task_contexts,
 };
 
+struct perf_tp_idr {
+       struct mutex    lock;
+       struct idr      idr;
+};
+
 struct task_struct {
        volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
        void *stack;
@@ -1485,6 +1491,9 @@ struct task_struct {
        struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
        struct mutex perf_event_mutex;
        struct list_head perf_event_list;
+#ifdef CONFIG_EVENT_TRACING
+       struct perf_tp_idr *perf_tp_idr;
+#endif
 #endif
 #ifdef CONFIG_NUMA
        struct mempolicy *mempolicy;    /* Protected by alloc_lock */
Index: linux/include/trace/ftrace.h
===================================================================
--- linux.orig/include/trace/ftrace.h
+++ linux/include/trace/ftrace.h
@@ -708,7 +708,6 @@ perf_trace_##call(void *__data, proto)              
        struct ftrace_raw_##call *entry;                                \
        struct pt_regs __regs;                                          \
        u64 __addr = 0, __count = 1;                                    \
-       struct hlist_head *head;                                        \
        int __entry_size;                                               \
        int __data_size;                                                \
        int rctx;                                                       \
@@ -733,9 +732,8 @@ perf_trace_##call(void *__data, proto)              
                                                                        \
        { assign; }                                                     \
                                                                        \
-       head = this_cpu_ptr(event_call->perf_events);                   \
        perf_trace_buf_submit(entry, __entry_size, rctx, __addr,        \
-               __count, &__regs, head);                                \
+               __count, &__regs, event_call->event.type);              \
 }
 
 /*
Index: linux/kernel/events/core.c
===================================================================
--- linux.orig/kernel/events/core.c
+++ linux/kernel/events/core.c
@@ -823,6 +823,7 @@ list_add_event(struct perf_event *event,
        ctx->nr_events++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;
+       ++ctx->generation;
 }
 
 /*
@@ -976,6 +977,7 @@ list_del_event(struct perf_event *event,
         */
        if (event->state > PERF_EVENT_STATE_OFF)
                event->state = PERF_EVENT_STATE_OFF;
+       ++ctx->generation;
 }
 
 static void perf_group_detach(struct perf_event *event)
@@ -1894,6 +1896,12 @@ static void perf_event_context_sched_out
        if (!cpuctx->task_ctx)
                return;
 
+#if 0
+       /*
+        * Need to sort out how to make task_struct::perf_tp_idr
+        * work with this fancy switching stuff.. tracepoints could be
+        * in multiple contexts due to the software event muck.
+        */
        rcu_read_lock();
        parent = rcu_dereference(ctx->parent_ctx);
        next_ctx = next->perf_event_ctxp[ctxn];
@@ -1927,6 +1935,7 @@ static void perf_event_context_sched_out
                raw_spin_unlock(&ctx->lock);
        }
        rcu_read_unlock();
+#endif
 
        if (do_switch) {
                ctx_sched_out(ctx, cpuctx, EVENT_ALL);
@@ -3261,6 +3270,7 @@ static struct perf_event *perf_fget_ligh
 static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_add_tp(struct perf_event *event, int tp_id);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -3307,6 +3317,9 @@ static long perf_ioctl(struct file *file
        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);
 
+       case PERF_EVENT_IOC_ADD_TP:
+               return perf_event_add_tp(event, arg);
+
        default:
                return -ENOTTY;
        }
@@ -5471,6 +5484,9 @@ static struct pmu perf_swevent = {
 
 #ifdef CONFIG_EVENT_TRACING
 
+#include <linux/ftrace_event.h>
+#include "../trace/trace_output.h"
+
 static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
 {
@@ -5485,8 +5501,9 @@ static int perf_tp_event_match(struct pe
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
 {
-       if (event->hw.state & PERF_HES_STOPPED)
+       if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;
+
        /*
         * All tracepoints are from kernel-space.
         */
@@ -5499,8 +5516,60 @@ static int perf_tp_event_match(struct pe
        return 1;
 }
 
+static void perf_tp_idr_init(struct perf_tp_idr *idr)
+{
+       idr_init(&idr->idr);
+       mutex_init(&idr->lock);
+}
+
+static DEFINE_PER_CPU(struct perf_tp_idr, perf_tp_idr);
+
+struct perf_tp_node {
+       struct list_head        list;
+       struct perf_event       *event;
+       struct rcu_head         rcu;
+};
+
+static void do_perf_tp_event(struct perf_event *event, u64 count,
+                            struct perf_sample_data *data,
+                            struct pt_regs *regs)
+{
+       if (perf_tp_event_match(event, data, regs))
+               perf_swevent_event(event, count, 1, data, regs);
+}
+
+static void perf_tp_idr_event(struct perf_tp_idr *tp_idr,
+                             int id, u64 count,
+                             struct perf_sample_data *data,
+                             struct pt_regs *regs)
+{
+       struct perf_tp_node *tp_node, *node;
+       struct perf_event *event;
+
+       if (!tp_idr)
+               return;
+
+       /*
+        * Most of this is done under rcu_read_lock_sched(), which doesn't
+        * exclude regular RCU grace periods, but the IDR code uses call_rcu()
+        * so we have to use rcu_read_lock() here as well.
+        */
+       rcu_read_lock();
+       tp_node = idr_find(&tp_idr->idr, id);
+       rcu_read_unlock();
+
+       if (!tp_node)
+               return;
+
+       event = tp_node->event;
+
+       do_perf_tp_event(event, count, data, regs);
+       list_for_each_entry_rcu(node, &tp_node->list, list)
+               do_perf_tp_event(node->event, count, data, regs);
+}
+
 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-                  struct pt_regs *regs, struct hlist_head *head, int rctx)
+                  struct pt_regs *regs, int rctx, int id)
 {
        struct perf_sample_data data;
        struct perf_event *event;
@@ -5514,18 +5583,197 @@ void perf_tp_event(u64 addr, u64 count, 
        perf_sample_data_init(&data, addr);
        data.raw = &raw;
 
-       hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
-               if (perf_tp_event_match(event, &data, regs))
-                       perf_swevent_event(event, count, 1, &data, regs);
-       }
+       perf_tp_idr_event(&__get_cpu_var(perf_tp_idr), id, count, &data, regs);
+       perf_tp_idr_event(current->perf_tp_idr, id, count, &data, regs);
 
        perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
+static struct perf_tp_idr *
+perf_tp_init_task(struct perf_event *event, struct task_struct *task)
+{
+       struct perf_tp_idr *idr;
+
+       mutex_lock(&task->perf_event_mutex);
+       idr = task->perf_tp_idr;
+       if (idr)
+               goto unlock;
+
+       idr = kzalloc(sizeof(struct perf_tp_idr), GFP_KERNEL);
+       if (!idr)
+               goto unlock;
+
+       perf_tp_idr_init(idr);
+
+       task->perf_tp_idr = idr;
+unlock:
+       mutex_unlock(&task->perf_event_mutex);
+
+       return idr;
+}
+
+static struct perf_tp_idr *perf_event_idr(struct perf_event *event, bool 
create)
+{
+       struct perf_tp_idr *tp_idr;
+       struct task_struct *task;
+
+       if (event->attach_state & PERF_ATTACH_TASK) {
+               task = event->hw.event_target;
+               tp_idr = task->perf_tp_idr;
+               if (!tp_idr && create)
+                       tp_idr = perf_tp_init_task(event, task);
+       } else
+               tp_idr = &per_cpu(perf_tp_idr, event->cpu);
+
+       return tp_idr;
+}
+
+static void perf_tp_free_node(struct rcu_head *rcu)
+{
+       struct perf_tp_node *node = container_of(rcu, struct perf_tp_node, rcu);
+
+       kfree(node);
+}
+
+static int perf_tp_remove_idr(int id, void *p, void *data)
+{
+       struct perf_tp_node *node = p;
+       struct perf_tp_node *first, *next;
+       struct perf_tp_idr *tp_idr = data;
+
+       if (!tp_idr)
+               goto no_idr;
+
+       mutex_lock(&tp_idr->lock);
+       first = idr_find(&tp_idr->idr, id);
+       if (first == node) {
+               next = list_first_entry(&first->list, struct perf_tp_node, 
list);
+               if (next != first)
+                       idr_replace(&tp_idr->idr, next, id);
+               else
+                       idr_remove(&tp_idr->idr, id);
+       }
+       list_del_rcu(&node->list);
+       mutex_unlock(&tp_idr->lock);
+
+no_idr:
+       perf_trace_destroy_id(id);
+       call_rcu_sched(&node->rcu, perf_tp_free_node);
+       return 0;
+}
+
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-       perf_trace_destroy(event);
+       /*
+        * Since this is the free path, the fd is gone an there
+        * can be no concurrency on event->tp_idr.
+        */
+
+       idr_for_each(&event->tp_idr.idr, perf_tp_remove_idr,
+                       perf_event_idr(event, false));
+
+       idr_remove_all(&event->tp_idr.idr);
+       idr_destroy(&event->tp_idr.idr);
+}
+
+static int __perf_event_add_tp(struct perf_event *event, int tp_id)
+{
+       struct perf_tp_node *node, *first;
+       struct perf_tp_idr *idr;
+       int tmp_id, err, ret = -ENOMEM;
+
+       node = kmalloc(sizeof(*node), GFP_KERNEL);
+       if (!node)
+               goto out;
+
+       node->event = event;
+       INIT_LIST_HEAD(&node->list);
+
+       /*
+        * Insert the node into the event->idr, this idr tracks the
+        * tracepoints we're interested in, it has a 1:1 relation
+        * with the node.
+        */
+       idr = &event->tp_idr;
+       mutex_lock(&idr->lock);
+       err = idr_pre_get(&idr->idr, GFP_KERNEL);
+       if (!err) {
+               ret = -ENOMEM;
+               goto free_node;
+       }
+
+       ret = idr_get_new_above(&idr->idr, node, tp_id, &tmp_id);
+       if (ret)
+               goto free_node;
+
+       if (WARN_ON(tp_id != tmp_id)) {
+               printk(KERN_ERR "fail: %d %d\n" , tp_id, tmp_id);
+               ret = -EBUSY;
+               goto free_idr1;
+       }
+       mutex_unlock(&idr->lock);
+
+       /*
+        * Insert the node into the task/cpu idr, this idr tracks
+        * all active tracepoints for the task/cpu, it has a 1:n relation
+        * with the node.
+        */
+       idr = perf_event_idr(event, true);
+       if (!idr) {
+               if (event->attach_state & PERF_ATTACH_CONTEXT)
+                       ret = -ENOMEM;
+               else
+                       ret = -ESRCH;
+               goto free_idr1_set;
+       }
+       mutex_lock(&idr->lock);
+       first = idr_find(&idr->idr, tp_id);
+       if (first) {
+               list_add_rcu(&node->list, &first->list);
+               goto unlock;
+       }
+
+       err = idr_pre_get(&idr->idr, GFP_KERNEL);
+       if (!err) {
+               ret = -ENOMEM;
+               goto free_idr1_set_unlock;
+       }
+
+       ret = idr_get_new_above(&idr->idr, node, tp_id, &tmp_id);
+       if (ret)
+               goto free_idr1_set;
+
+       if (WARN_ON(tp_id != tmp_id)) {
+               ret = -EBUSY;
+               goto free_idr2;
+       }
+unlock:
+       mutex_unlock(&idr->lock);
+
+       ret = perf_trace_init(event, tp_id);
+       if (ret)
+               goto free_all;
+
+out:
+       return ret;
+
+free_all:
+       mutex_lock(&idr->lock);
+free_idr2:
+       idr_remove(&idr->idr, tmp_id);
+free_idr1_set_unlock:
+       mutex_unlock(&idr->lock);
+free_idr1_set:
+       idr = &event->tp_idr;
+       tmp_id = tp_id;
+       mutex_lock(&idr->lock);
+free_idr1:
+       idr_remove(&idr->idr, tmp_id);
+free_node:
+       mutex_unlock(&idr->lock);
+       kfree(node);
+       goto out;
 }
 
 static int perf_tp_event_init(struct perf_event *event)
@@ -5535,21 +5783,35 @@ static int perf_tp_event_init(struct per
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;
 
-       err = perf_trace_init(event);
-       if (err)
-               return err;
+       perf_tp_idr_init(&event->tp_idr);
 
        event->destroy = tp_perf_event_destroy;
 
+       if (event->attr.config != ~0ULL) {
+               err = __perf_event_add_tp(event, event->attr.config);
+               if (err)
+                       return err;
+       }
+
        return 0;
 }
 
+static int perf_tp_event_add(struct perf_event *event, int flags)
+{
+       event->hw.state = flags & PERF_EF_START ? 0 : PERF_HES_STOPPED;
+       return 0;
+}
+
+static void perf_tp_event_del(struct perf_event *event, int flags)
+{
+}
+
 static struct pmu perf_tracepoint = {
        .task_ctx_nr    = perf_sw_context,
 
        .event_init     = perf_tp_event_init,
-       .add            = perf_trace_add,
-       .del            = perf_trace_del,
+       .add            = perf_tp_event_add,
+       .del            = perf_tp_event_del,
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
@@ -5557,6 +5819,11 @@ static struct pmu perf_tracepoint = {
 
 static inline void perf_tp_register(void)
 {
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               perf_tp_idr_init(&per_cpu(perf_tp_idr, cpu));
+
        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
 }
 
@@ -5565,7 +5832,8 @@ static int perf_event_set_filter(struct 
        char *filter_str;
        int ret;
 
-       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+       if (event->attr.type != PERF_TYPE_TRACEPOINT ||
+           event->attr.config == ~0ULL)
                return -EINVAL;
 
        filter_str = strndup_user(arg, PAGE_SIZE);
@@ -5583,6 +5851,74 @@ static void perf_event_free_filter(struc
        ftrace_profile_free_filter(event);
 }
 
+static int perf_event_add_tp(struct perf_event *event, int tp_id)
+{
+       if (event->attr.type != PERF_TYPE_TRACEPOINT &&
+           event->attr.config != ~0ULL)
+               return -EINVAL;
+
+       return __perf_event_add_tp(event, tp_id);
+}
+
+/*
+ * Called from the exit path, _after_ all events have been detached from it.
+ */
+static void perf_tp_event_exit(struct task_struct *tsk)
+{
+       struct perf_tp_idr *idr = tsk->perf_tp_idr;
+
+       if (!idr)
+               return;
+
+       idr_remove_all(&idr->idr);
+       idr_destroy(&idr->idr);
+}
+
+static void perf_tp_event_delayed_put(struct task_struct *tsk)
+{
+       struct perf_tp_idr *idr = tsk->perf_tp_idr;
+
+       tsk->perf_tp_idr = NULL;
+       kfree(idr);
+}
+
+static int perf_tp_inherit_idr(int id, void *p, void *data)
+{
+       struct perf_event *child = data;
+
+       return __perf_event_add_tp(child, id);
+}
+
+static int perf_tp_event_inherit(struct perf_event *parent_event,
+                                struct perf_event *child_event)
+{
+       int ret;
+
+       if (parent_event->attr.type != PERF_TYPE_TRACEPOINT ||
+           parent_event->attr.config != ~0ULL)
+               return 0;
+
+       /*
+        * The child is not yet exposed, hence no need to serialize things
+        * on that side.
+        */
+       mutex_lock(&parent_event->tp_idr.lock);
+       ret = idr_for_each(&parent_event->tp_idr.idr,
+                       perf_tp_inherit_idr,
+                       child_event);
+       mutex_unlock(&parent_event->tp_idr.lock);
+
+       return ret;
+}
+
+static void perf_tp_event_init_task(struct task_struct *child)
+{
+       /*
+        * Clear the idr pointer copied from the parent.
+        */
+       child->perf_tp_idr = NULL;
+}
+
 #else
 
 static inline void perf_tp_register(void)
@@ -5598,6 +5934,29 @@ static void perf_event_free_filter(struc
 {
 }
 
+static int perf_event_add_tp(struct perf_event *event, int tp_id)
+{
+       return -ENOENT;
+}
+
+static void perf_tp_event_exit(struct task_struct *tsk)
+{
+}
+
+static void perf_tp_event_delayed_put(struct task_struct *tsk)
+{
+}
+
+static int perf_tp_event_inherit(struct perf_event *parent_event,
+                                struct perf_event *child_event)
+{
+       return 0;
+}
+
+static void perf_tp_event_init_task()(struct task_struct *child)
+{
+}
+
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -6173,6 +6532,9 @@ perf_event_alloc(struct perf_event_attr 
        INIT_LIST_HEAD(&event->sibling_list);
        init_waitqueue_head(&event->waitq);
        init_irq_work(&event->pending, perf_pending_event);
+#ifdef CONFIG_EVENT_TRACING
+       perf_tp_idr_init(&event->tp_idr);
+#endif
 
        mutex_init(&event->mmap_mutex);
 
@@ -6191,6 +6553,7 @@ perf_event_alloc(struct perf_event_attr 
 
        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
+               event->hw.event_target = task;
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
                /*
                 * hw_breakpoint is a bit difficult here..
@@ -6236,7 +6599,7 @@ done:
        if (err) {
                if (event->ns)
                        put_pid_ns(event->ns);
-               kfree(event);
+               free_event(event);
                return ERR_PTR(err);
        }
 
@@ -6604,7 +6967,6 @@ SYSCALL_DEFINE5(perf_event_open,
        }
 
        perf_install_in_context(ctx, event, cpu);
-       ++ctx->generation;
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
 
@@ -6681,7 +7043,6 @@ perf_event_create_kernel_counter(struct 
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        perf_install_in_context(ctx, event, cpu);
-       ++ctx->generation;
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
 
@@ -6858,6 +7219,8 @@ void perf_event_exit_task(struct task_st
 
        for_each_task_context_nr(ctxn)
                perf_event_exit_task_context(child, ctxn);
+
+       perf_tp_event_exit(child);
 }
 
 static void perf_free_event(struct perf_event *event,
@@ -6920,6 +7283,8 @@ void perf_event_delayed_put(struct task_
 
        for_each_task_context_nr(ctxn)
                WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+
+       perf_tp_event_delayed_put(task);
 }
 
 /*
@@ -6935,6 +7300,7 @@ inherit_event(struct perf_event *parent_
 {
        struct perf_event *child_event;
        unsigned long flags;
+       int ret;
 
        /*
         * Instead of creating recursive hierarchies of events,
@@ -6952,6 +7318,13 @@ inherit_event(struct perf_event *parent_
                                           NULL);
        if (IS_ERR(child_event))
                return child_event;
+
+       ret = perf_tp_event_inherit(parent_event, child_event);
+       if (ret) {
+               free_event(child_event);
+               return ERR_PTR(ret);
+       }
+
        get_ctx(child_ctx);
 
        /*
@@ -7177,6 +7550,8 @@ int perf_event_init_task(struct task_str
        mutex_init(&child->perf_event_mutex);
        INIT_LIST_HEAD(&child->perf_event_list);
 
+       perf_tp_event_init_task(child);
+
        for_each_task_context_nr(ctxn) {
                ret = perf_event_init_context(child, ctxn);
                if (ret)
Index: linux/kernel/trace/trace_event_perf.c
===================================================================
--- linux.orig/kernel/trace/trace_event_perf.c
+++ linux/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include "trace.h"
+#include "trace_output.h"
 
 static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
 
@@ -47,9 +48,7 @@ static int perf_trace_event_perm(struct 
 static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                                 struct perf_event *p_event)
 {
-       struct hlist_head __percpu *list;
        int ret;
-       int cpu;
 
        ret = perf_trace_event_perm(tp_event, p_event);
        if (ret)
@@ -61,15 +60,6 @@ static int perf_trace_event_init(struct 
 
        ret = -ENOMEM;
 
-       list = alloc_percpu(struct hlist_head);
-       if (!list)
-               goto fail;
-
-       for_each_possible_cpu(cpu)
-               INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
-
-       tp_event->perf_events = list;
-
        if (!total_ref_count) {
                char __percpu *buf;
                int i;
@@ -100,63 +90,40 @@ fail:
                }
        }
 
-       if (!--tp_event->perf_refcount) {
-               free_percpu(tp_event->perf_events);
-               tp_event->perf_events = NULL;
-       }
+       --tp_event->perf_refcount;
 
        return ret;
 }
 
-int perf_trace_init(struct perf_event *p_event)
+int perf_trace_init(struct perf_event *p_event, int event_id)
 {
        struct ftrace_event_call *tp_event;
-       int event_id = p_event->attr.config;
+       struct trace_event *t_event;
        int ret = -EINVAL;
 
+       trace_event_read_lock();
+       t_event = ftrace_find_event(event_id);
+       if (!t_event)
+               goto out;
+
+       tp_event = container_of(t_event, struct ftrace_event_call, event);
+
        mutex_lock(&event_mutex);
-       list_for_each_entry(tp_event, &ftrace_events, list) {
-               if (tp_event->event.type == event_id &&
-                   tp_event->class && tp_event->class->reg &&
-                   try_module_get(tp_event->mod)) {
-                       ret = perf_trace_event_init(tp_event, p_event);
-                       if (ret)
-                               module_put(tp_event->mod);
-                       break;
-               }
+       if (tp_event->class && tp_event->class->reg &&
+                       try_module_get(tp_event->mod)) {
+               ret = perf_trace_event_init(tp_event, p_event);
+               if (ret)
+                       module_put(tp_event->mod);
        }
        mutex_unlock(&event_mutex);
+out:
+       trace_event_read_unlock();
 
        return ret;
 }
 
-int perf_trace_add(struct perf_event *p_event, int flags)
-{
-       struct ftrace_event_call *tp_event = p_event->tp_event;
-       struct hlist_head __percpu *pcpu_list;
-       struct hlist_head *list;
-
-       pcpu_list = tp_event->perf_events;
-       if (WARN_ON_ONCE(!pcpu_list))
-               return -EINVAL;
-
-       if (!(flags & PERF_EF_START))
-               p_event->hw.state = PERF_HES_STOPPED;
-
-       list = this_cpu_ptr(pcpu_list);
-       hlist_add_head_rcu(&p_event->hlist_entry, list);
-
-       return 0;
-}
-
-void perf_trace_del(struct perf_event *p_event, int flags)
-{
-       hlist_del_rcu(&p_event->hlist_entry);
-}
-
-void perf_trace_destroy(struct perf_event *p_event)
+static void __perf_trace_destroy(struct ftrace_event_call *tp_event)
 {
-       struct ftrace_event_call *tp_event = p_event->tp_event;
        int i;
 
        mutex_lock(&event_mutex);
@@ -171,9 +138,6 @@ void perf_trace_destroy(struct perf_even
         */
        tracepoint_synchronize_unregister();
 
-       free_percpu(tp_event->perf_events);
-       tp_event->perf_events = NULL;
-
        if (!--total_ref_count) {
                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                        free_percpu(perf_trace_buf[i]);
@@ -185,6 +149,27 @@ out:
        mutex_unlock(&event_mutex);
 }
 
+void perf_trace_destroy(struct perf_event *p_event)
+{
+       __perf_trace_destroy(p_event->tp_event);
+}
+
+void perf_trace_destroy_id(int event_id)
+{
+       struct ftrace_event_call *tp_event;
+       struct trace_event *t_event;
+
+       trace_event_read_lock();
+       t_event = ftrace_find_event(event_id);
+       if (!t_event)
+               goto unlock;
+
+       tp_event = container_of(t_event, struct ftrace_event_call, event);
+       __perf_trace_destroy(tp_event);
+unlock:
+       trace_event_read_unlock();
+}
+
 __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
                                       struct pt_regs *regs, int *rctxp)
 {
Index: linux/kernel/trace/trace_kprobe.c
===================================================================
--- linux.orig/kernel/trace/trace_kprobe.c
+++ linux/kernel/trace/trace_kprobe.c
@@ -1659,7 +1659,6 @@ static __kprobes void kprobe_perf_func(s
        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
        struct ftrace_event_call *call = &tp->call;
        struct kprobe_trace_entry_head *entry;
-       struct hlist_head *head;
        int size, __size, dsize;
        int rctx;
 
@@ -1679,8 +1678,8 @@ static __kprobes void kprobe_perf_func(s
        memset(&entry[1], 0, dsize);
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
-       head = this_cpu_ptr(call->perf_events);
-       perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+       perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs,
+                             call->event.type);
 }
 
 /* Kretprobe profile handler */
@@ -1690,7 +1689,6 @@ static __kprobes void kretprobe_perf_fun
        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
        struct ftrace_event_call *call = &tp->call;
        struct kretprobe_trace_entry_head *entry;
-       struct hlist_head *head;
        int size, __size, dsize;
        int rctx;
 
@@ -1710,8 +1708,8 @@ static __kprobes void kretprobe_perf_fun
        entry->ret_ip = (unsigned long)ri->ret_addr;
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
-       head = this_cpu_ptr(call->perf_events);
-       perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
+       perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
+                             regs, call->event.type);
 }
 
 static int probe_perf_enable(struct ftrace_event_call *call)
Index: linux/kernel/trace/trace_output.c
===================================================================
--- linux.orig/kernel/trace/trace_output.c
+++ linux/kernel/trace/trace_output.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
+#include <linux/idr.h>
 
 #include "trace_output.h"
 
@@ -16,9 +17,9 @@
 
 DECLARE_RWSEM(trace_event_mutex);
 
-static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+static const int first_event_type = __TRACE_LAST_TYPE + 1;
 
-static int next_event_type = __TRACE_LAST_TYPE + 1;
+static DEFINE_IDR(trace_type_idr);
 
 int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
@@ -664,58 +665,43 @@ static int task_state_char(unsigned long
  */
 struct trace_event *ftrace_find_event(int type)
 {
-       struct trace_event *event;
-       struct hlist_node *n;
-       unsigned key;
-
-       key = type & (EVENT_HASHSIZE - 1);
-
-       hlist_for_each_entry(event, n, &event_hash[key], node) {
-               if (event->type == type)
-                       return event;
-       }
-
-       return NULL;
+       return idr_find(&trace_type_idr, type);
 }
 
-static LIST_HEAD(ftrace_event_list);
+void trace_event_read_lock(void)
+{
+       down_read(&trace_event_mutex);
+}
 
-static int trace_search_list(struct list_head **list)
+void trace_event_read_unlock(void)
 {
-       struct trace_event *e;
-       int last = __TRACE_LAST_TYPE;
+       up_read(&trace_event_mutex);
+}
 
-       if (list_empty(&ftrace_event_list)) {
-               *list = &ftrace_event_list;
-               return last + 1;
-       }
+static int register_event(struct trace_event *event, int id, bool strict)
+{
+       int ret, type;
 
-       /*
-        * We used up all possible max events,
-        * lets see if somebody freed one.
-        */
-       list_for_each_entry(e, &ftrace_event_list, list) {
-               if (e->type != last + 1)
-                       break;
-               last++;
-       }
+       ret = idr_pre_get(&trace_type_idr, GFP_KERNEL);
+       if (!ret)
+               return 0;
 
-       /* Did we used up all 65 thousand events??? */
-       if ((last + 1) > FTRACE_MAX_EVENT)
+       ret = idr_get_new_above(&trace_type_idr, event, id, &type);
+       if (ret)
                return 0;
 
-       *list = &e->list;
-       return last + 1;
-}
+       if (strict && id != type) {
+               idr_remove(&trace_type_idr, type);
+               return 0;
+       }
 
-void trace_event_read_lock(void)
-{
-       down_read(&trace_event_mutex);
-}
+       if (type > FTRACE_MAX_EVENT) {
+               idr_remove(&trace_type_idr, type);
+               return 0;
+       }
 
-void trace_event_read_unlock(void)
-{
-       up_read(&trace_event_mutex);
+       event->type = type;
+       return type;
 }
 
 /**
@@ -735,7 +721,6 @@ void trace_event_read_unlock(void)
  */
 int register_ftrace_event(struct trace_event *event)
 {
-       unsigned key;
        int ret = 0;
 
        down_write(&trace_event_mutex);
@@ -746,35 +731,18 @@ int register_ftrace_event(struct trace_e
        if (WARN_ON(!event->funcs))
                goto out;
 
-       INIT_LIST_HEAD(&event->list);
-
        if (!event->type) {
-               struct list_head *list = NULL;
-
-               if (next_event_type > FTRACE_MAX_EVENT) {
-
-                       event->type = trace_search_list(&list);
-                       if (!event->type)
-                               goto out;
-
-               } else {
-                       
-                       event->type = next_event_type++;
-                       list = &ftrace_event_list;
-               }
-
-               if (WARN_ON(ftrace_find_event(event->type)))
+               ret = register_event(event, first_event_type, false);
+               if (!ret)
                        goto out;
-
-               list_add_tail(&event->list, list);
-
-       } else if (event->type > __TRACE_LAST_TYPE) {
-               printk(KERN_WARNING "Need to add type to trace.h\n");
-               WARN_ON(1);
-               goto out;
        } else {
-               /* Is this event already used */
-               if (ftrace_find_event(event->type))
+               if (event->type > __TRACE_LAST_TYPE) {
+                       printk(KERN_WARNING "Need to add type to trace.h\n");
+                       WARN_ON(1);
+                       goto out;
+               }
+               ret = register_event(event, event->type, true);
+               if (!ret)
                        goto out;
        }
 
@@ -787,11 +755,6 @@ int register_ftrace_event(struct trace_e
        if (event->funcs->binary == NULL)
                event->funcs->binary = trace_nop_print;
 
-       key = event->type & (EVENT_HASHSIZE - 1);
-
-       hlist_add_head(&event->node, &event_hash[key]);
-
-       ret = event->type;
  out:
        up_write(&trace_event_mutex);
 
@@ -804,8 +767,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_event)
  */
 int __unregister_ftrace_event(struct trace_event *event)
 {
-       hlist_del(&event->node);
-       list_del(&event->list);
+       idr_remove(&trace_type_idr, event->type);
        return 0;
 }
 
Index: linux/kernel/trace/trace_syscalls.c
===================================================================
--- linux.orig/kernel/trace/trace_syscalls.c
+++ linux/kernel/trace/trace_syscalls.c
@@ -499,7 +499,6 @@ static void perf_syscall_enter(void *ign
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
-       struct hlist_head *head;
        int syscall_nr;
        int rctx;
        int size;
@@ -530,8 +529,7 @@ static void perf_syscall_enter(void *ign
        syscall_get_arguments(current, regs, 0, sys_data->nb_args,
                               (unsigned long *)&rec->args);
 
-       head = this_cpu_ptr(sys_data->enter_event->perf_events);
-       perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+       perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, rec->ent.type);
 }
 
 int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -573,7 +571,6 @@ static void perf_syscall_exit(void *igno
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_exit *rec;
-       struct hlist_head *head;
        int syscall_nr;
        int rctx;
        int size;
@@ -606,8 +603,7 @@ static void perf_syscall_exit(void *igno
        rec->nr = syscall_nr;
        rec->ret = syscall_get_return_value(current, regs);
 
-       head = this_cpu_ptr(sys_data->exit_event->perf_events);
-       perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+       perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, rec->ent.type);
 }
 
 int perf_sysexit_enable(struct ftrace_event_call *call)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to