I believe that Jiri understands this code better than I do.

On Tue, 10 Oct 2017 17:21:46 +0200
Peter Zijlstra <pet...@infradead.org> wrote:

> On Tue, Oct 10, 2017 at 03:04:48PM +0200, Peter Zijlstra wrote:
> > On Tue, Oct 10, 2017 at 01:33:21PM +0200, Peter Zijlstra wrote:  
> > > But now you've got me looking at 75e8387685f6, which also looks
> > > completely insane.  
> > 
> > The reason I insta stumbled on that patch is that it only addresses the
> > ftrace situation and doesn't mention the other _5_ places that use this
> > interface. It doesn't explain why those don't have the problem and if
> > not, why their solution doesn't work for ftrace.
> > 
> > So all (well syscall and regular tracepoints, didn't check the others)
> > avoid that problem by simply not registering multiple times at the
> > tracepoint. Tracepoints use tp_event->perf_refcount and the syscall
> > things use sys_perf_refcount_{enter,exit} for that.
> > 
> > Doing the same for function trace looks a little something like the
> > below (after reverting 75e8387685f6)
> > 
> > Except the below doesn't compile because of
> > kernel/trace/trace_event_filter.c, which is where I lost the plot.  
> 
> OK, so that filter stuff was the entire reason for this trainwreck :/
> 
> Using ftrace_ops filters allows ftrace to patch less functions etc.. So
> that requires an ftrace_ops per event. Still that then instantly
> suggests we fix the whole hlist situation instead of making it worse.
> 
> See below; I now have 3 patches: revert, the below, kill
> FTRACE_OPS_FL_PER_CPU.
> 
> How's this?
> 
> ---
>  kernel/trace/trace_event_perf.c |   68 
> +++++++++++++++++++++-------------------
>  1 file changed, 36 insertions(+), 32 deletions(-)
> 
> --- a/kernel/trace/trace_event_perf.c
> +++ b/kernel/trace/trace_event_perf.c
> @@ -240,27 +240,31 @@ void perf_trace_destroy(struct perf_even
>  int perf_trace_add(struct perf_event *p_event, int flags)
>  {
>       struct trace_event_call *tp_event = p_event->tp_event;
> -     struct hlist_head __percpu *pcpu_list;
> -     struct hlist_head *list;
>  
> -     pcpu_list = tp_event->perf_events;
> -     if (WARN_ON_ONCE(!pcpu_list))
> -             return -EINVAL;
> +     if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {

We probably still want to check for reg returning less than zero.

And there really should be a comment here. If I understand this
correctly, ftrace reg functions return 1 and trace_event reg functions
return zero (entering into this path). But the trace_event register
functions can fail, and will return negative values.


> +             struct hlist_head __percpu *pcpu_list;
> +             struct hlist_head *list;
> +
> +             pcpu_list = tp_event->perf_events;
> +             if (WARN_ON_ONCE(!pcpu_list))
> +                     return -EINVAL;
>  
> -     if (!(flags & PERF_EF_START))
> -             p_event->hw.state = PERF_HES_STOPPED;
> +             if (!(flags & PERF_EF_START))
> +                     p_event->hw.state = PERF_HES_STOPPED;
>  
> -     list = this_cpu_ptr(pcpu_list);
> -     hlist_add_head_rcu(&p_event->hlist_entry, list);
> +             list = this_cpu_ptr(pcpu_list);
> +             hlist_add_head_rcu(&p_event->hlist_entry, list);
> +     }
>  
> -     return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
> +     return 0;
>  }
>  
>  void perf_trace_del(struct perf_event *p_event, int flags)
>  {
>       struct trace_event_call *tp_event = p_event->tp_event;
> -     hlist_del_rcu(&p_event->hlist_entry);
> -     tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
> +
> +     if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))

This shouldn't ever fail. I believe all unregister (DEL) functions
return 0 (for trace_events). But probably comment anyway.

-- Steve

> +             hlist_del_rcu(&p_event->hlist_entry);
>  }
>  
>  void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
> @@ -306,15 +310,19 @@ static void
>  perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
>                         struct ftrace_ops *ops, struct pt_regs *pt_regs)
>  {
> +     struct hlist_head head = HLIST_HEAD_INIT;
>       struct ftrace_entry *entry;
> -     struct hlist_head *head;
> +     struct perf_event *event;
>       struct pt_regs regs;
>       int rctx;
>  
> -     head = this_cpu_ptr(event_function.perf_events);
> -     if (hlist_empty(head))
> +     event = container_of(ops, struct perf_event, ftrace_ops);
> +
> +     if (!event->ftrace_ops.private)
>               return;
>  
> +     hlist_add_head(&event->hlist_entry, &head);
> +
>  #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
>                   sizeof(u64)) - sizeof(u32))
>  
> @@ -330,17 +338,21 @@ perf_ftrace_function_call(unsigned long
>       entry->ip = ip;
>       entry->parent_ip = parent_ip;
>       perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
> -                           1, &regs, head, NULL);
> +                           1, &regs, &head, NULL);
>  
>  #undef ENTRY_SIZE
> +
> +     hlist_del_init(&event->hlist_entry);
>  }
>  
>  static int perf_ftrace_function_register(struct perf_event *event)
>  {
>       struct ftrace_ops *ops = &event->ftrace_ops;
>  
> -     ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
> -     ops->func = perf_ftrace_function_call;
> +     ops->flags   = FTRACE_OPS_FL_RCU;
> +     ops->func    = perf_ftrace_function_call;
> +     ops->private = NULL;
> +
>       return register_ftrace_function(ops);
>  }
>  
> @@ -352,19 +364,11 @@ static int perf_ftrace_function_unregist
>       return ret;
>  }
>  
> -static void perf_ftrace_function_enable(struct perf_event *event)
> -{
> -     ftrace_function_local_enable(&event->ftrace_ops);
> -}
> -
> -static void perf_ftrace_function_disable(struct perf_event *event)
> -{
> -     ftrace_function_local_disable(&event->ftrace_ops);
> -}
> -
>  int perf_ftrace_event_register(struct trace_event_call *call,
>                              enum trace_reg type, void *data)
>  {
> +     struct perf_event *event = data;
> +
>       switch (type) {
>       case TRACE_REG_REGISTER:
>       case TRACE_REG_UNREGISTER:
> @@ -377,11 +381,11 @@ int perf_ftrace_event_register(struct tr
>       case TRACE_REG_PERF_CLOSE:
>               return perf_ftrace_function_unregister(data);
>       case TRACE_REG_PERF_ADD:
> -             perf_ftrace_function_enable(data);
> -             return 0;
> +             event->ftrace_ops.private = (void *)1UL;
> +             return 1;
>       case TRACE_REG_PERF_DEL:
> -             perf_ftrace_function_disable(data);
> -             return 0;
> +             event->ftrace_ops.private = (void *)0UL;
> +             return 1;
>       }
>  
>       return -EINVAL;

Reply via email to