I believe that Jiri understands this code better than I do. On Tue, 10 Oct 2017 17:21:46 +0200 Peter Zijlstra <pet...@infradead.org> wrote:
> On Tue, Oct 10, 2017 at 03:04:48PM +0200, Peter Zijlstra wrote: > > On Tue, Oct 10, 2017 at 01:33:21PM +0200, Peter Zijlstra wrote: > > > But now you've got me looking at 75e8387685f6, which also looks > > > completely insane. > > > > The reason I insta stumbled on that patch is that it only addresses the > > ftrace situation and doesn't mention the other _5_ places that use this > > interface. It doesn't explain why those don't have the problem and if > > not, why their solution doesn't work for ftrace. > > > > So all (well syscall and regular tracepoints, didn't check the others) > > avoid that problem by simply not registering multiple times at the > > tracepoint. Tracepoints use tp_event->perf_refcount and the syscall > > things use sys_perf_refcount_{enter,exit} for that. > > > > Doing the same for function trace looks a little something like the > > below (after reverting 75e8387685f6) > > > > Except the below doesn't compile because of > > kernel/trace/trace_event_filter.c, which is where I lost the plot. > > OK, so that filter stuff was the entire reason for this trainwreck :/ > > Using ftrace_ops filters allows ftrace to patch less functions etc.. So > that requires an ftrace_ops per event. Still that then instantly > suggests we fix the whole hlist situation instead of making it worse. > > See below; I now have 3 patches: revert, the below, kill > FTRACE_OPS_FL_PER_CPU. > > How's this? > > --- > kernel/trace/trace_event_perf.c | 68 > +++++++++++++++++++++------------------- > 1 file changed, 36 insertions(+), 32 deletions(-) > > --- a/kernel/trace/trace_event_perf.c > +++ b/kernel/trace/trace_event_perf.c > @@ -240,27 +240,31 @@ void perf_trace_destroy(struct perf_even > int perf_trace_add(struct perf_event *p_event, int flags) > { > struct trace_event_call *tp_event = p_event->tp_event; > - struct hlist_head __percpu *pcpu_list; > - struct hlist_head *list; > > - pcpu_list = tp_event->perf_events; > - if (WARN_ON_ONCE(!pcpu_list)) > - return -EINVAL; > + if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) { We probably still want to check for reg returning less than zero. And there really should be a comment here. If I understand this correctly, ftrace reg functions return 1 and trace_event reg functions return zero (entering into this path). But the trace_event register functions can fail, and will return negative values. > + struct hlist_head __percpu *pcpu_list; > + struct hlist_head *list; > + > + pcpu_list = tp_event->perf_events; > + if (WARN_ON_ONCE(!pcpu_list)) > + return -EINVAL; > > - if (!(flags & PERF_EF_START)) > - p_event->hw.state = PERF_HES_STOPPED; > + if (!(flags & PERF_EF_START)) > + p_event->hw.state = PERF_HES_STOPPED; > > - list = this_cpu_ptr(pcpu_list); > - hlist_add_head_rcu(&p_event->hlist_entry, list); > + list = this_cpu_ptr(pcpu_list); > + hlist_add_head_rcu(&p_event->hlist_entry, list); > + } > > - return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); > + return 0; > } > > void perf_trace_del(struct perf_event *p_event, int flags) > { > struct trace_event_call *tp_event = p_event->tp_event; > - hlist_del_rcu(&p_event->hlist_entry); > - tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); > + > + if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event)) This shouldn't ever fail. I believe all unregister (DEL) functions return 0 (for trace_events). But probably comment anyway. -- Steve > + hlist_del_rcu(&p_event->hlist_entry); > } > > void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) > @@ -306,15 +310,19 @@ static void > perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, > struct ftrace_ops *ops, struct pt_regs *pt_regs) > { > + struct hlist_head head = HLIST_HEAD_INIT; > struct ftrace_entry *entry; > - struct hlist_head *head; > + struct perf_event *event; > struct pt_regs regs; > int rctx; > > - head = this_cpu_ptr(event_function.perf_events); > - if (hlist_empty(head)) > + event = container_of(ops, struct perf_event, ftrace_ops); > + > + if (!event->ftrace_ops.private) > return; > > + hlist_add_head(&event->hlist_entry, &head); > + > #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ > sizeof(u64)) - sizeof(u32)) > > @@ -330,17 +338,21 @@ perf_ftrace_function_call(unsigned long > entry->ip = ip; > entry->parent_ip = parent_ip; > perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, > - 1, ®s, head, NULL); > + 1, ®s, &head, NULL); > > #undef ENTRY_SIZE > + > + hlist_del_init(&event->hlist_entry); > } > > static int perf_ftrace_function_register(struct perf_event *event) > { > struct ftrace_ops *ops = &event->ftrace_ops; > > - ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU; > - ops->func = perf_ftrace_function_call; > + ops->flags = FTRACE_OPS_FL_RCU; > + ops->func = perf_ftrace_function_call; > + ops->private = NULL; > + > return register_ftrace_function(ops); > } > > @@ -352,19 +364,11 @@ static int perf_ftrace_function_unregist > return ret; > } > > -static void perf_ftrace_function_enable(struct perf_event *event) > -{ > - ftrace_function_local_enable(&event->ftrace_ops); > -} > - > -static void perf_ftrace_function_disable(struct perf_event *event) > -{ > - ftrace_function_local_disable(&event->ftrace_ops); > -} > - > int perf_ftrace_event_register(struct trace_event_call *call, > enum trace_reg type, void *data) > { > + struct perf_event *event = data; > + > switch (type) { > case TRACE_REG_REGISTER: > case TRACE_REG_UNREGISTER: > @@ -377,11 +381,11 @@ int perf_ftrace_event_register(struct tr > case TRACE_REG_PERF_CLOSE: > return perf_ftrace_function_unregister(data); > case TRACE_REG_PERF_ADD: > - perf_ftrace_function_enable(data); > - return 0; > + event->ftrace_ops.private = (void *)1UL; > + return 1; > case TRACE_REG_PERF_DEL: > - perf_ftrace_function_disable(data); > - return 0; > + event->ftrace_ops.private = (void *)0UL; > + return 1; > } > > return -EINVAL;