On Thu, 2018-12-13 at 19:22 +0000, Martin Lau wrote: > On Wed, Dec 12, 2018 at 04:42:37PM -0800, Matt Mullins wrote: > > Distributions build drivers as modules, including network and filesystem > > drivers which export numerous tracepoints. This enables > > bpf(BPF_RAW_TRACEPOINT_OPEN) to attach to those tracepoints. > > > > Signed-off-by: Matt Mullins <mmull...@fb.com> > > --- > > v1->v2: > > * avoid taking the mutex in bpf_event_notify when op is neither COMING nor > > GOING. > > * check that kzalloc actually succeeded > > > > I didn't try to check list_empty before taking the mutex since I want to > > avoid > > races between bpf_event_notify and bpf_get_raw_tracepoint. Additionally, > > list_for_each_entry_safe is not strictly necessary upon MODULE_STATE_GOING, > > but > > Alexei suggested I use it to protect against fragility if the subsequent > > break; > > eventually disappears. > > > > include/linux/module.h | 4 ++ > > include/linux/trace_events.h | 8 ++- > > kernel/bpf/syscall.c | 11 ++-- > > kernel/module.c | 5 ++ > > kernel/trace/bpf_trace.c | 99 +++++++++++++++++++++++++++++++++++- > > 5 files changed, 120 insertions(+), 7 deletions(-) > > > > diff --git a/include/linux/module.h b/include/linux/module.h > > index fce6b4335e36..5f147dd5e709 100644 > > --- a/include/linux/module.h > > +++ b/include/linux/module.h > > @@ -432,6 +432,10 @@ struct module { > > unsigned int num_tracepoints; > > tracepoint_ptr_t *tracepoints_ptrs; > > #endif > > +#ifdef CONFIG_BPF_EVENTS > > + unsigned int num_bpf_raw_events; > > + struct bpf_raw_event_map *bpf_raw_events; > > +#endif > > #ifdef HAVE_JUMP_LABEL > > struct jump_entry *jump_entries; > > unsigned int num_jump_entries; > > diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h > > index 4130a5497d40..8a62731673f7 100644 > > --- a/include/linux/trace_events.h > > +++ b/include/linux/trace_events.h > > @@ -471,7 +471,8 @@ void perf_event_detach_bpf_prog(struct perf_event > > *event); > > int perf_event_query_prog_array(struct perf_event *event, void __user > > *info); > > int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog > > *prog); > > int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog > > *prog); > > -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); > > +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); > > +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); > > int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, > > u32 *fd_type, const char **buf, > > u64 *probe_offset, u64 *probe_addr); > > @@ -502,10 +503,13 @@ static inline int bpf_probe_unregister(struct > > bpf_raw_event_map *btp, struct bpf > > { > > return -EOPNOTSUPP; > > } > > -static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char > > *name) > > +static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char > > *name) > > { > > return NULL; > > } > > +static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) > > +{ > > +} > > static inline int bpf_get_perf_event_info(const struct perf_event *event, > > u32 *prog_id, u32 *fd_type, > > const char **buf, u64 *probe_offset, > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > > index 70fb11106fc2..754370e3155e 100644 > > --- a/kernel/bpf/syscall.c > > +++ b/kernel/bpf/syscall.c > > @@ -1609,6 +1609,7 @@ static int bpf_raw_tracepoint_release(struct inode > > *inode, struct file *filp) > > bpf_probe_unregister(raw_tp->btp, raw_tp->prog); > > bpf_prog_put(raw_tp->prog); > > } > > + bpf_put_raw_tracepoint(raw_tp->btp); > > kfree(raw_tp); > > return 0; > > } > > @@ -1634,13 +1635,15 @@ static int bpf_raw_tracepoint_open(const union > > bpf_attr *attr) > > return -EFAULT; > > tp_name[sizeof(tp_name) - 1] = 0; > > > > - btp = bpf_find_raw_tracepoint(tp_name); > > + btp = bpf_get_raw_tracepoint(tp_name); > > if (!btp) > > return -ENOENT; > > > > raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); > > - if (!raw_tp) > > - return -ENOMEM; > > + if (!raw_tp) { > > + err = -ENOMEM; > > + goto out_put_btp; > > + } > > raw_tp->btp = btp; > > > > prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, > > @@ -1668,6 +1671,8 @@ static int bpf_raw_tracepoint_open(const union > > bpf_attr *attr) > > bpf_prog_put(prog); > > out_free_tp: > > kfree(raw_tp); > > +out_put_btp: > > + bpf_put_raw_tracepoint(btp); > > return err; > > } > > > > diff --git a/kernel/module.c b/kernel/module.c > > index 49a405891587..06ec68f08387 100644 > > --- a/kernel/module.c > > +++ b/kernel/module.c > > @@ -3093,6 +3093,11 @@ static int find_module_sections(struct module *mod, > > struct load_info *info) > > sizeof(*mod->tracepoints_ptrs), > > &mod->num_tracepoints); > > #endif > > +#ifdef CONFIG_BPF_EVENTS > > + mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", > > + sizeof(*mod->bpf_raw_events), > > + &mod->num_bpf_raw_events); > > +#endif > > #ifdef HAVE_JUMP_LABEL > > mod->jump_entries = section_objs(info, "__jump_table", > > sizeof(*mod->jump_entries), > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > > index 9864a35c8bb5..9ddb6fddb4e0 100644 > > --- a/kernel/trace/bpf_trace.c > > +++ b/kernel/trace/bpf_trace.c > > @@ -17,6 +17,43 @@ > > #include "trace_probe.h" > > #include "trace.h" > > > > +#ifdef CONFIG_MODULES > > +struct bpf_trace_module { > > + struct module *module; > > + struct list_head list; > > +}; > > + > > +static LIST_HEAD(bpf_trace_modules); > > +static DEFINE_MUTEX(bpf_module_mutex); > > + > > +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char > > *name) > > +{ > > + struct bpf_raw_event_map *btp, *ret = NULL; > > + struct bpf_trace_module *btm; > > + unsigned int i; > > + > > + mutex_lock(&bpf_module_mutex); > > + list_for_each_entry(btm, &bpf_trace_modules, list) { > > + for (i = 0; i < btm->module->num_bpf_raw_events; ++i) { > > + btp = &btm->module->bpf_raw_events[i]; > > + if (!strcmp(btp->tp->name, name)) { > > + if (try_module_get(btm->module)) > > + ret = btp; > > + goto out; > > + } > > + } > > + } > > +out: > > + mutex_unlock(&bpf_module_mutex); > > + return ret; > > +} > > +#else > > +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char > > *name) > > +{ > > + return NULL; > > +} > > +#endif /* CONFIG_MODULES */ > > + > > u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); > > u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); > > > > @@ -1076,7 +1113,7 @@ int perf_event_query_prog_array(struct perf_event > > *event, void __user *info) > > extern struct bpf_raw_event_map __start__bpf_raw_tp[]; > > extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; > > > > -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) > > +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) > > { > > struct bpf_raw_event_map *btp = __start__bpf_raw_tp; > > > > @@ -1084,7 +1121,16 @@ struct bpf_raw_event_map > > *bpf_find_raw_tracepoint(const char *name) > > if (!strcmp(btp->tp->name, name)) > > return btp; > > } > > - return NULL; > > + > > + return bpf_get_raw_tracepoint_module(name); > > +} > > + > > +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) > > +{ > > + struct module *mod = __module_address((unsigned long)btp); > > + > > + if (mod) > > + module_put(mod); > > } > > > > static __always_inline > > @@ -1222,3 +1268,52 @@ int bpf_get_perf_event_info(const struct perf_event > > *event, u32 *prog_id, > > > > return err; > > } > > + > > +#ifdef CONFIG_MODULES > > +int bpf_event_notify(struct notifier_block *nb, unsigned long op, void > > *module) > > +{ > > + struct bpf_trace_module *btm, *tmp; > > + struct module *mod = module; > > + > > + if (mod->num_bpf_raw_events == 0 || > > + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) > > + return 0; > > + > > + mutex_lock(&bpf_module_mutex); > > + > > + switch (op) { > > + case MODULE_STATE_COMING: > > + btm = kzalloc(sizeof(*btm), GFP_KERNEL); > > + if (btm) { > > + btm->module = module; > > + list_add(&btm->list, &bpf_trace_modules); > > + } > > Is it fine to return 0 on !btm case?
That effectively just means we'll be ignoring tracepoints for a module that is loaded while we can't allocate a bpf_trace_module (24 bytes) to track it. That feels like reasonable behavior to me. > > Other looks good. > > > + break; > > + case MODULE_STATE_GOING: > > + list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) { > > + if (btm->module == module) { > > + list_del(&btm->list); > > + kfree(btm); > > + break; > > + } > > + } > > + break; > > + } > > + > > + mutex_unlock(&bpf_module_mutex); > > + > > + return 0; > > +} > > + > > +static struct notifier_block bpf_module_nb = { > > + .notifier_call = bpf_event_notify, > > +}; > > + > > +int __init bpf_event_init(void) > > +{ > > + register_module_notifier(&bpf_module_nb); > > + return 0; > > +} > > + > > +fs_initcall(bpf_event_init); > > +#endif /* CONFIG_MODULES */ > > -- > > 2.17.1 > >