> On May 4, 2018, at 2:49 PM, Martin KaFai Lau <ka...@fb.com> wrote: > > This patch gives an ID to each loaded BTF. The ID is allocated by > the idr like the existing prog-id and map-id. > > The bpf_put(map->btf) is moved to __bpf_map_put() so that the > userspace can stop seeing the BTF ID ASAP when the last BTF > refcnt is gone. > > It also makes BTF accessible from userspace through the > 1. new BPF_BTF_GET_FD_BY_ID command. It is limited to CAP_SYS_ADMIN > which is inline with the BPF_BTF_LOAD cmd and the existing > BPF_[MAP|PROG]_GET_FD_BY_ID cmd. > 2. new btf_id (and btf_key_id + btf_value_id) in "struct bpf_map_info" > > Once the BTF ID handler is accessible from userspace, freeing a BTF > object has to go through a rcu period. The BPF_BTF_GET_FD_BY_ID cmd > can then be done under a rcu_read_lock() instead of taking > spin_lock. > [Note: A similar rcu usage can be done to the existing > bpf_prog_get_fd_by_id() in a follow up patch] > > When processing the BPF_BTF_GET_FD_BY_ID cmd, > refcount_inc_not_zero() is needed because the BTF object > could be already in the rcu dead row . btf_get() is > removed since its usage is currently limited to btf.c > alone. refcount_inc() is used directly instead. > > Signed-off-by: Martin KaFai Lau <ka...@fb.com> > Acked-by: Alexei Starovoitov <a...@fb.com> > --- > include/linux/btf.h | 2 + > include/uapi/linux/bpf.h | 5 +++ > kernel/bpf/btf.c | 108 ++++++++++++++++++++++++++++++++++++++++++----- > kernel/bpf/syscall.c | 24 ++++++++++- > 4 files changed, 128 insertions(+), 11 deletions(-) > > diff --git a/include/linux/btf.h b/include/linux/btf.h > index a966dc6d61ee..e076c4697049 100644 > --- a/include/linux/btf.h > +++ b/include/linux/btf.h > @@ -44,5 +44,7 @@ const struct btf_type *btf_type_id_size(const struct btf > *btf, > u32 *ret_size); > void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, > struct seq_file *m); > +int btf_get_fd_by_id(u32 id); > +u32 btf_id(const struct btf *btf); > > #endif > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 93d5a4eeec2a..6106f23a9a8a 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -96,6 +96,7 @@ enum bpf_cmd { > BPF_PROG_QUERY, > BPF_RAW_TRACEPOINT_OPEN, > BPF_BTF_LOAD, > + BPF_BTF_GET_FD_BY_ID, > }; > > enum bpf_map_type { > @@ -344,6 +345,7 @@ union bpf_attr { > __u32 start_id; > __u32 prog_id; > __u32 map_id; > + __u32 btf_id; > }; > __u32 next_id; > __u32 open_flags; > @@ -2130,6 +2132,9 @@ struct bpf_map_info { > __u32 ifindex; > __u64 netns_dev; > __u64 netns_ino; > + __u32 btf_id; > + __u32 btf_key_id; > + __u32 btf_value_id; > } __attribute__((aligned(8))); > > /* User bpf_sock_addr struct to access socket fields and sockaddr struct > passed > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c > index fa0dce0452e7..40950b6bf395 100644 > --- a/kernel/bpf/btf.c > +++ b/kernel/bpf/btf.c > @@ -11,6 +11,7 @@ > #include <linux/file.h> > #include <linux/uaccess.h> > #include <linux/kernel.h> > +#include <linux/idr.h> > #include <linux/bpf_verifier.h> > #include <linux/btf.h> > > @@ -179,6 +180,9 @@ > i < btf_type_vlen(struct_type); \ > i++, member++) > > +static DEFINE_IDR(btf_idr); > +static DEFINE_SPINLOCK(btf_idr_lock); > + > struct btf { > union { > struct btf_header *hdr; > @@ -193,6 +197,8 @@ struct btf { > u32 types_size; > u32 data_size; > refcount_t refcnt; > + u32 id; > + struct rcu_head rcu; > }; > > enum verifier_phase { > @@ -598,6 +604,42 @@ static int btf_add_type(struct btf_verifier_env *env, > struct btf_type *t) > return 0; > } > > +static int btf_alloc_id(struct btf *btf) > +{ > + int id; > + > + idr_preload(GFP_KERNEL); > + spin_lock_bh(&btf_idr_lock); > + id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC); > + if (id > 0) > + btf->id = id; > + spin_unlock_bh(&btf_idr_lock); > + idr_preload_end(); > + > + if (WARN_ON_ONCE(!id)) > + return -ENOSPC; > + > + return id > 0 ? 0 : id; > +} > + > +static void btf_free_id(struct btf *btf) > +{ > + unsigned long flags; > + > + /* > + * In map-in-map, calling map_delete_elem() on outer > + * map will call bpf_map_put on the inner map. > + * It will then eventually call btf_free_id() > + * on the inner map. Some of the map_delete_elem() > + * implementation may have irq disabled, so > + * we need to use the _irqsave() version instead > + * of the _bh() version. > + */ > + spin_lock_irqsave(&btf_idr_lock, flags); > + idr_remove(&btf_idr, btf->id); > + spin_unlock_irqrestore(&btf_idr_lock, flags); > +} > + > static void btf_free(struct btf *btf) > { > kvfree(btf->types); > @@ -607,15 +649,19 @@ static void btf_free(struct btf *btf) > kfree(btf); > } > > -static void btf_get(struct btf *btf) > +static void btf_free_rcu(struct rcu_head *rcu) > { > - refcount_inc(&btf->refcnt); > + struct btf *btf = container_of(rcu, struct btf, rcu); > + > + btf_free(btf); > } > > void btf_put(struct btf *btf) > { > - if (btf && refcount_dec_and_test(&btf->refcnt)) > - btf_free(btf); > + if (btf && refcount_dec_and_test(&btf->refcnt)) { > + btf_free_id(btf); > + call_rcu(&btf->rcu, btf_free_rcu); > + } > } > > static int env_resolve_init(struct btf_verifier_env *env) > @@ -2006,10 +2052,15 @@ const struct file_operations btf_fops = { > .release = btf_release, > }; > > +static int __btf_new_fd(struct btf *btf) > +{ > + return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); > +} > + > int btf_new_fd(const union bpf_attr *attr) > { > struct btf *btf; > - int fd; > + int ret; > > btf = btf_parse(u64_to_user_ptr(attr->btf), > attr->btf_size, attr->btf_log_level, > @@ -2018,12 +2069,23 @@ int btf_new_fd(const union bpf_attr *attr) > if (IS_ERR(btf)) > return PTR_ERR(btf); > > - fd = anon_inode_getfd("btf", &btf_fops, btf, > - O_RDONLY | O_CLOEXEC); > - if (fd < 0) > + ret = btf_alloc_id(btf); > + if (ret) { > + btf_free(btf); > + return ret; > + } > + > + /* > + * The BTF ID is published to the userspace. > + * All BTF free must go through call_rcu() from > + * now on (i.e. free by calling btf_put()). > + */ > + > + ret = __btf_new_fd(btf); > + if (ret < 0) > btf_put(btf); > > - return fd; > + return ret; > } > > struct btf *btf_get_by_fd(int fd) > @@ -2042,7 +2104,7 @@ struct btf *btf_get_by_fd(int fd) > } > > btf = f.file->private_data; > - btf_get(btf); > + refcount_inc(&btf->refcnt); > fdput(f); > > return btf; > @@ -2062,3 +2124,29 @@ int btf_get_info_by_fd(const struct btf *btf, > > return 0; > } > + > +int btf_get_fd_by_id(u32 id) > +{ > + struct btf *btf; > + int fd; > + > + rcu_read_lock(); > + btf = idr_find(&btf_idr, id); > + if (!btf || !refcount_inc_not_zero(&btf->refcnt)) > + btf = ERR_PTR(-ENOENT); > + rcu_read_unlock(); > + > + if (IS_ERR(btf)) > + return PTR_ERR(btf); > + > + fd = __btf_new_fd(btf); > + if (fd < 0) > + btf_put(btf); > + > + return fd; > +} > + > +u32 btf_id(const struct btf *btf) > +{ > + return btf->id; > +} > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 263e13ede029..8b0a45d65454 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -252,7 +252,6 @@ static void bpf_map_free_deferred(struct work_struct > *work) > > bpf_map_uncharge_memlock(map); > security_bpf_map_free(map); > - btf_put(map->btf); > /* implementation dependent freeing */ > map->ops->map_free(map); > } > @@ -273,6 +272,7 @@ static void __bpf_map_put(struct bpf_map *map, bool > do_idr_lock) > if (atomic_dec_and_test(&map->refcnt)) { > /* bpf_map_free_id() must be called first */ > bpf_map_free_id(map, do_idr_lock); > + btf_put(map->btf); > INIT_WORK(&map->work, bpf_map_free_deferred); > schedule_work(&map->work); > } > @@ -2000,6 +2000,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, > info.map_flags = map->map_flags; > memcpy(info.name, map->name, sizeof(map->name)); > > + if (map->btf) { > + info.btf_id = btf_id(map->btf); > + info.btf_key_id = map->btf_key_id; > + info.btf_value_id = map->btf_value_id; > + } > + > if (bpf_map_is_dev_bound(map)) { > err = bpf_map_offload_info_fill(&info, map); > if (err) > @@ -2057,6 +2063,19 @@ static int bpf_btf_load(const union bpf_attr *attr) > return btf_new_fd(attr); > } > > +#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id > + > +static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) > +{ > + if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) > + return -EINVAL; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EPERM; > + > + return btf_get_fd_by_id(attr->btf_id); > +} > + > SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, > size) > { > union bpf_attr attr = {}; > @@ -2140,6 +2159,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, > uattr, unsigned int, siz > case BPF_BTF_LOAD: > err = bpf_btf_load(&attr); > break; > + case BPF_BTF_GET_FD_BY_ID: > + err = bpf_btf_get_fd_by_id(&attr); > + break; > default: > err = -EINVAL; > break; > -- > 2.9.5 >
Acked-by: Song Liu <songliubrav...@fb.com>