On Wed, Oct 10, 2018 at 7:06 AM Mauricio Vasquez B <mauricio.vasq...@polito.it> wrote: > > The following patch implements a bpf queue/stack maps that > provides the peek/pop/push functions. There is not a direct > relationship between those functions and the current maps > syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added, > this is mapped to the pop operation in the queue/stack maps > and it is still to implement in other kind of maps. > > Signed-off-by: Mauricio Vasquez B <mauricio.vasq...@polito.it> > --- > include/linux/bpf.h | 1 + > include/uapi/linux/bpf.h | 1 + > kernel/bpf/syscall.c | 82 > ++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 84 insertions(+) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 9b558713447f..5793f0c7fbb5 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -39,6 +39,7 @@ struct bpf_map_ops { > void *(*map_lookup_elem)(struct bpf_map *map, void *key); > int (*map_update_elem)(struct bpf_map *map, void *key, void *value, > u64 flags); > int (*map_delete_elem)(struct bpf_map *map, void *key); > + void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key); > > /* funcs called by prog_array and perf_event_array map */ > void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index f9187b41dff6..3bb94aa2d408 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -103,6 +103,7 @@ enum bpf_cmd { > BPF_BTF_LOAD, > BPF_BTF_GET_FD_BY_ID, > BPF_TASK_FD_QUERY, > + BPF_MAP_LOOKUP_AND_DELETE_ELEM, > }; > > enum bpf_map_type { > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index f36c080ad356..6907d661dea5 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -980,6 +980,85 @@ static int map_get_next_key(union bpf_attr *attr) > return err; > } > > +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value > + > +static int map_lookup_and_delete_elem(union bpf_attr *attr) > +{ > + void __user *ukey = u64_to_user_ptr(attr->key); > + void __user *uvalue = u64_to_user_ptr(attr->value); > + int ufd = attr->map_fd; > + struct bpf_map *map; > + void *key, *value, *ptr; > + u32 value_size; > + struct fd f; > + int err; > + > + if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) > + return -EINVAL; > + > + f = fdget(ufd); > + map = __bpf_map_get(f); > + if (IS_ERR(map)) > + return PTR_ERR(map); > + > + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { > + err = -EPERM; > + goto err_put; > + } > + > + key = __bpf_copy_key(ukey, map->key_size); > + if (IS_ERR(key)) { > + err = PTR_ERR(key); > + goto err_put; > + } > + > + value_size = map->value_size; > + > + err = -ENOMEM; > + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); > + if (!value) > + goto free_key; > + > + err = -EFAULT; > + if (copy_from_user(value, uvalue, value_size) != 0) > + goto free_value; > + > + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from > + * inside bpf map update or delete otherwise deadlocks are possible > + */ > + preempt_disable(); > + __this_cpu_inc(bpf_prog_active); > + if (map->ops->map_lookup_and_delete_elem) { > + rcu_read_lock(); > + ptr = map->ops->map_lookup_and_delete_elem(map, key); > + if (ptr) > + memcpy(value, ptr, value_size); I think we are exposed to race condition with push and pop in parallel. map_lookup_and_delete_elem() only updates the head/tail, so it gives no protection for the buffer pointed by ptr.
Thanks, Song > + rcu_read_unlock(); > + err = ptr ? 0 : -ENOENT; > + } else { > + err = -ENOTSUPP; > + } > + > + __this_cpu_dec(bpf_prog_active); > + preempt_enable(); > + > + if (err) > + goto free_value; > + > + if (copy_to_user(uvalue, value, value_size) != 0) > + goto free_value; > + > + err = 0; > + > +free_value: > + kfree(value); > +free_key: > + kfree(key); > +err_put: > + fdput(f); > + return err; > +} > + > static const struct bpf_prog_ops * const bpf_prog_types[] = { > #define BPF_PROG_TYPE(_id, _name) \ > [_id] = & _name ## _prog_ops, > @@ -2453,6 +2532,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, > uattr, unsigned int, siz > case BPF_TASK_FD_QUERY: > err = bpf_task_fd_query(&attr, uattr); > break; > + case BPF_MAP_LOOKUP_AND_DELETE_ELEM: > + err = map_lookup_and_delete_elem(&attr); > + break; > default: > err = -EINVAL; > break; >