On 08/06/2018 03:58 PM, Mauricio Vasquez B wrote: > Bpf queue implements a LIFO/FIFO data containers for ebpf programs. > > It allows to push an element to the queue by using the update operation > and to pop an element from the queue by using the lookup operation. > > A use case for this is to keep track of a pool of elements, like > network ports in a SNAT. > > Signed-off-by: Mauricio Vasquez B <mauricio.vasq...@polito.it> > --- > include/linux/bpf_types.h | 1 > include/uapi/linux/bpf.h | 5 + > kernel/bpf/Makefile | 2 > kernel/bpf/queuemap.c | 287 > +++++++++++++++++++++++++++++++++++++++++++++ > kernel/bpf/syscall.c | 61 +++++++--- > kernel/bpf/verifier.c | 16 ++- > 6 files changed, 353 insertions(+), 19 deletions(-) > create mode 100644 kernel/bpf/queuemap.c > > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h > index c5700c2d5549..6c7a62f3fe43 100644 > --- a/include/linux/bpf_types.h > +++ b/include/linux/bpf_types.h > @@ -58,3 +58,4 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) > BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) > #endif > #endif > +BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops) > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 0ebaaf7f3568..2c171c40eb45 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -120,6 +120,7 @@ enum bpf_map_type { > BPF_MAP_TYPE_CPUMAP, > BPF_MAP_TYPE_XSKMAP, > BPF_MAP_TYPE_SOCKHASH, > + BPF_MAP_TYPE_QUEUE, > }; > > enum bpf_prog_type { > @@ -255,6 +256,10 @@ enum bpf_attach_type { > /* Flag for stack_map, store build_id+offset instead of pointer */ > #define BPF_F_STACK_BUILD_ID (1U << 5) > > +/* Flags for queue_map, type of queue */ > +#define BPF_F_QUEUE_FIFO (1U << 16) > +#define BPF_F_QUEUE_LIFO (2U << 16) > + > enum bpf_stack_build_id_status { > /* user space need an empty entry to identify end of a trace */ > BPF_STACK_BUILD_ID_EMPTY = 0, > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile > index f27f5496d6fe..30f02ef66635 100644 > --- a/kernel/bpf/Makefile > +++ b/kernel/bpf/Makefile > @@ -2,7 +2,7 @@ > obj-y := core.o > > obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o > -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o > bpf_lru_list.o lpm_trie.o map_in_map.o > +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o > bpf_lru_list.o lpm_trie.o map_in_map.o queuemap.o > obj-$(CONFIG_BPF_SYSCALL) += disasm.o > obj-$(CONFIG_BPF_SYSCALL) += btf.o > ifeq ($(CONFIG_NET),y) > diff --git a/kernel/bpf/queuemap.c b/kernel/bpf/queuemap.c > new file mode 100644 > index 000000000000..ab30af43b4cc > --- /dev/null > +++ b/kernel/bpf/queuemap.c > @@ -0,0 +1,287 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * queuemap.c: BPF queue map > + * > + * Copyright (c) 2018 Politecnico di Torino > + */ > +#include <linux/bpf.h> > +#include <linux/rculist.h> > +#include <linux/slab.h> > +#include "percpu_freelist.h" > + > +#define QUEUE_CREATE_FLAG_MASK \ > + (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ > + BPF_F_QUEUE_FIFO | BPF_F_QUEUE_LIFO) > + > +enum queue_type { > + QUEUE_FIFO = (BPF_F_QUEUE_FIFO >> 16), > + QUEUE_LIFO = (BPF_F_QUEUE_LIFO >> 16), > +}; > + > +struct bpf_queue { > + struct bpf_map map; > + struct list_head head; > + struct pcpu_freelist freelist; > + void *nodes; > + enum queue_type type; > + raw_spinlock_t lock; > + atomic_t count; > + u32 node_size; > +}; > + > +struct queue_node { > + struct pcpu_freelist_node fnode; > + struct bpf_queue *queue; > + struct list_head list; > + struct rcu_head rcu; > + char element[0] __aligned(8); > +}; > + > +static bool queue_map_is_prealloc(struct bpf_queue *queue) > +{ > + return !(queue->map.map_flags & BPF_F_NO_PREALLOC); > +} > + > +/* Called from syscall */ > +static int queue_map_alloc_check(union bpf_attr *attr) > +{ > + /* check sanity of attributes */ > + if (attr->max_entries == 0 || attr->key_size != 0 || > + attr->value_size == 0 || attr->map_flags & ~QUEUE_CREATE_FLAG_MASK) > + return -EINVAL; > + > + if ((attr->map_flags >> 16) != QUEUE_FIFO && > + (attr->map_flags >> 16) != QUEUE_LIFO) { > + return -EINVAL; > + } > + > + if (attr->value_size > KMALLOC_MAX_SIZE) > + /* if value_size is bigger, the user space won't be able to > + * access the elements. > + */ > + return -E2BIG; > + > + return 0; > +} > + > +static int prealloc_init(struct bpf_queue *queue) > +{ > + u32 node_size = sizeof(struct queue_node) + > + round_up(queue->map.value_size, 8); > + u32 num_entries = queue->map.max_entries; > + int err; > + > + queue->nodes = bpf_map_area_alloc(node_size * num_entries, > + queue->map.numa_node); > + if (!queue->nodes) > + return -ENOMEM; > + > + err = pcpu_freelist_init(&queue->freelist); > + if (err) > + goto free_nodes; > + > + pcpu_freelist_populate(&queue->freelist, > + queue->nodes + > + offsetof(struct queue_node, fnode), > + node_size, num_entries); > + > + return 0; > + > +free_nodes: > + bpf_map_area_free(queue->nodes); > + return err; > +} > + > +static void prealloc_destroy(struct bpf_queue *queue) > +{ > + bpf_map_area_free(queue->nodes); > + pcpu_freelist_destroy(&queue->freelist); > +} > + > +static struct bpf_map *queue_map_alloc(union bpf_attr *attr) > +{ > + struct bpf_queue *queue; > + u64 cost = sizeof(*queue); > + int ret; > + > + queue = kzalloc(sizeof(*queue), GFP_USER); > + if (!queue) > + return ERR_PTR(-ENOMEM); > + > + bpf_map_init_from_attr(&queue->map, attr); > + > + queue->node_size = sizeof(struct queue_node) + > + round_up(attr->value_size, 8); > + cost += (u64) attr->max_entries * queue->node_size; > + if (cost >= U32_MAX - PAGE_SIZE) { > + ret = -E2BIG; > + goto free_queue; > + } > + > + queue->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; > + > + ret = bpf_map_precharge_memlock(queue->map.pages); > + if (ret) > + goto free_queue; > + > + INIT_LIST_HEAD(&queue->head); > + > + raw_spin_lock_init(&queue->lock); > + > + queue->type = attr->map_flags >> 16; > + > + if (queue_map_is_prealloc(queue)) > + ret = prealloc_init(queue); > + if (ret) > + goto free_queue; > + > + return &queue->map; > + > +free_queue: > + kfree(queue); > + return ERR_PTR(ret); > +} > + > +/* Called when map->refcnt goes to zero, either from workqueue or from > syscall */ > +static void queue_map_free(struct bpf_map *map) > +{ > + struct bpf_queue *queue = container_of(map, struct bpf_queue, map); > + struct queue_node *l; > + > + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, > + * so the programs (can be more than one that used this map) were > + * disconnected from events. Wait for outstanding critical sections in > + * these programs to complete > + */ > + synchronize_rcu(); > + > + /* some of queue_elem_free_rcu() callbacks for elements of this map may > + * not have executed. Wait for them. > + */ > + rcu_barrier(); > + if (!queue_map_is_prealloc(queue)) > + list_for_each_entry_rcu(l, &queue->head, list) { > + list_del_rcu(&l->list); > + kfree(l); > + } > + else > + prealloc_destroy(queue); > + kfree(queue); > +} > + > +static void queue_elem_free_rcu(struct rcu_head *head) > +{ > + struct queue_node *l = container_of(head, struct queue_node, rcu); > + struct bpf_queue *queue = l->queue; > + > + /* must increment bpf_prog_active to avoid kprobe+bpf triggering while > + * we're calling kfree, otherwise deadlock is possible if kprobes > + * are placed somewhere inside of slub > + */ > + preempt_disable(); > + __this_cpu_inc(bpf_prog_active); > + if (queue_map_is_prealloc(queue)) > + pcpu_freelist_push(&queue->freelist, &l->fnode); > + else > + kfree(l); > + __this_cpu_dec(bpf_prog_active); > + preempt_enable(); > +} > + > +/* Called from syscall or from eBPF program */ > +static void *queue_map_lookup_elem(struct bpf_map *map, void *key) > +{ > + struct bpf_queue *queue = container_of(map, struct bpf_queue, map); > + unsigned long flags; > + struct queue_node *node; > + > + raw_spin_lock_irqsave(&queue->lock, flags);
I think a per-cpu flavor would be very useful here as well in this map type such that we wouldn't need a lock here but only guarantee that preemption is disabled, and it could be used as temp store for the program for example. > + node = list_first_or_null_rcu(&queue->head, struct queue_node, list); > + if (!node) { > + raw_spin_unlock_irqrestore(&queue->lock, flags); > + return NULL; > + } > + > + if (!queue_map_is_prealloc(queue)) > + atomic_dec(&queue->count); > + > + list_del_rcu(&node->list); > + call_rcu(&node->rcu, queue_elem_free_rcu); > + > + raw_spin_unlock_irqrestore(&queue->lock, flags); > + > + return &node->element; > +} > + > +/* Called from syscall or from eBPF program */ > +static int queue_map_update_elem(struct bpf_map *map, void *key, void *value, > + u64 map_flags) > +{ > + struct bpf_queue *queue = container_of(map, struct bpf_queue, map); > + unsigned long flags; > + struct queue_node *new; Should reject invalid map update flags here. > + if (!queue_map_is_prealloc(queue)) { > + if (atomic_inc_return(&queue->count) > queue->map.max_entries) { > + atomic_dec(&queue->count); > + return -E2BIG; > + } > + > + new = kmalloc_node(queue->node_size, GFP_ATOMIC | __GFP_NOWARN, > + queue->map.numa_node); > + if (!new) { > + atomic_dec(&queue->count); > + return -ENOMEM; > + } > + } else { > + struct pcpu_freelist_node *l; > + > + l = pcpu_freelist_pop(&queue->freelist); > + if (!l) > + return -E2BIG; > + new = container_of(l, struct queue_node, fnode); > + } > + > + memcpy(new->element, value, queue->map.value_size); > + new->queue = queue; > + > + raw_spin_lock_irqsave(&queue->lock, flags); > + switch (queue->type) { > + case QUEUE_FIFO: > + list_add_tail_rcu(&new->list, &queue->head); > + break; > + > + case QUEUE_LIFO: > + list_add_rcu(&new->list, &queue->head); > + break; > + } > + > + raw_spin_unlock_irqrestore(&queue->lock, flags); > + > + return 0; > +} > + > +/* Called from syscall or from eBPF program */ > +static int queue_map_delete_elem(struct bpf_map *map, void *key) > +{ > + return -EINVAL; > +} > + > +/* Called from syscall */ > +static int queue_map_get_next_key(struct bpf_map *map, void *key, > + void *next_key) > +{ > + return -EINVAL; > +} > + > +const struct bpf_map_ops queue_map_ops = { > + .map_alloc_check = queue_map_alloc_check, > + .map_alloc = queue_map_alloc, > + .map_free = queue_map_free, > + .map_lookup_elem = queue_map_lookup_elem, > + .map_update_elem = queue_map_update_elem, > + .map_delete_elem = queue_map_delete_elem, > + .map_get_next_key = queue_map_get_next_key, > +}; > + > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index a31a1ba0f8ea..7e9a11d69eef 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -622,11 +622,19 @@ static int map_lookup_elem(union bpf_attr *attr) > err = -EPERM; > goto err_put; > } > + if (map->map_type != BPF_MAP_TYPE_QUEUE) { > + key = memdup_user(ukey, map->key_size); > + if (IS_ERR(key)) { > + err = PTR_ERR(key); > + goto err_put; > + } > + } else { > + if (ukey) { > + err = -EINVAL; > + goto err_put; > + } Given you have a fixed key_size of 0, this could probably be made more generic and refactored a bit into a helper to check for 0 map->key_size instead of map type. > - key = memdup_user(ukey, map->key_size); > - if (IS_ERR(key)) { > - err = PTR_ERR(key); > - goto err_put; > + key = NULL; > } > > if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || > @@ -709,10 +717,19 @@ static int map_update_elem(union bpf_attr *attr) > goto err_put; > } > > - key = memdup_user(ukey, map->key_size); > - if (IS_ERR(key)) { > - err = PTR_ERR(key); > - goto err_put; > + if (map->map_type != BPF_MAP_TYPE_QUEUE) { > + key = memdup_user(ukey, map->key_size); > + if (IS_ERR(key)) { > + err = PTR_ERR(key); > + goto err_put; > + } > + } else { > + if (ukey) { > + err = -EINVAL; > + goto err_put; > + } > + > + key = NULL; > } Ditto, and also further below as well for the other syscall cases. > > if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || > @@ -803,10 +820,19 @@ static int map_delete_elem(union bpf_attr *attr) > goto err_put; > } > > - key = memdup_user(ukey, map->key_size); > - if (IS_ERR(key)) { > - err = PTR_ERR(key); > - goto err_put; > + if (map->map_type != BPF_MAP_TYPE_QUEUE) { > + key = memdup_user(ukey, map->key_size); > + if (IS_ERR(key)) { > + err = PTR_ERR(key); > + goto err_put; > + } > + } else { > + if (ukey) { > + err = -EINVAL; > + goto err_put; > + } > + > + key = NULL; > } > > if (bpf_map_is_dev_bound(map)) { > @@ -855,9 +881,14 @@ static int map_get_next_key(union bpf_attr *attr) > } > > if (ukey) { > - key = memdup_user(ukey, map->key_size); > - if (IS_ERR(key)) { > - err = PTR_ERR(key); > + if (map->map_type != BPF_MAP_TYPE_QUEUE) { > + key = memdup_user(ukey, map->key_size); > + if (IS_ERR(key)) { > + err = PTR_ERR(key); > + goto err_put; > + } > + } else { > + err = -EINVAL; > goto err_put; > } > } else { > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 25e47c195874..83099a9a21d9 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -1976,8 +1976,12 @@ static int check_func_arg(struct bpf_verifier_env > *env, u32 regno, > return -EACCES; > } > > - if (arg_type == ARG_PTR_TO_MAP_KEY || > - arg_type == ARG_PTR_TO_MAP_VALUE) { > + if (arg_type == ARG_PTR_TO_MAP_KEY) { > + expected_type = PTR_TO_STACK; > + if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && > + type != expected_type && type != SCALAR_VALUE) > + goto err_type; > + } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { > expected_type = PTR_TO_STACK; > if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && > type != expected_type) > @@ -2021,6 +2025,7 @@ static int check_func_arg(struct bpf_verifier_env *env, > u32 regno, > /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ > meta->map_ptr = reg->map_ptr; > } else if (arg_type == ARG_PTR_TO_MAP_KEY) { > + bool zero_size_allowed = false; > /* bpf_map_xxx(..., map_ptr, ..., key) call: > * check that [key, key + map->key_size) are within > * stack limits and initialized > @@ -2034,8 +2039,13 @@ static int check_func_arg(struct bpf_verifier_env > *env, u32 regno, > verbose(env, "invalid map_ptr to access map->key\n"); > return -EACCES; > } > + > + if (meta->map_ptr->map_type == BPF_MAP_TYPE_QUEUE) Here as well. > + zero_size_allowed = true; Also, verifier should rather enforce a const NULL key here than allowing one to be set (but not as the only 'valid' input option), meaning, I don't think it would be good to allow a 'zero' sized pointer to stack in this case. > err = check_helper_mem_access(env, regno, > - meta->map_ptr->key_size, false, > + meta->map_ptr->key_size, > + zero_size_allowed, > NULL); > } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { > /* bpf_map_xxx(..., map_ptr, ..., value) call: >