On 08/06/2018 03:58 PM, Mauricio Vasquez B wrote:
> Bpf queue implements a LIFO/FIFO data containers for ebpf programs.
> 
> It allows to push an element to the queue by using the update operation
> and to pop an element from the queue by using the lookup operation.
> 
> A use case for this is to keep track of a pool of elements, like
> network ports in a SNAT.
> 
> Signed-off-by: Mauricio Vasquez B <mauricio.vasq...@polito.it>
> ---
>  include/linux/bpf_types.h |    1 
>  include/uapi/linux/bpf.h  |    5 +
>  kernel/bpf/Makefile       |    2 
>  kernel/bpf/queuemap.c     |  287 
> +++++++++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c      |   61 +++++++---
>  kernel/bpf/verifier.c     |   16 ++-
>  6 files changed, 353 insertions(+), 19 deletions(-)
>  create mode 100644 kernel/bpf/queuemap.c
> 
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index c5700c2d5549..6c7a62f3fe43 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -58,3 +58,4 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
>  #endif
>  #endif
> +BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 0ebaaf7f3568..2c171c40eb45 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -120,6 +120,7 @@ enum bpf_map_type {
>       BPF_MAP_TYPE_CPUMAP,
>       BPF_MAP_TYPE_XSKMAP,
>       BPF_MAP_TYPE_SOCKHASH,
> +     BPF_MAP_TYPE_QUEUE,
>  };
>  
>  enum bpf_prog_type {
> @@ -255,6 +256,10 @@ enum bpf_attach_type {
>  /* Flag for stack_map, store build_id+offset instead of pointer */
>  #define BPF_F_STACK_BUILD_ID (1U << 5)
>  
> +/* Flags for queue_map, type of queue */
> +#define BPF_F_QUEUE_FIFO     (1U << 16)
> +#define BPF_F_QUEUE_LIFO     (2U << 16)
> +
>  enum bpf_stack_build_id_status {
>       /* user space need an empty entry to identify end of a trace */
>       BPF_STACK_BUILD_ID_EMPTY = 0,
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index f27f5496d6fe..30f02ef66635 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -2,7 +2,7 @@
>  obj-y := core.o
>  
>  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
> -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o 
> bpf_lru_list.o lpm_trie.o map_in_map.o
> +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o 
> bpf_lru_list.o lpm_trie.o map_in_map.o queuemap.o
>  obj-$(CONFIG_BPF_SYSCALL) += disasm.o
>  obj-$(CONFIG_BPF_SYSCALL) += btf.o
>  ifeq ($(CONFIG_NET),y)
> diff --git a/kernel/bpf/queuemap.c b/kernel/bpf/queuemap.c
> new file mode 100644
> index 000000000000..ab30af43b4cc
> --- /dev/null
> +++ b/kernel/bpf/queuemap.c
> @@ -0,0 +1,287 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * queuemap.c: BPF queue map
> + *
> + * Copyright (c) 2018 Politecnico di Torino
> + */
> +#include <linux/bpf.h>
> +#include <linux/rculist.h>
> +#include <linux/slab.h>
> +#include "percpu_freelist.h"
> +
> +#define QUEUE_CREATE_FLAG_MASK \
> +     (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
> +      BPF_F_QUEUE_FIFO | BPF_F_QUEUE_LIFO)
> +
> +enum queue_type {
> +     QUEUE_FIFO = (BPF_F_QUEUE_FIFO >> 16),
> +     QUEUE_LIFO = (BPF_F_QUEUE_LIFO >> 16),
> +};
> +
> +struct bpf_queue {
> +     struct bpf_map map;
> +     struct list_head head;
> +     struct pcpu_freelist freelist;
> +     void *nodes;
> +     enum queue_type type;
> +     raw_spinlock_t lock;
> +     atomic_t count;
> +     u32 node_size;
> +};
> +
> +struct queue_node {
> +     struct pcpu_freelist_node fnode;
> +     struct bpf_queue *queue;
> +     struct list_head list;
> +     struct rcu_head rcu;
> +     char element[0] __aligned(8);
> +};
> +
> +static bool queue_map_is_prealloc(struct bpf_queue *queue)
> +{
> +     return !(queue->map.map_flags & BPF_F_NO_PREALLOC);
> +}
> +
> +/* Called from syscall */
> +static int queue_map_alloc_check(union bpf_attr *attr)
> +{
> +     /* check sanity of attributes */
> +     if (attr->max_entries == 0 || attr->key_size != 0 ||
> +         attr->value_size == 0 || attr->map_flags & ~QUEUE_CREATE_FLAG_MASK)
> +             return -EINVAL;
> +
> +     if ((attr->map_flags >> 16) != QUEUE_FIFO &&
> +         (attr->map_flags >> 16) != QUEUE_LIFO) {
> +             return -EINVAL;
> +     }
> +
> +     if (attr->value_size > KMALLOC_MAX_SIZE)
> +             /* if value_size is bigger, the user space won't be able to
> +              * access the elements.
> +              */
> +             return -E2BIG;
> +
> +     return 0;
> +}
> +
> +static int prealloc_init(struct bpf_queue *queue)
> +{
> +     u32 node_size = sizeof(struct queue_node) +
> +                     round_up(queue->map.value_size, 8);
> +     u32 num_entries = queue->map.max_entries;
> +     int err;
> +
> +     queue->nodes = bpf_map_area_alloc(node_size * num_entries,
> +                                       queue->map.numa_node);
> +     if (!queue->nodes)
> +             return -ENOMEM;
> +
> +     err = pcpu_freelist_init(&queue->freelist);
> +     if (err)
> +             goto free_nodes;
> +
> +     pcpu_freelist_populate(&queue->freelist,
> +                            queue->nodes +
> +                            offsetof(struct queue_node, fnode),
> +                            node_size, num_entries);
> +
> +     return 0;
> +
> +free_nodes:
> +     bpf_map_area_free(queue->nodes);
> +     return err;
> +}
> +
> +static void prealloc_destroy(struct bpf_queue *queue)
> +{
> +     bpf_map_area_free(queue->nodes);
> +     pcpu_freelist_destroy(&queue->freelist);
> +}
> +
> +static struct bpf_map *queue_map_alloc(union bpf_attr *attr)
> +{
> +     struct bpf_queue *queue;
> +     u64 cost = sizeof(*queue);
> +     int ret;
> +
> +     queue = kzalloc(sizeof(*queue), GFP_USER);
> +     if (!queue)
> +             return ERR_PTR(-ENOMEM);
> +
> +     bpf_map_init_from_attr(&queue->map, attr);
> +
> +     queue->node_size = sizeof(struct queue_node) +
> +                        round_up(attr->value_size, 8);
> +     cost += (u64) attr->max_entries * queue->node_size;
> +     if (cost >= U32_MAX - PAGE_SIZE) {
> +             ret = -E2BIG;
> +             goto free_queue;
> +     }
> +
> +     queue->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
> +
> +     ret = bpf_map_precharge_memlock(queue->map.pages);
> +     if (ret)
> +             goto free_queue;
> +
> +     INIT_LIST_HEAD(&queue->head);
> +
> +     raw_spin_lock_init(&queue->lock);
> +
> +     queue->type = attr->map_flags >> 16;
> +
> +     if (queue_map_is_prealloc(queue))
> +             ret = prealloc_init(queue);
> +             if (ret)
> +                     goto free_queue;
> +
> +     return &queue->map;
> +
> +free_queue:
> +     kfree(queue);
> +     return ERR_PTR(ret);
> +}
> +
> +/* Called when map->refcnt goes to zero, either from workqueue or from 
> syscall */
> +static void queue_map_free(struct bpf_map *map)
> +{
> +     struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
> +     struct queue_node *l;
> +
> +     /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
> +      * so the programs (can be more than one that used this map) were
> +      * disconnected from events. Wait for outstanding critical sections in
> +      * these programs to complete
> +      */
> +     synchronize_rcu();
> +
> +     /* some of queue_elem_free_rcu() callbacks for elements of this map may
> +      * not have executed. Wait for them.
> +      */
> +     rcu_barrier();
> +     if (!queue_map_is_prealloc(queue))
> +             list_for_each_entry_rcu(l, &queue->head, list) {
> +                     list_del_rcu(&l->list);
> +                     kfree(l);
> +             }
> +     else
> +             prealloc_destroy(queue);
> +     kfree(queue);
> +}
> +
> +static void queue_elem_free_rcu(struct rcu_head *head)
> +{
> +     struct queue_node *l = container_of(head, struct queue_node, rcu);
> +     struct bpf_queue *queue = l->queue;
> +
> +     /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
> +      * we're calling kfree, otherwise deadlock is possible if kprobes
> +      * are placed somewhere inside of slub
> +      */
> +     preempt_disable();
> +     __this_cpu_inc(bpf_prog_active);
> +     if (queue_map_is_prealloc(queue))
> +             pcpu_freelist_push(&queue->freelist, &l->fnode);
> +     else
> +             kfree(l);
> +     __this_cpu_dec(bpf_prog_active);
> +     preempt_enable();
> +}
> +
> +/* Called from syscall or from eBPF program */
> +static void *queue_map_lookup_elem(struct bpf_map *map, void *key)
> +{
> +     struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
> +     unsigned long flags;
> +     struct queue_node *node;
> +
> +     raw_spin_lock_irqsave(&queue->lock, flags);

I think a per-cpu flavor would be very useful here as well in this map
type such that we wouldn't need a lock here but only guarantee that
preemption is disabled, and it could be used as temp store for the program
for example.

> +     node = list_first_or_null_rcu(&queue->head, struct queue_node, list);
> +     if (!node) {
> +             raw_spin_unlock_irqrestore(&queue->lock, flags);
> +             return NULL;
> +     }
> +
> +     if (!queue_map_is_prealloc(queue))
> +             atomic_dec(&queue->count);
> +
> +     list_del_rcu(&node->list);
> +     call_rcu(&node->rcu, queue_elem_free_rcu);
> +
> +     raw_spin_unlock_irqrestore(&queue->lock, flags);
> +
> +     return &node->element;
> +}
> +
> +/* Called from syscall or from eBPF program */
> +static int queue_map_update_elem(struct bpf_map *map, void *key, void *value,
> +                              u64 map_flags)
> +{
> +     struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
> +     unsigned long flags;
> +     struct queue_node *new;

Should reject invalid map update flags here.

> +     if (!queue_map_is_prealloc(queue)) {
> +             if (atomic_inc_return(&queue->count) > queue->map.max_entries) {
> +                     atomic_dec(&queue->count);
> +                     return -E2BIG;
> +             }
> +
> +             new = kmalloc_node(queue->node_size, GFP_ATOMIC | __GFP_NOWARN,
> +                                queue->map.numa_node);
> +             if (!new) {
> +                     atomic_dec(&queue->count);
> +                     return -ENOMEM;
> +             }
> +     } else {
> +             struct pcpu_freelist_node *l;
> +
> +             l = pcpu_freelist_pop(&queue->freelist);
> +             if (!l)
> +                     return -E2BIG;
> +             new = container_of(l, struct queue_node, fnode);
> +     }
> +
> +     memcpy(new->element, value, queue->map.value_size);
> +     new->queue = queue;
> +
> +     raw_spin_lock_irqsave(&queue->lock, flags);
> +     switch (queue->type) {
> +     case QUEUE_FIFO:
> +             list_add_tail_rcu(&new->list, &queue->head);
> +             break;
> +
> +     case QUEUE_LIFO:
> +             list_add_rcu(&new->list, &queue->head);
> +             break;
> +     }
> +
> +     raw_spin_unlock_irqrestore(&queue->lock, flags);
> +
> +     return 0;
> +}
> +
> +/* Called from syscall or from eBPF program */
> +static int queue_map_delete_elem(struct bpf_map *map, void *key)
> +{
> +     return -EINVAL;
> +}
> +
> +/* Called from syscall */
> +static int queue_map_get_next_key(struct bpf_map *map, void *key,
> +                               void *next_key)
> +{
> +     return -EINVAL;
> +}
> +
> +const struct bpf_map_ops queue_map_ops = {
> +     .map_alloc_check = queue_map_alloc_check,
> +     .map_alloc = queue_map_alloc,
> +     .map_free = queue_map_free,
> +     .map_lookup_elem = queue_map_lookup_elem,
> +     .map_update_elem = queue_map_update_elem,
> +     .map_delete_elem = queue_map_delete_elem,
> +     .map_get_next_key = queue_map_get_next_key,
> +};
> +
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index a31a1ba0f8ea..7e9a11d69eef 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -622,11 +622,19 @@ static int map_lookup_elem(union bpf_attr *attr)
>               err = -EPERM;
>               goto err_put;
>       }
> +     if (map->map_type != BPF_MAP_TYPE_QUEUE) {
> +             key = memdup_user(ukey, map->key_size);
> +             if (IS_ERR(key)) {
> +                     err = PTR_ERR(key);
> +                     goto err_put;
> +             }
> +     } else {
> +             if (ukey) {
> +                     err = -EINVAL;
> +                     goto err_put;
> +             }

Given you have a fixed key_size of 0, this could probably be made more
generic and refactored a bit into a helper to check for 0 map->key_size
instead of map type.

> -     key = memdup_user(ukey, map->key_size);
> -     if (IS_ERR(key)) {
> -             err = PTR_ERR(key);
> -             goto err_put;
> +             key = NULL;
>       }
>  
>       if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> @@ -709,10 +717,19 @@ static int map_update_elem(union bpf_attr *attr)
>               goto err_put;
>       }
>  
> -     key = memdup_user(ukey, map->key_size);
> -     if (IS_ERR(key)) {
> -             err = PTR_ERR(key);
> -             goto err_put;
> +     if (map->map_type != BPF_MAP_TYPE_QUEUE) {
> +             key = memdup_user(ukey, map->key_size);
> +             if (IS_ERR(key)) {
> +                     err = PTR_ERR(key);
> +                     goto err_put;
> +             }
> +     } else {
> +             if (ukey) {
> +                     err = -EINVAL;
> +                     goto err_put;
> +             }
> +
> +             key = NULL;
>       }

Ditto, and also further below as well for the other syscall cases.

>  
>       if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> @@ -803,10 +820,19 @@ static int map_delete_elem(union bpf_attr *attr)
>               goto err_put;
>       }
>  
> -     key = memdup_user(ukey, map->key_size);
> -     if (IS_ERR(key)) {
> -             err = PTR_ERR(key);
> -             goto err_put;
> +     if (map->map_type != BPF_MAP_TYPE_QUEUE) {
> +             key = memdup_user(ukey, map->key_size);
> +             if (IS_ERR(key)) {
> +                     err = PTR_ERR(key);
> +                     goto err_put;
> +             }
> +     } else {
> +             if (ukey) {
> +                     err = -EINVAL;
> +                     goto err_put;
> +             }
> +
> +             key = NULL;
>       }
>  
>       if (bpf_map_is_dev_bound(map)) {
> @@ -855,9 +881,14 @@ static int map_get_next_key(union bpf_attr *attr)
>       }
>  
>       if (ukey) {
> -             key = memdup_user(ukey, map->key_size);
> -             if (IS_ERR(key)) {
> -                     err = PTR_ERR(key);
> +             if (map->map_type != BPF_MAP_TYPE_QUEUE) {
> +                     key = memdup_user(ukey, map->key_size);
> +                     if (IS_ERR(key)) {
> +                             err = PTR_ERR(key);
> +                             goto err_put;
> +                     }
> +             } else {
> +                     err = -EINVAL;
>                       goto err_put;
>               }
>       } else {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 25e47c195874..83099a9a21d9 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1976,8 +1976,12 @@ static int check_func_arg(struct bpf_verifier_env 
> *env, u32 regno,
>               return -EACCES;
>       }
>  
> -     if (arg_type == ARG_PTR_TO_MAP_KEY ||
> -         arg_type == ARG_PTR_TO_MAP_VALUE) {
> +     if (arg_type == ARG_PTR_TO_MAP_KEY) {
> +             expected_type = PTR_TO_STACK;
> +             if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
> +                 type != expected_type && type != SCALAR_VALUE)
> +                     goto err_type;
> +     } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
>               expected_type = PTR_TO_STACK;
>               if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
>                   type != expected_type)
> @@ -2021,6 +2025,7 @@ static int check_func_arg(struct bpf_verifier_env *env, 
> u32 regno,
>               /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
>               meta->map_ptr = reg->map_ptr;
>       } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
> +             bool zero_size_allowed = false;
>               /* bpf_map_xxx(..., map_ptr, ..., key) call:
>                * check that [key, key + map->key_size) are within
>                * stack limits and initialized
> @@ -2034,8 +2039,13 @@ static int check_func_arg(struct bpf_verifier_env 
> *env, u32 regno,
>                       verbose(env, "invalid map_ptr to access map->key\n");
>                       return -EACCES;
>               }
> +
> +             if (meta->map_ptr->map_type == BPF_MAP_TYPE_QUEUE)

Here as well.

> +                     zero_size_allowed = true;

Also, verifier should rather enforce a const NULL key here than allowing one to
be set (but not as the only 'valid' input option), meaning, I don't think it 
would
be good to allow a 'zero' sized pointer to stack in this case.

>               err = check_helper_mem_access(env, regno,
> -                                           meta->map_ptr->key_size, false,
> +                                           meta->map_ptr->key_size,
> +                                           zero_size_allowed,
>                                             NULL);
>       } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
>               /* bpf_map_xxx(..., map_ptr, ..., value) call:
> 

Reply via email to