From: Björn Töpel <bjorn.to...@intel.com> The xskmap is yet another BPF map, very much inspired by dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application adds AF_XDP sockets into the map, and by using the bpf_redirect_map helper, an XDP program can redirect XDP frames to an AF_XDP socket.
Note that a socket that is bound to certain ifindex/queue index will *only* accept XDP frames from that netdev/queue index. If an XDP program tries to redirect from a netdev/queue index other than what the socket is bound to, the frame will not be received on the socket. A socket can reside in multiple maps. v2: Removed one indirection in map lookup. Signed-off-by: Björn Töpel <bjorn.to...@intel.com> --- include/linux/bpf.h | 26 +++++ include/linux/bpf_types.h | 3 + include/net/xdp_sock.h | 7 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 3 + kernel/bpf/verifier.c | 8 +- kernel/bpf/xskmap.c | 272 ++++++++++++++++++++++++++++++++++++++++++++++ net/xdp/xsk.c | 5 + 8 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/xskmap.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 38ebbc61ed99..6b62509777a2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -676,6 +676,32 @@ static inline int sock_map_prog(struct bpf_map *map, } #endif +#if defined(CONFIG_XDP_SOCKETS) +struct xdp_sock; +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key); +int __xsk_map_redirect(struct bpf_map *map, u32 index, + struct xdp_buff *xdp, struct xdp_sock *xs); +void __xsk_map_flush(struct bpf_map *map); +#else +struct xdp_sock; +static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} + +static inline int __xsk_map_redirect(struct bpf_map *map, u32 index, + struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + return -EOPNOTSUPP; +} + +static inline void __xsk_map_flush(struct bpf_map *map) +{ +} +#endif + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b28fcf6f6ae..d7df1b323082 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -49,4 +49,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) +#if defined(CONFIG_XDP_SOCKETS) +BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) +#endif #endif diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index ea54483f64bb..678a1d293579 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -28,6 +28,7 @@ struct xdp_sock { struct xsk_queue *rx; struct net_device *dev; struct xdp_umem *umem; + struct rcu_head rcu; u64 rx_dropped; u16 queue_id; /* Protects multiple processes in the control path */ @@ -39,6 +40,7 @@ struct xdp_buff; int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -53,6 +55,11 @@ static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static inline void xsk_flush(struct xdp_sock *xs) { } + +static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) +{ + return false; +} #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index da77a9388947..7c16bc197ff5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -116,6 +116,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 35c485fa9ea3..f27f5496d6fe 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -8,6 +8,9 @@ obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +ifeq ($(CONFIG_XDP_SOCKETS),y) +obj-$(CONFIG_BPF_SYSCALL) += xskmap.o +endif obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) ifeq ($(CONFIG_INET),y) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb1a596aebd3..0ac49d62dc0c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2061,8 +2061,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_redirect_map) goto error; break; - /* Restrict bpf side of cpumap, open when use-cases appear */ + /* Restrict bpf side of cpumap and xskmap, open when use-cases + * appear. + */ case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; @@ -2109,7 +2112,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && - map->map_type != BPF_MAP_TYPE_CPUMAP) + map->map_type != BPF_MAP_TYPE_CPUMAP && + map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; break; case BPF_FUNC_sk_redirect_map: diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c new file mode 100644 index 000000000000..eb60824af070 --- /dev/null +++ b/kernel/bpf/xskmap.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XSKMAP used for AF_XDP sockets + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/bpf.h> +#include <linux/capability.h> +#include <net/xdp_sock.h> +#include <linux/slab.h> +#include <linux/sched.h> + +struct xsk_map { + struct bpf_map map; + struct xdp_sock **xsk_map; + unsigned long __percpu *flush_needed; +}; + +static u64 xsk_map_bitmap_size(const union bpf_attr *attr) +{ + return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); +} + +static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) +{ + struct xsk_map *m; + int err = -EINVAL; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) + return ERR_PTR(-EINVAL); + + m = kzalloc(sizeof(*m), GFP_USER); + if (!m) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&m->map, attr); + + cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); + cost += xsk_map_bitmap_size(attr) * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_m; + + m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* Notice returns -EPERM on if map size is larger than memlock limit */ + err = bpf_map_precharge_memlock(m->map.pages); + if (err) + goto free_m; + + m->flush_needed = __alloc_percpu(xsk_map_bitmap_size(attr), + __alignof__(unsigned long)); + if (!m->flush_needed) + goto free_m; + + m->xsk_map = bpf_map_area_alloc(m->map.max_entries * + sizeof(struct xdp_sock *), + m->map.numa_node); + if (!m->xsk_map) + goto free_percpu; + return &m->map; + +free_percpu: + free_percpu(m->flush_needed); +free_m: + kfree(m); + return ERR_PTR(err); +} + +static void xsk_map_free(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + int i, cpu; + + /* At this point bpf_prog->aux->refcnt == 0 and this + * map->refcnt == 0, so the programs (can be more than one + * that used this map) were disconnected from events. Wait for + * outstanding critical sections in these programs to + * complete. The rcu critical section only guarantees no + * further reads against xsk_map. It does __not__ ensure + * pending flush operations (if any) are complete. + */ + + synchronize_rcu(); + + /* To ensure all pending flush operations have completed wait + * for flush bitmap to indicate all flush_needed bits to be + * zero on _all_ cpus. Because the above synchronize_rcu() + * ensures the map is disconnected from the program we can + * assume no new bits will be set. + */ + for_each_online_cpu(cpu) { + unsigned long *bitmap = per_cpu_ptr(m->flush_needed, cpu); + + while (!bitmap_empty(bitmap, map->max_entries)) + cond_resched(); + } + + for (i = 0; i < map->max_entries; i++) { + struct xdp_sock *xs; + + xs = m->xsk_map[i]; + if (!xs) + continue; + + sock_put((struct sock *)xs); + } + + free_percpu(m->flush_needed); + bpf_map_area_free(m->xsk_map); + kfree(m); +} + +static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= m->map.max_entries) { + *next = 0; + return 0; + } + + if (index == m->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs; + + if (key >= map->max_entries) + return NULL; + + xs = READ_ONCE(m->xsk_map[key]); + return xs; +} + +int __xsk_map_redirect(struct bpf_map *map, u32 index, + struct xdp_buff *xdp, struct xdp_sock *xs) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + unsigned long *bitmap = this_cpu_ptr(m->flush_needed); + int err; + + err = xsk_rcv(xs, xdp); + if (err) + return err; + + __set_bit(index, bitmap); + return 0; +} + +void __xsk_map_flush(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + unsigned long *bitmap = this_cpu_ptr(m->flush_needed); + u32 bit; + + for_each_set_bit(bit, bitmap, map->max_entries) { + struct xdp_sock *xs = READ_ONCE(m->xsk_map[bit]); + + /* This is possible if the entry is removed by user + * space between xdp redirect and flush op. + */ + if (unlikely(!xs)) + continue; + + __clear_bit(bit, bitmap); + xsk_flush(xs); + } +} + +static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +static void __xsk_map_entry_free(struct rcu_head *rcu) +{ + struct xdp_sock *xs; + + xs = container_of(rcu, struct xdp_sock, rcu); + xsk_flush(xs); + sock_put((struct sock *)xs); +} + +static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 i = *(u32 *)key, fd = *(u32 *)value; + struct xdp_sock *xs, *old_xs; + struct socket *sock; + int err; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(i >= m->map.max_entries)) + return -E2BIG; + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return err; + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + xs = (struct xdp_sock *)sock->sk; + + if (!xsk_is_setup_for_bpf_map(xs)) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + sock_hold(sock->sk); + + old_xs = xchg(&m->xsk_map[i], xs); + if (old_xs) + call_rcu(&old_xs->rcu, __xsk_map_entry_free); + + sockfd_put(sock); + return 0; +} + +static int xsk_map_delete_elem(struct bpf_map *map, void *key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *old_xs; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + old_xs = xchg(&m->xsk_map[k], NULL); + if (old_xs) + call_rcu(&old_xs->rcu, __xsk_map_entry_free); + + return 0; +} + +const struct bpf_map_ops xsk_map_ops = { + .map_alloc = xsk_map_alloc, + .map_free = xsk_map_free, + .map_get_next_key = xsk_map_get_next_key, + .map_lookup_elem = xsk_map_lookup_elem, + .map_update_elem = xsk_map_update_elem, + .map_delete_elem = xsk_map_delete_elem, +}; + + diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4e1e6c581e1d..b931a0db5588 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,6 +41,11 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) +{ + return !!xs->rx; +} + static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 *id, len = xdp->data_end - xdp->data; -- 2.14.1