On Mon, 18 Feb 2019 09:59:30 +0100 Magnus Karlsson <magnus.karls...@gmail.com> wrote:
> On Fri, Feb 15, 2019 at 6:09 PM Daniel Borkmann <dan...@iogearbox.net> wrote: > > > > On 02/08/2019 02:05 PM, Magnus Karlsson wrote: > > > This commit adds AF_XDP support to libbpf. The main reason for this is > > > to facilitate writing applications that use AF_XDP by offering > > > higher-level APIs that hide many of the details of the AF_XDP > > > uapi. This is in the same vein as libbpf facilitates XDP adoption by > > > offering easy-to-use higher level interfaces of XDP > > > functionality. Hopefully this will facilitate adoption of AF_XDP, make > > > applications using it simpler and smaller, and finally also make it > > > possible for applications to benefit from optimizations in the AF_XDP > > > user space access code. Previously, people just copied and pasted the > > > code from the sample application into their application, which is not > > > desirable. > > > > > > The interface is composed of two parts: > > > > > > * Low-level access interface to the four rings and the packet > > > * High-level control plane interface for creating and setting > > > up umems and af_xdp sockets as well as a simple XDP program. > > > > > > Tested-by: Björn Töpel <bjorn.to...@intel.com> > > > Signed-off-by: Magnus Karlsson <magnus.karls...@intel.com> > > [...] > > > diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c > > > new file mode 100644 > > > index 0000000..a982a76 > > > --- /dev/null > > > +++ b/tools/lib/bpf/xsk.c > > > @@ -0,0 +1,742 @@ > > > +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) > > > + > > > +/* > > > + * AF_XDP user-space access library. > > > + * > > > + * Copyright(c) 2018 - 2019 Intel Corporation. > > > + * > > > + * Author(s): Magnus Karlsson <magnus.karls...@intel.com> > > > + */ > > > + > > > +#include <errno.h> > > > +#include <stdlib.h> > > > +#include <string.h> > > > +#include <unistd.h> > > > +#include <arpa/inet.h> > > > +#include <asm/barrier.h> > > > +#include <linux/compiler.h> > > > +#include <linux/filter.h> > > > +#include <linux/if_ether.h> > > > +#include <linux/if_link.h> > > > +#include <linux/if_packet.h> > > > +#include <linux/if_xdp.h> > > > +#include <linux/rtnetlink.h> > > > +#include <net/if.h> > > > +#include <sys/mman.h> > > > +#include <sys/socket.h> > > > +#include <sys/types.h> > > > + > > > +#include "bpf.h" > > > +#include "libbpf.h" > > > +#include "libbpf_util.h" > > > +#include "nlattr.h" > > > +#include "xsk.h" > > > + > > > +#ifndef SOL_XDP > > > + #define SOL_XDP 283 > > > +#endif > > > + > > > +#ifndef AF_XDP > > > + #define AF_XDP 44 > > > +#endif > > > + > > > +#ifndef PF_XDP > > > + #define PF_XDP AF_XDP > > > +#endif > > > + > > > +struct xsk_umem { > > > + struct xsk_ring_prod *fill; > > > + struct xsk_ring_cons *comp; > > > + char *umem_area; > > > + struct xsk_umem_config config; > > > + int fd; > > > + int refcount; > > > +}; > > > + > > > +struct xsk_socket { > > > + struct xsk_ring_cons *rx; > > > + struct xsk_ring_prod *tx; > > > + __u64 outstanding_tx; > > > + struct xsk_umem *umem; > > > + struct xsk_socket_config config; > > > + int fd; > > > + int xsks_map; > > > + int ifindex; > > > + int prog_fd; > > > + int qidconf_map_fd; > > > + int xsks_map_fd; > > > + __u32 queue_id; > > > +}; > > > + > > > +struct xsk_nl_info { > > > + bool xdp_prog_attached; > > > + int ifindex; > > > + int fd; > > > +}; > > > + > > > +#define MAX_QUEUES 128 > > > > Why is this a fixed constant here, shouldn't this be dynamic due to being > > NIC > > specific anyway? > > It was only here for simplicity. If a NIC had more queues, it would > require a recompile of the lib. Obviously, not desirable in a distro. > What I could do is to read the max "combined" queues (pre-set maximum > in the ethtool output) from the same interface as ethool uses and size > the array after that. Or is there a simpler way? What to do if the NIC > does not have a "combined", or is there no such NIC (seems the common > HW ones set this)? > > > [...] > > > +void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr) > > > +{ > > > + return &((char *)(umem->umem_area))[addr]; > > > +} > > > > There's also a xsk_umem__get_data_raw() doing the same. Why having both, > > resp. > > when to choose which? ;) > > There is enough to have the xsk_umem__get_data_raw() function. > xsk_umem__get_data() is just a convenience function for which the > application does not have to store the beginning of the umem. But as > the application always has to provide this anyway in the > xsk_umem__create() function, it might as well store this pointer. I > will delete xsk_umem__get_data() and rename xsk_umem__get_data_raw() > to xsk_umem__get_data(). > > > > +int xsk_umem__fd(const struct xsk_umem *umem) > > > +{ > > > + return umem ? umem->fd : -EINVAL; > > > +} > > > + > > > +int xsk_socket__fd(const struct xsk_socket *xsk) > > > +{ > > > + return xsk ? xsk->fd : -EINVAL; > > > +} > > > + > > > +static bool xsk_page_aligned(void *buffer) > > > +{ > > > + unsigned long addr = (unsigned long)buffer; > > > + > > > + return !(addr & (getpagesize() - 1)); > > > +} > > > + > > > +static void xsk_set_umem_config(struct xsk_umem_config *cfg, > > > + const struct xsk_umem_config *usr_cfg) > > > +{ > > > + if (!usr_cfg) { > > > + cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; > > > + cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; > > > + cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; > > > + cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; > > > + return; > > > + } > > > + > > > + cfg->fill_size = usr_cfg->fill_size; > > > + cfg->comp_size = usr_cfg->comp_size; > > > + cfg->frame_size = usr_cfg->frame_size; > > > + cfg->frame_headroom = usr_cfg->frame_headroom; > > > > Just optional nit, might be a bit nicer to have it in this form: > > > > cfg->fill_size = usr_cfg ? usr_cfg->fill_size : > > XSK_RING_PROD__DEFAULT_NUM_DESCS; > > I actually think the current form is clearer when there are multiple > lines. If there was only one line, I would agree with you. > > > > +} > > > + > > > +static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, > > > + const struct xsk_socket_config > > > *usr_cfg) > > > +{ > > > + if (!usr_cfg) { > > > + cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; > > > + cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; > > > + cfg->libbpf_flags = 0; > > > + cfg->xdp_flags = 0; > > > + cfg->bind_flags = 0; > > > + return; > > > + } > > > + > > > + cfg->rx_size = usr_cfg->rx_size; > > > + cfg->tx_size = usr_cfg->tx_size; > > > + cfg->libbpf_flags = usr_cfg->libbpf_flags; > > > + cfg->xdp_flags = usr_cfg->xdp_flags; > > > + cfg->bind_flags = usr_cfg->bind_flags; > > > > (Ditto) > > > > > +} > > > + > > > +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 > > > size, > > > + struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, > > > + const struct xsk_umem_config *usr_config) > > > +{ > > > + struct xdp_mmap_offsets off; > > > + struct xdp_umem_reg mr; > > > + struct xsk_umem *umem; > > > + socklen_t optlen; > > > + void *map; > > > + int err; > > > + > > > + if (!umem_area || !umem_ptr || !fill || !comp) > > > + return -EFAULT; > > > + if (!size && !xsk_page_aligned(umem_area)) > > > + return -EINVAL; > > > + > > > + umem = calloc(1, sizeof(*umem)); > > > + if (!umem) > > > + return -ENOMEM; > > > + > > > + umem->fd = socket(AF_XDP, SOCK_RAW, 0); > > > + if (umem->fd < 0) { > > > + err = -errno; > > > + goto out_umem_alloc; > > > + } > > > + > > > + umem->umem_area = umem_area; > > > + xsk_set_umem_config(&umem->config, usr_config); > > > + > > > + mr.addr = (uintptr_t)umem_area; > > > + mr.len = size; > > > + mr.chunk_size = umem->config.frame_size; > > > + mr.headroom = umem->config.frame_headroom; > > > + > > > + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)); > > > + if (err) { > > > + err = -errno; > > > + goto out_socket; > > > + } > > > + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING, > > > + &umem->config.fill_size, > > > + sizeof(umem->config.fill_size)); > > > + if (err) { > > > + err = -errno; > > > + goto out_socket; > > > + } > > > + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING, > > > + &umem->config.comp_size, > > > + sizeof(umem->config.comp_size)); > > > + if (err) { > > > + err = -errno; > > > + goto out_socket; > > > + } > > > + > > > + optlen = sizeof(off); > > > + err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, > > > &optlen); > > > + if (err) { > > > + err = -errno; > > > + goto out_socket; > > > + } > > > + > > > + map = xsk_mmap(NULL, off.fr.desc + > > > + umem->config.fill_size * sizeof(__u64), > > > + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, > > > + umem->fd, XDP_UMEM_PGOFF_FILL_RING); > > > + if (map == MAP_FAILED) { > > > + err = -errno; > > > + goto out_socket; > > > + } > > > + > > > + umem->fill = fill; > > > + fill->mask = umem->config.fill_size - 1; > > > + fill->size = umem->config.fill_size; > > > + fill->producer = map + off.fr.producer; > > > + fill->consumer = map + off.fr.consumer; > > > + fill->ring = map + off.fr.desc; > > > + fill->cached_cons = umem->config.fill_size; > > > + > > > + map = xsk_mmap(NULL, > > > + off.cr.desc + umem->config.comp_size * sizeof(__u64), > > > + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, > > > + umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING); > > > + if (map == MAP_FAILED) { > > > + err = -errno; > > > + goto out_mmap; > > > + } > > > + > > > + umem->comp = comp; > > > + comp->mask = umem->config.comp_size - 1; > > > + comp->size = umem->config.comp_size; > > > + comp->producer = map + off.cr.producer; > > > + comp->consumer = map + off.cr.consumer; > > > + comp->ring = map + off.cr.desc; > > > + > > > + *umem_ptr = umem; > > > + return 0; > > > + > > > +out_mmap: > > > + munmap(umem->fill, > > > + off.fr.desc + umem->config.fill_size * sizeof(__u64)); > > > +out_socket: > > > + close(umem->fd); > > > +out_umem_alloc: > > > + free(umem); > > > + return err; > > > +} > > > + > > > +static int xsk_parse_nl(void *cookie, void *msg, struct nlattr **tb) > > > +{ > > > + struct nlattr *tb_parsed[IFLA_XDP_MAX + 1]; > > > + struct xsk_nl_info *nl_info = cookie; > > > + struct ifinfomsg *ifinfo = msg; > > > + unsigned char mode; > > > + int err; > > > + > > > + if (nl_info->ifindex && nl_info->ifindex != ifinfo->ifi_index) > > > + return 0; > > > + > > > + if (!tb[IFLA_XDP]) > > > + return 0; > > > + > > > + err = libbpf_nla_parse_nested(tb_parsed, IFLA_XDP_MAX, tb[IFLA_XDP], > > > + NULL); > > > + if (err) > > > + return err; > > > + > > > + if (!tb_parsed[IFLA_XDP_ATTACHED] || !tb_parsed[IFLA_XDP_FD]) > > > + return 0; > > > + > > > + mode = libbpf_nla_getattr_u8(tb_parsed[IFLA_XDP_ATTACHED]); > > > + if (mode == XDP_ATTACHED_NONE) > > > + return 0; > > > + > > > + nl_info->xdp_prog_attached = true; > > > + nl_info->fd = libbpf_nla_getattr_u32(tb_parsed[IFLA_XDP_FD]); > > > > Hm, I don't think this works if I read the intention of this helper > > correctly. > > > > IFLA_XDP_FD is never set for retrieving the prog from the kernel. So the > > above is a bug. > > > > We also have bpf_get_link_xdp_id(). This should probably just be reused in > > this context here. > > If bpf_get_link_xdp_id() will fit my bill, I will happily use it. I > will check it out and hopefully I can drop all this code. Thanks. > I see that all you need to know is whether there's already attached XDP program to xsk socket's related interface, no? If so, then within the xsk_setup_xdp_prog, you could do something like: u32 prog_id = 0; bpf_get_link_xdp_id(xsk->ifindex, &prog_id, xsk->config.xdp_flags); if (!prog_id) { // create maps // load xdp prog } else { xsk->fd = prog_id; } xsk_update_bpf_maps(xsk, true, xsk->fd); If that's ok then xsk_xdp_prog_attached and xsk_parse_nl could be dropped. > > > + return 0; > > > +} > > > + > > > +static bool xsk_xdp_prog_attached(struct xsk_socket *xsk) > > > +{ > > > + struct xsk_nl_info nl_info; > > > + unsigned int nl_pid; > > > + char err_buf[256]; > > > + int sock, err; > > > + > > > + sock = libbpf_netlink_open(&nl_pid); > > > + if (sock < 0) > > > + return false; > > > + > > > + nl_info.xdp_prog_attached = false; > > > + nl_info.ifindex = xsk->ifindex; > > > + nl_info.fd = -1; > > > + > > > + err = libbpf_nl_get_link(sock, nl_pid, xsk_parse_nl, &nl_info); > > > + if (err) { > > > + libbpf_strerror(err, err_buf, sizeof(err_buf)); > > > + pr_warning("Error:\n%s\n", err_buf); > > > + close(sock); > > > + return false; > > > + } > > > + > > > + close(sock); > > > + xsk->prog_fd = nl_info.fd; > > > + return nl_info.xdp_prog_attached; > > > +} > > > > (See bpf_get_link_xdp_id().) > > > > > + > > > +static int xsk_load_xdp_prog(struct xsk_socket *xsk) > > > +{ > > > + char bpf_log_buf[BPF_LOG_BUF_SIZE]; > > > + int err, prog_fd; > > > + > > > + /* This is the C-program: > > > + * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) > > > + * { > > > + * int *qidconf, index = ctx->rx_queue_index; > > [...] > > > + > > > +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, > > > + __u32 queue_id, struct xsk_umem *umem, > > > + struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, > > > + const struct xsk_socket_config *usr_config) > > > +{ > > > + struct sockaddr_xdp sxdp = {}; > > > + struct xdp_mmap_offsets off; > > > + struct xsk_socket *xsk; > > > + socklen_t optlen; > > > + void *map; > > > + int err; > > > + > > > + if (!umem || !xsk_ptr || !rx || !tx) > > > + return -EFAULT; > > > + > > > + if (umem->refcount) { > > > + pr_warning("Error: shared umems not supported by > > > libbpf.\n"); > > > + return -EBUSY; > > > + } > > > + > > > + xsk = calloc(1, sizeof(*xsk)); > > > + if (!xsk) > > > + return -ENOMEM; > > > + > > > + if (umem->refcount++ > 0) { > > > > Should this refcount rather be atomic actually? > > Neither our config nor data plane interfaces are reentrant for > performance reasons. Any concurrency has to be handled explicitly on > the application level. This so that it only penalizes apps that really > need this. > > Thanks for all your reviews: Magnus > > > > + xsk->fd = socket(AF_XDP, SOCK_RAW, 0); > > > + if (xsk->fd < 0) { > > > + err = -errno; > > > + goto out_xsk_alloc; > > > + } > > > + } else { > > > + xsk->fd = umem->fd; > > > + } > > > + > > > + xsk->outstanding_tx = 0; > > > + xsk->queue_id = queue_id; > > > + xsk->umem = umem; > > > + xsk->ifindex = if_nametoindex(ifname); > > > + if (!xsk->ifindex) { > > > + err = -errno; > > > + goto out_socket; > > > + } > > > + > > > + xsk_set_xdp_socket_config(&xsk->config, usr_config); > > [...]