On Mon, Apr 23, 2018 at 03:56:06PM +0200, Björn Töpel wrote: > From: Björn Töpel <bjorn.to...@intel.com> > > In this commit the base structure of the AF_XDP address family is set > up. Further, we introduce the abilty register a window of user memory > to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory > window is viewed by an AF_XDP socket as a set of equally large > frames. After a user memory registration all frames are "owned" by the > user application, and not the kernel. > > Co-authored-by: Magnus Karlsson <magnus.karls...@intel.com> > Signed-off-by: Magnus Karlsson <magnus.karls...@intel.com> > Signed-off-by: Björn Töpel <bjorn.to...@intel.com> > --- > include/uapi/linux/if_xdp.h | 34 +++++++ > net/Makefile | 1 + > net/xdp/Makefile | 2 + > net/xdp/xdp_umem.c | 237 > ++++++++++++++++++++++++++++++++++++++++++++ > net/xdp/xdp_umem.h | 42 ++++++++ > net/xdp/xdp_umem_props.h | 23 +++++ > net/xdp/xsk.c | 223 +++++++++++++++++++++++++++++++++++++++++ > 7 files changed, 562 insertions(+) > create mode 100644 include/uapi/linux/if_xdp.h > create mode 100644 net/xdp/Makefile > create mode 100644 net/xdp/xdp_umem.c > create mode 100644 net/xdp/xdp_umem.h > create mode 100644 net/xdp/xdp_umem_props.h > create mode 100644 net/xdp/xsk.c > > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h > new file mode 100644 > index 000000000000..41252135a0fe > --- /dev/null > +++ b/include/uapi/linux/if_xdp.h > @@ -0,0 +1,34 @@ > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note > + * > + * if_xdp: XDP socket user-space interface > + * Copyright(c) 2018 Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * Author(s): Björn Töpel <bjorn.to...@intel.com> > + * Magnus Karlsson <magnus.karls...@intel.com> > + */ > + > +#ifndef _LINUX_IF_XDP_H > +#define _LINUX_IF_XDP_H > + > +#include <linux/types.h> > + > +/* XDP socket options */ > +#define XDP_UMEM_REG 3 > + > +struct xdp_umem_reg { > + __u64 addr; /* Start of packet data area */ > + __u64 len; /* Length of packet data area */ > + __u32 frame_size; /* Frame size */ > + __u32 frame_headroom; /* Frame head room */ > +}; > + > +#endif /* _LINUX_IF_XDP_H */ > diff --git a/net/Makefile b/net/Makefile > index a6147c61b174..77aaddedbd29 100644 > --- a/net/Makefile > +++ b/net/Makefile > @@ -85,3 +85,4 @@ obj-y += l3mdev/ > endif > obj-$(CONFIG_QRTR) += qrtr/ > obj-$(CONFIG_NET_NCSI) += ncsi/ > +obj-$(CONFIG_XDP_SOCKETS) += xdp/ > diff --git a/net/xdp/Makefile b/net/xdp/Makefile > new file mode 100644 > index 000000000000..a5d736640a0f > --- /dev/null > +++ b/net/xdp/Makefile > @@ -0,0 +1,2 @@ > +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o > + > diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c > new file mode 100644 > index 000000000000..bff058f5a769 > --- /dev/null > +++ b/net/xdp/xdp_umem.c > @@ -0,0 +1,237 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* XDP user-space packet buffer > + * Copyright(c) 2018 Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + */ > + > +#include <linux/init.h> > +#include <linux/sched/mm.h> > +#include <linux/sched/signal.h> > +#include <linux/sched/task.h> > +#include <linux/uaccess.h> > +#include <linux/slab.h> > +#include <linux/bpf.h> > +#include <linux/mm.h> > + > +#include "xdp_umem.h" > + > +#define XDP_UMEM_MIN_FRAME_SIZE 2048 > + > +int xdp_umem_create(struct xdp_umem **umem) > +{ > + *umem = kzalloc(sizeof(**umem), GFP_KERNEL); > + > + if (!(*umem)) > + return -ENOMEM; > + > + return 0; > +} > + > +static void xdp_umem_unpin_pages(struct xdp_umem *umem) > +{ > + unsigned int i; > + > + if (umem->pgs) { > + for (i = 0; i < umem->npgs; i++)
Since you pin them with FOLL_WRITE, I assume these pages are written to. Don't you need set_page_dirty_lock here? > + put_page(umem->pgs[i]); > + > + kfree(umem->pgs); > + umem->pgs = NULL; > + } > +} > + > +static void xdp_umem_unaccount_pages(struct xdp_umem *umem) > +{ > + if (umem->user) { > + atomic_long_sub(umem->npgs, &umem->user->locked_vm); > + free_uid(umem->user); > + } > +} > + > +static void xdp_umem_release(struct xdp_umem *umem) > +{ > + struct task_struct *task; > + struct mm_struct *mm; > + unsigned long diff; > + > + if (umem->pgs) { > + xdp_umem_unpin_pages(umem); > + > + task = get_pid_task(umem->pid, PIDTYPE_PID); > + put_pid(umem->pid); > + if (!task) > + goto out; > + mm = get_task_mm(task); > + put_task_struct(task); > + if (!mm) > + goto out; > + > + diff = umem->size >> PAGE_SHIFT; > + > + down_write(&mm->mmap_sem); > + mm->pinned_vm -= diff; > + up_write(&mm->mmap_sem); > + mmput(mm); > + umem->pgs = NULL; > + } > + > + xdp_umem_unaccount_pages(umem); > +out: > + kfree(umem); > +} > + > +void xdp_put_umem(struct xdp_umem *umem) > +{ > + if (!umem) > + return; > + > + if (atomic_dec_and_test(&umem->users)) > + xdp_umem_release(umem); > +} > + > +static int xdp_umem_pin_pages(struct xdp_umem *umem) > +{ > + unsigned int gup_flags = FOLL_WRITE; > + long npgs; > + int err; > + > + umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL); > + if (!umem->pgs) > + return -ENOMEM; > + > + npgs = get_user_pages(umem->address, umem->npgs, > + gup_flags, &umem->pgs[0], NULL); > + if (npgs != umem->npgs) { > + if (npgs >= 0) { > + umem->npgs = npgs; > + err = -ENOMEM; > + goto out_pin; > + } > + err = npgs; > + goto out_pgs; > + } > + return 0; > + > +out_pin: > + xdp_umem_unpin_pages(umem); > +out_pgs: > + kfree(umem->pgs); > + umem->pgs = NULL; > + return err; > +} > + > +static int xdp_umem_account_pages(struct xdp_umem *umem) > +{ > + unsigned long lock_limit, new_npgs, old_npgs; > + > + if (capable(CAP_IPC_LOCK)) > + return 0; > + > + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > + umem->user = get_uid(current_user()); > + > + do { > + old_npgs = atomic_long_read(&umem->user->locked_vm); > + new_npgs = old_npgs + umem->npgs; > + if (new_npgs > lock_limit) { > + free_uid(umem->user); > + umem->user = NULL; > + return -ENOBUFS; > + } > + } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, > + new_npgs) != old_npgs); > + return 0; > +} > + > +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) > +{ > + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; > + u64 addr = mr->addr, size = mr->len; > + unsigned int nframes; > + int size_chk, err; > + > + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { > + /* Strictly speaking we could support this, if: > + * - huge pages, or* what does "or*" here mean? > + * - using an IOMMU, or > + * - making sure the memory area is consecutive > + * but for now, we simply say "computer says no". > + */ > + return -EINVAL; > + } > + > + if (!is_power_of_2(frame_size)) > + return -EINVAL; > + > + if (!PAGE_ALIGNED(addr)) { > + /* Memory area has to be page size aligned. For > + * simplicity, this might change. > + */ > + return -EINVAL; > + } > + > + if ((addr + size) < addr) > + return -EINVAL; > + > + nframes = size / frame_size; > + if (nframes == 0 || nframes > UINT_MAX) > + return -EINVAL; > + > + frame_headroom = ALIGN(frame_headroom, 64); > + > + size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; > + if (size_chk < 0) > + return -EINVAL; > + > + umem->pid = get_task_pid(current, PIDTYPE_PID); > + umem->size = (size_t)size; > + umem->address = (unsigned long)addr; > + umem->props.frame_size = frame_size; > + umem->props.nframes = nframes; > + umem->frame_headroom = frame_headroom; > + umem->npgs = size / PAGE_SIZE; > + umem->pgs = NULL; > + umem->user = NULL; > + > + umem->frame_size_log2 = ilog2(frame_size); > + umem->nfpp_mask = (PAGE_SIZE / frame_size) - 1; > + umem->nfpplog2 = ilog2(PAGE_SIZE / frame_size); > + atomic_set(&umem->users, 1); > + > + err = xdp_umem_account_pages(umem); > + if (err) > + goto out; > + > + err = xdp_umem_pin_pages(umem); > + if (err) > + goto out; > + return 0; > + > +out: > + put_pid(umem->pid); > + return err; > +} > + > +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) > +{ > + int err; > + > + if (!umem) > + return -EINVAL; > + > + down_write(¤t->mm->mmap_sem); > + > + err = __xdp_umem_reg(umem, mr); > + > + up_write(¤t->mm->mmap_sem); > + return err; > +} > + > diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h > new file mode 100644 > index 000000000000..58714f4f7f25 > --- /dev/null > +++ b/net/xdp/xdp_umem.h > @@ -0,0 +1,42 @@ > +/* SPDX-License-Identifier: GPL-2.0 > + * XDP user-space packet buffer > + * Copyright(c) 2018 Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + */ > + > +#ifndef XDP_UMEM_H_ > +#define XDP_UMEM_H_ > + > +#include <linux/mm.h> > +#include <linux/if_xdp.h> > + > +#include "xdp_umem_props.h" > + > +struct xdp_umem { > + struct page **pgs; > + struct xdp_umem_props props; > + u32 npgs; > + u32 frame_headroom; > + u32 nfpp_mask; > + u32 nfpplog2; > + u32 frame_size_log2; > + struct user_struct *user; > + struct pid *pid; > + unsigned long address; > + size_t size; > + atomic_t users; > +}; > + > +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); > +void xdp_put_umem(struct xdp_umem *umem); > +int xdp_umem_create(struct xdp_umem **umem); > + > +#endif /* XDP_UMEM_H_ */ > diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h > new file mode 100644 > index 000000000000..77fb5daf29f3 > --- /dev/null > +++ b/net/xdp/xdp_umem_props.h > @@ -0,0 +1,23 @@ > +/* SPDX-License-Identifier: GPL-2.0 > + * XDP user-space packet buffer > + * Copyright(c) 2018 Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + */ > + > +#ifndef XDP_UMEM_PROPS_H_ > +#define XDP_UMEM_PROPS_H_ > + > +struct xdp_umem_props { > + u32 frame_size; > + u32 nframes; > +}; > + > +#endif /* XDP_UMEM_PROPS_H_ */ > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c > new file mode 100644 > index 000000000000..19fc719cbe0d > --- /dev/null > +++ b/net/xdp/xsk.c > @@ -0,0 +1,223 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* XDP sockets > + * > + * AF_XDP sockets allows a channel between XDP programs and userspace > + * applications. > + * Copyright(c) 2018 Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * Author(s): Björn Töpel <bjorn.to...@intel.com> > + * Magnus Karlsson <magnus.karls...@intel.com> > + */ > + > +#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ > + > +#include <linux/if_xdp.h> > +#include <linux/init.h> > +#include <linux/sched/mm.h> > +#include <linux/sched/signal.h> > +#include <linux/sched/task.h> > +#include <linux/socket.h> > +#include <linux/file.h> > +#include <linux/uaccess.h> > +#include <linux/net.h> > +#include <linux/netdevice.h> > +#include <net/sock.h> > + > +#include "xdp_umem.h" > + > +struct xdp_sock { > + /* struct sock must be the first member of struct xdp_sock */ > + struct sock sk; > + struct xdp_umem *umem; > + /* Protects multiple processes in the control path */ > + struct mutex mutex; > +}; > + > +static struct xdp_sock *xdp_sk(struct sock *sk) > +{ > + return (struct xdp_sock *)sk; > +} > + > +static int xsk_release(struct socket *sock) > +{ > + struct sock *sk = sock->sk; > + struct net *net; > + > + if (!sk) > + return 0; > + > + net = sock_net(sk); > + > + local_bh_disable(); > + sock_prot_inuse_add(net, sk->sk_prot, -1); > + local_bh_enable(); > + > + sock_orphan(sk); > + sock->sk = NULL; > + > + sk_refcnt_debug_release(sk); > + sock_put(sk); > + > + return 0; > +} > + > +static int xsk_setsockopt(struct socket *sock, int level, int optname, > + char __user *optval, unsigned int optlen) > +{ > + struct sock *sk = sock->sk; > + struct xdp_sock *xs = xdp_sk(sk); > + int err; > + > + if (level != SOL_XDP) > + return -ENOPROTOOPT; > + > + switch (optname) { > + case XDP_UMEM_REG: > + { > + struct xdp_umem_reg mr; > + struct xdp_umem *umem; > + > + if (xs->umem) > + return -EBUSY; > + > + if (copy_from_user(&mr, optval, sizeof(mr))) > + return -EFAULT; > + > + mutex_lock(&xs->mutex); > + err = xdp_umem_create(&umem); > + > + err = xdp_umem_reg(umem, &mr); > + if (err) { > + kfree(umem); > + mutex_unlock(&xs->mutex); > + return err; > + } > + > + /* Make sure umem is ready before it can be seen by others */ > + smp_wmb(); > + > + xs->umem = umem; > + mutex_unlock(&xs->mutex); > + return 0; > + } > + default: > + break; > + } > + > + return -ENOPROTOOPT; > +} > + > +static struct proto xsk_proto = { > + .name = "XDP", > + .owner = THIS_MODULE, > + .obj_size = sizeof(struct xdp_sock), > +}; > + > +static const struct proto_ops xsk_proto_ops = { > + .family = PF_XDP, > + .owner = THIS_MODULE, > + .release = xsk_release, > + .bind = sock_no_bind, > + .connect = sock_no_connect, > + .socketpair = sock_no_socketpair, > + .accept = sock_no_accept, > + .getname = sock_no_getname, > + .poll = sock_no_poll, > + .ioctl = sock_no_ioctl, > + .listen = sock_no_listen, > + .shutdown = sock_no_shutdown, > + .setsockopt = xsk_setsockopt, > + .getsockopt = sock_no_getsockopt, > + .sendmsg = sock_no_sendmsg, > + .recvmsg = sock_no_recvmsg, > + .mmap = sock_no_mmap, > + .sendpage = sock_no_sendpage, > +}; > + > +static void xsk_destruct(struct sock *sk) > +{ > + struct xdp_sock *xs = xdp_sk(sk); > + > + if (!sock_flag(sk, SOCK_DEAD)) > + return; > + > + xdp_put_umem(xs->umem); > + > + sk_refcnt_debug_dec(sk); > +} > + > +static int xsk_create(struct net *net, struct socket *sock, int protocol, > + int kern) > +{ > + struct sock *sk; > + struct xdp_sock *xs; > + > + if (!ns_capable(net->user_ns, CAP_NET_RAW)) > + return -EPERM; > + if (sock->type != SOCK_RAW) > + return -ESOCKTNOSUPPORT; > + > + if (protocol) > + return -EPROTONOSUPPORT; > + > + sock->state = SS_UNCONNECTED; > + > + sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); > + if (!sk) > + return -ENOBUFS; > + > + sock->ops = &xsk_proto_ops; > + > + sock_init_data(sock, sk); > + > + sk->sk_family = PF_XDP; > + > + sk->sk_destruct = xsk_destruct; > + sk_refcnt_debug_inc(sk); > + > + xs = xdp_sk(sk); > + mutex_init(&xs->mutex); > + > + local_bh_disable(); > + sock_prot_inuse_add(net, &xsk_proto, 1); > + local_bh_enable(); > + > + return 0; > +} > + > +static const struct net_proto_family xsk_family_ops = { > + .family = PF_XDP, > + .create = xsk_create, > + .owner = THIS_MODULE, > +}; > + > +static int __init xsk_init(void) > +{ > + int err; > + > + err = proto_register(&xsk_proto, 0 /* no slab */); > + if (err) > + goto out; > + > + err = sock_register(&xsk_family_ops); > + if (err) > + goto out_proto; > + > + return 0; > + > +out_proto: > + proto_unregister(&xsk_proto); > +out: > + return err; > +} > + > +fs_initcall(xsk_init); > -- > 2.14.1