From: Björn Töpel <bjorn.to...@intel.com>

The XDP_MEM_REG socket option allows a process to register a window of
user space memory to the kernel. This memory will later be used as
frame data buffer.

Signed-off-by: Björn Töpel <bjorn.to...@intel.com>
---
 include/uapi/linux/if_xdp.h |   7 ++
 net/xdp/xsk.c               | 294 +++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk.h               |  19 ++-
 3 files changed, 316 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index cd09232e16c1..3f8c90c708b4 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -29,4 +29,11 @@ struct sockaddr_xdp {
 #define XDP_RX_RING    2
 #define XDP_TX_RING    3
 
+struct xdp_mr_req {
+       __u64   addr;           /* Start of packet data area */
+       __u64   len;            /* Length of packet data area */
+       __u32   frame_size;     /* Frame size */
+       __u32   data_headroom;  /* Frame head room */
+};
+
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 2d7c08a50c60..333ce1450cc7 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -19,18 +19,235 @@
 
 #include <linux/if_xdp.h>
 #include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/socket.h>
 #include <net/sock.h>
 
 #include "xsk.h"
 
+#define XSK_UMEM_MIN_FRAME_SIZE 2048
+
 struct xdp_sock {
        /* struct sock must be the first member of struct xdp_sock */
        struct sock sk;
+       struct xsk_umem *umem;
 };
 
+static struct xdp_sock *xdp_sk(struct sock *sk)
+{
+       return (struct xdp_sock *)sk;
+}
+
+static void xsk_umem_unpin_pages(struct xsk_umem *umem)
+{
+       unsigned int i;
+
+       if (umem->pgs) {
+               for (i = 0; i < umem->npgs; i++) {
+                       struct page *page = umem->pgs[i];
+
+                       set_page_dirty_lock(page);
+                       put_page(page);
+               }
+
+               kfree(umem->pgs);
+               umem->pgs = NULL;
+       }
+}
+
+static void xsk_umem_destroy(struct xsk_umem *umem)
+{
+       struct mm_struct *mm;
+       struct task_struct *task;
+       unsigned long diff;
+
+       if (!umem)
+               return;
+
+       xsk_umem_unpin_pages(umem);
+
+       task = get_pid_task(umem->pid, PIDTYPE_PID);
+       put_pid(umem->pid);
+       if (!task)
+               goto out;
+       mm = get_task_mm(task);
+       put_task_struct(task);
+       if (!mm)
+               goto out;
+
+       diff = umem->size >> PAGE_SHIFT;
+
+       down_write(&mm->mmap_sem);
+       mm->pinned_vm -= diff;
+       up_write(&mm->mmap_sem);
+       mmput(mm);
+out:
+       kfree(umem);
+}
+
+static struct xsk_umem *xsk_umem_create(u64 addr, u64 size, u32 frame_size,
+                                       u32 data_headroom)
+{
+       struct xsk_umem *umem;
+       unsigned int nframes;
+       int size_chk;
+
+       if (frame_size < XSK_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+               /* Strictly speaking we could support this, if:
+                * - huge pages, or*
+                * - using an IOMMU, or
+                * - making sure the memory area is consecutive
+                * but for now, we simply say "computer says no".
+                */
+               return ERR_PTR(-EINVAL);
+       }
+
+       if (!is_power_of_2(frame_size))
+               return ERR_PTR(-EINVAL);
+
+       if (!PAGE_ALIGNED(addr)) {
+               /* Memory area has to be page size aligned. For
+                * simplicity, this might change.
+                */
+               return ERR_PTR(-EINVAL);
+       }
+
+       if ((addr + size) < addr)
+               return ERR_PTR(-EINVAL);
+
+       nframes = size / frame_size;
+       if (nframes == 0)
+               return ERR_PTR(-EINVAL);
+
+       data_headroom = ALIGN(data_headroom, 64);
+
+       size_chk = frame_size - data_headroom - XSK_KERNEL_HEADROOM;
+       if (size_chk < 0)
+               return ERR_PTR(-EINVAL);
+
+       umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+       if (!umem)
+               return ERR_PTR(-ENOMEM);
+
+       umem->pid = get_task_pid(current, PIDTYPE_PID);
+       umem->size = (size_t)size;
+       umem->address = (unsigned long)addr;
+       umem->frame_size = frame_size;
+       umem->nframes = nframes;
+       umem->data_headroom = data_headroom;
+       umem->pgs = NULL;
+
+       return umem;
+}
+
+static int xsk_umem_pin_pages(struct xsk_umem *umem)
+{
+       unsigned int gup_flags = FOLL_WRITE;
+       long npgs;
+       int err;
+
+       /* XXX Fix so that we don't always pin.
+        * "copy to user" from interrupt context, but how?
+        */
+       umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_ATOMIC);
+       if (!umem->pgs)
+               return -ENOMEM;
+
+       npgs = get_user_pages(umem->address, umem->npgs,
+                             gup_flags, &umem->pgs[0], NULL);
+       if (npgs != umem->npgs) {
+               if (npgs >= 0) {
+                       umem->npgs = npgs;
+                       err = -ENOMEM;
+                       goto out_pin;
+               }
+               err = npgs;
+               goto out_pgs;
+       }
+
+       return 0;
+
+out_pin:
+       xsk_umem_unpin_pages(umem);
+out_pgs:
+       kfree(umem->pgs);
+       umem->pgs = NULL;
+
+       return err;
+}
+
+static struct xsk_umem *xsk_mem_reg(u64 addr, u64 size, u32 frame_size,
+                                   u32 data_headroom)
+{
+       unsigned long lock_limit, locked, npages;
+       int ret = 0;
+       struct xsk_umem *umem;
+
+       if (!can_do_mlock())
+               return ERR_PTR(-EPERM);
+
+       umem = xsk_umem_create(addr, size, frame_size, data_headroom);
+       if (IS_ERR(umem))
+               return umem;
+
+       npages = PAGE_ALIGN(umem->nframes * umem->frame_size) >> PAGE_SHIFT;
+
+       down_write(&current->mm->mmap_sem);
+
+       locked = npages + current->mm->pinned_vm;
+       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+       if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       if (npages == 0 || npages > UINT_MAX) {
+               ret = -EINVAL;
+               goto out;
+       }
+       umem->npgs = npages;
+
+       ret = xsk_umem_pin_pages(umem);
+
+out:
+       if (ret < 0) {
+               put_pid(umem->pid);
+               kfree(umem);
+       } else {
+               current->mm->pinned_vm = locked;
+       }
+
+       up_write(&current->mm->mmap_sem);
+
+       return ret < 0 ? ERR_PTR(ret) : umem;
+}
+
 static int xsk_release(struct socket *sock)
 {
+       struct sock *sk = sock->sk;
+       struct xdp_sock *xs = xdp_sk(sk);
+       struct net *net;
+
+       if (!sk)
+               return 0;
+
+       net = sock_net(sk);
+
+       local_bh_disable();
+       sock_prot_inuse_add(net, sk->sk_prot, -1);
+       local_bh_enable();
+
+       xsk_umem_destroy(xs->umem);
+
+       sock_orphan(sk);
+       sock->sk = NULL;
+
+       sk_refcnt_debug_release(sk);
+       sock_put(sk);
+
        return 0;
 }
 
@@ -48,6 +265,43 @@ static unsigned int xsk_poll(struct file *file, struct 
socket *sock,
 static int xsk_setsockopt(struct socket *sock, int level, int optname,
                          char __user *optval, unsigned int optlen)
 {
+       struct sock *sk = sock->sk;
+       struct xdp_sock *xs = xdp_sk(sk);
+
+       if (level != SOL_XDP)
+               return -ENOPROTOOPT;
+
+       switch (optname) {
+       case XDP_MEM_REG:
+       {
+               struct xdp_mr_req req;
+               struct xsk_umem *umem;
+
+               if (optlen < sizeof(req))
+                       return -EINVAL;
+               if (copy_from_user(&req, optval, sizeof(req)))
+                       return -EFAULT;
+
+               umem = xsk_mem_reg(req.addr, req.len, req.frame_size,
+                                  req.data_headroom);
+               if (IS_ERR(umem))
+                       return PTR_ERR(umem);
+
+               lock_sock(sk);
+               if (xs->umem) { /* XXX create and check afterwards... really? */
+                       release_sock(sk);
+                       xsk_umem_destroy(umem);
+                       return -EBUSY;
+               }
+               xs->umem = umem;
+               release_sock(sk);
+
+               return 0;
+       }
+       default:
+               break;
+       }
+
        return -ENOPROTOOPT;
 }
 
@@ -97,10 +351,48 @@ static const struct proto_ops xsk_proto_ops = {
        /* the rest vvv, OK to be missing implementation -- checked against 
NULL. */
 };
 
+static void xsk_destruct(struct sock *sk)
+{
+       if (!sock_flag(sk, SOCK_DEAD))
+               return;
+
+       sk_refcnt_debug_dec(sk);
+}
+
 static int xsk_create(struct net *net, struct socket *sock, int protocol,
                      int kern)
 {
-       return -EOPNOTSUPP;
+       struct sock *sk;
+
+       if (!ns_capable(net->user_ns, CAP_NET_RAW))
+               return -EPERM;
+       if (sock->type != SOCK_RAW)
+               return -ESOCKTNOSUPPORT;
+
+       /* XXX Require ETH_P_IP? Something else? */
+       if (protocol)
+               return -EPROTONOSUPPORT;
+
+       sock->state = SS_UNCONNECTED;
+
+       sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
+       if (!sk)
+               return -ENOBUFS;
+
+       sock->ops = &xsk_proto_ops;
+
+       sock_init_data(sock, sk);
+
+       sk->sk_family = PF_XDP;
+
+       sk->sk_destruct = xsk_destruct;
+       sk_refcnt_debug_inc(sk);
+
+       local_bh_disable();
+       sock_prot_inuse_add(net, &xsk_proto, 1);
+       local_bh_enable();
+
+       return 0;
 }
 
 static const struct net_proto_family xsk_family_ops = {
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index 441f8d00a9d5..71559374645b 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -12,7 +12,20 @@
  * more details.
  */
 
-#ifndef _LINUX_XDPSOCK_H
-#define _LINUX_XDPSOCK_H
+#ifndef _LINUX_XSK_H
+#define _LINUX_XSK_H
 
-#endif /* _LINUX_XDPSOCK_H */
+#define XSK_KERNEL_HEADROOM 256 /* Headrom for XDP */
+
+struct xsk_umem {
+       struct pid *pid;
+       struct page **pgs;
+       unsigned long address;
+       size_t size;
+       u32 npgs;
+       u32 frame_size;
+       u32 nframes;
+       u32 data_headroom;
+};
+
+#endif /* _LINUX_XSK_H */
-- 
2.14.1

Reply via email to