From: Björn Töpel <bjorn.to...@intel.com>

Here, the PACKET_MEMREG setsockopt is implemented for the AF_PACKET
protocol family. PACKET_MEMREG allows the user to register memory
regions that can be used by AF_PACKET V4 as packet data buffers.

Signed-off-by: Björn Töpel <bjorn.to...@intel.com>
---
 include/linux/tpacket4.h | 101 +++++++++++++++++++++++++++++
 net/packet/af_packet.c   | 163 +++++++++++++++++++++++++++++++++++++++++++++++
 net/packet/internal.h    |   4 ++
 3 files changed, 268 insertions(+)
 create mode 100644 include/linux/tpacket4.h

diff --git a/include/linux/tpacket4.h b/include/linux/tpacket4.h
new file mode 100644
index 000000000000..fcf4c333c78d
--- /dev/null
+++ b/include/linux/tpacket4.h
@@ -0,0 +1,101 @@
+/*
+ *  tpacket v4
+ *  Copyright(c) 2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_TPACKET4_H
+#define _LINUX_TPACKET4_H
+
+#define TP4_UMEM_MIN_FRAME_SIZE 2048
+#define TP4_KERNEL_HEADROOM 256 /* Headrom for XDP */
+
+struct tp4_umem {
+       struct pid *pid;
+       struct page **pgs;
+       unsigned int npgs;
+       size_t size;
+       unsigned long address;
+       unsigned int frame_size;
+       unsigned int frame_size_log2;
+       unsigned int nframes;
+       unsigned int nfpplog2; /* num frames per page in log2 */
+       unsigned int data_headroom;
+};
+
+/*************** V4 QUEUE OPERATIONS *******************************/
+
+/**
+ * tp4q_umem_new - Creates a new umem (packet buffer)
+ *
+ * @addr: The address to the umem
+ * @size: The size of the umem
+ * @frame_size: The size of each frame, between 2K and PAGE_SIZE
+ * @data_headroom: The desired data headroom before start of the packet
+ *
+ * Returns a pointer to the new umem or NULL for failure
+ **/
+static inline struct tp4_umem *tp4q_umem_new(unsigned long addr, size_t size,
+                                            unsigned int frame_size,
+                                            unsigned int data_headroom)
+{
+       struct tp4_umem *umem;
+       unsigned int nframes;
+
+       if (frame_size < TP4_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+               /* Strictly speaking we could support this, if:
+                * - huge pages, or*
+                * - using an IOMMU, or
+                * - making sure the memory area is consecutive
+                * but for now, we simply say "computer says no".
+                */
+               return ERR_PTR(-EINVAL);
+       }
+
+       if (!is_power_of_2(frame_size))
+               return ERR_PTR(-EINVAL);
+
+       if (!PAGE_ALIGNED(addr)) {
+               /* Memory area has to be page size aligned. For
+                * simplicity, this might change.
+                */
+               return ERR_PTR(-EINVAL);
+       }
+
+       if ((addr + size) < addr)
+               return ERR_PTR(-EINVAL);
+
+       nframes = size / frame_size;
+       if (nframes == 0)
+               return ERR_PTR(-EINVAL);
+
+       data_headroom = ALIGN(data_headroom, 64);
+
+       if (frame_size - data_headroom - TP4_KERNEL_HEADROOM < 0)
+               return ERR_PTR(-EINVAL);
+
+       umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+       if (!umem)
+               return ERR_PTR(-ENOMEM);
+
+       umem->pid = get_task_pid(current, PIDTYPE_PID);
+       umem->size = size;
+       umem->address = addr;
+       umem->frame_size = frame_size;
+       umem->frame_size_log2 = ilog2(frame_size);
+       umem->nframes = nframes;
+       umem->nfpplog2 = ilog2(PAGE_SIZE / frame_size);
+       umem->data_headroom = data_headroom;
+
+       return umem;
+}
+
+#endif /* _LINUX_TPACKET4_H */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 9603f6ff17a4..b39be424ec0e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -89,11 +89,15 @@
 #include <linux/errqueue.h>
 #include <linux/net_tstamp.h>
 #include <linux/percpu.h>
+#include <linux/log2.h>
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
 #endif
 #include <linux/bpf.h>
 #include <net/compat.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
 
 #include "internal.h"
 
@@ -2975,6 +2979,132 @@ static int packet_sendmsg(struct socket *sock, struct 
msghdr *msg, size_t len)
                return packet_snd(sock, msg, len);
 }
 
+static void
+packet_umem_unpin_pages(struct tp4_umem *umem)
+{
+       unsigned int i;
+
+       for (i = 0; i < umem->npgs; i++) {
+               struct page *page = umem->pgs[i];
+
+               set_page_dirty_lock(page);
+               put_page(page);
+       }
+       kfree(umem->pgs);
+       umem->pgs = NULL;
+}
+
+static void
+packet_umem_free(struct tp4_umem *umem)
+{
+       struct mm_struct *mm;
+       struct task_struct *task;
+       unsigned long diff;
+
+       packet_umem_unpin_pages(umem);
+
+       task = get_pid_task(umem->pid, PIDTYPE_PID);
+       put_pid(umem->pid);
+       if (!task)
+               goto out;
+       mm = get_task_mm(task);
+       put_task_struct(task);
+       if (!mm)
+               goto out;
+
+       diff = umem->size >> PAGE_SHIFT;
+
+       down_write(&mm->mmap_sem);
+       mm->pinned_vm -= diff;
+       up_write(&mm->mmap_sem);
+       mmput(mm);
+out:
+       kfree(umem);
+}
+
+static struct tp4_umem *
+packet_umem_new(unsigned long addr, size_t size, unsigned int frame_size,
+               unsigned int data_headroom)
+{
+       unsigned long lock_limit, locked, npages;
+       unsigned int gup_flags = FOLL_WRITE;
+       int need_release = 0, j = 0, i, ret;
+       struct page **page_list;
+       struct tp4_umem *umem;
+
+       if (!can_do_mlock())
+               return ERR_PTR(-EPERM);
+
+       umem = tp4q_umem_new(addr, size, frame_size, data_headroom);
+       if (IS_ERR(umem))
+               return umem;
+
+       page_list = (struct page **)__get_free_page(GFP_KERNEL);
+       if (!page_list) {
+               put_pid(umem->pid);
+               kfree(umem);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       npages = PAGE_ALIGN(umem->nframes * umem->frame_size) >> PAGE_SHIFT;
+
+       down_write(&current->mm->mmap_sem);
+
+       locked = npages + current->mm->pinned_vm;
+       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+       if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       if (npages == 0 || npages > UINT_MAX) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       umem->pgs = kcalloc(npages, sizeof(*umem->pgs), GFP_KERNEL);
+       if (!umem->pgs) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       need_release = 1;
+       while (npages) {
+               ret = get_user_pages(addr,
+                                    min_t(unsigned long, npages,
+                                          PAGE_SIZE / sizeof(struct page *)),
+                                    gup_flags, page_list, NULL);
+
+               if (ret < 0)
+                       goto out;
+
+               umem->npgs += ret;
+               addr += ret * PAGE_SIZE;
+               npages -= ret;
+
+               for (i = 0; i < ret; i++)
+                       umem->pgs[j++] = page_list[i];
+       }
+
+       ret = 0;
+
+out:
+       if (ret < 0) {
+               if (need_release)
+                       packet_umem_unpin_pages(umem);
+               put_pid(umem->pid);
+               kfree(umem);
+       } else {
+               current->mm->pinned_vm = locked;
+       }
+
+       up_write(&current->mm->mmap_sem);
+       free_page((unsigned long)page_list);
+
+       return ret < 0 ? ERR_PTR(ret) : umem;
+}
+
 /*
  *     Close a PACKET socket. This is fairly simple. We immediately go
  *     to 'closed' state and remove our protocol entry in the device list.
@@ -3024,6 +3154,11 @@ static int packet_release(struct socket *sock)
                packet_set_ring(sk, &req_u, 1, 1);
        }
 
+       if (po->umem) {
+               packet_umem_free(po->umem);
+               po->umem = NULL;
+       }
+
        f = fanout_release(sk);
 
        synchronize_net();
@@ -3828,6 +3963,31 @@ packet_setsockopt(struct socket *sock, int level, int 
optname, char __user *optv
                po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
                return 0;
        }
+       case PACKET_MEMREG:
+       {
+               struct tpacket_memreg_req req;
+               struct tp4_umem *umem;
+
+               if (optlen < sizeof(req))
+                       return -EINVAL;
+               if (copy_from_user(&req, optval, sizeof(req)))
+                       return -EFAULT;
+
+               umem = packet_umem_new(req.addr, req.len, req.frame_size,
+                                      req.data_headroom);
+               if (IS_ERR(umem))
+                       return PTR_ERR(umem);
+
+               lock_sock(sk);
+               if (po->umem) {
+                       release_sock(sk);
+                       packet_umem_free(umem);
+                       return -EBUSY;
+               }
+               po->umem = umem;
+               release_sock(sk);
+               return 0;
+       }
        default:
                return -ENOPROTOOPT;
        }
@@ -4245,6 +4405,9 @@ static int packet_set_ring(struct sock *sk, union 
tpacket_req_u *req_u,
                case TPACKET_V3:
                        po->tp_hdrlen = TPACKET3_HDRLEN;
                        break;
+               default:
+                       err = -EINVAL;
+                       goto out;
                }
 
                err = -EINVAL;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 94d1d405a116..9c07cfe1b8a3 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -2,6 +2,7 @@
 #define __PACKET_INTERNAL_H__
 
 #include <linux/refcount.h>
+#include <linux/tpacket4.h>
 
 struct packet_mclist {
        struct packet_mclist    *next;
@@ -109,6 +110,9 @@ struct packet_sock {
        union  tpacket_stats_u  stats;
        struct packet_ring_buffer       rx_ring;
        struct packet_ring_buffer       tx_ring;
+
+       struct tp4_umem                 *umem;
+
        int                     copy_thresh;
        spinlock_t              bind_lock;
        struct mutex            pg_vec_lock;
-- 
2.11.0

Reply via email to