From: Jonathan Lemon <b...@fb.com>

This provides the interface between the netgpu core module and the
nvidia kernel driver.  This should be built as an external module,
pointing to the nvidia build.  For example:

export NV_PACKAGE_DIR=/w/nvidia/NVIDIA-Linux-x86_64-440.64
make -C ${kdir} M=`pwd` O=obj $*

Signed-off-by: Jonathan Lemon <jonathan.le...@gmail.com>
---
 drivers/misc/netgpu/nvidia/Kbuild        |   9 +
 drivers/misc/netgpu/nvidia/Kconfig       |  10 +
 drivers/misc/netgpu/nvidia/netgpu_cuda.c | 416 +++++++++++++++++++++++
 3 files changed, 435 insertions(+)
 create mode 100644 drivers/misc/netgpu/nvidia/Kbuild
 create mode 100644 drivers/misc/netgpu/nvidia/Kconfig
 create mode 100644 drivers/misc/netgpu/nvidia/netgpu_cuda.c

diff --git a/drivers/misc/netgpu/nvidia/Kbuild 
b/drivers/misc/netgpu/nvidia/Kbuild
new file mode 100644
index 000000000000..10a3b3156f30
--- /dev/null
+++ b/drivers/misc/netgpu/nvidia/Kbuild
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+nv_dir = $(NV_PACKAGE_DIR)/kernel
+
+KBUILD_EXTRA_SYMBOLS = $(nv_dir)/Module.symvers
+
+obj-m := netgpu_cuda.o
+
+ccflags-y += -I$(nv_dir)
diff --git a/drivers/misc/netgpu/nvidia/Kconfig 
b/drivers/misc/netgpu/nvidia/Kconfig
new file mode 100644
index 000000000000..6bb8be158943
--- /dev/null
+++ b/drivers/misc/netgpu/nvidia/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# NetGPU framework
+#
+
+config NETGPU_CUDA
+       tristate "Network/GPU driver for Nvidia"
+       depends on NETGPU && m
+       help
+         Experimental Network / GPU driver for Nvidia
diff --git a/drivers/misc/netgpu/nvidia/netgpu_cuda.c 
b/drivers/misc/netgpu/nvidia/netgpu_cuda.c
new file mode 100644
index 000000000000..2cd93dab52ad
--- /dev/null
+++ b/drivers/misc/netgpu/nvidia/netgpu_cuda.c
@@ -0,0 +1,416 @@
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/uio.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/memory.h>
+#include <linux/interval_tree.h>
+
+#include <net/netgpu.h>
+#include "../netgpu_priv.h"
+
+#include "nvidia/nv-p2p.h"
+
+/* nvidia GPU uses 64K pages */
+#define GPU_PAGE_SHIFT         16
+#define GPU_PAGE_SIZE          (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_MASK          (GPU_PAGE_SIZE - 1)
+
+struct netgpu_cuda_region {
+       struct netgpu_region r;                         /* must be first */
+       struct rb_root_cached root;
+       struct nvidia_p2p_page_table *gpu_pgtbl;
+};
+
+struct netgpu_cuda_dmamap {
+       struct netgpu_dmamap map;                       /* must be first */
+       unsigned pg_shift;
+       unsigned long pg_mask;
+       u64 *dma;
+       struct nvidia_p2p_dma_mapping *gpu_map;
+};
+
+/* page_range represents one contiguous GPU PA region */
+struct netgpu_page_range {
+       unsigned long pfn;
+       struct resource *res;
+       struct interval_tree_node va_node;
+};
+
+static int nvidia_pg_shift[] = {
+       [NVIDIA_P2P_PAGE_SIZE_4KB]   = 12,
+       [NVIDIA_P2P_PAGE_SIZE_64KB]  = 16,
+       [NVIDIA_P2P_PAGE_SIZE_128KB] = 17,
+};
+
+#define node2page_range(itn) \
+       container_of(itn, struct netgpu_page_range, va_node)
+
+#define region_remove_each(root, first, last, itn)                     \
+       while ((itn = interval_tree_iter_first(root, first, last)) &&   \
+              (interval_tree_remove(itn, root), 1))
+
+#define cuda_region_remove_each(r, itn)                                        
\
+       region_remove_each(&cuda_region(r)->root, r->start,             \
+                          r->start + (r->nr_pages << PAGE_SHIFT) - 1,  \
+                          itn)
+
+static inline struct netgpu_cuda_region *
+cuda_region(struct netgpu_region *r)
+{
+       return (struct netgpu_cuda_region *)r;
+}
+
+static inline struct netgpu_cuda_dmamap *
+cuda_map(struct netgpu_dmamap *map)
+{
+       return (struct netgpu_cuda_dmamap *)map;
+}
+
+static inline struct netgpu_page_range *
+region_find(struct netgpu_region *r, unsigned long start, int count)
+{
+       struct interval_tree_node *itn;
+       unsigned long last;
+
+       last = start + count * PAGE_SIZE - 1;
+
+       itn = interval_tree_iter_first(&cuda_region(r)->root, start, last);
+       return itn ? node2page_range(itn) : 0;
+}
+
+static dma_addr_t
+netgpu_cuda_get_dma(struct netgpu_dmamap *map, unsigned long addr)
+{
+       unsigned long base, idx;
+
+       base = addr - map->start;
+       idx = base >> cuda_map(map)->pg_shift;
+       return cuda_map(map)->dma[idx] + (base & cuda_map(map)->pg_mask);
+}
+
+static int
+netgpu_cuda_get_page(struct netgpu_dmamap *map, unsigned long addr,
+                    struct page **page, dma_addr_t *dma)
+{
+       struct netgpu_page_range *pr;
+       unsigned long idx;
+
+       pr = region_find(map->r, addr, 1);
+       if (!pr)
+               return -EFAULT;
+       idx = (addr - pr->va_node.start) >> PAGE_SHIFT;
+
+       *page = pfn_to_page(pr->pfn + idx);
+       get_page(*page);
+       *dma = netgpu_cuda_get_dma(map, addr);
+
+       return 0;
+}
+
+static void
+region_get_pages(struct page **pages, unsigned long pfn, int n)
+{
+       struct page *p;
+       int i;
+
+       for (i = 0; i < n; i++) {
+               p = pfn_to_page(pfn + i);
+               get_page(p);
+               pages[i] = p;
+       }
+}
+
+static int
+netgpu_cuda_get_pages(struct netgpu_region *r, struct page **pages,
+                     unsigned long addr, int count)
+{
+       struct netgpu_page_range *pr;
+       unsigned long idx, end;
+       int n;
+
+       pr = region_find(r, addr, count);
+       if (!pr)
+               return -EFAULT;
+
+       idx = (addr - pr->va_node.start) >> PAGE_SHIFT;
+       end = (pr->va_node.last - pr->va_node.start) >> PAGE_SHIFT;
+       n = end - idx + 1;
+       n = min(count, n);
+
+       region_get_pages(pages, pr->pfn + idx, n);
+
+       return n;
+}
+
+static void
+netgpu_cuda_unmap_region(struct netgpu_dmamap *map)
+{
+       struct pci_dev *pdev;
+       int err;
+
+       pdev = cuda_map(map)->gpu_map->pci_dev;
+
+       err = nvidia_p2p_dma_unmap_pages(pdev, cuda_region(map->r)->gpu_pgtbl,
+                                        cuda_map(map)->gpu_map);
+       if (err)
+               pr_err("nvidia_p2p_dma_unmap failed: %d\n", err);
+}
+
+static struct netgpu_dmamap *
+netgpu_cuda_map_region(struct netgpu_region *r, struct device *device)
+{
+       struct netgpu_cuda_region *cr = cuda_region(r);
+       struct nvidia_p2p_dma_mapping *gpu_map;
+       struct netgpu_dmamap *map;
+       struct pci_dev *pdev;
+       int err;
+
+       map = kmalloc(sizeof(struct netgpu_cuda_dmamap), GFP_KERNEL);
+       if (!map)
+               return ERR_PTR(-ENOMEM);
+
+       pdev = to_pci_dev(device);
+
+       /*
+        * takes PA from pgtbl, performs mapping, saves mapping
+        * dma_mapping holds dma mapped addresses, and pdev.
+        * mem_info contains pgtbl and mapping list.  mapping is added to list.
+        * rm_p2p_dma_map_pages() does the work.
+        */
+       err = nvidia_p2p_dma_map_pages(pdev, cr->gpu_pgtbl, &gpu_map);
+       if (err) {
+               kfree(map);
+               return ERR_PTR(err);
+       }
+
+       cuda_map(map)->gpu_map = gpu_map;
+       cuda_map(map)->dma = gpu_map->dma_addresses;
+       cuda_map(map)->pg_shift = nvidia_pg_shift[gpu_map->page_size_type];
+       cuda_map(map)->pg_mask = (1UL << cuda_map(map)->pg_shift) - 1;
+
+       return map;
+}
+
+static struct resource *
+netgpu_add_pages(int nid, u64 start, u64 end)
+{
+       struct mhp_params params = { .pgprot = PAGE_KERNEL };
+
+       return add_memory_pages(nid, start, end - start, &params);
+}
+
+static void
+netgpu_free_pages(struct resource *res)
+{
+       release_memory_pages(res);
+}
+
+static void
+netgpu_free_page_range(struct netgpu_page_range *pr)
+{
+       unsigned long pfn, pfn_end;
+       struct page *page;
+
+       pfn_end = pr->pfn +
+                 ((pr->va_node.last + 1 - pr->va_node.start) >> PAGE_SHIFT);
+
+       /* XXX verify page count is 2! */
+       for (pfn = pr->pfn; pfn < pfn_end; pfn++) {
+               page = pfn_to_page(pfn);
+               set_page_count(page, 0);
+       }
+       netgpu_free_pages(pr->res);
+       kfree(pr);
+}
+
+static void
+netgpu_cuda_release_pages(struct netgpu_region *r)
+{
+       struct interval_tree_node *va_node;
+
+       cuda_region_remove_each(r, va_node)
+               netgpu_free_page_range(node2page_range(va_node));
+}
+
+static void
+netgpu_init_pages(u64 va, unsigned long pfn_start, unsigned long pfn_end)
+{
+       unsigned long pfn;
+       struct page *page;
+
+       for (pfn = pfn_start; pfn < pfn_end; pfn++) {
+               page = pfn_to_page(pfn);
+               mm_zero_struct_page(page);
+
+               set_page_count(page, 2);        /* matches host logic */
+               page->page_type = 7;            /* XXX differential flag */
+               __SetPageReserved(page);
+
+               SetPagePrivate(page);
+               set_page_private(page, va);
+               va += PAGE_SIZE;
+       }
+}
+
+static int
+netgpu_add_page_range(struct netgpu_region *r, u64 va, u64 start, u64 end)
+{
+       struct netgpu_page_range *pr;
+       struct resource *res;
+
+       pr = kmalloc(sizeof(*pr), GFP_KERNEL);
+       if (!pr)
+               return -ENOMEM;
+
+       res = netgpu_add_pages(numa_mem_id(), start, end);
+       if (IS_ERR(res)) {
+               kfree(pr);
+               return PTR_ERR(res);
+       }
+
+       pr->pfn = PHYS_PFN(start);
+       pr->va_node.start = va;
+       pr->va_node.last = va + (end - start) - 1;
+       pr->res = res;
+
+       netgpu_init_pages(va, PHYS_PFN(start), PHYS_PFN(end));
+
+       interval_tree_insert(&pr->va_node, &cuda_region(r)->root);
+
+       return 0;
+}
+
+static void
+netgpu_cuda_pgtbl_cb(void *data)
+{
+       struct netgpu_region *r = data;
+
+       /* This is required - nvidia gets unhappy if the page table is
+        * freed from the page table callback.
+        */
+       cuda_region(r)->gpu_pgtbl = NULL;
+       netgpu_detach_region(r);
+}
+
+static struct netgpu_region *
+netgpu_cuda_add_region(struct netgpu_mem *mem, const struct iovec *iov)
+{
+       struct nvidia_p2p_page_table *gpu_pgtbl = NULL;
+       u64 va, pa, len, start, end;
+       struct netgpu_region *r;
+       int err, i, gpu_pgsize;
+
+       err = -ENOMEM;
+       r = kzalloc(sizeof(struct netgpu_cuda_region), GFP_KERNEL);
+       if (!r)
+               return ERR_PTR(err);
+
+       start = (u64)iov->iov_base;
+       r->start = round_down(start, GPU_PAGE_SIZE);
+       len = round_up(start - r->start + iov->iov_len, GPU_PAGE_SIZE);
+       r->nr_pages = len >> PAGE_SHIFT;
+
+       r->mem = mem;
+       INIT_LIST_HEAD(&r->ctx_list);
+       INIT_LIST_HEAD(&r->dma_list);
+       spin_lock_init(&r->lock);
+
+       /*
+        * allocates page table, sets gpu_uuid to owning gpu.
+        * allocates page array, set PA for each page.
+        * sets page_size (64K here)
+        * rm_p2p_get_pages() does the actual work.
+        */
+       err = nvidia_p2p_get_pages(0, 0, r->start, len, &gpu_pgtbl,
+                                  netgpu_cuda_pgtbl_cb, r);
+       if (err)
+               goto out;
+
+       /* gpu pgtbl owns r, will free via netgpu_cuda_pgtbl_cb */
+       cuda_region(r)->gpu_pgtbl = gpu_pgtbl;
+
+       if (!NVIDIA_P2P_PAGE_TABLE_VERSION_COMPATIBLE(gpu_pgtbl)) {
+               pr_err("incompatible page table\n");
+               err = -EINVAL;
+               goto out;
+       }
+
+       gpu_pgsize = 1UL << nvidia_pg_shift[gpu_pgtbl->page_size];
+       if (r->nr_pages != gpu_pgtbl->entries * gpu_pgsize / PAGE_SIZE) {
+               pr_err("GPU page count %ld != host page count %ld\n",
+                      gpu_pgtbl->entries * gpu_pgsize / PAGE_SIZE,
+                      r->nr_pages);
+               err = -EINVAL;
+               goto out;
+       }
+
+       start = U64_MAX;
+       end = 0;
+
+       for (i = 0; i < gpu_pgtbl->entries; i++) {
+               pa = gpu_pgtbl->pages[i]->physical_address;
+               if (pa != end) {
+                       if (end) {
+                               err = netgpu_add_page_range(r, va, start, end);
+                               if (err)
+                                       goto out;
+                       }
+                       start = pa;
+                       va = r->start + i * gpu_pgsize;
+               }
+               end = pa + gpu_pgsize;
+       }
+       err = netgpu_add_page_range(r, va, start, end);
+       if (err)
+               goto out;
+
+       return r;
+
+out:
+       netgpu_cuda_release_pages(r);
+       if (gpu_pgtbl)
+               nvidia_p2p_put_pages(0, 0, r->start, gpu_pgtbl);
+       kfree(r);
+
+       return ERR_PTR(err);
+}
+
+static void
+netgpu_cuda_free_region(struct netgpu_mem *mem, struct netgpu_region *r)
+{
+       netgpu_cuda_release_pages(r);
+       if (cuda_region(r)->gpu_pgtbl)
+               nvidia_p2p_put_pages(0, 0, r->start, cuda_region(r)->gpu_pgtbl);
+       kfree(r);
+}
+
+struct netgpu_ops cuda_ops = {
+       .owner          = THIS_MODULE,
+       .memtype        = NETGPU_MEMTYPE_CUDA,
+       .add_region     = netgpu_cuda_add_region,
+       .free_region    = netgpu_cuda_free_region,
+       .map_region     = netgpu_cuda_map_region,
+       .unmap_region   = netgpu_cuda_unmap_region,
+       .get_dma        = netgpu_cuda_get_dma,
+       .get_page       = netgpu_cuda_get_page,
+       .get_pages      = netgpu_cuda_get_pages,
+};
+
+static int __init
+netgpu_cuda_init(void)
+{
+       return netgpu_register(&cuda_ops);
+}
+
+static void __exit
+netgpu_cuda_fini(void)
+{
+       netgpu_unregister(cuda_ops.memtype);
+}
+
+module_init(netgpu_cuda_init);
+module_exit(netgpu_cuda_fini);
+MODULE_LICENSE("GPL v2");
-- 
2.24.1

Reply via email to