The memory region(MR) is very similar to RDMA memory region, however
much simpler. It allows userspace to register a bulky memory to kernel,
which would pin all pages from that memory and returns a reference back
to userspace. Userspace can then just pass the descriptor(start address,
length) of a buffer allocated from the registered MR, together with
returned reference for that MR to kernel to complete data transfer to/from
kernel.

This feature will be used for implementing pcs_krpc.

Signed-off-by: Liu Kui <kui....@virtuozzo.com>
---
 fs/fuse/kio/pcs/pcs_mr.c | 210 +++++++++++++++++++++++++++++++++++++++
 fs/fuse/kio/pcs/pcs_mr.h |  64 ++++++++++++
 2 files changed, 274 insertions(+)
 create mode 100644 fs/fuse/kio/pcs/pcs_mr.c
 create mode 100644 fs/fuse/kio/pcs/pcs_mr.h

diff --git a/fs/fuse/kio/pcs/pcs_mr.c b/fs/fuse/kio/pcs/pcs_mr.c
new file mode 100644
index 000000000000..ad55b0cbad8b
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_mr.c
@@ -0,0 +1,210 @@
+/*
+ *  Copyright (c) 2018-2024 Virtuozzo International GmbH. All rights reserved.
+ */
+
+#include <linux/gfp.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+#include <linux/resource.h>
+
+#include "pcs_mr.h"
+
+void pcs_umem_release(struct pcs_umem *umem)
+{
+       struct mm_struct *mm_s = umem->mm;
+
+       unpin_user_pages(umem->pages, umem->npages);
+       atomic64_sub(umem->npages, &mm_s->pinned_vm);
+       mmdrop(mm_s);
+       kfree(umem->pages);
+       kfree(umem);
+}
+
+/*
+ * Pin pages from userspace memory
+ */
+struct pcs_umem *pcs_umem_get(u64 start, u64 len)
+{
+       struct pcs_umem *umem = NULL;
+       struct page **pages;
+       int npages;
+       u64 fp_va;
+       struct mm_struct *mm_s;
+       int got, ret;
+
+       fp_va = start & PAGE_MASK;
+       npages = PAGE_ALIGN(start + len - fp_va) >> PAGE_SHIFT;
+
+       umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+       if (!umem)
+               return ERR_PTR(-ENOMEM);
+
+       /*  pin user pages */
+       mm_s = current->mm;
+       umem->mm = mm_s;
+
+       mmgrab(mm_s);
+       mmap_read_lock(mm_s);
+
+       umem->fp_addr = fp_va;
+       umem->pages = kcalloc(npages, sizeof(struct page *), GFP_KERNEL);
+       if (!umem->pages) {
+               ret = -ENOMEM;
+               goto out_err;
+       }
+
+       got = 0;
+       while (npages) {
+               pages = &umem->pages[got];
+               ret = pin_user_pages(fp_va, npages, FOLL_WRITE | FOLL_LONGTERM, 
pages, NULL);
+               if (ret < 0)
+                       goto out_err;
+
+               WARN_ON(ret == 0);
+               umem->npages += ret;
+               atomic64_add(ret, &mm_s->pinned_vm);
+               fp_va += ret * PAGE_SIZE;
+               npages -= ret;
+               got += ret;
+       }
+       mmap_read_unlock(mm_s);
+
+       return umem;
+
+out_err:
+       mmap_read_unlock(mm_s);
+       pcs_umem_release(umem);
+
+       return ERR_PTR(ret);
+}
+
+static void pcs_mr_free(struct kref *ref)
+{
+       struct pcs_mr *mr = container_of(ref, struct pcs_mr, ref);
+
+       pcs_umem_release(mr->umem);
+       kfree(mr);
+}
+
+void pcs_mr_put(struct pcs_mr *mr)
+{
+       kref_put(&mr->ref, pcs_mr_free);
+}
+
+struct pcs_mr *pcs_mr_get(struct pcs_mr_set *mrs, int idx)
+{
+       struct pcs_mr *mr;
+
+       rcu_read_lock();
+       mr = xa_load(&mrs->mr_xa, idx);
+       if (likely(mr && kref_get_unless_zero(&mr->ref))) {
+               rcu_read_unlock();
+               return mr;
+       }
+       return NULL;
+}
+
+/*
+ * Register a MR
+ */
+int pcs_reg_mr(struct pcs_mr_set *mrs, u64 start, u64 len)
+{
+       int ret = 0;
+       struct pcs_mr *mr;
+       struct pcs_umem *umem;
+
+       if (!len)
+               return -EINVAL;
+
+       if (!can_do_mlock())
+               return -EPERM;
+
+       if (atomic_inc_return(&mrs->mr_num) > PCS_MAX_MR) {
+               atomic_dec(&mrs->mr_num);
+               return -ENOMEM;
+       }
+
+       umem = pcs_umem_get(start, len);
+       if (IS_ERR(umem))
+               return PTR_ERR(umem);
+
+       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+       if (!mr) {
+               ret = -ENOMEM;
+               goto err_out;
+       }
+
+       mr->mrs = mrs;
+       mr->va = start;
+       mr->len = len;
+       mr->umem = umem;
+       kref_init(&mr->ref);
+
+       ret = xa_alloc_cyclic(&mrs->mr_xa, &mr->id, mr,
+                       XA_LIMIT(1, PCS_MAX_MR), &mrs->mr_next, GFP_KERNEL);
+       if (ret < 0) {
+               kfree(mr);
+               goto err_out;
+       }
+
+       mr->id_valid = 1;
+
+       return mr->id;
+
+err_out:
+       pcs_umem_release(umem);
+       atomic_dec(&mrs->mr_num);
+       return ret;
+}
+
+/*
+ * Deregister a MR
+ */
+int pcs_dereg_mr(struct pcs_mr_set *mrs, u32 id)
+{
+       struct pcs_mr *mr;
+
+       mr = pcs_mr_get(mrs, id);
+       if (!mr)
+               return -ENXIO;
+
+       mr->id_valid = 0;
+       pcs_mr_put(mr);
+
+       /* make sure mr->id_valid is seen */
+       smp_mb();
+
+       xa_erase(&mrs->mr_xa, mr->id);
+       pcs_mr_put(mr);
+
+       atomic_dec(&mrs->mr_num);
+
+       return 0;
+}
+
+void pcs_mrset_init(struct pcs_mr_set *mrs)
+{
+       xa_init_flags(&mrs->mr_xa, XA_FLAGS_ALLOC1);
+       mrs->mr_next = 0;
+       atomic_set(&mrs->mr_num, 0);
+}
+
+void pcs_mrset_fini(struct pcs_mr_set *mrs)
+{
+       struct pcs_mr *mr;
+       unsigned long idx;
+
+       if (atomic_read(&mrs->mr_num) == 0)
+               return;
+
+       /*clean all registered MRs*/
+       xa_for_each(&mrs->mr_xa, idx, mr) {
+               BUG_ON(kref_read(&mr->ref) != 1);
+               pcs_mr_put(mr);
+               atomic_dec(&mrs->mr_num);
+       }
+
+       BUG_ON(atomic_read(&mrs->mr_num) != 0);
+       xa_destroy(&mrs->mr_xa);
+}
diff --git a/fs/fuse/kio/pcs/pcs_mr.h b/fs/fuse/kio/pcs/pcs_mr.h
new file mode 100644
index 000000000000..0eaa9f263090
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_mr.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2018-2024 Virtuozzo International GmbH. All rights reserved.
+ */
+
+#ifndef _PCS_MR_H_
+#define _PCS_MR_H_ 1
+
+#include <linux/types.h>
+#include <linux/highmem-internal.h>
+
+struct pcs_umem;
+
+#define PCS_MAX_MR             0x10000
+
+struct pcs_mr_set {
+       struct xarray   mr_xa;          /* array of registered MRs*/
+       u32                             mr_next;    /* next index of the mr 
xarray*/
+
+       atomic_t                mr_num;         /* number of registered MRs*/
+};
+
+struct pcs_umem {
+       u64             fp_addr; /* First page base address */
+       int             npages;    /* number of pinned pages */
+       struct page     **pages;  /* array of pinned pages */
+       struct mm_struct *mm; /* mm the memory belongs to */
+};
+
+struct pcs_mr {
+       struct pcs_mr_set *mrs; /* set holding this mr */
+       struct kref ref;
+       struct pcs_umem *umem;
+       u64 va;         /* starting address of MR */
+       u64 len;        /* length of MR */
+       u32 id; /* index in kdev->mr_xa, returned to userspace */
+       u8  id_valid; /* valid or invalid */
+       u8  rsvd[3];
+};
+
+/*
+ * Get page pointer for the address
+ */
+static inline struct page *pcs_umem_page(struct pcs_umem *umem, u64 addr)
+{
+       unsigned int idx = (addr - umem->fp_addr) >> PAGE_SHIFT;
+
+       if (likely(idx < umem->npages))
+               return umem->pages[idx];
+       return NULL;
+}
+
+struct pcs_umem *pcs_umem_get(u64 start, u64 len);
+void pcs_umem_release(struct pcs_umem *umem);
+
+struct pcs_mr *pcs_mr_get(struct pcs_mr_set *mrs, int idx);
+void pcs_mr_put(struct pcs_mr *mr);
+
+int pcs_reg_mr(struct pcs_mr_set *mrs, u64 start, u64 len);
+int pcs_dereg_mr(struct pcs_mr_set *mrs, u32 id);
+
+void pcs_mrset_init(struct pcs_mr_set *mrs);
+void pcs_mrset_fini(struct pcs_mr_set *mrs);
+
+#endif
-- 
2.39.3 (Apple Git-146)

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to