The commit is pushed to "branch-rh9-5.14.0-427.22.1.vz9.62.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh9-5.14.0-427.22.1.vz9.62.3 ------> commit 7d8b13845a929ba67e8bb387501cd476ca473b9f Author: Liu Kui <kui....@virtuozzo.com> Date: Tue Jul 2 23:26:06 2024 +0800
fs/fuse kio: implement memory region to support zero-copy between userspace and kernel. The memory region(MR) is very similar to RDMA memory region, however much simpler. It allows userspace to register a bulky memory to kernel, which would pin all pages from that memory and returns a reference back to userspace. Userspace can then just pass the descriptor(start address, length) of a buffer allocated from the registered MR, together with the returned reference to that MR to kernel to complete data transfer to/from kernel. This feature will be used for implementing pcs_krpc. https://pmc.acronis.work/browse/VSTOR-82613 Signed-off-by: Liu Kui <kui....@virtuozzo.com> ====== Patchset description: fs/fuse kio: introduce pcs_krpc for merging userspace RPC in vstorage-mount with kernel RPC Implement pcs_krpc in kio module to support using kernel RPC directly from userspace. https://pmc.acronis.work/browse/VSTOR-82613 Liu Kui (4): fs/fuse kio: implement memory region to support zero-copy between userspace and kernel. fs/fuse kio: implement pcs_krpc - export kernel RPC to userspace fs/fuse kio: adapt pcs_rpc to support pcs_krpc. fs/fuse kio: integrate pcs_krpc to kio module Feature: fuse: kRPC - single RPC for kernel and userspace --- fs/fuse/kio/pcs/pcs_mr.c | 202 +++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/kio/pcs/pcs_mr.h | 62 +++++++++++++++ 2 files changed, 264 insertions(+) diff --git a/fs/fuse/kio/pcs/pcs_mr.c b/fs/fuse/kio/pcs/pcs_mr.c new file mode 100644 index 000000000000..c8efc1301bac --- /dev/null +++ b/fs/fuse/kio/pcs/pcs_mr.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2018-2024 Virtuozzo International GmbH. All rights reserved. + */ + +#include <linux/gfp.h> +#include <linux/dma-mapping.h> +#include <linux/slab.h> +#include <linux/sched/mm.h> +#include <linux/resource.h> + +#include "pcs_mr.h" + +void pcs_umem_release(struct pcs_umem *umem) +{ + struct mm_struct *mm_s = umem->mm; + + unpin_user_pages(umem->pages, umem->npages); + atomic64_sub(umem->npages, &mm_s->pinned_vm); + mmdrop(mm_s); + kfree(umem->pages); + kfree(umem); +} + +/* + * Pin pages from userspace memory + */ +struct pcs_umem *pcs_umem_get(u64 start, u64 len) +{ + struct pcs_umem *umem = NULL; + struct page **pages; + int npages; + u64 fp_va; + struct mm_struct *mm_s; + int got, ret; + + fp_va = start & PAGE_MASK; + npages = PAGE_ALIGN(start + len - fp_va) >> PAGE_SHIFT; + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + /* pin user pages */ + mm_s = current->mm; + umem->mm = mm_s; + + mmgrab(mm_s); + mmap_read_lock(mm_s); + + umem->fp_addr = fp_va; + umem->pages = kcalloc(npages, sizeof(struct page *), GFP_KERNEL); + if (!umem->pages) { + ret = -ENOMEM; + goto out_err; + } + + got = 0; + while (npages) { + pages = &umem->pages[got]; + ret = pin_user_pages(fp_va, npages, FOLL_WRITE | FOLL_LONGTERM, pages, NULL); + if (ret < 0) + goto out_err; + + WARN_ON(ret == 0); + umem->npages += ret; + atomic64_add(ret, &mm_s->pinned_vm); + fp_va += ret * PAGE_SIZE; + npages -= ret; + got += ret; + } + mmap_read_unlock(mm_s); + + return umem; + +out_err: + mmap_read_unlock(mm_s); + pcs_umem_release(umem); + + return ERR_PTR(ret); +} + +static void pcs_mr_free(struct kref *ref) +{ + struct pcs_mr *mr = container_of(ref, struct pcs_mr, ref); + + pcs_umem_release(mr->umem); + kfree(mr); +} + +void pcs_mr_put(struct pcs_mr *mr) +{ + kref_put(&mr->ref, pcs_mr_free); +} + +struct pcs_mr *pcs_mr_get(struct pcs_mr_set *mrs, int idx) +{ + struct pcs_mr *mr; + + rcu_read_lock(); + mr = xa_load(&mrs->mr_xa, idx); + if (likely(mr && kref_get_unless_zero(&mr->ref))) { + rcu_read_unlock(); + return mr; + } + return NULL; +} + +/* + * Register a MR + */ +int pcs_reg_mr(struct pcs_mr_set *mrs, u64 start, u64 len) +{ + int ret = 0; + struct pcs_mr *mr; + struct pcs_umem *umem; + + if (!len) + return -EINVAL; + + if (!can_do_mlock()) + return -EPERM; + + if (atomic_inc_return(&mrs->mr_num) > PCS_MAX_MR) { + atomic_dec(&mrs->mr_num); + return -ENOMEM; + } + + umem = pcs_umem_get(start, len); + if (IS_ERR(umem)) { + atomic_dec(&mrs->mr_num); + return PTR_ERR(umem); + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + ret = -ENOMEM; + goto err_out; + } + + mr->mrs = mrs; + mr->va = start; + mr->len = len; + mr->umem = umem; + kref_init(&mr->ref); + + ret = xa_alloc_cyclic(&mrs->mr_xa, &mr->id, mr, + XA_LIMIT(1, PCS_MAX_MR), &mrs->mr_next, GFP_KERNEL); + if (ret < 0) { + kfree(mr); + goto err_out; + } + + return mr->id; + +err_out: + pcs_umem_release(umem); + atomic_dec(&mrs->mr_num); + return ret; +} + +/* + * Deregister a MR + */ +int pcs_dereg_mr(struct pcs_mr_set *mrs, u32 id) +{ + struct pcs_mr *mr; + + mr = xa_erase(&mrs->mr_xa, id); + if (unlikely(!mr)) + return -ENXIO; + + pcs_mr_put(mr); + atomic_dec(&mrs->mr_num); + + return 0; +} + +void pcs_mrset_init(struct pcs_mr_set *mrs) +{ + xa_init_flags(&mrs->mr_xa, XA_FLAGS_ALLOC1); + mrs->mr_next = 0; + atomic_set(&mrs->mr_num, 0); +} + +void pcs_mrset_fini(struct pcs_mr_set *mrs) +{ + struct pcs_mr *mr; + unsigned long idx; + + if (atomic_read(&mrs->mr_num) == 0) + return; + + /*clean all registered MRs*/ + xa_for_each(&mrs->mr_xa, idx, mr) { + BUG_ON(kref_read(&mr->ref) != 1); + pcs_mr_put(mr); + atomic_dec(&mrs->mr_num); + } + + BUG_ON(atomic_read(&mrs->mr_num) != 0); + xa_destroy(&mrs->mr_xa); +} diff --git a/fs/fuse/kio/pcs/pcs_mr.h b/fs/fuse/kio/pcs/pcs_mr.h new file mode 100644 index 000000000000..dae9931d9967 --- /dev/null +++ b/fs/fuse/kio/pcs/pcs_mr.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018-2024 Virtuozzo International GmbH. All rights reserved. + */ + +#ifndef _PCS_MR_H_ +#define _PCS_MR_H_ 1 + +#include <linux/types.h> +#include <linux/highmem-internal.h> + +struct pcs_umem; + +#define PCS_MAX_MR 0x10000 + +struct pcs_mr_set { + struct xarray mr_xa; /* array of registered MRs*/ + unsigned int mr_next; /* next index of the mr xarray*/ + atomic_t mr_num; /* number of registered MRs*/ +}; + +struct pcs_umem { + u64 fp_addr; /* First page base address */ + int npages; /* number of pinned pages */ + struct page **pages; /* array of pinned pages */ + struct mm_struct *mm; /* mm the memory belongs to */ +}; + +struct pcs_mr { + struct pcs_mr_set *mrs; /* set holding this mr */ + struct kref ref; + struct pcs_umem *umem; + u64 va; /* starting address of MR */ + u64 len; /* length of MR */ + u32 id; /* index in kdev->mr_xa, returned to userspace */ + u8 rsvd[4]; +}; + +/* + * Get page pointer for the address + */ +static inline struct page *pcs_umem_page(struct pcs_umem *umem, u64 addr) +{ + unsigned int idx = (addr - umem->fp_addr) >> PAGE_SHIFT; + + if (likely(idx < umem->npages)) + return umem->pages[idx]; + return NULL; +} + +struct pcs_umem *pcs_umem_get(u64 start, u64 len); +void pcs_umem_release(struct pcs_umem *umem); + +struct pcs_mr *pcs_mr_get(struct pcs_mr_set *mrs, int idx); +void pcs_mr_put(struct pcs_mr *mr); + +int pcs_reg_mr(struct pcs_mr_set *mrs, u64 start, u64 len); +int pcs_dereg_mr(struct pcs_mr_set *mrs, u32 id); + +void pcs_mrset_init(struct pcs_mr_set *mrs); +void pcs_mrset_fini(struct pcs_mr_set *mrs); + +#endif _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel