On 5/27/24 07:56, Liu Kui wrote: > The memory region(MR) is very similar to RDMA memory region, however > much simpler. It allows userspace to register a bulky memory to kernel, > which would pin all pages from that memory and returns a reference back > to userspace. Userspace can then just pass the descriptor(start address, > length) of a buffer allocated from the registered MR, together with > returned reference for that MR to kernel to complete data transfer to/from > kernel. > > This feature will be used for implementing pcs_krpc. > > Signed-off-by: Liu Kui <kui....@virtuozzo.com> Reviewed-by: Vasily Averin <v...@openvz.org> > --- > fs/fuse/kio/pcs/pcs_mr.c | 212 +++++++++++++++++++++++++++++++++++++++ > fs/fuse/kio/pcs/pcs_mr.h | 64 ++++++++++++ > 2 files changed, 276 insertions(+) > create mode 100644 fs/fuse/kio/pcs/pcs_mr.c > create mode 100644 fs/fuse/kio/pcs/pcs_mr.h > > diff --git a/fs/fuse/kio/pcs/pcs_mr.c b/fs/fuse/kio/pcs/pcs_mr.c > new file mode 100644 > index 000000000000..c2a2c072ba9e > --- /dev/null > +++ b/fs/fuse/kio/pcs/pcs_mr.c > @@ -0,0 +1,212 @@ > +/* > + * Copyright (c) 2018-2024 Virtuozzo International GmbH. All rights > reserved. > + */ > + > +#include <linux/gfp.h> > +#include <linux/dma-mapping.h> > +#include <linux/slab.h> > +#include <linux/sched/mm.h> > +#include <linux/resource.h> > + > +#include "pcs_mr.h" > + > +void pcs_umem_release(struct pcs_umem *umem) > +{ > + struct mm_struct *mm_s = umem->mm; > + > + unpin_user_pages(umem->pages, umem->npages); > + atomic64_sub(umem->npages, &mm_s->pinned_vm); > + mmdrop(mm_s); > + kfree(umem->pages); > + kfree(umem); > +} > + > +/* > + * Pin pages from userspace memory > + */ > +struct pcs_umem *pcs_umem_get(u64 start, u64 len) > +{ > + struct pcs_umem *umem = NULL; > + struct page **pages; > + int npages; > + u64 fp_va; > + struct mm_struct *mm_s; > + int got, ret; > + > + fp_va = start & PAGE_MASK; > + npages = PAGE_ALIGN(start + len - fp_va) >> PAGE_SHIFT; > + > + umem = kzalloc(sizeof(*umem), GFP_KERNEL); > + if (!umem) > + return ERR_PTR(-ENOMEM); > + > + /* pin user pages */ > + mm_s = current->mm; > + umem->mm = mm_s; > + > + mmgrab(mm_s); > + mmap_read_lock(mm_s); > + > + umem->fp_addr = fp_va; > + umem->pages = kcalloc(npages, sizeof(struct page *), GFP_KERNEL); > + if (!umem->pages) { > + ret = -ENOMEM; > + goto out_err; > + } > + > + got = 0; > + while (npages) { > + pages = &umem->pages[got]; > + ret = pin_user_pages(fp_va, npages, FOLL_WRITE | FOLL_LONGTERM, > pages, NULL); > + if (ret < 0) > + goto out_err; > + > + WARN_ON(ret == 0); > + umem->npages += ret; > + atomic64_add(ret, &mm_s->pinned_vm); > + fp_va += ret * PAGE_SIZE; > + npages -= ret; > + got += ret; > + } > + mmap_read_unlock(mm_s); > + > + return umem; > + > +out_err: > + mmap_read_unlock(mm_s); > + pcs_umem_release(umem); > + > + return ERR_PTR(ret); > +} > + > +static void pcs_mr_free(struct kref *ref) > +{ > + struct pcs_mr *mr = container_of(ref, struct pcs_mr, ref); > + > + pcs_umem_release(mr->umem); > + kfree(mr); > +} > + > +void pcs_mr_put(struct pcs_mr *mr) > +{ > + kref_put(&mr->ref, pcs_mr_free); > +} > + > +struct pcs_mr *pcs_mr_get(struct pcs_mr_set *mrs, int idx) > +{ > + struct pcs_mr *mr; > + > + rcu_read_lock(); > + mr = xa_load(&mrs->mr_xa, idx); > + if (likely(mr && kref_get_unless_zero(&mr->ref))) { > + rcu_read_unlock(); > + return mr; > + } > + return NULL; > +} > + > +/* > + * Register a MR > + */ > +int pcs_reg_mr(struct pcs_mr_set *mrs, u64 start, u64 len) > +{ > + int ret = 0; > + struct pcs_mr *mr; > + struct pcs_umem *umem; > + > + if (!len) > + return -EINVAL; > + > + if (!can_do_mlock()) > + return -EPERM; > + > + if (atomic_inc_return(&mrs->mr_num) > PCS_MAX_MR) { > + atomic_dec(&mrs->mr_num); > + return -ENOMEM; > + } > + > + umem = pcs_umem_get(start, len); > + if (IS_ERR(umem)) { > + atomic_dec(&mrs->mr_num); > + return PTR_ERR(umem); > + } > + > + mr = kzalloc(sizeof(*mr), GFP_KERNEL); > + if (!mr) { > + ret = -ENOMEM; > + goto err_out; > + } > + > + mr->mrs = mrs; > + mr->va = start; > + mr->len = len; > + mr->umem = umem; > + kref_init(&mr->ref); > + > + ret = xa_alloc_cyclic(&mrs->mr_xa, &mr->id, mr, > + XA_LIMIT(1, PCS_MAX_MR), &mrs->mr_next, GFP_KERNEL); > + if (ret < 0) { > + kfree(mr); > + goto err_out; > + } > + > + mr->id_valid = 1; > + > + return mr->id; > + > +err_out: > + pcs_umem_release(umem); > + atomic_dec(&mrs->mr_num); > + return ret; > +} > + > +/* > + * Deregister a MR > + */ > +int pcs_dereg_mr(struct pcs_mr_set *mrs, u32 id) > +{ > + struct pcs_mr *mr; > + > + mr = pcs_mr_get(mrs, id); > + if (!mr) > + return -ENXIO; > + > + mr->id_valid = 0; > + pcs_mr_put(mr); > + > + /* make sure mr->id_valid is seen */ > + smp_mb(); > + > + xa_erase(&mrs->mr_xa, mr->id); > + pcs_mr_put(mr); > + > + atomic_dec(&mrs->mr_num); > + > + return 0; > +} > + > +void pcs_mrset_init(struct pcs_mr_set *mrs) > +{ > + xa_init_flags(&mrs->mr_xa, XA_FLAGS_ALLOC1); > + mrs->mr_next = 0; > + atomic_set(&mrs->mr_num, 0); > +} > + > +void pcs_mrset_fini(struct pcs_mr_set *mrs) > +{ > + struct pcs_mr *mr; > + unsigned long idx; > + > + if (atomic_read(&mrs->mr_num) == 0) > + return; > + > + /*clean all registered MRs*/ > + xa_for_each(&mrs->mr_xa, idx, mr) { > + BUG_ON(kref_read(&mr->ref) != 1); > + pcs_mr_put(mr); > + atomic_dec(&mrs->mr_num); > + } > + > + BUG_ON(atomic_read(&mrs->mr_num) != 0); > + xa_destroy(&mrs->mr_xa); > +} > diff --git a/fs/fuse/kio/pcs/pcs_mr.h b/fs/fuse/kio/pcs/pcs_mr.h > new file mode 100644 > index 000000000000..0eaa9f263090 > --- /dev/null > +++ b/fs/fuse/kio/pcs/pcs_mr.h > @@ -0,0 +1,64 @@ > +/* > + * Copyright (c) 2018-2024 Virtuozzo International GmbH. All rights > reserved. > + */ > + > +#ifndef _PCS_MR_H_ > +#define _PCS_MR_H_ 1 > + > +#include <linux/types.h> > +#include <linux/highmem-internal.h> > + > +struct pcs_umem; > + > +#define PCS_MAX_MR 0x10000 > + > +struct pcs_mr_set { > + struct xarray mr_xa; /* array of registered MRs*/ > + u32 mr_next; /* next index of the mr > xarray*/ > + > + atomic_t mr_num; /* number of registered MRs*/ > +}; > + > +struct pcs_umem { > + u64 fp_addr; /* First page base address */ > + int npages; /* number of pinned pages */ > + struct page **pages; /* array of pinned pages */ > + struct mm_struct *mm; /* mm the memory belongs to */ > +}; > + > +struct pcs_mr { > + struct pcs_mr_set *mrs; /* set holding this mr */ > + struct kref ref; > + struct pcs_umem *umem; > + u64 va; /* starting address of MR */ > + u64 len; /* length of MR */ > + u32 id; /* index in kdev->mr_xa, returned to userspace */ > + u8 id_valid; /* valid or invalid */ > + u8 rsvd[3]; > +}; > + > +/* > + * Get page pointer for the address > + */ > +static inline struct page *pcs_umem_page(struct pcs_umem *umem, u64 addr) > +{ > + unsigned int idx = (addr - umem->fp_addr) >> PAGE_SHIFT; > + > + if (likely(idx < umem->npages)) > + return umem->pages[idx]; > + return NULL; > +} > + > +struct pcs_umem *pcs_umem_get(u64 start, u64 len); > +void pcs_umem_release(struct pcs_umem *umem); > + > +struct pcs_mr *pcs_mr_get(struct pcs_mr_set *mrs, int idx); > +void pcs_mr_put(struct pcs_mr *mr); > + > +int pcs_reg_mr(struct pcs_mr_set *mrs, u64 start, u64 len); > +int pcs_dereg_mr(struct pcs_mr_set *mrs, u32 id); > + > +void pcs_mrset_init(struct pcs_mr_set *mrs); > +void pcs_mrset_fini(struct pcs_mr_set *mrs); > + > +#endif
_______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel