> -----Original Message-----
> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, October 27, 2020 1:08 PM
> To: Xiong, Jianxin <[email protected]>
> Cc: [email protected]; [email protected]; Doug Ledford
> <[email protected]>; Leon Romanovsky
> <[email protected]>; Sumit Semwal <[email protected]>; Christian Koenig
> <[email protected]>; Vetter, Daniel
> <[email protected]>
> Subject: Re: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory
> region
>
> On Fri, Oct 23, 2020 at 09:40:01AM -0700, Jianxin Xiong wrote:
>
> > diff --git a/drivers/infiniband/hw/mlx5/mr.c
> > b/drivers/infiniband/hw/mlx5/mr.c index b261797..3bc412b 100644
> > +++ b/drivers/infiniband/hw/mlx5/mr.c
> > @@ -1,5 +1,6 @@
> > /*
> > * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
> > + * Copyright (c) 2020, Intel Corporation. All rights reserved.
> > *
> > * This software is available to you under a choice of one of two
> > * licenses. You may choose to be licensed under the terms of the
> > GNU @@ -36,6 +37,8 @@ #include <linux/debugfs.h> #include
> > <linux/export.h> #include <linux/delay.h>
> > +#include <linux/dma-buf.h>
> > +#include <linux/dma-resv.h>
> > #include <rdma/ib_umem.h>
> > #include <rdma/ib_umem_odp.h>
> > #include <rdma/ib_verbs.h>
> > @@ -1113,6 +1116,8 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64
> > idx, int npages,
> > dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
> > if (mr->umem->is_odp) {
> > mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
> > + } else if (mr->umem->is_dmabuf && (flags &
> > MLX5_IB_UPD_XLT_ZAP)) {
> > + memset(xlt, 0, size);
> > } else {
> > __mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx,
> > npages, xlt,
> > @@ -1462,6 +1467,111 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd,
> > u64 start, u64 length,
> > return ERR_PTR(err);
> > }
> >
> > +static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment
> > +*attach) {
> > + struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
> > + struct mlx5_ib_mr *mr = umem_dmabuf->device_context;
> > +
> > + mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, MLX5_IB_UPD_XLT_ZAP);
> > + ib_umem_dmabuf_unmap_pages(umem_dmabuf);
> > +}
> > +
> > +static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
> > + .allow_peer2peer = 1,
> > + .move_notify = mlx5_ib_dmabuf_invalidate_cb, };
> > +
> > +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
> > + u64 length, u64 virt_addr,
> > + int fd, int access_flags,
> > + struct ib_udata *udata)
> > +{
> > + struct mlx5_ib_dev *dev = to_mdev(pd->device);
> > + struct mlx5_ib_mr *mr = NULL;
> > + struct ib_umem *umem;
> > + struct ib_umem_dmabuf *umem_dmabuf;
> > + int npages;
> > + int order;
> > + int err;
> > +
> > + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
> > + return ERR_PTR(-EOPNOTSUPP);
> > +
> > + mlx5_ib_dbg(dev,
> > + "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d,
> > access_flags 0x%x\n",
> > + offset, virt_addr, length, fd, access_flags);
> > +
> > + if (!mlx5_ib_can_load_pas_with_umr(dev, length))
> > + return ERR_PTR(-EINVAL);
> > +
> > + umem = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
> > access_flags,
> > + &mlx5_ib_dmabuf_attach_ops);
> > + if (IS_ERR(umem)) {
> > + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
> > + return ERR_PTR(PTR_ERR(umem));
> > + }
> > +
> > + npages = ib_umem_num_pages(umem);
> > + if (!npages) {
>
> ib_umem_get should reject invalid umems like this
Will move the check to ib_umem_dmabuf_get().
>
> > + mlx5_ib_warn(dev, "avoid zero region\n");
> > + ib_umem_release(umem);
> > + return ERR_PTR(-EINVAL);
> > + }
> > +
> > + order = ilog2(roundup_pow_of_two(npages));
>
> Must always call ib_umem_find_best_pgsz(), specify PAGE_SIZE as the argument
> for this scenario
Will fix.
>
> > + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
> > + npages, npages, order, PAGE_SHIFT);
> > +
> > + mr = alloc_mr_from_cache(pd, umem, virt_addr, length, npages,
> > + PAGE_SHIFT, order, access_flags);
> > + if (IS_ERR(mr))
> > + mr = NULL;
> > +
> > + if (!mr) {
> > + mutex_lock(&dev->slow_path_mutex);
> > + mr = reg_create(NULL, pd, virt_addr, length, umem, npages,
> > + PAGE_SHIFT, access_flags, false);
> > + mutex_unlock(&dev->slow_path_mutex);
> > + }
>
> Rebase on the mlx5 series just posted and use their version of this code
> sequence, this is just not quite right
Will do.
>
>
> > + err = mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT,
> > + MLX5_IB_UPD_XLT_ENABLE | MLX5_IB_UPD_XLT_ZAP);
> > +
> > + if (err) {
> > + dereg_mr(dev, mr);
> > + return ERR_PTR(err);
> > + }
>
> The current mlx5 code preloads the buffer with the right data, zapping is
> fairly expensive, mapping and loading is the same cost
Could probably do the same here. Will double check.
>
> > @@ -1536,7 +1646,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int
> > flags, u64 start,
> > if (!mr->umem)
> > return -EINVAL;
> >
> > - if (is_odp_mr(mr))
> > + if (is_odp_mr(mr) || is_dmabuf_mr(mr))
> > return -EOPNOTSUPP;
> >
> > if (flags & IB_MR_REREG_TRANS) {
> > @@ -1695,7 +1805,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct
> > mlx5_ib_mr *mr)
> > struct ib_umem *umem = mr->umem;
> >
> > /* Stop all DMA */
> > - if (is_odp_mr(mr))
> > + if (is_odp_mr(mr) || is_dmabuf_mr(mr))
> > mlx5_ib_fence_odp_mr(mr);
>
> Make a dma buf specific function
Ok.
>
> I have another series touching this area too, but I think
>
> > @@ -801,6 +816,52 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr
> > *imr,
> > * Returns:
> > * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that
> > are
> > * not accessible, or the MR is no longer valid.
> > + * -EAGAIN: The operation should be retried
> > + *
> > + * >0: Number of pages mapped
> > + */
> > +static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, struct ib_umem *umem,
> > + u64 io_virt, size_t bcnt, u32 *bytes_mapped,
> > + u32 flags)
> > +{
> > + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem);
> > + u64 user_va;
> > + u64 end;
> > + int npages;
> > + int err;
> > +
> > + if (unlikely(io_virt < mr->mmkey.iova))
> > + return -EFAULT;
> > + if (check_add_overflow(io_virt - mr->mmkey.iova,
> > + (u64)umem->address, &user_va))
> > + return -EFAULT;
> > + /* Overflow has alreddy been checked at the umem creation time */
> > + end = umem->address + umem->length;
> > + if (unlikely(user_va >= end || end - user_va < bcnt))
> > + return -EFAULT;
>
> Why duplicate this sequence? Caller does it
The sequence in the caller is for umem_odp only.
>
> > @@ -811,6 +872,10 @@ static int pagefault_mr(struct mlx5_ib_mr *mr,
> > u64 io_virt, size_t bcnt, {
> > struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
> >
> > + if (is_dmabuf_mr(mr))
> > + return pagefault_dmabuf_mr(mr, mr->umem, io_virt, bcnt,
> > + bytes_mapped, flags);
> > +
> > lockdep_assert_held(&mr->dev->odp_srcu);
> > if (unlikely(io_virt < mr->mmkey.iova))
> > return -EFAULT;
> > @@ -1747,7 +1812,6 @@ static void destroy_prefetch_work(struct
> > prefetch_mr_work *work) {
> > struct mlx5_ib_dev *dev = to_mdev(pd->device);
> > struct mlx5_core_mkey *mmkey;
> > - struct ib_umem_odp *odp;
> > struct mlx5_ib_mr *mr;
> >
> > lockdep_assert_held(&dev->odp_srcu);
> > @@ -1761,11 +1825,9 @@ static void destroy_prefetch_work(struct
> > prefetch_mr_work *work)
> > if (mr->ibmr.pd != pd)
> > return NULL;
> >
> > - odp = to_ib_umem_odp(mr->umem);
> > -
> > /* prefetch with write-access must be supported by the MR */
> > if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
> > - !odp->umem.writable)
> > + !mr->umem->writable)
>
> ??
There is no need to use umem_odp here, mr->umem is the same as &odp->umem.
This change makes the code works for both umem_odp and umem_dmabuf.
>
> This does look basically right though. I think a little more polishing and it
> can be merged. It does need to go after the mlx5 MR series
> though..
>
> Jason
_______________________________________________
dri-devel mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/dri-devel