odp: convert to use HMM for ODP

Haggai Eran Wed, 06 Feb 2019 00:44:40 -0800

On 1/29/2019 6:58 PM, [email protected] wrote:
 > Convert ODP to use HMM so that we can build on common infrastructure
 > for different class of devices that want to mirror a process address
 > space into a device. There is no functional changes.


Thanks for sending this patch. I think in general it is a good idea to 
use a common infrastructure for ODP.

I have a couple of questions below.

> -static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
> -                             const struct mmu_notifier_range *range)
> -{
> -     struct ib_ucontext_per_mm *per_mm =
> -             container_of(mn, struct ib_ucontext_per_mm, mn);
> -
> -     if (unlikely(!per_mm->active))
> -             return;
> -
> -     rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
> -                                   range->end,
> -                                   invalidate_range_end_trampoline, true, 
> NULL);
>       up_read(&per_mm->umem_rwsem);
> +     return ret;
>   }
Previously the code held the umem_rwsem between range_start and 
range_end calls. I guess that was in order to guarantee that no device 
page faults take reference to the pages being invalidated while the 
invalidation is ongoing. I assume this is now handled by hmm instead, 
correct?

> +
> +static uint64_t odp_hmm_flags[HMM_PFN_FLAG_MAX] = {
> +     ODP_READ_BIT,   /* HMM_PFN_VALID */
> +     ODP_WRITE_BIT,  /* HMM_PFN_WRITE */
> +     ODP_DEVICE_BIT, /* HMM_PFN_DEVICE_PRIVATE */
It seems that the mlx5_ib code in this patch currently ignores the 
ODP_DEVICE_BIT (e.g., in umem_dma_to_mtt). Is that okay? Or is it 
handled implicitly by the HMM_PFN_SPECIAL case?


> @@ -327,9 +287,10 @@ void put_per_mm(struct ib_umem_odp *umem_odp)
>       up_write(&per_mm->umem_rwsem);
>  
>       WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root));
> -     mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm);
> +     hmm_mirror_unregister(&per_mm->mirror);
>       put_pid(per_mm->tgid);
> -     mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm);
> +
> +     kfree(per_mm);
>  }
Previously the per_mm struct was released through call srcu, but now it 
is released immediately. Is it safe? I saw that hmm_mirror_unregister 
calls mmu_notifier_unregister_no_release, so I don't understand what 
prevents concurrently running invalidations from accessing the released 
per_mm struct.

> @@ -578,11 +578,27 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct 
> mlx5_ib_mr *mr,
>  
>  next_mr:
>       size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt);
> -
>       page_shift = mr->umem->page_shift;
>       page_mask = ~(BIT(page_shift) - 1);
> +     off = (io_virt & (~page_mask));
> +     size += (io_virt & (~page_mask));
> +     io_virt = io_virt & page_mask;
> +     off += (size & (~page_mask));
> +     size = ALIGN(size, 1UL << page_shift);
> +
> +     if (io_virt < ib_umem_start(&odp->umem))
> +             return -EINVAL;
> +
>       start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
>  
> +     if (odp_mr->per_mm == NULL || odp_mr->per_mm->mm == NULL)
> +             return -ENOENT;
> +
> +     ret = hmm_range_register(&range, odp_mr->per_mm->mm,
> +                              io_virt, io_virt + size, page_shift);
> +     if (ret)
> +             return ret;
> +
>       if (prefetch && !downgrade && !mr->umem->writable) {
>               /* prefetch with write-access must
>                * be supported by the MR
Isn't there a mistake in the calculation of the variable size? Itis 
first set to the size of the page fault range, but then you add the 
virtual address, so I guess it is actually the range end. Then you pass 
io_virt + size to hmm_range_register. Doesn't it double the size of the 
range

> -void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
> -                              u64 bound)
> +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp,
> +                              u64 virt, u64 bound)
>  {
> +     struct device *device = umem_odp->umem.context->device->dma_device;
>       struct ib_umem *umem = &umem_odp->umem;
> -     int idx;
> -     u64 addr;
> -     struct ib_device *dev = umem->context->device;
> +     unsigned long idx, page_mask;
> +     struct hmm_range range;
> +     long ret;
> +
> +     if (!umem->npages)
> +             return;
> +
> +     bound = ALIGN(bound, 1UL << umem->page_shift);
> +     page_mask = ~(BIT(umem->page_shift) - 1);
> +     virt &= page_mask;
>  
>       virt  = max_t(u64, virt,  ib_umem_start(umem));
>       bound = min_t(u64, bound, ib_umem_end(umem));
> -     /* Note that during the run of this function, the
> -      * notifiers_count of the MR is > 0, preventing any racing
> -      * faults from completion. We might be racing with other
> -      * invalidations, so we must make sure we free each page only
> -      * once. */
> +
> +     idx = ((unsigned long)virt - ib_umem_start(umem)) >> PAGE_SHIFT;
> +
> +     range.page_shift = umem->page_shift;
> +     range.pfns = &umem_odp->pfns[idx];
> +     range.pfn_shift = ODP_FLAGS_BITS;
> +     range.values = odp_hmm_values;
> +     range.flags = odp_hmm_flags;
> +     range.start = virt;
> +     range.end = bound;
> +
>       mutex_lock(&umem_odp->umem_mutex);
> -     for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) {
> -             idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
> -             if (umem_odp->page_list[idx]) {
> -                     struct page *page = umem_odp->page_list[idx];
> -                     dma_addr_t dma = umem_odp->dma_list[idx];
> -                     dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
> -
> -                     WARN_ON(!dma_addr);
> -
> -                     ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
> -                                       DMA_BIDIRECTIONAL);
> -                     if (dma & ODP_WRITE_ALLOWED_BIT) {
> -                             struct page *head_page = compound_head(page);
> -                             /*
> -                              * set_page_dirty prefers being called with
> -                              * the page lock. However, MMU notifiers are
> -                              * called sometimes with and sometimes without
> -                              * the lock. We rely on the umem_mutex instead
> -                              * to prevent other mmu notifiers from
> -                              * continuing and allowing the page mapping to
> -                              * be removed.
> -                              */
> -                             set_page_dirty(head_page);
> -                     }
> -                     /* on demand pinning support */
> -                     if (!umem->context->invalidate_range)
> -                             put_page(page);
> -                     umem_odp->page_list[idx] = NULL;
> -                     umem_odp->dma_list[idx] = 0;
> -                     umem->npages--;
> -             }
> -     }
> +     ret = hmm_range_dma_unmap(&range, NULL, device,
> +             &umem_odp->dma_list[idx], true);
> +     if (ret > 0)
> +             umem->npages -= ret;
Can hmm_range_dma_unmap fail? If it does, we do we simply leak the DMA 
mappings?
>       mutex_unlock(&umem_odp->umem_mutex);
>  }

Regards,
Haggai

Re: [PATCH 1/1] RDMA/odp: convert to use HMM for ODP

Reply via email to