For the nouveau bits:

Reviewed-by: Lyude Paul <ly...@redhat.com>

On Thu, 2022-10-20 at 14:56 -0700, Dan Williams wrote:
> A 'struct dev_pagemap' (pgmap) represents a collection of ZONE_DEVICE
> pages. The pgmap is a reference counted object that serves a similar
> role as a 'struct request_queue'. Live references are obtained for each
> in flight request / page, and once a page's reference count drops to
> zero the associated pin of the pgmap is dropped as well. While a page is
> idle nothing should be accessing it because that is effectively a
> use-after-free situation. Unfortunately, all current ZONE_DEVICE
> implementations deploy a layering violation to manage requests to
> activate pages owned by a pgmap. Specifically, they take steps like walk
> the pfns that were previously assigned at memremap_pages() time and use
> pfn_to_page() to recall metadata like page->pgmap, or make use of other
> data like page->zone_device_data.
> 
> The first step towards correcting that situation is to provide a
> API to get access to a pgmap page that does not require the caller to
> know the pfn, nor access any fields of an idle page. Ideally this API
> would be able to support dynamic page creation instead of the current
> status quo of pre-allocating and initializing pages.
> 
> On a prompt from Jason, introduce pgmap_request_folio() that operates on
> an offset into a pgmap. It replaces the shortlived
> pgmap_request_folios() that was continuing the layering violation of
> assuming pages are available to be consulted before asking the pgmap to
> make them available.
> 
> For now this only converts the callers to lookup the pgmap and generate
> the pgmap offset, but it does not do the deeper cleanup of teaching
> those call sites to generate those arguments without walking the page
> metadata. For next steps it appears the DEVICE_PRIVATE implementations
> could plumb the pgmap into the necessary callsites and switch to using
> gen_pool_alloc() to track which offsets of a pgmap are allocated. For
> DAX, dax_direct_access() could switch from returning pfns to returning
> the associated @pgmap and @pgmap_offset. Those changes are saved for
> follow-on work.
> 
> Cc: Matthew Wilcox <wi...@infradead.org>
> Cc: Jan Kara <j...@suse.cz>
> Cc: "Darrick J. Wong" <djw...@kernel.org>
> Cc: Christoph Hellwig <h...@lst.de>
> Cc: John Hubbard <jhubb...@nvidia.com>
> Cc: Alistair Popple <apop...@nvidia.com>
> Cc: Felix Kuehling <felix.kuehl...@amd.com>
> Cc: Alex Deucher <alexander.deuc...@amd.com>
> Cc: "Christian König" <christian.koe...@amd.com>
> Cc: "Pan, Xinhui" <xinhui....@amd.com>
> Cc: David Airlie <airl...@linux.ie>
> Cc: Daniel Vetter <dan...@ffwll.ch>
> Cc: Ben Skeggs <bske...@redhat.com>
> Cc: Karol Herbst <kher...@redhat.com>
> Cc: Lyude Paul <ly...@redhat.com>
> Cc: "Jérôme Glisse" <jgli...@redhat.com>
> Suggested-by: Jason Gunthorpe <j...@nvidia.com>
> Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
> ---
> This builds on the dax reference counting reworks in mm-unstable.
> 
>  arch/powerpc/kvm/book3s_hv_uvmem.c       |   11 ++--
>  drivers/dax/mapping.c                    |   10 +++
>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |   14 +++--
>  drivers/gpu/drm/nouveau/nouveau_dmem.c   |   13 +++-
>  include/linux/memremap.h                 |   35 ++++++++---
>  lib/test_hmm.c                           |    9 +++
>  mm/memremap.c                            |   92 
> ++++++++++++------------------
>  7 files changed, 106 insertions(+), 78 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
> b/arch/powerpc/kvm/book3s_hv_uvmem.c
> index 884ec112ad43..2ea59396f608 100644
> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> @@ -689,12 +689,14 @@ unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm)
>   */
>  static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
>  {
> -     struct page *dpage = NULL;
> +     struct dev_pagemap *pgmap = &kvmppc_uvmem_pgmap;
>       unsigned long bit, uvmem_pfn;
>       struct kvmppc_uvmem_page_pvt *pvt;
>       unsigned long pfn_last, pfn_first;
> +     struct folio *folio;
> +     struct page *dpage;
>  
> -     pfn_first = kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT;
> +     pfn_first = pgmap->range.start >> PAGE_SHIFT;
>       pfn_last = pfn_first +
>                  (range_len(&kvmppc_uvmem_pgmap.range) >> PAGE_SHIFT);
>  
> @@ -716,9 +718,10 @@ static struct page *kvmppc_uvmem_get_page(unsigned long 
> gpa, struct kvm *kvm)
>       pvt->gpa = gpa;
>       pvt->kvm = kvm;
>  
> -     dpage = pfn_to_page(uvmem_pfn);
> +     folio = pgmap_request_folio(pgmap,
> +                                 pfn_to_pgmap_offset(pgmap, uvmem_pfn), 0);
> +     dpage = &folio->page;
>       dpage->zone_device_data = pvt;
> -     pgmap_request_folios(dpage->pgmap, page_folio(dpage), 1);
>       lock_page(dpage);
>       return dpage;
>  out_clear:
> diff --git a/drivers/dax/mapping.c b/drivers/dax/mapping.c
> index ca06f2515644..b885c75e2dfb 100644
> --- a/drivers/dax/mapping.c
> +++ b/drivers/dax/mapping.c
> @@ -376,8 +376,14 @@ static vm_fault_t dax_associate_entry(void *entry,
>               if (flags & DAX_COW) {
>                       dax_mapping_set_cow(folio);
>               } else {
> +                     struct dev_pagemap *pgmap = folio_pgmap(folio);
> +                     unsigned long pfn = page_to_pfn(&folio->page);
> +
>                       WARN_ON_ONCE(folio->mapping);
> -                     if (!pgmap_request_folios(folio_pgmap(folio), folio, 1))
> +                     if (folio !=
> +                         pgmap_request_folio(pgmap,
> +                                             pfn_to_pgmap_offset(pgmap, pfn),
> +                                             folio_order(folio)))
>                               return VM_FAULT_SIGBUS;
>                       folio->mapping = mapping;
>                       folio->index = index + i;
> @@ -691,7 +697,7 @@ static struct page *dax_zap_pages(struct xa_state *xas, 
> void *entry)
>  
>       dax_for_each_folio(entry, folio, i) {
>               if (zap)
> -                     pgmap_release_folios(folio, 1);
> +                     folio_put(folio);
>               if (!ret && !dax_folio_idle(folio))
>                       ret = folio_page(folio, 0);
>       }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> index 8cf97060122b..1cecee358a9e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> @@ -215,15 +215,17 @@ svm_migrate_addr_to_pfn(struct amdgpu_device *adev, 
> unsigned long addr)
>       return (addr + adev->kfd.dev->pgmap.range.start) >> PAGE_SHIFT;
>  }
>  
> -static void
> -svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
> +static void svm_migrate_get_vram_page(struct dev_pagemap *pgmap,
> +                                   struct svm_range *prange,
> +                                   unsigned long pfn)
>  {
> +     struct folio *folio;
>       struct page *page;
>  
> -     page = pfn_to_page(pfn);
> +     folio = pgmap_request_folio(pgmap, pfn_to_pgmap_offset(pgmap, pfn), 0);
> +     page = &folio->page;
>       svm_range_bo_ref(prange->svm_bo);
>       page->zone_device_data = prange->svm_bo;
> -     pgmap_request_folios(page->pgmap, page_folio(page), 1);
>       lock_page(page);
>  }
>  
> @@ -298,6 +300,7 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, 
> struct svm_range *prange,
>                        struct migrate_vma *migrate, struct dma_fence **mfence,
>                        dma_addr_t *scratch)
>  {
> +     struct kfd_dev *kfddev = adev->kfd.dev;
>       uint64_t npages = migrate->npages;
>       struct device *dev = adev->dev;
>       struct amdgpu_res_cursor cursor;
> @@ -327,7 +330,8 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, 
> struct svm_range *prange,
>               if (spage && !is_zone_device_page(spage)) {
>                       dst[i] = cursor.start + (j << PAGE_SHIFT);
>                       migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]);
> -                     svm_migrate_get_vram_page(prange, migrate->dst[i]);
> +                     svm_migrate_get_vram_page(&kfddev->pgmap, prange,
> +                                               migrate->dst[i]);
>                       migrate->dst[i] = migrate_pfn(migrate->dst[i]);
>                       src[i] = dma_map_page(dev, spage, 0, PAGE_SIZE,
>                                             DMA_TO_DEVICE);
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
> b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> index 1482533c7ca0..24208a1d7441 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> @@ -307,6 +307,9 @@ static struct page *
>  nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
>  {
>       struct nouveau_dmem_chunk *chunk;
> +     struct dev_pagemap *pgmap;
> +     struct folio *folio;
> +     unsigned long pfn;
>       struct page *page = NULL;
>       int ret;
>  
> @@ -316,17 +319,21 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
>               drm->dmem->free_pages = page->zone_device_data;
>               chunk = nouveau_page_to_chunk(page);
>               chunk->callocated++;
> +             pfn = page_to_pfn(page);
>               spin_unlock(&drm->dmem->lock);
>       } else {
>               spin_unlock(&drm->dmem->lock);
>               ret = nouveau_dmem_chunk_alloc(drm, &page);
>               if (ret)
>                       return NULL;
> +             chunk = nouveau_page_to_chunk(page);
> +             pfn = page_to_pfn(page);
>       }
>  
> -     pgmap_request_folios(page->pgmap, page_folio(page), 1);
> -     lock_page(page);
> -     return page;
> +     pgmap = &chunk->pagemap;
> +     folio = pgmap_request_folio(pgmap, pfn_to_pgmap_offset(pgmap, pfn), 0);
> +     lock_page(&folio->page);
> +     return &folio->page;
>  }
>  
>  static void
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index ddb196ae0696..f11f827883bb 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -139,6 +139,28 @@ struct dev_pagemap {
>       };
>  };
>  
> +/*
> + * Do not use this in new code, this is a transitional helper on the
> + * path to convert all ZONE_DEVICE users to operate in terms of pgmap
> + * offsets rather than pfn and pfn_to_page() to put ZONE_DEVICE pages
> + * into use.
> + */
> +static inline pgoff_t pfn_to_pgmap_offset(struct dev_pagemap *pgmap, 
> unsigned long pfn)
> +{
> +     u64 phys = PFN_PHYS(pfn), sum = 0;
> +     int i;
> +
> +     for (i = 0; i < pgmap->nr_range; i++) {
> +             struct range *range = &pgmap->ranges[i];
> +
> +             if (phys >= range->start && phys <= range->end)
> +                     return PHYS_PFN(phys - range->start + sum);
> +             sum += range_len(range);
> +     }
> +
> +     return -1;
> +}
> +
>  static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap)
>  {
>       return pgmap->ops && pgmap->ops->memory_failure;
> @@ -193,9 +215,8 @@ void *devm_memremap_pages(struct device *dev, struct 
> dev_pagemap *pgmap);
>  void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
>  struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
>                                   struct dev_pagemap *pgmap);
> -bool pgmap_request_folios(struct dev_pagemap *pgmap, struct folio *folio,
> -                       int nr_folios);
> -void pgmap_release_folios(struct folio *folio, int nr_folios);
> +struct folio *pgmap_request_folio(struct dev_pagemap *pgmap,
> +                               pgoff_t pgmap_offset, int order);
>  bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
>  
>  unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
> @@ -231,16 +252,12 @@ static inline struct dev_pagemap 
> *get_dev_pagemap(unsigned long pfn,
>       return NULL;
>  }
>  
> -static inline bool pgmap_request_folios(struct dev_pagemap *pgmap,
> -                                     struct folio *folio, int nr_folios)
> +static inline struct folio *pgmap_request_folio(struct dev_pagemap *pgmap,
> +                                             pgoff_t pgmap_offset, int order)
>  {
>       return false;
>  }
>  
> -static inline void pgmap_release_folios(struct folio *folio, int nr_folios)
> -{
> -}
> -
>  static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long 
> pfn)
>  {
>       return false;
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index e4f7219ae3bb..1f7e00ae62d5 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -605,8 +605,11 @@ static int dmirror_allocate_chunk(struct dmirror_device 
> *mdevice,
>  
>  static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
>  {
> +     struct dev_pagemap *pgmap;
>       struct page *dpage = NULL;
>       struct page *rpage = NULL;
> +     struct folio *folio;
> +     unsigned long pfn;
>  
>       /*
>        * For ZONE_DEVICE private type, this is a fake device so we allocate
> @@ -632,7 +635,11 @@ static struct page *dmirror_devmem_alloc_page(struct 
> dmirror_device *mdevice)
>                       goto error;
>       }
>  
> -     pgmap_request_folios(dpage->pgmap, page_folio(dpage), 1);
> +     /* FIXME: Rework allocator to be pgmap offset based */
> +     pgmap = dpage->pgmap;
> +     pfn = page_to_pfn(dpage);
> +     folio = pgmap_request_folio(pgmap, pfn_to_pgmap_offset(pgmap, pfn), 0);
> +     WARN_ON_ONCE(dpage != &folio->page);
>       lock_page(dpage);
>       dpage->zone_device_data = rpage;
>       return dpage;
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 02b796749b72..09b20a337db9 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -492,76 +492,60 @@ void free_zone_device_page(struct page *page)
>       put_dev_pagemap(page->pgmap);
>  }
>  
> -static __maybe_unused bool folio_span_valid(struct dev_pagemap *pgmap,
> -                                         struct folio *folio,
> -                                         int nr_folios)
> +static unsigned long pgmap_offset_to_pfn(struct dev_pagemap *pgmap,
> +                                      pgoff_t pgmap_offset)
>  {
> -     unsigned long pfn_start, pfn_end;
> -
> -     pfn_start = page_to_pfn(folio_page(folio, 0));
> -     pfn_end = pfn_start + (1 << folio_order(folio)) * nr_folios - 1;
> +     u64 sum = 0, offset = PFN_PHYS(pgmap_offset);
> +     int i;
>  
> -     if (pgmap != xa_load(&pgmap_array, pfn_start))
> -             return false;
> +     for (i = 0; i < pgmap->nr_range; i++) {
> +             struct range *range = &pgmap->ranges[i];
>  
> -     if (pfn_end > pfn_start && pgmap != xa_load(&pgmap_array, pfn_end))
> -             return false;
> +             if (offset >= sum && offset < (sum + range_len(range)))
> +                     return PHYS_PFN(range->start + offset - sum);
> +             sum += range_len(range);
> +     }
>  
> -     return true;
> +     return -1;
>  }
>  
>  /**
> - * pgmap_request_folios - activate an contiguous span of folios in @pgmap
> - * @pgmap: host page map for the folio array
> - * @folio: start of the folio list, all subsequent folios have same 
> folio_size()
> + * pgmap_request_folio - activate a folio of a given order in @pgmap
> + * @pgmap: host page map of the folio to activate
> + * @pgmap_offset: page-offset into the pgmap to request
> + * @order: expected folio_order() of the folio
>   *
>   * Caller is responsible for @pgmap remaining live for the duration of
> - * this call. Caller is also responsible for not racing requests for the
> - * same folios.
> + * this call. The order (size) of the folios in the pgmap are assumed
> + * stable before this call.
>   */
> -bool pgmap_request_folios(struct dev_pagemap *pgmap, struct folio *folio,
> -                       int nr_folios)
> +struct folio *pgmap_request_folio(struct dev_pagemap *pgmap,
> +                               pgoff_t pgmap_offset, int order)
>  {
> -     struct folio *iter;
> -     int i;
> +     unsigned long pfn = pgmap_offset_to_pfn(pgmap, pgmap_offset);
> +     struct page *page = pfn_to_page(pfn);
> +     struct folio *folio;
> +     int v;
>  
> -     /*
> -      * All of the WARNs below are for catching bugs in future
> -      * development that changes the assumptions of:
> -      * 1/ uniform folios in @pgmap
> -      * 2/ @pgmap death does not race this routine.
> -      */
> -     VM_WARN_ON_ONCE(!folio_span_valid(pgmap, folio, nr_folios));
> +     if (WARN_ON_ONCE(page->pgmap != pgmap))
> +             return NULL;
>  
>       if (WARN_ON_ONCE(percpu_ref_is_dying(&pgmap->ref)))
> -             return false;
> +             return NULL;
>  
> -     for (iter = folio_next(folio), i = 1; i < nr_folios;
> -          iter = folio_next(folio), i++)
> -             if (WARN_ON_ONCE(folio_order(iter) != folio_order(folio)))
> -                     return false;
> +     folio = page_folio(page);
> +     if (WARN_ON_ONCE(folio_order(folio) != order))
> +             return NULL;
>  
> -     for (iter = folio, i = 0; i < nr_folios; iter = folio_next(iter), i++) {
> -             folio_ref_inc(iter);
> -             if (folio_ref_count(iter) == 1)
> -                     percpu_ref_tryget(&pgmap->ref);
> -     }
> -
> -     return true;
> -}
> -EXPORT_SYMBOL_GPL(pgmap_request_folios);
> +     v = folio_ref_inc_return(folio);
> +     if (v > 1)
> +             return folio;
>  
> -/*
> - * A symmetric helper to undo the page references acquired by
> - * pgmap_request_folios(), but the caller can also just arrange
> - * folio_put() on all the folios it acquired previously for the same
> - * effect.
> - */
> -void pgmap_release_folios(struct folio *folio, int nr_folios)
> -{
> -     struct folio *iter;
> -     int i;
> +     if (WARN_ON_ONCE(!percpu_ref_tryget(&pgmap->ref))) {
> +             folio_put(folio);
> +             return NULL;
> +     }
>  
> -     for (iter = folio, i = 0; i < nr_folios; iter = folio_next(folio), i++)
> -             folio_put(iter);
> +     return folio;
>  }
> +EXPORT_SYMBOL_GPL(pgmap_request_folio);
> 

-- 
Cheers,
 Lyude Paul (she/her)
 Software Engineer at Red Hat


Reply via email to