On Mon,  3 Jul 2017 17:14:12 -0400
Jérôme Glisse <jgli...@redhat.com> wrote:

> Platform with advance system bus (like CAPI or CCIX) allow device
> memory to be accessible from CPU in a cache coherent fashion. Add
> a new type of ZONE_DEVICE to represent such memory. The use case
> are the same as for the un-addressable device memory but without
> all the corners cases.
>

Looks good overall, some comments inline.
 
> Changed since v1:
>   - Kconfig and #if/#else cleanup
> 
> Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
> Cc: Balbir Singh <balb...@au1.ibm.com>
> Cc: Aneesh Kumar <aneesh.ku...@linux.vnet.ibm.com>
> Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
> Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
> Cc: Dan Williams <dan.j.willi...@intel.com>
> Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
> ---
>  fs/proc/task_mmu.c       |  2 +-
>  include/linux/hmm.h      |  4 ++--
>  include/linux/ioport.h   |  1 +
>  include/linux/memremap.h | 21 +++++++++++++++++
>  include/linux/mm.h       | 16 ++++++++-----
>  kernel/memremap.c        | 11 ++++++---
>  mm/Kconfig               | 11 +++++++++
>  mm/gup.c                 |  7 ++++++
>  mm/hmm.c                 |  4 ++--
>  mm/madvise.c             |  2 +-
>  mm/memory.c              | 46 +++++++++++++++++++++++++++++++++----
>  mm/migrate.c             | 60 
> ++++++++++++++++++++++++++++++++----------------
>  mm/swap.c                | 11 +++++++++
>  13 files changed, 156 insertions(+), 40 deletions(-)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 957b6ea80d5f..1f38f2c7cc34 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -1182,7 +1182,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct 
> pagemapread *pm,
>               if (pm->show_pfn)
>                       frame = pte_pfn(pte);
>               flags |= PM_PRESENT;
> -             page = vm_normal_page(vma, addr, pte);
> +             page = _vm_normal_page(vma, addr, pte, true);
>               if (pte_soft_dirty(pte))
>                       flags |= PM_SOFT_DIRTY;
>       } else if (is_swap_pte(pte)) {
> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index 458d0d6d82f3..a40288309fd2 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
>  #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
>  
>  
> -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
> +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
>  struct hmm_devmem;
>  
>  struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
> @@ -443,7 +443,7 @@ struct hmm_device {
>   */
>  struct hmm_device *hmm_device_new(void *drvdata);
>  void hmm_device_put(struct hmm_device *hmm_device);
> -#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
> +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
>  
>  
>  /* Below are for HMM internal use only! Not to be used by device driver! */
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index 3a4f69137bc2..f5cf32e80041 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -131,6 +131,7 @@ enum {
>       IORES_DESC_PERSISTENT_MEMORY            = 4,
>       IORES_DESC_PERSISTENT_MEMORY_LEGACY     = 5,
>       IORES_DESC_DEVICE_PRIVATE_MEMORY        = 6,
> +     IORES_DESC_DEVICE_PUBLIC_MEMORY         = 7,
>  };
>  
>  /* helpers to define resources */
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index 2299cc2d387d..916ca1653ced 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned 
> long memmap_start)
>   *
>   * A more complete discussion of unaddressable memory may be found in
>   * include/linux/hmm.h and Documentation/vm/hmm.txt.
> + *
> + * MEMORY_DEVICE_PUBLIC:
> + * Device memory that is cache coherent from device and CPU point of view. 
> This
> + * is use on platform that have an advance system bus (like CAPI or CCIX). A
> + * driver can hotplug the device memory using ZONE_DEVICE and with that 
> memory
> + * type. Any page of a process can be migrated to such memory. However no one
> + * should be allow to pin such memory so that it can always be evicted.
>   */
>  enum memory_type {
>       MEMORY_DEVICE_PERSISTENT = 0,
>       MEMORY_DEVICE_PRIVATE,
> +     MEMORY_DEVICE_PUBLIC,
>  };
>  
>  /*
> @@ -92,6 +100,8 @@ enum memory_type {
>   * The page_free() callback is called once the page refcount reaches 1
>   * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
>   * This allows the device driver to implement its own memory management.)
> + *
> + * For MEMORY_DEVICE_CACHE_COHERENT only the page_free() callback matter.

Correct, but I wonder if we should in the long term allow for minor faults
(due to coherency) via this interface?

>   */
>  typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
>                               unsigned long addr,
> @@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct 
> page *page)
>       return is_zone_device_page(page) &&
>               page->pgmap->type == MEMORY_DEVICE_PRIVATE;
>  }
> +
> +static inline bool is_device_public_page(const struct page *page)
> +{
> +     return is_zone_device_page(page) &&
> +             page->pgmap->type == MEMORY_DEVICE_PUBLIC;
> +}
>  #else
>  static inline void *devm_memremap_pages(struct device *dev,
>               struct resource *res, struct percpu_ref *ref,
> @@ -157,6 +173,11 @@ static inline bool is_device_private_page(const struct 
> page *page)
>  {
>       return false;
>  }
> +
> +static inline bool is_device_public_page(const struct page *page)
> +{
> +     return false;
> +}
>  #endif
>  
>  /**
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 330a216ac315..8b72b122de93 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -797,14 +797,15 @@ static inline bool is_zone_device_page(const struct 
> page *page)
>  #endif
>  
>  #ifdef CONFIG_DEVICE_PRIVATE
> -void put_zone_device_private_page(struct page *page);
> +void put_zone_device_private_or_public_page(struct page *page);
>  #else
> -static inline void put_zone_device_private_page(struct page *page)
> +static inline void put_zone_device_private_or_public_page(struct page *page)
>  {
>  }
>  #endif
>  
>  static inline bool is_device_private_page(const struct page *page);
> +static inline bool is_device_public_page(const struct page *page);
>  
>  DECLARE_STATIC_KEY_FALSE(device_private_key);
>  
> @@ -830,8 +831,9 @@ static inline void put_page(struct page *page)
>        * include/linux/memremap.h and HMM for details.
>        */
>       if (static_branch_unlikely(&device_private_key) &&
> -         unlikely(is_device_private_page(page))) {
> -             put_zone_device_private_page(page);
> +         unlikely(is_device_private_page(page) ||
> +                  is_device_public_page(page))) {
> +             put_zone_device_private_or_public_page(page);
>               return;
>       }
>  
> @@ -1220,8 +1222,10 @@ struct zap_details {
>       pgoff_t last_index;                     /* Highest page->index to unmap 
> */
>  };
>  
> -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
> -             pte_t pte);
> +struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
> +                          pte_t pte, bool with_public_device);
> +#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
> +
>  struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long 
> addr,
>                               pmd_t pmd);
>  
> diff --git a/kernel/memremap.c b/kernel/memremap.c
> index e82456c39a6a..da74775f2247 100644
> --- a/kernel/memremap.c
> +++ b/kernel/memremap.c
> @@ -466,7 +466,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long 
> memmap_start)
>  
>  
>  #ifdef CONFIG_DEVICE_PRIVATE

Does the #ifdef above need to go as well?

> -void put_zone_device_private_page(struct page *page)
> +void put_zone_device_private_or_public_page(struct page *page)
>  {
>       int count = page_ref_dec_return(page);
>  
> @@ -474,10 +474,15 @@ void put_zone_device_private_page(struct page *page)
>        * If refcount is 1 then page is freed and refcount is stable as nobody
>        * holds a reference on the page.
>        */
> -     if (count == 1)
> +     if (count == 1) {
> +             /* Clear Active bit in case of parallel mark_page_accessed */
> +             __ClearPageActive(page);
> +             __ClearPageWaiters(page);
> +
>               page->pgmap->page_free(page, page->pgmap->data);
> +     }
>       else if (!count)
>               __put_page(page);
>  }
> -EXPORT_SYMBOL(put_zone_device_private_page);
> +EXPORT_SYMBOL(put_zone_device_private_or_public_page);
>  #endif /* CONFIG_DEVICE_PRIVATE */
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 3269ff1cc4cd..6002f1e40fcd 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -712,12 +712,23 @@ config ZONE_DEVICE
>  config DEVICE_PRIVATE
>       bool "Unaddressable device memory (GPU memory, ...)"
>       depends on ARCH_HAS_HMM
> +     select HMM
>  
>       help
>         Allows creation of struct pages to represent unaddressable device
>         memory; i.e., memory that is only accessible from the device (or
>         group of devices).
>  
> +config DEVICE_PUBLIC
> +     bool "Addressable device memory (like GPU memory)"
> +     depends on ARCH_HAS_HMM
> +     select HMM
> +
> +     help
> +       Allows creation of struct pages to represent addressable device
> +       memory; i.e., memory that is accessible from both the device and
> +       the CPU
> +
>  config FRAME_VECTOR
>       bool
>  
> diff --git a/mm/gup.c b/mm/gup.c
> index 23f01c40c88f..2f8e8604ff80 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -438,6 +438,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned 
> long address,
>               if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
>                       goto unmap;
>               *page = pte_page(*pte);
> +
> +             /*
> +              * This should never happen (a device public page in the gate
> +              * area).
> +              */
> +             if (is_device_public_page(*page))
> +                     goto unmap;
>       }
>       get_page(*page);
>  out:
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 4e01c9ba9cc1..eadf70829c34 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -747,7 +747,7 @@ EXPORT_SYMBOL(hmm_vma_fault);
>  #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
>  
>  
> -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
> +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
>  struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
>                                      unsigned long addr)
>  {
> @@ -1190,4 +1190,4 @@ static int __init hmm_init(void)
>  }
>  
>  device_initcall(hmm_init);
> -#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
> +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 9976852f1e1c..197277156ce3 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -343,7 +343,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned 
> long addr,
>                       continue;
>               }
>  
> -             page = vm_normal_page(vma, addr, ptent);
> +             page = _vm_normal_page(vma, addr, ptent, true);
>               if (!page)
>                       continue;
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index 4fcdab3ec525..cee2bed01aa0 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -789,8 +789,8 @@ static void print_bad_pte(struct vm_area_struct *vma, 
> unsigned long addr,
>  #else
>  # define HAVE_PTE_SPECIAL 0
>  #endif
> -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
> -                             pte_t pte)
> +struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
> +                          pte_t pte, bool with_public_device)
>  {
>       unsigned long pfn = pte_pfn(pte);
>  
> @@ -801,8 +801,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, 
> unsigned long addr,
>                       return vma->vm_ops->find_special_page(vma, addr);
>               if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
>                       return NULL;
> -             if (!is_zero_pfn(pfn))
> -                     print_bad_pte(vma, addr, pte, NULL);
> +             if (is_zero_pfn(pfn))
> +                     return NULL;
> +
> +             /*
> +              * Device public pages are special pages (they are ZONE_DEVICE
> +              * pages but different from persistent memory). They behave
> +              * allmost like normal pages. The difference is that they are
> +              * not on the lru and thus should never be involve with any-
> +              * thing that involve lru manipulation (mlock, numa balancing,
> +              * ...).
> +              *
> +              * This is why we still want to return NULL for such page from
> +              * vm_normal_page() so that we do not have to special case all
> +              * call site of vm_normal_page().
> +              */
> +             if (likely(pfn < highest_memmap_pfn)) {
> +                     struct page *page = pfn_to_page(pfn);
> +
> +                     if (is_device_public_page(page)) {
> +                             if (with_public_device)
> +                                     return page;
> +                             return NULL;
> +                     }
> +             }
> +             print_bad_pte(vma, addr, pte, NULL);
>               return NULL;
>       }
>  
> @@ -983,6 +1006,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct 
> *src_mm,
>               get_page(page);
>               page_dup_rmap(page, false);
>               rss[mm_counter(page)]++;
> +     } else if (pte_devmap(pte)) {
> +             page = pte_page(pte);
> +
> +             /*
> +              * Cache coherent device memory behave like regular page and
> +              * not like persistent memory page. For more informations see
> +              * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
> +              */
> +             if (is_device_public_page(page)) {
> +                     get_page(page);
> +                     page_dup_rmap(page, false);
> +                     rss[mm_counter(page)]++;
> +             }
>       }
>  
>  out_set_pte:
> @@ -1236,7 +1272,7 @@ static unsigned long zap_pte_range(struct mmu_gather 
> *tlb,
>               if (pte_present(ptent)) {
>                       struct page *page;
>  
> -                     page = vm_normal_page(vma, addr, ptent);
> +                     page = _vm_normal_page(vma, addr, ptent, true);
>                       if (unlikely(details) && page) {
>                               /*
>                                * unmap_shared_mapping_pages() wants to
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 643ea61ca9bb..f9ae57f0c7a1 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -229,12 +229,19 @@ static bool remove_migration_pte(struct page *page, 
> struct vm_area_struct *vma,
>               if (is_write_migration_entry(entry))
>                       pte = maybe_mkwrite(pte, vma);
>  
> -             if (unlikely(is_zone_device_page(new)) &&
> -                 is_device_private_page(new)) {
> -                     entry = make_device_private_entry(new, pte_write(pte));
> -                     pte = swp_entry_to_pte(entry);
> -                     if (pte_swp_soft_dirty(*pvmw.pte))
> -                             pte = pte_mksoft_dirty(pte);
> +             if (unlikely(is_zone_device_page(new))) {
> +                     if (is_device_private_page(new)) {
> +                             entry = make_device_private_entry(new, 
> pte_write(pte));
> +                             pte = swp_entry_to_pte(entry);
> +                             if (pte_swp_soft_dirty(*pvmw.pte))
> +                                     pte = pte_mksoft_dirty(pte);
> +                     }
> +#if IS_ENABLED(CONFIG_DEVICE_PUBLIC)
> +                     else if (is_device_public_page(new)) {
> +                             pte = pte_mkdevmap(pte);
> +                             flush_dcache_page(new);
> +                     }
> +#endif /* IS_ENABLED(CONFIG_DEVICE_PUBLIC) */
>               } else
>                       flush_dcache_page(new);
>  
> @@ -408,12 +415,11 @@ int migrate_page_move_mapping(struct address_space 
> *mapping,
>       void **pslot;
>  
>       /*
> -      * ZONE_DEVICE pages have 1 refcount always held by their device
> -      *
> -      * Note that DAX memory will never reach that point as it does not have
> -      * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
> +      * Device public or private pages have an extra refcount as they are
> +      * ZONE_DEVICE pages.
>        */
> -     expected_count += is_zone_device_page(page);
> +     expected_count += is_device_private_page(page);
> +     expected_count += is_device_public_page(page);
>  
>       if (!mapping) {
>               /* Anonymous page without mapping */
> @@ -2087,7 +2093,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct 
> *mm,
>  
>  #endif /* CONFIG_NUMA */
>  
> -
>  struct migrate_vma {
>       struct vm_area_struct   *vma;
>       unsigned long           *dst;
> @@ -2186,7 +2191,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>                       if (is_write_device_private_entry(entry))
>                               mpfn |= MIGRATE_PFN_WRITE;
>               } else {
> -                     page = vm_normal_page(migrate->vma, addr, pte);
> +                     page = _vm_normal_page(migrate->vma, addr, pte, true);
>                       mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
>                       mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
>               }
> @@ -2311,13 +2316,18 @@ static bool migrate_vma_check_page(struct page *page)
>  
>       /* Page from ZONE_DEVICE have one extra reference */
>       if (is_zone_device_page(page)) {
> -             if (is_device_private_page(page)) {
> +             if (is_device_private_page(page) ||
> +                 is_device_public_page(page))
>                       extra++;
> -             } else
> +             else
>                       /* Other ZONE_DEVICE memory type are not supported */
>                       return false;
>       }
>  
> +     /* For file back page */
> +     if (page_mapping(page))
> +             extra += 1 + page_has_private(page);
> +
>       if ((page_count(page) - extra) > page_mapcount(page))
>               return false;
>  
> @@ -2541,11 +2551,21 @@ static void migrate_vma_insert_page(struct 
> migrate_vma *migrate,
>        */
>       __SetPageUptodate(page);
>  
> -     if (is_zone_device_page(page) && is_device_private_page(page)) {
> -             swp_entry_t swp_entry;
> +     if (is_zone_device_page(page)) {
> +             if (is_device_private_page(page)) {
> +                     swp_entry_t swp_entry;
>  
> -             swp_entry = make_device_private_entry(page, vma->vm_flags & 
> VM_WRITE);
> -             entry = swp_entry_to_pte(swp_entry);
> +                     swp_entry = make_device_private_entry(page, 
> vma->vm_flags & VM_WRITE);
> +                     entry = swp_entry_to_pte(swp_entry);
> +             }
> +#if IS_ENABLED(CONFIG_DEVICE_PUBLIC)

Do we need this #if check? is_device_public_page(page)
will return false if the config is disabled

> +             else if (is_device_public_page(page)) {
> +                     entry = pte_mkold(mk_pte(page, 
> READ_ONCE(vma->vm_page_prot)));
> +                     if (vma->vm_flags & VM_WRITE)
> +                             entry = pte_mkwrite(pte_mkdirty(entry));
> +                     entry = pte_mkdevmap(entry);
> +             }
> +#endif /* IS_ENABLED(CONFIG_DEVICE_PUBLIC) */

Reply via email to