On 26.1.2016 1:06, Dan Williams wrote:
> It appears devices requiring ZONE_DMA are still prevalent (see link
> below).  For this reason the proposal to require turning off ZONE_DMA to
> enable ZONE_DEVICE is untenable in the short term.  We want a single
> kernel image to be able to support legacy devices as well as next
> generation persistent memory platforms.
> 
> Towards this end, alias ZONE_DMA and ZONE_DEVICE to work around needing
> to maintain a unique zone number for ZONE_DEVICE.  Record the geometry
> of ZONE_DMA at init (->init_spanned_pages) and use that information in
> is_zone_device_page() to differentiate pages allocated via
> devm_memremap_pages() vs true ZONE_DMA pages.  Otherwise, use the
> simpler definition of is_zone_device_page() when ZONE_DMA is turned off.
> 
> Note that this also teaches the memory hot remove path that the zone may
> not have sections for all pfn spans (->zone_dyn_start_pfn).
> 
> A user visible implication of this change is potentially an unexpectedly
> high "spanned" value in /proc/zoneinfo for the DMA zone.

[+CC Joonsoo, Laura]

Sounds like quite a hack :( Would it be possible to extend the bits encoding
zone? Potentially, ZONE_CMA could be added one day...

> Cc: H. Peter Anvin <h...@zytor.com>
> Cc: Ingo Molnar <mi...@redhat.com>
> Cc: Rik van Riel <r...@redhat.com>
> Cc: Mel Gorman <mgor...@suse.de>
> Cc: Jerome Glisse <j.gli...@gmail.com>
> Cc: Christoph Hellwig <h...@lst.de>
> Cc: Dave Hansen <dave.han...@linux.intel.com>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931
> Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"")
> Reported-by: Sudip Mukherjee <sudipm.mukher...@gmail.com>
> Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
> ---
>  include/linux/mm.h     |   46 ++++++++++++++++++++++++++++++++--------------
>  include/linux/mmzone.h |   24 ++++++++++++++++++++----
>  mm/Kconfig             |    1 -
>  mm/memory_hotplug.c    |   15 +++++++++++----
>  mm/page_alloc.c        |    9 ++++++---
>  5 files changed, 69 insertions(+), 26 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f1cd22f2df1a..b4bccd3d3c41 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -664,12 +664,44 @@ static inline enum zone_type page_zonenum(const struct 
> page *page)
>       return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
>  }
>  
> +#ifdef NODE_NOT_IN_PAGE_FLAGS
> +extern int page_to_nid(const struct page *page);
> +#else
> +static inline int page_to_nid(const struct page *page)
> +{
> +     return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
> +}
> +#endif
> +
> +static inline struct zone *page_zone(const struct page *page)
> +{
> +     return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
> +}
> +
>  #ifdef CONFIG_ZONE_DEVICE
>  void get_zone_device_page(struct page *page);
>  void put_zone_device_page(struct page *page);
>  static inline bool is_zone_device_page(const struct page *page)
>  {
> +#ifndef CONFIG_ZONE_DMA
>       return page_zonenum(page) == ZONE_DEVICE;
> +#else /* ZONE_DEVICE == ZONE_DMA */
> +     struct zone *zone;
> +
> +     if (page_zonenum(page) != ZONE_DEVICE)
> +             return false;
> +
> +     /*
> +      * If ZONE_DEVICE is aliased with ZONE_DMA we need to check
> +      * whether this was a dynamically allocated page from
> +      * devm_memremap_pages() by checking against the size of
> +      * ZONE_DMA at boot.
> +      */
> +     zone = page_zone(page);
> +     if (page_to_pfn(page) <= zone_end_pfn_boot(zone))
> +             return false;
> +     return true;
> +#endif
>  }
>  #else
>  static inline void get_zone_device_page(struct page *page)
> @@ -735,15 +767,6 @@ static inline int zone_to_nid(struct zone *zone)
>  #endif
>  }
>  
> -#ifdef NODE_NOT_IN_PAGE_FLAGS
> -extern int page_to_nid(const struct page *page);
> -#else
> -static inline int page_to_nid(const struct page *page)
> -{
> -     return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
> -}
> -#endif
> -
>  #ifdef CONFIG_NUMA_BALANCING
>  static inline int cpu_pid_to_cpupid(int cpu, int pid)
>  {
> @@ -857,11 +880,6 @@ static inline bool cpupid_match_pid(struct task_struct 
> *task, int cpupid)
>  }
>  #endif /* CONFIG_NUMA_BALANCING */
>  
> -static inline struct zone *page_zone(const struct page *page)
> -{
> -     return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
> -}
> -
>  #ifdef SECTION_IN_PAGE_FLAGS
>  static inline void set_page_section(struct page *page, unsigned long section)
>  {
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 33bb1b19273e..a0ef09b7f893 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -288,6 +288,13 @@ enum zone_type {
>        */
>       ZONE_DMA,
>  #endif
> +#ifdef CONFIG_ZONE_DEVICE
> +#ifndef CONFIG_ZONE_DMA
> +     ZONE_DEVICE,
> +#else
> +     ZONE_DEVICE = ZONE_DMA,
> +#endif
> +#endif
>  #ifdef CONFIG_ZONE_DMA32
>       /*
>        * x86_64 needs two ZONE_DMAs because it supports devices that are
> @@ -314,11 +321,7 @@ enum zone_type {
>       ZONE_HIGHMEM,
>  #endif
>       ZONE_MOVABLE,
> -#ifdef CONFIG_ZONE_DEVICE
> -     ZONE_DEVICE,
> -#endif
>       __MAX_NR_ZONES
> -
>  };
>  
>  #ifndef __GENERATING_BOUNDS_H
> @@ -379,12 +382,19 @@ struct zone {
>  
>       /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
>       unsigned long           zone_start_pfn;
> +     /* first dynamically added pfn of the zone */
> +     unsigned long           zone_dyn_start_pfn;
>  
>       /*
>        * spanned_pages is the total pages spanned by the zone, including
>        * holes, which is calculated as:
>        *      spanned_pages = zone_end_pfn - zone_start_pfn;
>        *
> +      * init_spanned_pages is the boot/init time total pages spanned
> +      * by the zone for differentiating statically assigned vs
> +      * dynamically hot added memory to a zone.
> +      *      init_spanned_pages = init_zone_end_pfn - zone_start_pfn;
> +      *
>        * present_pages is physical pages existing within the zone, which
>        * is calculated as:
>        *      present_pages = spanned_pages - absent_pages(pages in holes);
> @@ -423,6 +433,7 @@ struct zone {
>        */
>       unsigned long           managed_pages;
>       unsigned long           spanned_pages;
> +     unsigned long           init_spanned_pages;
>       unsigned long           present_pages;
>  
>       const char              *name;
> @@ -546,6 +557,11 @@ static inline unsigned long zone_end_pfn(const struct 
> zone *zone)
>       return zone->zone_start_pfn + zone->spanned_pages;
>  }
>  
> +static inline unsigned long zone_end_pfn_boot(const struct zone *zone)
> +{
> +     return zone->zone_start_pfn + zone->init_spanned_pages;
> +}
> +
>  static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
>  {
>       return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 97a4e06b15c0..08a92a9c8fbd 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -652,7 +652,6 @@ config IDLE_PAGE_TRACKING
>  config ZONE_DEVICE
>       bool "Device memory (pmem, etc...) hotplug support" if EXPERT
>       default !ZONE_DMA
> -     depends on !ZONE_DMA
>       depends on MEMORY_HOTPLUG
>       depends on MEMORY_HOTREMOVE
>       depends on X86_64 #arch_add_memory() comprehends device memory
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 4af58a3a8ffa..c3f0ff45bd47 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -300,6 +300,8 @@ static void __meminit grow_zone_span(struct zone *zone, 
> unsigned long start_pfn,
>  
>       zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
>                               zone->zone_start_pfn;
> +     if (!zone->zone_dyn_start_pfn || start_pfn < zone->zone_dyn_start_pfn)
> +             zone->zone_dyn_start_pfn = start_pfn;
>  
>       zone_span_writeunlock(zone);
>  }
> @@ -601,8 +603,9 @@ static int find_biggest_section_pfn(int nid, struct zone 
> *zone,
>  static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
>                            unsigned long end_pfn)
>  {
> -     unsigned long zone_start_pfn = zone->zone_start_pfn;
> +     unsigned long zone_start_pfn = zone->zone_dyn_start_pfn;
>       unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
> +     bool dyn_zone = zone->zone_start_pfn == zone_start_pfn;
>       unsigned long zone_end_pfn = z;
>       unsigned long pfn;
>       struct mem_section *ms;
> @@ -619,7 +622,9 @@ static void shrink_zone_span(struct zone *zone, unsigned 
> long start_pfn,
>               pfn = find_smallest_section_pfn(nid, zone, end_pfn,
>                                               zone_end_pfn);
>               if (pfn) {
> -                     zone->zone_start_pfn = pfn;
> +                     if (dyn_zone)
> +                             zone->zone_start_pfn = pfn;
> +                     zone->zone_dyn_start_pfn = pfn;
>                       zone->spanned_pages = zone_end_pfn - pfn;
>               }
>       } else if (zone_end_pfn == end_pfn) {
> @@ -661,8 +666,10 @@ static void shrink_zone_span(struct zone *zone, unsigned 
> long start_pfn,
>       }
>  
>       /* The zone has no valid section */
> -     zone->zone_start_pfn = 0;
> -     zone->spanned_pages = 0;
> +     if (dyn_zone)
> +             zone->zone_start_pfn = 0;
> +     zone->zone_dyn_start_pfn = 0;
> +     zone->spanned_pages = zone->init_spanned_pages;
>       zone_span_writeunlock(zone);
>  }
>  
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 63358d9f9aa9..2d8b1d602ff3 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -209,6 +209,10 @@ EXPORT_SYMBOL(totalram_pages);
>  static char * const zone_names[MAX_NR_ZONES] = {
>  #ifdef CONFIG_ZONE_DMA
>        "DMA",
> +#else
> +#ifdef CONFIG_ZONE_DEVICE
> +      "Device",
> +#endif
>  #endif
>  #ifdef CONFIG_ZONE_DMA32
>        "DMA32",
> @@ -218,9 +222,6 @@ static char * const zone_names[MAX_NR_ZONES] = {
>        "HighMem",
>  #endif
>        "Movable",
> -#ifdef CONFIG_ZONE_DEVICE
> -      "Device",
> -#endif
>  };
>  
>  compound_page_dtor * const compound_page_dtors[] = {
> @@ -5082,6 +5083,8 @@ static void __meminit calculate_node_totalpages(struct 
> pglist_data *pgdat,
>                                                 node_start_pfn, node_end_pfn,
>                                                 zholes_size);
>               zone->spanned_pages = size;
> +             zone->init_spanned_pages = size;
> +             zone->zone_dyn_start_pfn = 0;
>               zone->present_pages = real_size;
>  
>               totalpages += size;
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"d...@kvack.org";> em...@kvack.org </a>
> 

Reply via email to