It appears devices requiring ZONE_DMA are still prevalent (see link
below).  For this reason the proposal to require turning off ZONE_DMA to
enable ZONE_DEVICE is untenable in the short term.  We want a single
kernel image to be able to support legacy devices as well as next
generation persistent memory platforms.

Towards this end, alias ZONE_DMA and ZONE_DEVICE to work around needing
to maintain a unique zone number for ZONE_DEVICE.  Record the geometry
of ZONE_DMA at init (->init_spanned_pages) and use that information in
is_zone_device_page() to differentiate pages allocated via
devm_memremap_pages() vs true ZONE_DMA pages.  Otherwise, use the
simpler definition of is_zone_device_page() when ZONE_DMA is turned off.

Note that this also teaches the memory hot remove path that the zone may
not have sections for all pfn spans (->zone_dyn_start_pfn).

A user visible implication of this change is potentially an unexpectedly
high "spanned" value in /proc/zoneinfo for the DMA zone.

Cc: H. Peter Anvin <h...@zytor.com>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Rik van Riel <r...@redhat.com>
Cc: Mel Gorman <mgor...@suse.de>
Cc: Jerome Glisse <j.gli...@gmail.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Dave Hansen <dave.han...@linux.intel.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931
Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"")
Reported-by: Sudip Mukherjee <sudipm.mukher...@gmail.com>
Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
---
 include/linux/mm.h     |   46 ++++++++++++++++++++++++++++++++--------------
 include/linux/mmzone.h |   24 ++++++++++++++++++++----
 mm/Kconfig             |    1 -
 mm/memory_hotplug.c    |   15 +++++++++++----
 mm/page_alloc.c        |    9 ++++++---
 5 files changed, 69 insertions(+), 26 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f1cd22f2df1a..b4bccd3d3c41 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -664,12 +664,44 @@ static inline enum zone_type page_zonenum(const struct 
page *page)
        return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
 
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+extern int page_to_nid(const struct page *page);
+#else
+static inline int page_to_nid(const struct page *page)
+{
+       return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
+}
+#endif
+
+static inline struct zone *page_zone(const struct page *page)
+{
+       return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 void get_zone_device_page(struct page *page);
 void put_zone_device_page(struct page *page);
 static inline bool is_zone_device_page(const struct page *page)
 {
+#ifndef CONFIG_ZONE_DMA
        return page_zonenum(page) == ZONE_DEVICE;
+#else /* ZONE_DEVICE == ZONE_DMA */
+       struct zone *zone;
+
+       if (page_zonenum(page) != ZONE_DEVICE)
+               return false;
+
+       /*
+        * If ZONE_DEVICE is aliased with ZONE_DMA we need to check
+        * whether this was a dynamically allocated page from
+        * devm_memremap_pages() by checking against the size of
+        * ZONE_DMA at boot.
+        */
+       zone = page_zone(page);
+       if (page_to_pfn(page) <= zone_end_pfn_boot(zone))
+               return false;
+       return true;
+#endif
 }
 #else
 static inline void get_zone_device_page(struct page *page)
@@ -735,15 +767,6 @@ static inline int zone_to_nid(struct zone *zone)
 #endif
 }
 
-#ifdef NODE_NOT_IN_PAGE_FLAGS
-extern int page_to_nid(const struct page *page);
-#else
-static inline int page_to_nid(const struct page *page)
-{
-       return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
-}
-#endif
-
 #ifdef CONFIG_NUMA_BALANCING
 static inline int cpu_pid_to_cpupid(int cpu, int pid)
 {
@@ -857,11 +880,6 @@ static inline bool cpupid_match_pid(struct task_struct 
*task, int cpupid)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline struct zone *page_zone(const struct page *page)
-{
-       return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
-}
-
 #ifdef SECTION_IN_PAGE_FLAGS
 static inline void set_page_section(struct page *page, unsigned long section)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 33bb1b19273e..a0ef09b7f893 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -288,6 +288,13 @@ enum zone_type {
         */
        ZONE_DMA,
 #endif
+#ifdef CONFIG_ZONE_DEVICE
+#ifndef CONFIG_ZONE_DMA
+       ZONE_DEVICE,
+#else
+       ZONE_DEVICE = ZONE_DMA,
+#endif
+#endif
 #ifdef CONFIG_ZONE_DMA32
        /*
         * x86_64 needs two ZONE_DMAs because it supports devices that are
@@ -314,11 +321,7 @@ enum zone_type {
        ZONE_HIGHMEM,
 #endif
        ZONE_MOVABLE,
-#ifdef CONFIG_ZONE_DEVICE
-       ZONE_DEVICE,
-#endif
        __MAX_NR_ZONES
-
 };
 
 #ifndef __GENERATING_BOUNDS_H
@@ -379,12 +382,19 @@ struct zone {
 
        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long           zone_start_pfn;
+       /* first dynamically added pfn of the zone */
+       unsigned long           zone_dyn_start_pfn;
 
        /*
         * spanned_pages is the total pages spanned by the zone, including
         * holes, which is calculated as:
         *      spanned_pages = zone_end_pfn - zone_start_pfn;
         *
+        * init_spanned_pages is the boot/init time total pages spanned
+        * by the zone for differentiating statically assigned vs
+        * dynamically hot added memory to a zone.
+        *      init_spanned_pages = init_zone_end_pfn - zone_start_pfn;
+        *
         * present_pages is physical pages existing within the zone, which
         * is calculated as:
         *      present_pages = spanned_pages - absent_pages(pages in holes);
@@ -423,6 +433,7 @@ struct zone {
         */
        unsigned long           managed_pages;
        unsigned long           spanned_pages;
+       unsigned long           init_spanned_pages;
        unsigned long           present_pages;
 
        const char              *name;
@@ -546,6 +557,11 @@ static inline unsigned long zone_end_pfn(const struct zone 
*zone)
        return zone->zone_start_pfn + zone->spanned_pages;
 }
 
+static inline unsigned long zone_end_pfn_boot(const struct zone *zone)
+{
+       return zone->zone_start_pfn + zone->init_spanned_pages;
+}
+
 static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
 {
        return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
diff --git a/mm/Kconfig b/mm/Kconfig
index 97a4e06b15c0..08a92a9c8fbd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -652,7 +652,6 @@ config IDLE_PAGE_TRACKING
 config ZONE_DEVICE
        bool "Device memory (pmem, etc...) hotplug support" if EXPERT
        default !ZONE_DMA
-       depends on !ZONE_DMA
        depends on MEMORY_HOTPLUG
        depends on MEMORY_HOTREMOVE
        depends on X86_64 #arch_add_memory() comprehends device memory
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4af58a3a8ffa..c3f0ff45bd47 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -300,6 +300,8 @@ static void __meminit grow_zone_span(struct zone *zone, 
unsigned long start_pfn,
 
        zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
                                zone->zone_start_pfn;
+       if (!zone->zone_dyn_start_pfn || start_pfn < zone->zone_dyn_start_pfn)
+               zone->zone_dyn_start_pfn = start_pfn;
 
        zone_span_writeunlock(zone);
 }
@@ -601,8 +603,9 @@ static int find_biggest_section_pfn(int nid, struct zone 
*zone,
 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
                             unsigned long end_pfn)
 {
-       unsigned long zone_start_pfn = zone->zone_start_pfn;
+       unsigned long zone_start_pfn = zone->zone_dyn_start_pfn;
        unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
+       bool dyn_zone = zone->zone_start_pfn == zone_start_pfn;
        unsigned long zone_end_pfn = z;
        unsigned long pfn;
        struct mem_section *ms;
@@ -619,7 +622,9 @@ static void shrink_zone_span(struct zone *zone, unsigned 
long start_pfn,
                pfn = find_smallest_section_pfn(nid, zone, end_pfn,
                                                zone_end_pfn);
                if (pfn) {
-                       zone->zone_start_pfn = pfn;
+                       if (dyn_zone)
+                               zone->zone_start_pfn = pfn;
+                       zone->zone_dyn_start_pfn = pfn;
                        zone->spanned_pages = zone_end_pfn - pfn;
                }
        } else if (zone_end_pfn == end_pfn) {
@@ -661,8 +666,10 @@ static void shrink_zone_span(struct zone *zone, unsigned 
long start_pfn,
        }
 
        /* The zone has no valid section */
-       zone->zone_start_pfn = 0;
-       zone->spanned_pages = 0;
+       if (dyn_zone)
+               zone->zone_start_pfn = 0;
+       zone->zone_dyn_start_pfn = 0;
+       zone->spanned_pages = zone->init_spanned_pages;
        zone_span_writeunlock(zone);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63358d9f9aa9..2d8b1d602ff3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -209,6 +209,10 @@ EXPORT_SYMBOL(totalram_pages);
 static char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
         "DMA",
+#else
+#ifdef CONFIG_ZONE_DEVICE
+        "Device",
+#endif
 #endif
 #ifdef CONFIG_ZONE_DMA32
         "DMA32",
@@ -218,9 +222,6 @@ static char * const zone_names[MAX_NR_ZONES] = {
         "HighMem",
 #endif
         "Movable",
-#ifdef CONFIG_ZONE_DEVICE
-        "Device",
-#endif
 };
 
 compound_page_dtor * const compound_page_dtors[] = {
@@ -5082,6 +5083,8 @@ static void __meminit calculate_node_totalpages(struct 
pglist_data *pgdat,
                                                  node_start_pfn, node_end_pfn,
                                                  zholes_size);
                zone->spanned_pages = size;
+               zone->init_spanned_pages = size;
+               zone->zone_dyn_start_pfn = 0;
                zone->present_pages = real_size;
 
                totalpages += size;

Reply via email to