Let's refactor that code. We want to check if we can offline memory blocks. Add a new function is_mem_section_offlineable() for that and make it call is_mem_section_offlineable() for each contained section. Within is_mem_section_offlineable(), add some more sanity checks and directly bail out if the section contains holes or if it spans multiple zones.
The old code was inherently racy with concurrent offlining/memory unplug. Let's avoid that and grab the device_hotplug_lock. Luckily we are already holding it when calling from powerpc code. Note1: If somebody wants to export this function for use in driver code, we need a variant that takes the device_hotplug_lock() Note2: If we could have a zombie device (not clear yet), the present section checks would properly bail out early. Note3: I'd prefer the mem_hotplug_lock in read, but as we are about to change the locking on the removal path (IOW, don't hold it when removing memory block devices), I do not want to go down that path. Note4: For now we would have returned "removable" although we would block offlining due to memory holes, multiple zones, or missing sections. Tested with DIMMs on x86-64. Compile-tested on Power. Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org> Cc: Paul Mackerras <pau...@samba.org> Cc: Michael Ellerman <m...@ellerman.id.au> Cc: Greg Kroah-Hartman <gre...@linuxfoundation.org> Cc: "Rafael J. Wysocki" <raf...@kernel.org> Cc: Andrew Morton <a...@linux-foundation.org> Cc: Leonardo Bras <leona...@linux.ibm.com> Cc: Nathan Lynch <nath...@linux.ibm.com> Cc: Allison Randal <alli...@lohutok.net> Cc: Nathan Fontenot <nf...@linux.vnet.ibm.com> Cc: Thomas Gleixner <t...@linutronix.de> Cc: Michal Hocko <mho...@suse.com> Cc: Dan Williams <dan.j.willi...@intel.com> Cc: Stephen Rothwell <s...@canb.auug.org.au> Cc: Anshuman Khandual <anshuman.khand...@arm.com> Cc: lantianyu1...@gmail.com Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: David Hildenbrand <da...@redhat.com> --- .../platforms/pseries/hotplug-memory.c | 24 ++----- drivers/base/memory.c | 37 ++++++---- include/linux/memory.h | 1 + include/linux/memory_hotplug.h | 5 +- mm/memory_hotplug.c | 68 +++++++++---------- 5 files changed, 67 insertions(+), 68 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index c126b94d1943..8d80159465e4 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -337,34 +337,24 @@ static int pseries_remove_mem_node(struct device_node *np) static bool lmb_is_removable(struct drmem_lmb *lmb) { - int i, scns_per_block; - bool rc = true; - unsigned long pfn, block_sz; - u64 phys_addr; + struct memory_block *mem; + bool rc = false; if (!(lmb->flags & DRCONF_MEM_ASSIGNED)) return false; - block_sz = memory_block_size_bytes(); - scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - phys_addr = lmb->base_addr; - #ifdef CONFIG_FA_DUMP /* * Don't hot-remove memory that falls in fadump boot memory area * and memory that is reserved for capturing old kernel memory. */ - if (is_fadump_memory_area(phys_addr, block_sz)) + if (is_fadump_memory_area(lmb->base_addr, memory_block_size_bytes())) return false; #endif - - for (i = 0; i < scns_per_block; i++) { - pfn = PFN_DOWN(phys_addr); - if (!pfn_present(pfn)) - continue; - - rc = rc && is_mem_section_removable(pfn, PAGES_PER_SECTION); - phys_addr += MIN_MEMORY_BLOCK_SIZE; + mem = lmb_to_memblock(lmb); + if (mem) { + rc = is_memory_block_offlineable(mem); + put_device(&mem->dev); } return rc; diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c6d288fad493..f744250c34d0 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -104,6 +104,25 @@ static ssize_t phys_index_show(struct device *dev, return sprintf(buf, "%08lx\n", phys_index); } +/* + * Test if a memory block is likely to be offlineable. Returns true if + * the block is already offline. + * + * Called under device_hotplug_lock. + */ +bool is_memory_block_offlineable(struct memory_block *mem) +{ + int i; + + if (mem->state != MEM_ONLINE) + return true; + + for (i = 0; i < sections_per_block; i++) + if (!is_mem_section_offlineable(mem->start_section_nr + i)) + return false; + return true; +} + /* * Show whether the memory block is likely to be offlineable (or is already * offline). Once offline, the memory block could be removed. The return @@ -114,20 +133,14 @@ static ssize_t removable_show(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); - unsigned long pfn; - int ret = 1, i; - - if (mem->state != MEM_ONLINE) - goto out; + int ret; - for (i = 0; i < sections_per_block; i++) { - if (!present_section_nr(mem->start_section_nr + i)) - continue; - pfn = section_nr_to_pfn(mem->start_section_nr + i); - ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); - } + ret = lock_device_hotplug_sysfs(); + if (ret) + return ret; + ret = is_memory_block_offlineable(mem); + unlock_device_hotplug(); -out: return sprintf(buf, "%d\n", ret); } diff --git a/include/linux/memory.h b/include/linux/memory.h index 0b8d791b6669..faf03eb64ecc 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -91,6 +91,7 @@ typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func); +extern bool is_memory_block_offlineable(struct memory_block *mem); #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT) #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index f4d59155f3d4..8d087772f748 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -306,15 +306,14 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} #ifdef CONFIG_MEMORY_HOTREMOVE -extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); +extern bool is_mem_section_offlineable(unsigned long nr); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern int remove_memory(int nid, u64 start, u64 size); extern void __remove_memory(int nid, u64 start, u64 size); #else -static inline bool is_mem_section_removable(unsigned long pfn, - unsigned long nr_pages) +static inline bool is_mem_section_offlineable(unsigned long nr) { return false; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7a6de9b0dcab..a6d14d2b7f0c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1128,46 +1128,51 @@ static unsigned long next_active_pageblock(unsigned long pfn) return pfn + pageblock_nr_pages; } -static bool is_pageblock_removable_nolock(unsigned long pfn) +static int count_system_ram_pages_cb(unsigned long start_pfn, + unsigned long nr_pages, void *data) { - struct page *page = pfn_to_page(pfn); + unsigned long *nr_system_ram_pages = data; + + *nr_system_ram_pages += nr_pages; + return 0; +} + +/* + * Check if a section is likely to be offlineable. + * + * Called with device_hotplug_lock. + */ +bool is_mem_section_offlineable(unsigned long nr) +{ + const unsigned long start_pfn = section_nr_to_pfn(nr); + const unsigned long end_pfn = start_pfn + PAGES_PER_SECTION; + unsigned long pfn, nr_pages = 0; struct zone *zone; - /* - * We have to be careful here because we are iterating over memory - * sections which are not zone aware so we might end up outside of - * the zone but still within the section. - * We have to take care about the node as well. If the node is offline - * its NODE_DATA will be NULL - see page_zone. - */ - if (!node_online(page_to_nid(page))) + if (!present_section_nr(nr)) return false; - - zone = page_zone(page); - pfn = page_to_pfn(page); - if (!zone_spans_pfn(zone, pfn)) + if (!online_section_nr(nr)) return false; - return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE, - MEMORY_OFFLINE); -} - -/* Checks if this range of memory is likely to be hot-removable. */ -bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long end_pfn, pfn; + /* we don't allow to offline sections with holes */ + walk_system_ram_range(start_pfn, PAGES_PER_SECTION, &nr_pages, + count_system_ram_pages_cb); + if (nr_pages != PAGES_PER_SECTION) + return false; - end_pfn = min(start_pfn + nr_pages, - zone_end_pfn(page_zone(pfn_to_page(start_pfn)))); + /* we don't allow to offline sections with mixed zones/nodes */ + zone = test_pages_in_a_zone(start_pfn, end_pfn); + if (!zone) + return false; - /* Check the starting page of each pageblock within the range */ + /* check each pageblock if it contains unmovable pages */ for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) { - if (!is_pageblock_removable_nolock(pfn)) + if (has_unmovable_pages(zone, pfn_to_page(pfn), MIGRATE_MOVABLE, + MEMORY_OFFLINE)) return false; cond_resched(); } - /* All pageblocks in the memory block are likely to be hot-removable */ return true; } @@ -1436,15 +1441,6 @@ static void node_states_clear_node(int node, struct memory_notify *arg) node_clear_state(node, N_MEMORY); } -static int count_system_ram_pages_cb(unsigned long start_pfn, - unsigned long nr_pages, void *data) -{ - unsigned long *nr_system_ram_pages = data; - - *nr_system_ram_pages += nr_pages; - return 0; -} - static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn) { -- 2.24.1