On 14.05.19 11:00, Anshuman Khandual wrote: > Memory removal from an arch perspective involves tearing down two different > kernel based mappings i.e vmemmap and linear while releasing related page > table and any mapped pages allocated for given physical memory range to be > removed. > > Define a common kernel page table tear down helper remove_pagetable() which > can be used to unmap given kernel virtual address range. In effect it can > tear down both vmemap or kernel linear mappings. This new helper is called > from both vmemamp_free() and ___remove_pgd_mapping() during memory removal. > > For linear mapping there are no actual allocated pages which are mapped to > create the translation. Any pfn on a given entry is derived from physical > address (__va(PA) --> PA) whose linear translation is to be created. They > need not be freed as they were never allocated in the first place. But for > vmemmap which is a real virtual mapping (like vmalloc) physical pages are > allocated either from buddy or memblock which get mapped in the kernel page > table. These allocated and mapped pages need to be freed during translation > tear down. But page table pages need to be freed in both these cases. > > These mappings need to be differentiated while deciding if a mapped page at > any level i.e [pte|pmd|pud]_page() should be freed or not. Callers for the > mapping tear down process should pass on 'sparse_vmap' variable identifying > kernel vmemmap mappings. > > While here update arch_add_mempory() to handle __add_pages() failures by > just unmapping recently added kernel linear mapping. Now enable memory hot > remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE. > > This implementation is overall inspired from kernel page table tear down > procedure on X86 architecture. > > Signed-off-by: Anshuman Khandual <anshuman.khand...@arm.com> > --- > arch/arm64/Kconfig | 3 + > arch/arm64/mm/mmu.c | 204 > +++++++++++++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 205 insertions(+), 2 deletions(-) > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index 1c0cb51..bb4e571 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -268,6 +268,9 @@ config HAVE_GENERIC_GUP > config ARCH_ENABLE_MEMORY_HOTPLUG > def_bool y > > +config ARCH_ENABLE_MEMORY_HOTREMOVE > + def_bool y > + > config SMP > def_bool y > > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c > index 37a902c..bd2d003 100644 > --- a/arch/arm64/mm/mmu.c > +++ b/arch/arm64/mm/mmu.c > @@ -733,6 +733,177 @@ int kern_addr_valid(unsigned long addr) > > return pfn_valid(pte_pfn(pte)); > } > + > +#ifdef CONFIG_MEMORY_HOTPLUG > +static void free_hotplug_page_range(struct page *page, ssize_t size) > +{ > + WARN_ON(PageReserved(page)); > + free_pages((unsigned long)page_address(page), get_order(size)); > +} > + > +static void free_hotplug_pgtable_page(struct page *page) > +{ > + free_hotplug_page_range(page, PAGE_SIZE); > +} > + > +static void free_pte_table(pte_t *ptep, pmd_t *pmdp, unsigned long addr) > +{ > + struct page *page; > + int i; > + > + for (i = 0; i < PTRS_PER_PTE; i++) { > + if (!pte_none(ptep[i])) > + return; > + } > + > + page = pmd_page(*pmdp); > + pmd_clear(pmdp); > + __flush_tlb_kernel_pgtable(addr); > + free_hotplug_pgtable_page(page); > +} > + > +#if (CONFIG_PGTABLE_LEVELS > 2) > +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr) > +{ > + struct page *page; > + int i; > + > + for (i = 0; i < PTRS_PER_PMD; i++) { > + if (!pmd_none(pmdp[i])) > + return; > + } > + > + page = pud_page(*pudp); > + pud_clear(pudp); > + __flush_tlb_kernel_pgtable(addr); > + free_hotplug_pgtable_page(page); > +} > +#else > +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr) { } > +#endif > + > +#if (CONFIG_PGTABLE_LEVELS > 3) > +static void free_pud_table(pud_t *pudp, pgd_t *pgdp, unsigned long addr) > +{ > + struct page *page; > + int i; > + > + for (i = 0; i < PTRS_PER_PUD; i++) { > + if (!pud_none(pudp[i])) > + return; > + } > + > + page = pgd_page(*pgdp); > + pgd_clear(pgdp); > + __flush_tlb_kernel_pgtable(addr); > + free_hotplug_pgtable_page(page); > +} > +#else > +static void free_pud_table(pud_t *pudp, pgd_t *pgdp, unsigned long addr) { } > +#endif > + > +static void > +remove_pte_table(pmd_t *pmdp, unsigned long addr, > + unsigned long end, bool sparse_vmap) > +{ > + struct page *page; > + pte_t *ptep; > + unsigned long start = addr; > + > + for (; addr < end; addr += PAGE_SIZE) { > + ptep = pte_offset_kernel(pmdp, addr); > + if (!pte_present(*ptep)) > + continue; > + > + if (sparse_vmap) { > + page = pte_page(READ_ONCE(*ptep)); > + free_hotplug_page_range(page, PAGE_SIZE); > + } > + pte_clear(&init_mm, addr, ptep); > + } > + flush_tlb_kernel_range(start, end); > +} > + > +static void > +remove_pmd_table(pud_t *pudp, unsigned long addr, > + unsigned long end, bool sparse_vmap) > +{ > + unsigned long next; > + struct page *page; > + pte_t *ptep_base; > + pmd_t *pmdp; > + > + for (; addr < end; addr = next) { > + next = pmd_addr_end(addr, end); > + pmdp = pmd_offset(pudp, addr); > + if (!pmd_present(*pmdp)) > + continue; > + > + if (pmd_sect(*pmdp)) { > + if (sparse_vmap) { > + page = pmd_page(READ_ONCE(*pmdp)); > + free_hotplug_page_range(page, PMD_SIZE); > + } > + pmd_clear(pmdp); > + continue; > + } > + ptep_base = pte_offset_kernel(pmdp, 0UL); > + remove_pte_table(pmdp, addr, next, sparse_vmap); > + free_pte_table(ptep_base, pmdp, addr); > + } > +} > + > +static void > +remove_pud_table(pgd_t *pgdp, unsigned long addr, > + unsigned long end, bool sparse_vmap) > +{ > + unsigned long next; > + struct page *page; > + pmd_t *pmdp_base; > + pud_t *pudp; > + > + for (; addr < end; addr = next) { > + next = pud_addr_end(addr, end); > + pudp = pud_offset(pgdp, addr); > + if (!pud_present(*pudp)) > + continue; > + > + if (pud_sect(*pudp)) { > + if (sparse_vmap) { > + page = pud_page(READ_ONCE(*pudp)); > + free_hotplug_page_range(page, PUD_SIZE); > + } > + pud_clear(pudp); > + continue; > + } > + pmdp_base = pmd_offset(pudp, 0UL); > + remove_pmd_table(pudp, addr, next, sparse_vmap); > + free_pmd_table(pmdp_base, pudp, addr); > + } > +} > + > +static void > +remove_pagetable(unsigned long start, unsigned long end, bool sparse_vmap) > +{ > + unsigned long addr, next; > + pud_t *pudp_base; > + pgd_t *pgdp; > + > + spin_lock(&init_mm.page_table_lock); > + for (addr = start; addr < end; addr = next) { > + next = pgd_addr_end(addr, end); > + pgdp = pgd_offset_k(addr); > + if (!pgd_present(*pgdp)) > + continue; > + > + pudp_base = pud_offset(pgdp, 0UL); > + remove_pud_table(pgdp, addr, next, sparse_vmap); > + free_pud_table(pudp_base, pgdp, addr); > + } > + spin_unlock(&init_mm.page_table_lock); > +} > +#endif > + > #ifdef CONFIG_SPARSEMEM_VMEMMAP > #if !ARM64_SWAPPER_USES_SECTION_MAPS > int __meminit vmemmap_populate(unsigned long start, unsigned long end, int > node, > @@ -780,6 +951,9 @@ int __meminit vmemmap_populate(unsigned long start, > unsigned long end, int node, > void vmemmap_free(unsigned long start, unsigned long end, > struct vmem_altmap *altmap) > { > +#ifdef CONFIG_MEMORY_HOTPLUG > + remove_pagetable(start, end, true); > +#endif > } > #endif /* CONFIG_SPARSEMEM_VMEMMAP */ > > @@ -1070,10 +1244,16 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) > } > > #ifdef CONFIG_MEMORY_HOTPLUG > +static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) > +{ > + WARN_ON(pgdir != init_mm.pgd); > + remove_pagetable(start, start + size, false); > +} > + > int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, > bool want_memblock) > { > - int flags = 0; > + int ret, flags = 0; > > if (rodata_full || debug_pagealloc_enabled()) > flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; > @@ -1081,7 +1261,27 @@ int arch_add_memory(int nid, u64 start, u64 size, > struct vmem_altmap *altmap, > __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), > size, PAGE_KERNEL, __pgd_pgtable_alloc, flags); > > - return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, > + ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, > altmap, want_memblock); > + if (ret) > + __remove_pgd_mapping(swapper_pg_dir, > + __phys_to_virt(start), size); > + return ret; > } > + > +#ifdef CONFIG_MEMORY_HOTREMOVE > +int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap > *altmap)
You need to rebase to -next (linus master soon). This function is now a void ... > +{ > + unsigned long start_pfn = start >> PAGE_SHIFT; > + unsigned long nr_pages = size >> PAGE_SHIFT; > + struct zone *zone = page_zone(pfn_to_page(start_pfn)); > + int ret = 0; > + > + ret = __remove_pages(zone, start_pfn, nr_pages, altmap); .. and this call can no longer fail :) Which simplifies this patch. > + if (!ret) > + __remove_pgd_mapping(swapper_pg_dir, > + __phys_to_virt(start), size); > + return ret; > +} > +#endif > #endif > -- Thanks, David / dhildenb