Bharata B Rao <bhar...@linux.ibm.com> writes: > We hit the following BUG_ON when memory hotplugged before reboot > is unplugged after reboot: > > kernel BUG at arch/powerpc/mm/pgtable-frag.c:113! > > remove_pagetable+0x594/0x6a0 > (unreliable) > remove_pagetable+0x94/0x6a0 > vmemmap_free+0x394/0x410 > sparse_remove_one_section+0x26c/0x2e8 > __remove_pages+0x428/0x540 > arch_remove_memory+0xd0/0x170 > __remove_memory+0xd4/0x1a0 > dlpar_remove_lmb+0xbc/0x110 > dlpar_memory+0xa80/0xd20 > handle_dlpar_errorlog+0xa8/0x160 > pseries_hp_work_fn+0x2c/0x60 > process_one_work+0x46c/0x860 > worker_thread+0x364/0x5e0 > kthread+0x1b0/0x1c0 > ret_from_kernel_thread+0x5c/0x68 > > This occurs because, during reboot-after-hotplug, the hotplugged > memory range gets initialized as regular memory and page > tables are setup using memblock allocator. This means that we > wouldn't have initialized the PMD or PTE fragment count for > those PMD or PTE pages. > > Fixing this includes 3 parts: > > - Re-walk the init_mm page tables from mem_init() and initialize > the PMD and PTE fragment counts appropriately. So PMD and PTE > table pages allocated during early allocation will have a > fragment count of 1. > - Convert the pages from memblock pages to regular pages so that > they can be freed back to buddy allocator seamlessly. However > we do this for only PMD and PTE pages and not for PUD pages. > PUD pages are freed using kmem_cache_free() and we need to > identify memblock pages and free them differently. > - When we do early memblock based allocation of PMD and PUD pages, > allocate in PAGE_SIZE granularity so that we are sure the > complete page is used as pagetable page. PAGE_SIZE allocations will > have an implication on the amount of memory used for page tables, > an example of which is shown below: > > Since we now do PAGE_SIZE allocations for both PUD table and > PMD table (Note that PTE table allocation is already of PAGE_SIZE), > we end up allocating more memory for the same amount of system RAM. > Here is an example of how much more we end up allocating for > page tables in case of 64T system RAM: > > 1. Mapping system RAM > > With each PGD entry spanning 512G, 64TB RAM would need 128 entries > and hence 128 PUD tables. We use 1G mapping at PUD level (level 2) > > With default PUD_TABLE_SIZE(4K), 128*4K=512K (8 64K pages) > With PAGE_SIZE(64K) allocations, 128*64K=8192K (128 64K pages) > > 2. Mapping struct pages (memmap) > > 64T RAM would need 64G for memmap with struct page size being 64B. > Since memmap array is mapped using 64K mappings, we would need > 64 PUD entries or 64 PMD tables (level 3) in total. > > With default PMD_TABLE_SIZE(4K), 64*4K=256K (4 64K pages) > With PAGE_SIZE(64K) allocations, 64*64K=4096K (64 64K pages) > > There is no change in PTE table (level 4) allocation requirement as > early page table allocation is already using PAGE_SIZE PTE tables. > > So essentially with this change we would use 180 64K pages > more for 64T system. >
Reviewed-by: Aneesh Kumar K.V <aneesh.ku...@linux.ibm.com> > Reported-by: Srikanth Aithal <srait...@linux.vnet.ibm.com> > Signed-off-by: Bharata B Rao <bhar...@linux.ibm.com> > --- > v0 - https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-June/192242.html > Changes in v1: > - Handling PUD table freeing too. > - Added details about how much extra memory we use up with > this approach into the commit message > - A few cleanups and renames > > arch/powerpc/include/asm/book3s/64/pgalloc.h | 7 +- > arch/powerpc/include/asm/book3s/64/radix.h | 1 + > arch/powerpc/include/asm/sparsemem.h | 1 + > arch/powerpc/mm/book3s64/pgtable.c | 15 +++- > arch/powerpc/mm/book3s64/radix_pgtable.c | 79 +++++++++++++++++++- > arch/powerpc/mm/mem.c | 5 ++ > 6 files changed, 104 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h > b/arch/powerpc/include/asm/book3s/64/pgalloc.h > index d5a44912902f..9ae134f260be 100644 > --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h > +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h > @@ -111,7 +111,12 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, > unsigned long addr) > > static inline void pud_free(struct mm_struct *mm, pud_t *pud) > { > - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud); > + struct page *page = virt_to_page(pud); > + > + if (PageReserved(page)) > + free_reserved_page(page); > + else > + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud); > } > > static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) > diff --git a/arch/powerpc/include/asm/book3s/64/radix.h > b/arch/powerpc/include/asm/book3s/64/radix.h > index 574eca33f893..4320f2790e8d 100644 > --- a/arch/powerpc/include/asm/book3s/64/radix.h > +++ b/arch/powerpc/include/asm/book3s/64/radix.h > @@ -285,6 +285,7 @@ static inline unsigned long radix__get_tree_size(void) > #ifdef CONFIG_MEMORY_HOTPLUG > int radix__create_section_mapping(unsigned long start, unsigned long end, > int nid); > int radix__remove_section_mapping(unsigned long start, unsigned long end); > +void radix__fixup_pgtable_fragments(void); > #endif /* CONFIG_MEMORY_HOTPLUG */ > #endif /* __ASSEMBLY__ */ > #endif > diff --git a/arch/powerpc/include/asm/sparsemem.h > b/arch/powerpc/include/asm/sparsemem.h > index 3192d454a733..e662f9232d35 100644 > --- a/arch/powerpc/include/asm/sparsemem.h > +++ b/arch/powerpc/include/asm/sparsemem.h > @@ -15,6 +15,7 @@ > #ifdef CONFIG_MEMORY_HOTPLUG > extern int create_section_mapping(unsigned long start, unsigned long end, > int nid); > extern int remove_section_mapping(unsigned long start, unsigned long end); > +void fixup_pgtable_fragments(void); > > #ifdef CONFIG_PPC_BOOK3S_64 > extern int resize_hpt_for_hotplug(unsigned long new_mem_size); > diff --git a/arch/powerpc/mm/book3s64/pgtable.c > b/arch/powerpc/mm/book3s64/pgtable.c > index 01bc9663360d..c8fa94802620 100644 > --- a/arch/powerpc/mm/book3s64/pgtable.c > +++ b/arch/powerpc/mm/book3s64/pgtable.c > @@ -186,6 +186,13 @@ int __meminit remove_section_mapping(unsigned long > start, unsigned long end) > > return hash__remove_section_mapping(start, end); > } > + > +void fixup_pgtable_fragments(void) > +{ > + if (radix_enabled()) > + radix__fixup_pgtable_fragments(); > +} > + > #endif /* CONFIG_MEMORY_HOTPLUG */ > > void __init mmu_partition_table_init(void) > @@ -326,6 +333,8 @@ void pmd_fragment_free(unsigned long *pmd) > > static inline void pgtable_free(void *table, int index) > { > + struct page *page; > + > switch (index) { > case PTE_INDEX: > pte_fragment_free(table, 0); > @@ -334,7 +343,11 @@ static inline void pgtable_free(void *table, int index) > pmd_fragment_free(table); > break; > case PUD_INDEX: > - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); > + page = virt_to_page(table); > + if (PageReserved(page)) > + free_reserved_page(page); > + else > + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); > break; > #if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) > /* 16M hugepd directory at pud level */ > diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c > b/arch/powerpc/mm/book3s64/radix_pgtable.c > index 273ae66a9a45..4167e1ba1c58 100644 > --- a/arch/powerpc/mm/book3s64/radix_pgtable.c > +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c > @@ -32,6 +32,81 @@ > unsigned int mmu_pid_bits; > unsigned int mmu_base_pid; > > +/* > + * Since we know that this page is a memblock-allocated page, > + * convert it into a normal page in addition to fixing the fragment > + * count. > + */ > +static void fix_fragment_count(struct page *page) > +{ > + ClearPageReserved(page); > + init_page_count(page); > + adjust_managed_page_count(page, 1); > + atomic_inc(&page->pt_frag_refcount); > +} > + > +static void fixup_pte_fragments(pmd_t *pmd) > +{ > + int i; > + > + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { > + pte_t *pte; > + > + if (pmd_none(*pmd)) > + continue; > + if (pmd_huge(*pmd)) > + continue; > + > + pte = pte_offset_kernel(pmd, 0); > + if (!pte_none(*pte)) > + fix_fragment_count(virt_to_page(pte)); > + } > +} > + > +static void fixup_pmd_fragments(pud_t *pud) > +{ > + int i; > + > + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { > + pmd_t *pmd; > + > + if (pud_none(*pud)) > + continue; > + if (pud_huge(*pud)) > + continue; > + > + pmd = pmd_offset(pud, 0); > + if (!pmd_none(*pmd)) > + fix_fragment_count(virt_to_page(pmd)); > + fixup_pte_fragments(pmd); > + } > +} > + > +/* > + * Walk the init_mm page tables and fixup the PMD and PTE fragment > + * counts. This allows the PUD, PMD and PTE pages to be freed > + * back to buddy allocator properly during memory unplug. > + */ > +void radix__fixup_pgtable_fragments(void) > +{ > + int i; > + pgd_t *pgd = pgd_offset_k(0UL); > + > + spin_lock(&init_mm.page_table_lock); > + for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { > + pud_t *pud; > + > + if (pgd_none(*pgd)) > + continue; > + if (pgd_huge(*pgd)) > + continue; > + > + pud = pud_offset(pgd, 0); > + fixup_pmd_fragments(pud); > + } > + spin_unlock(&init_mm.page_table_lock); > +} > + > static int native_register_process_table(unsigned long base, unsigned long > pg_sz, > unsigned long table_size) > { > @@ -80,7 +155,7 @@ static int early_map_kernel_page(unsigned long ea, > unsigned long pa, > > pgdp = pgd_offset_k(ea); > if (pgd_none(*pgdp)) { > - pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, > + pudp = early_alloc_pgtable(PAGE_SIZE, nid, > region_start, region_end); > pgd_populate(&init_mm, pgdp, pudp); > } > @@ -90,7 +165,7 @@ static int early_map_kernel_page(unsigned long ea, > unsigned long pa, > goto set_the_pte; > } > if (pud_none(*pudp)) { > - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, > + pmdp = early_alloc_pgtable(PAGE_SIZE, nid, > region_start, region_end); > pud_populate(&init_mm, pudp, pmdp); > } > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c > index cba29131bccc..a8788b404266 100644 > --- a/arch/powerpc/mm/mem.c > +++ b/arch/powerpc/mm/mem.c > @@ -51,6 +51,10 @@ > > #include <mm/mmu_decl.h> > > +void __weak fixup_pgtable_fragments(void) > +{ > +} > + > #ifndef CPU_FTR_COHERENT_ICACHE > #define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */ > #define CPU_FTR_NOEXECUTE 0 > @@ -276,6 +280,7 @@ void __init mem_init(void) > set_max_mapnr(max_pfn); > memblock_free_all(); > > + fixup_pgtable_fragments(); > #ifdef CONFIG_HIGHMEM > { > unsigned long pfn, highmem_mapnr; > -- > 2.17.1