Bharata B Rao <bhar...@linux.ibm.com> writes: > We can hit the following BUG_ON during memory unplug > > kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:344! > Oops: Exception in kernel mode, sig: 5 [#1] > LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries > NIP [c000000000097d48] pmd_fragment_free+0x48/0xd0 > LR [c0000000016aaefc] remove_pagetable+0x494/0x530 > Call Trace: > _raw_spin_lock+0x54/0x80 (unreliable) > remove_pagetable+0x2b0/0x530 > radix__remove_section_mapping+0x18/0x2c > remove_section_mapping+0x38/0x5c > arch_remove_memory+0x124/0x190 > try_remove_memory+0xd0/0x1c0 > __remove_memory+0x20/0x40 > dlpar_remove_lmb+0xbc/0x110 > dlpar_memory+0xa90/0xd40 > handle_dlpar_errorlog+0xa8/0x160 > pseries_hp_work_fn+0x2c/0x60 > process_one_work+0x47c/0x870 > worker_thread+0x364/0x5e0 > kthread+0x1b4/0x1c0 > ret_from_kernel_thread+0x5c/0x74 > > This occurs when unplug is attempted for such memory which has > been mapped using memblock pages as part of early kernel page > table setup. We wouldn't have initialized the PMD or PTE fragment > count for those PMD or PTE pages. > > Fixing this includes 3 parts: > > - Re-walk the init_mm page tables from mem_init() and initialize > the PMD and PTE fragment count to 1. > - When freeing PUD, PMD and PTE page table pages, check explicitly > if they come from memblock and if so free then appropriately. > - When we do early memblock based allocation of PMD and PUD pages, > allocate in PAGE_SIZE granularity so that we are sure the > complete page is used as pagetable page. > > Since we now do PAGE_SIZE allocations for both PUD table and > PMD table (Note that PTE table allocation is already of PAGE_SIZE), > we end up allocating more memory for the same amount of system RAM. > Here is a comparision of how much more we need for a 64T and 2G > system after this patch: > > 1. 64T system > ------------- > 64T RAM would need 64G for vmemmap with struct page size being 64B. > > 128 PUD tables for 64T memory (1G mappings) > 1 PUD table and 64 PMD tables for 64G vmemmap (2M mappings) > > With default PUD[PMD]_TABLE_SIZE(4K), (128+1+64)*4K=772K > With PAGE_SIZE(64K) table allocations, (128+1+64)*64K=12352K > > 2. 2G system > ------------ > 2G RAM would need 2M for vmemmap with struct page size being 64B. > > 1 PUD table for 2G memory (1G mapping) > 1 PUD table and 1 PMD table for 2M vmemmap (2M mappings) > > With default PUD[PMD]_TABLE_SIZE(4K), (1+1+1)*4K=12K > With new PAGE_SIZE(64K) table allocations, (1+1+1)*64K=192K >
Reviewed-by: Aneesh Kumar K.V <aneesh.ku...@linux.ibm.com> > Signed-off-by: Bharata B Rao <bhar...@linux.ibm.com> > --- > arch/powerpc/include/asm/book3s/64/pgalloc.h | 11 ++- > arch/powerpc/include/asm/book3s/64/radix.h | 1 + > arch/powerpc/include/asm/sparsemem.h | 1 + > arch/powerpc/mm/book3s64/pgtable.c | 31 ++++++++- > arch/powerpc/mm/book3s64/radix_pgtable.c | 72 ++++++++++++++++++-- > arch/powerpc/mm/mem.c | 5 ++ > arch/powerpc/mm/pgtable-frag.c | 9 ++- > 7 files changed, 121 insertions(+), 9 deletions(-) > > diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h > b/arch/powerpc/include/asm/book3s/64/pgalloc.h > index a41e91bd0580..e96572fb2871 100644 > --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h > +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h > @@ -109,7 +109,16 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, > unsigned long addr) > > static inline void pud_free(struct mm_struct *mm, pud_t *pud) > { > - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud); > + struct page *page = virt_to_page(pud); > + > + /* > + * Early pud pages allocated via memblock allocator > + * can't be directly freed to slab > + */ > + if (PageReserved(page)) > + free_reserved_page(page); > + else > + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud); > } > > static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) > diff --git a/arch/powerpc/include/asm/book3s/64/radix.h > b/arch/powerpc/include/asm/book3s/64/radix.h > index d97db3ad9aae..0aff8750181a 100644 > --- a/arch/powerpc/include/asm/book3s/64/radix.h > +++ b/arch/powerpc/include/asm/book3s/64/radix.h > @@ -291,6 +291,7 @@ static inline unsigned long radix__get_tree_size(void) > #ifdef CONFIG_MEMORY_HOTPLUG > int radix__create_section_mapping(unsigned long start, unsigned long end, > int nid); > int radix__remove_section_mapping(unsigned long start, unsigned long end); > +void radix__fixup_pgtable_fragments(void); > #endif /* CONFIG_MEMORY_HOTPLUG */ > #endif /* __ASSEMBLY__ */ > #endif > diff --git a/arch/powerpc/include/asm/sparsemem.h > b/arch/powerpc/include/asm/sparsemem.h > index 3192d454a733..e662f9232d35 100644 > --- a/arch/powerpc/include/asm/sparsemem.h > +++ b/arch/powerpc/include/asm/sparsemem.h > @@ -15,6 +15,7 @@ > #ifdef CONFIG_MEMORY_HOTPLUG > extern int create_section_mapping(unsigned long start, unsigned long end, > int nid); > extern int remove_section_mapping(unsigned long start, unsigned long end); > +void fixup_pgtable_fragments(void); > > #ifdef CONFIG_PPC_BOOK3S_64 > extern int resize_hpt_for_hotplug(unsigned long new_mem_size); > diff --git a/arch/powerpc/mm/book3s64/pgtable.c > b/arch/powerpc/mm/book3s64/pgtable.c > index 2bf7e1b4fd82..be7aa8786747 100644 > --- a/arch/powerpc/mm/book3s64/pgtable.c > +++ b/arch/powerpc/mm/book3s64/pgtable.c > @@ -186,6 +186,13 @@ int __meminit remove_section_mapping(unsigned long > start, unsigned long end) > > return hash__remove_section_mapping(start, end); > } > + > +void fixup_pgtable_fragments(void) > +{ > + if (radix_enabled()) > + radix__fixup_pgtable_fragments(); > +} > + > #endif /* CONFIG_MEMORY_HOTPLUG */ > > void __init mmu_partition_table_init(void) > @@ -343,13 +350,23 @@ void pmd_fragment_free(unsigned long *pmd) > > BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); > if (atomic_dec_and_test(&page->pt_frag_refcount)) { > - pgtable_pmd_page_dtor(page); > - __free_page(page); > + /* > + * Early pmd pages allocated via memblock > + * allocator wouldn't have called _ctor > + */ > + if (PageReserved(page)) > + free_reserved_page(page); > + else { > + pgtable_pmd_page_dtor(page); > + __free_page(page); > + } > } > } > > static inline void pgtable_free(void *table, int index) > { > + struct page *page; > + > switch (index) { > case PTE_INDEX: > pte_fragment_free(table, 0); > @@ -358,7 +375,15 @@ static inline void pgtable_free(void *table, int index) > pmd_fragment_free(table); > break; > case PUD_INDEX: > - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); > + page = virt_to_page(table); > + /* > + * Early pud pages allocated via memblock > + * allocator need to be freed differently > + */ > + if (PageReserved(page)) > + free_reserved_page(page); > + else > + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); > break; > #if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) > /* 16M hugepd directory at pud level */ > diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c > b/arch/powerpc/mm/book3s64/radix_pgtable.c > index 4a4fb30f6c3d..e675c0bbf9a4 100644 > --- a/arch/powerpc/mm/book3s64/radix_pgtable.c > +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c > @@ -36,6 +36,70 @@ > unsigned int mmu_pid_bits; > unsigned int mmu_base_pid; > > +static void fixup_pte_fragments(pmd_t *pmd) > +{ > + int i; > + > + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { > + pte_t *pte; > + struct page *page; > + > + if (pmd_none(*pmd)) > + continue; > + if (pmd_is_leaf(*pmd)) > + continue; > + > + pte = pte_offset_kernel(pmd, 0); > + page = virt_to_page(pte); > + atomic_inc(&page->pt_frag_refcount); > + } > +} > + > +static void fixup_pmd_fragments(pud_t *pud) > +{ > + int i; > + > + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { > + pmd_t *pmd; > + struct page *page; > + > + if (pud_none(*pud)) > + continue; > + if (pud_is_leaf(*pud)) > + continue; > + > + pmd = pmd_offset(pud, 0); > + page = virt_to_page(pmd); > + atomic_inc(&page->pt_frag_refcount); > + fixup_pte_fragments(pmd); > + } > +} > + > +/* > + * Walk the init_mm page tables and fixup the PMD and PTE fragment > + * counts. This allows the PUD, PMD and PTE pages to be freed > + * back to buddy allocator properly during memory unplug. > + */ > +void radix__fixup_pgtable_fragments(void) > +{ > + int i; > + pgd_t *pgd = pgd_offset_k(0UL); > + > + spin_lock(&init_mm.page_table_lock); > + for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { > + pud_t *pud; > + > + if (pgd_none(*pgd)) > + continue; > + if (pgd_is_leaf(*pgd)) > + continue; > + > + pud = pud_offset(pgd, 0); > + fixup_pmd_fragments(pud); > + } > + spin_unlock(&init_mm.page_table_lock); > +} > + > static __ref void *early_alloc_pgtable(unsigned long size, int nid, > unsigned long region_start, unsigned long region_end) > { > @@ -71,8 +135,8 @@ static int early_map_kernel_page(unsigned long ea, > unsigned long pa, > > pgdp = pgd_offset_k(ea); > if (pgd_none(*pgdp)) { > - pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, > - region_start, region_end); > + pudp = early_alloc_pgtable(PAGE_SIZE, nid, region_start, > + region_end); > pgd_populate(&init_mm, pgdp, pudp); > } > pudp = pud_offset(pgdp, ea); > @@ -81,8 +145,8 @@ static int early_map_kernel_page(unsigned long ea, > unsigned long pa, > goto set_the_pte; > } > if (pud_none(*pudp)) { > - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, > - region_start, region_end); > + pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start, > + region_end); > pud_populate(&init_mm, pudp, pmdp); > } > pmdp = pmd_offset(pudp, ea); > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c > index 1c07d5a3f543..d43ad701f693 100644 > --- a/arch/powerpc/mm/mem.c > +++ b/arch/powerpc/mm/mem.c > @@ -53,6 +53,10 @@ > > #include <mm/mmu_decl.h> > > +void __weak fixup_pgtable_fragments(void) > +{ > +} > + > #ifndef CPU_FTR_COHERENT_ICACHE > #define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */ > #define CPU_FTR_NOEXECUTE 0 > @@ -307,6 +311,7 @@ void __init mem_init(void) > > memblock_free_all(); > > + fixup_pgtable_fragments(); > #ifdef CONFIG_HIGHMEM > { > unsigned long pfn, highmem_mapnr; > diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c > index ee4bd6d38602..16213c09896a 100644 > --- a/arch/powerpc/mm/pgtable-frag.c > +++ b/arch/powerpc/mm/pgtable-frag.c > @@ -114,6 +114,13 @@ void pte_fragment_free(unsigned long *table, int kernel) > if (atomic_dec_and_test(&page->pt_frag_refcount)) { > if (!kernel) > pgtable_pte_page_dtor(page); > - __free_page(page); > + /* > + * Early pte pages allocated via memblock > + * allocator need to be freed differently > + */ > + if (PageReserved(page)) > + free_reserved_page(page); > + else > + __free_page(page); > } > } > -- > 2.21.0