On 05/21/2019 03:50 PM, David Hildenbrand wrote:
> On 20.05.19 07:18, Anshuman Khandual wrote:
>> The arch code for hot-remove must tear down portions of the linear map and
>> vmemmap corresponding to memory being removed. In both cases the page
>> tables mapping these regions must be freed, and when sparse vmemmap is in
>> use the memory backing the vmemmap must also be freed.
>>
>> This patch adds a new remove_pagetable() helper which can be used to tear
>> down either region, and calls it from vmemmap_free() and
>> ___remove_pgd_mapping(). The sparse_vmap argument determines whether the
>> backing memory will be freed.
>>
>> While freeing intermediate level page table pages bail out if any of it's
>> entries are still valid. This can happen for partially filled kernel page
>> table either from a previously attempted failed memory hot add or while
>> removing an address range which does not span the entire page table page
>> range.
>>
>> The vmemmap region may share levels of table with the vmalloc region. Take
>> the kernel ptl so that we can safely free potentially-shared tables.
>>
>> While here update arch_add_memory() to handle __add_pages() failures by
>> just unmapping recently added kernel linear mapping. Now enable memory hot
>> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
>>
>> This implementation is overall inspired from kernel page table tear down
>> procedure on X86 architecture.
>>
>> Signed-off-by: Anshuman Khandual <anshuman.khand...@arm.com>
>> ---
>>  arch/arm64/Kconfig  |   3 +
>>  arch/arm64/mm/mmu.c | 212 
>> +++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  2 files changed, 213 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index 4780eb7..ce24427 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -267,6 +267,9 @@ config HAVE_GENERIC_GUP
>>  config ARCH_ENABLE_MEMORY_HOTPLUG
>>      def_bool y
>>  
>> +config ARCH_ENABLE_MEMORY_HOTREMOVE
>> +    def_bool y
>> +
>>  config SMP
>>      def_bool y
>>  
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index a1bfc44..0cf0d41 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -733,6 +733,187 @@ int kern_addr_valid(unsigned long addr)
>>  
>>      return pfn_valid(pte_pfn(pte));
>>  }
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void free_hotplug_page_range(struct page *page, ssize_t size)
>> +{
>> +    WARN_ON(PageReserved(page));
>> +    free_pages((unsigned long)page_address(page), get_order(size));
>> +}
>> +
>> +static void free_hotplug_pgtable_page(struct page *page)
>> +{
>> +    free_hotplug_page_range(page, PAGE_SIZE);
>> +}
>> +
>> +static void free_pte_table(pte_t *ptep, pmd_t *pmdp, unsigned long addr)
>> +{
>> +    struct page *page;
>> +    int i;
>> +
>> +    for (i = 0; i < PTRS_PER_PTE; i++) {
>> +            if (!pte_none(ptep[i]))
>> +                    return;
>> +    }
>> +
>> +    page = pmd_page(READ_ONCE(*pmdp));
>> +    pmd_clear(pmdp);
>> +    __flush_tlb_kernel_pgtable(addr);
>> +    free_hotplug_pgtable_page(page);
>> +}
>> +
>> +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr)
>> +{
>> +    struct page *page;
>> +    int i;
>> +
>> +    if (CONFIG_PGTABLE_LEVELS <= 2)
>> +            return;
>> +
>> +    for (i = 0; i < PTRS_PER_PMD; i++) {
>> +            if (!pmd_none(pmdp[i]))
>> +                    return;
>> +    }
>> +
>> +    page = pud_page(READ_ONCE(*pudp));
>> +    pud_clear(pudp);
>> +    __flush_tlb_kernel_pgtable(addr);
>> +    free_hotplug_pgtable_page(page);
>> +}
>> +
>> +static void free_pud_table(pud_t *pudp, pgd_t *pgdp, unsigned long addr)
>> +{
>> +    struct page *page;
>> +    int i;
>> +
>> +    if (CONFIG_PGTABLE_LEVELS <= 3)
>> +            return;
>> +
>> +    for (i = 0; i < PTRS_PER_PUD; i++) {
>> +            if (!pud_none(pudp[i]))
>> +                    return;
>> +    }
>> +
>> +    page = pgd_page(READ_ONCE(*pgdp));
>> +    pgd_clear(pgdp);
>> +    __flush_tlb_kernel_pgtable(addr);
>> +    free_hotplug_pgtable_page(page);
>> +}
>> +
>> +static void
>> +remove_pte_table(pmd_t *pmdp, unsigned long addr,
>> +                    unsigned long end, bool sparse_vmap)
>> +{
>> +    struct page *page;
>> +    pte_t *ptep, pte;
>> +    unsigned long start = addr;
>> +
>> +    for (; addr < end; addr += PAGE_SIZE) {
>> +            ptep = pte_offset_kernel(pmdp, addr);
>> +            pte = READ_ONCE(*ptep);
>> +
>> +            if (pte_none(pte))
>> +                    continue;
>> +
>> +            WARN_ON(!pte_present(pte));
>> +            if (sparse_vmap) {
>> +                    page = pte_page(pte);
>> +                    free_hotplug_page_range(page, PAGE_SIZE);
>> +            }
>> +            pte_clear(&init_mm, addr, ptep);
>> +    }
>> +    flush_tlb_kernel_range(start, end);
>> +}
>> +
>> +static void
>> +remove_pmd_table(pud_t *pudp, unsigned long addr,
>> +                    unsigned long end, bool sparse_vmap)
>> +{
>> +    unsigned long next;
>> +    struct page *page;
>> +    pte_t *ptep_base;
>> +    pmd_t *pmdp, pmd;
>> +
>> +    for (; addr < end; addr = next) {
>> +            next = pmd_addr_end(addr, end);
>> +            pmdp = pmd_offset(pudp, addr);
>> +            pmd = READ_ONCE(*pmdp);
>> +
>> +            if (pmd_none(pmd))
>> +                    continue;
>> +
>> +            WARN_ON(!pmd_present(pmd));
>> +            if (pmd_sect(pmd)) {
>> +                    if (sparse_vmap) {
>> +                            page = pmd_page(pmd);
>> +                            free_hotplug_page_range(page, PMD_SIZE);
>> +                    }
>> +                    pmd_clear(pmdp);
>> +                    continue;
>> +            }
>> +            ptep_base = pte_offset_kernel(pmdp, 0UL);
>> +            remove_pte_table(pmdp, addr, next, sparse_vmap);
>> +            free_pte_table(ptep_base, pmdp, addr);
>> +    }
>> +}
>> +
>> +static void
>> +remove_pud_table(pgd_t *pgdp, unsigned long addr,
>> +                    unsigned long end, bool sparse_vmap)
>> +{
>> +    unsigned long next;
>> +    struct page *page;
>> +    pmd_t *pmdp_base;
>> +    pud_t *pudp, pud;
>> +
>> +    for (; addr < end; addr = next) {
>> +            next = pud_addr_end(addr, end);
>> +            pudp = pud_offset(pgdp, addr);
>> +            pud = READ_ONCE(*pudp);
>> +
>> +            if (pud_none(pud))
>> +                    continue;
>> +
>> +            WARN_ON(!pud_present(pud));
>> +            if (pud_sect(pud)) {
>> +                    if (sparse_vmap) {
>> +                            page = pud_page(pud);
>> +                            free_hotplug_page_range(page, PUD_SIZE);
>> +                    }
>> +                    pud_clear(pudp);
>> +                    continue;
>> +            }
>> +            pmdp_base = pmd_offset(pudp, 0UL);
>> +            remove_pmd_table(pudp, addr, next, sparse_vmap);
>> +            free_pmd_table(pmdp_base, pudp, addr);
>> +    }
>> +}
>> +
>> +static void
>> +remove_pagetable(unsigned long start, unsigned long end, bool sparse_vmap)
>> +{
>> +    unsigned long addr, next;
>> +    pud_t *pudp_base;
>> +    pgd_t *pgdp, pgd;
>> +
>> +    spin_lock(&init_mm.page_table_lock);
>> +    for (addr = start; addr < end; addr = next) {
>> +            next = pgd_addr_end(addr, end);
>> +            pgdp = pgd_offset_k(addr);
>> +            pgd = READ_ONCE(*pgdp);
>> +
>> +            if (pgd_none(pgd))
>> +                    continue;
>> +
>> +            WARN_ON(!pgd_present(pgd));
>> +            pudp_base = pud_offset(pgdp, 0UL);
>> +            remove_pud_table(pgdp, addr, next, sparse_vmap);
>> +            free_pud_table(pudp_base, pgdp, addr);
>> +    }
>> +    spin_unlock(&init_mm.page_table_lock);
>> +}
>> +#endif
>> +
>>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>>  #if !ARM64_SWAPPER_USES_SECTION_MAPS
>>  int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
>> node,
>> @@ -780,6 +961,9 @@ int __meminit vmemmap_populate(unsigned long start, 
>> unsigned long end, int node,
>>  void vmemmap_free(unsigned long start, unsigned long end,
>>              struct vmem_altmap *altmap)
>>  {
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +    remove_pagetable(start, end, true);
>> +#endif
>>  }
>>  #endif      /* CONFIG_SPARSEMEM_VMEMMAP */
>>  
>> @@ -1070,10 +1254,16 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
>>  }
>>  
>>  #ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 
>> size)
>> +{
>> +    WARN_ON(pgdir != init_mm.pgd);
>> +    remove_pagetable(start, start + size, false);
>> +}
>> +
>>  int arch_add_memory(int nid, u64 start, u64 size,
>>                      struct mhp_restrictions *restrictions)
>>  {
>> -    int flags = 0;
>> +    int ret, flags = 0;
>>  
>>      if (rodata_full || debug_pagealloc_enabled())
>>              flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>> @@ -1081,7 +1271,25 @@ int arch_add_memory(int nid, u64 start, u64 size,
>>      __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
>>                           size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
>>  
>> -    return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>> +    ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>>                         restrictions);
>> +    if (ret)
>> +            __remove_pgd_mapping(swapper_pg_dir,
>> +                                    __phys_to_virt(start), size);
> 
> Nit: Indentation of the parameters looks really weird.
> 
>> +    return ret;
>> +}
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +void arch_remove_memory(int nid, u64 start, u64 size,
>> +                            struct vmem_altmap *altmap)
>> +{
>> +    unsigned long start_pfn = start >> PAGE_SHIFT;
>> +    unsigned long nr_pages = size >> PAGE_SHIFT;
>> +    struct zone *zone = page_zone(pfn_to_page(start_pfn));
>> +
>> +    __remove_pages(zone, start_pfn, nr_pages, altmap);
>> +    __remove_pgd_mapping(swapper_pg_dir,
>> +                                    __phys_to_virt(start), size);
> 
> Dito, indentation of the parameters.
> 
> For these two changes (arch_*_memory)
> 
> Acked-by: David Hildenbrand <da...@redhat.com>

Thanks David. The following change on this patch should fix the indentation
problem. If there are no other comments I will incorporate this and re-spin
the series once more.

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 0cf0d41..a87ba18 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1273,9 +1273,10 @@ int arch_add_memory(int nid, u64 start, u64 size,
 
        ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
                           restrictions);
-       if (ret)
-               __remove_pgd_mapping(swapper_pg_dir,
-                                       __phys_to_virt(start), size);
+       if (!ret)
+               return ret;
+
+       __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
        return ret;
 }
 
@@ -1288,8 +1289,7 @@ void arch_remove_memory(int nid, u64 start, u64 size,
        struct zone *zone = page_zone(pfn_to_page(start_pfn));
 
        __remove_pages(zone, start_pfn, nr_pages, altmap);
-       __remove_pgd_mapping(swapper_pg_dir,
-                                       __phys_to_virt(start), size);
+       __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
 }
 #endif
 #endif

Reply via email to