The patch 99baac21e4 ("mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem") added a force flush mode to the mmu_gather flush, which unconditionally flushes the entire address range being invalidated (even if actual ptes only covered a smaller range), to solve a problem with concurrent threads invalidating the same PTEs causing them to miss TLBs that need flushing.
This does not work with powerpc that invalidates mmu_gather batches according to page size. Have powerpc flush all possible page sizes in the range if it encounters this concurrency condition. Patch 4647706ebe ("mm: always flush VMA ranges affected by zap_page_range") does add a TLB flush for all page sizes on powerpc for the zap_page_range case, but that is to be removed and replaced with the mmu_gather flush to avoid redundant flushing. It is also thought to not cover other obscure race conditions: https://lkml.kernel.org/r/bd3a0ebe-ecf4-41d4-87fa-c755ea9ab...@gmail.com Hash does not have a problem because it invalidates TLBs inside the page table locks. Reported-by: Aneesh Kumar K.V <aneesh.ku...@linux.ibm.com> Signed-off-by: Nicholas Piggin <npig...@gmail.com> --- Since v1: - Compile fix or !THP - Fixed missing PWC flush case - Fixed concurrent TLB flush test - Expanded changelog I think this is a required fix for existing kernels, at least to be safe and bring the flushig in to line with other architctures I think we should add this as a fix. For the next kernel release I will remove the duplicate flush in zap_page_range so this would definitely be needed. arch/powerpc/mm/tlb-radix.c | 92 +++++++++++++++++++++++++++++-------- include/linux/huge_mm.h | 10 +--- 2 files changed, 76 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 67a6e86d3e7e..919232a59ea1 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -689,22 +689,17 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; -void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) +static inline void __radix__flush_tlb_range(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool flush_all_sizes) { - struct mm_struct *mm = vma->vm_mm; unsigned long pid; unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; bool local, full; -#ifdef CONFIG_HUGETLB_PAGE - if (is_vm_hugetlb_page(vma)) - return radix__flush_hugetlb_tlb_range(vma, start, end); -#endif - pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) return; @@ -738,16 +733,27 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, _tlbie_pid(pid, RIC_FLUSH_TLB); } } else { - bool hflush = false; + bool hflush = flush_all_sizes; + bool gflush = flush_all_sizes; unsigned long hstart, hend; + unsigned long gstart, gend; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT; - hend = end >> HPAGE_PMD_SHIFT; - if (hstart < hend) { - hstart <<= HPAGE_PMD_SHIFT; - hend <<= HPAGE_PMD_SHIFT; +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) hflush = true; + + if (hflush) { + hstart = (start + HPAGE_PMD_SIZE - 1) & HPAGE_PMD_MASK; + hend = end & HPAGE_PMD_MASK; + if (hstart == hend) + hflush = false; + } + + if (gflush) { + gstart = (start + HPAGE_PUD_SIZE - 1) & HPAGE_PUD_MASK; + gend = end & HPAGE_PUD_MASK; + if (gstart == gend) + gflush = false; } #endif @@ -757,18 +763,36 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, if (hflush) __tlbiel_va_range(hstart, hend, pid, HPAGE_PMD_SIZE, MMU_PAGE_2M); + if (gflush) + __tlbiel_va_range(gstart, gend, pid, + HPAGE_PUD_SIZE, MMU_PAGE_1G); asm volatile("ptesync": : :"memory"); } else { __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbie_va_range(hstart, hend, pid, HPAGE_PMD_SIZE, MMU_PAGE_2M); + if (gflush) + __tlbie_va_range(gstart, gend, pid, + HPAGE_PUD_SIZE, MMU_PAGE_1G); fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } } preempt_enable(); } + +void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_tlb_range(vma, start, end); +#endif + + __radix__flush_tlb_range(vma->vm_mm, start, end, false); +} EXPORT_SYMBOL(radix__flush_tlb_range); static int radix_get_mmu_psize(int page_size) @@ -837,6 +861,8 @@ void radix__tlb_flush(struct mmu_gather *tlb) int psize = 0; struct mm_struct *mm = tlb->mm; int page_size = tlb->page_size; + unsigned long start = tlb->start; + unsigned long end = tlb->end; /* * if page size is not something we understand, do a full mm flush @@ -847,15 +873,45 @@ void radix__tlb_flush(struct mmu_gather *tlb) */ if (tlb->fullmm) { __flush_all_mm(mm, true); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) + } else if (mm_tlb_flush_nested(mm)) { + /* + * If there is a concurrent invalidation that is clearing ptes, + * then it's possible this invalidation will miss one of those + * cleared ptes and miss flushing the TLB. If this invalidate + * returns before the other one flushes TLBs, that can result + * in it returning while there are still valid TLBs inside the + * range to be invalidated. + * + * See mm/memory.c:tlb_finish_mmu() for more details. + * + * The solution to this is ensure the entire range is always + * flushed here. The problem for powerpc is that the flushes + * are page size specific, so this "forced flush" would not + * do the right thing if there are a mix of page sizes in + * the range to be invalidated. So use __flush_tlb_range + * which invalidates all possible page sizes in the range. + * + * PWC flush probably is not be required because the core code + * shouldn't free page tables in this path, but accounting + * for the possibility makes us a bit more robust. + * + * need_flush_all is an uncommon case because page table + * teardown should be done with exclusive locks held (but + * after locks are dropped another invalidate could come + * in), it could be optimized further if necessary. + */ + if (!tlb->need_flush_all) + __radix__flush_tlb_range(mm, start, end, true); + else + radix__flush_all_mm(mm); +#endif } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { if (!tlb->need_flush_all) radix__flush_tlb_mm(mm); else radix__flush_all_mm(mm); } else { - unsigned long start = tlb->start; - unsigned long end = tlb->end; - if (!tlb->need_flush_all) radix__flush_tlb_range_psize(mm, start, end, psize); else diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a8a126259bc4..f7fe2b20efb3 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -79,7 +79,6 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) -#ifdef CONFIG_TRANSPARENT_HUGEPAGE #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) #define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) @@ -88,6 +87,8 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + extern bool is_vma_temporary_stack(struct vm_area_struct *vma); extern unsigned long transparent_hugepage_flags; @@ -246,13 +247,6 @@ static inline bool thp_migration_supported(void) } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) -#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) -#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) - -#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; }) -#define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) -#define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) #define hpage_nr_pages(x) 1 -- 2.17.0