commit: 97475e3deeb706adf19de4dc8380076168017fd8 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org> AuthorDate: Sun Jul 11 14:46:23 2021 +0000 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org> CommitDate: Sun Jul 11 14:46:23 2021 +0000 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=97475e3d
Linux patch 4.14.239 Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org> 0000_README | 4 + 1238_linux-4.14.239.patch | 872 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 876 insertions(+) diff --git a/0000_README b/0000_README index 487ae9d..a52d064 100644 --- a/0000_README +++ b/0000_README @@ -995,6 +995,10 @@ Patch: 1237_linux-4.14.238.patch From: https://www.kernel.org Desc: Linux 4.14.238 +Patch: 1238_linux-4.14.239.patch +From: https://www.kernel.org +Desc: Linux 4.14.239 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1238_linux-4.14.239.patch b/1238_linux-4.14.239.patch new file mode 100644 index 0000000..214f7fe --- /dev/null +++ b/1238_linux-4.14.239.patch @@ -0,0 +1,872 @@ +diff --git a/Makefile b/Makefile +index 5442918651e00..3bb379664a96e 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 4 + PATCHLEVEL = 14 +-SUBLEVEL = 238 ++SUBLEVEL = 239 + EXTRAVERSION = + NAME = Petit Gorille + +diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c +index e427f80344c4d..a2d770acd10a9 100644 +--- a/drivers/gpu/drm/nouveau/nouveau_bo.c ++++ b/drivers/gpu/drm/nouveau/nouveau_bo.c +@@ -450,7 +450,7 @@ nouveau_bo_sync_for_device(struct nouveau_bo *nvbo) + struct ttm_dma_tt *ttm_dma = (struct ttm_dma_tt *)nvbo->bo.ttm; + int i; + +- if (!ttm_dma) ++ if (!ttm_dma || !ttm_dma->dma_address) + return; + + /* Don't waste time looping if the object is coherent */ +@@ -470,7 +470,7 @@ nouveau_bo_sync_for_cpu(struct nouveau_bo *nvbo) + struct ttm_dma_tt *ttm_dma = (struct ttm_dma_tt *)nvbo->bo.ttm; + int i; + +- if (!ttm_dma) ++ if (!ttm_dma || !ttm_dma->dma_address) + return; + + /* Don't waste time looping if the object is coherent */ +diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c +index 5be3d6b7991b4..a46fbe2d2ee63 100644 +--- a/drivers/scsi/sr.c ++++ b/drivers/scsi/sr.c +@@ -216,6 +216,8 @@ static unsigned int sr_get_events(struct scsi_device *sdev) + return DISK_EVENT_EJECT_REQUEST; + else if (med->media_event_code == 2) + return DISK_EVENT_MEDIA_CHANGE; ++ else if (med->media_event_code == 3) ++ return DISK_EVENT_EJECT_REQUEST; + return 0; + } + +diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c +index b370144682ed5..a2f8130e18fec 100644 +--- a/drivers/xen/events/events_base.c ++++ b/drivers/xen/events/events_base.c +@@ -524,6 +524,9 @@ static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious) + } + + info->eoi_time = 0; ++ ++ /* is_active hasn't been reset yet, do it now. */ ++ smp_store_release(&info->is_active, 0); + do_unmask(info, EVT_MASK_REASON_EOI_PENDING); + } + +@@ -1780,10 +1783,22 @@ static void lateeoi_ack_dynirq(struct irq_data *data) + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; + +- if (VALID_EVTCHN(evtchn)) { +- do_mask(info, EVT_MASK_REASON_EOI_PENDING); +- ack_dynirq(data); +- } ++ if (!VALID_EVTCHN(evtchn)) ++ return; ++ ++ do_mask(info, EVT_MASK_REASON_EOI_PENDING); ++ ++ if (unlikely(irqd_is_setaffinity_pending(data)) && ++ likely(!irqd_irq_disabled(data))) { ++ do_mask(info, EVT_MASK_REASON_TEMPORARY); ++ ++ clear_evtchn(evtchn); ++ ++ irq_move_masked_irq(data); ++ ++ do_unmask(info, EVT_MASK_REASON_TEMPORARY); ++ } else ++ clear_evtchn(evtchn); + } + + static void lateeoi_mask_ack_dynirq(struct irq_data *data) +diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h +index fe0ec0a29db7c..d2b5cc8ce54f9 100644 +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -467,17 +467,6 @@ static inline int hstate_index(struct hstate *h) + return h - hstates; + } + +-pgoff_t __basepage_index(struct page *page); +- +-/* Return page->index in PAGE_SIZE units */ +-static inline pgoff_t basepage_index(struct page *page) +-{ +- if (!PageCompound(page)) +- return page->index; +- +- return __basepage_index(page); +-} +- + extern int dissolve_free_huge_page(struct page *page); + extern int dissolve_free_huge_pages(unsigned long start_pfn, + unsigned long end_pfn); +@@ -572,11 +561,6 @@ static inline int hstate_index(struct hstate *h) + return 0; + } + +-static inline pgoff_t basepage_index(struct page *page) +-{ +- return page->index; +-} +- + static inline int dissolve_free_huge_page(struct page *page) + { + return 0; +diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h +index 41eb6fdf87a84..86b5fb08e96cd 100644 +--- a/include/linux/kfifo.h ++++ b/include/linux/kfifo.h +@@ -113,7 +113,8 @@ struct kfifo_rec_ptr_2 __STRUCT_KFIFO_PTR(unsigned char, 2, void); + * array is a part of the structure and the fifo type where the array is + * outside of the fifo structure. + */ +-#define __is_kfifo_ptr(fifo) (sizeof(*fifo) == sizeof(struct __kfifo)) ++#define __is_kfifo_ptr(fifo) \ ++ (sizeof(*fifo) == sizeof(STRUCT_KFIFO_PTR(typeof(*(fifo)->type)))) + + /** + * DECLARE_KFIFO_PTR - macro to declare a fifo pointer object +diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h +index 57b0030d38007..5d0767cb424aa 100644 +--- a/include/linux/mmdebug.h ++++ b/include/linux/mmdebug.h +@@ -37,10 +37,22 @@ void dump_mm(const struct mm_struct *mm); + BUG(); \ + } \ + } while (0) +-#define VM_WARN_ON(cond) WARN_ON(cond) +-#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) +-#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) +-#define VM_WARN(cond, format...) WARN(cond, format) ++#define VM_WARN_ON_ONCE_PAGE(cond, page) ({ \ ++ static bool __section(".data.once") __warned; \ ++ int __ret_warn_once = !!(cond); \ ++ \ ++ if (unlikely(__ret_warn_once && !__warned)) { \ ++ dump_page(page, "VM_WARN_ON_ONCE_PAGE(" __stringify(cond)")");\ ++ __warned = true; \ ++ WARN_ON(1); \ ++ } \ ++ unlikely(__ret_warn_once); \ ++}) ++ ++#define VM_WARN_ON(cond) (void)WARN_ON(cond) ++#define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) ++#define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format) ++#define VM_WARN(cond, format...) (void)WARN(cond, format) + #else + #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) + #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) +@@ -48,6 +60,7 @@ void dump_mm(const struct mm_struct *mm); + #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond) + #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) + #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) ++#define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) + #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) + #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) + #endif +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index e08b5339023c0..84c7fc7f63e73 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -399,7 +399,7 @@ static inline struct page *read_mapping_page(struct address_space *mapping, + } + + /* +- * Get index of the page with in radix-tree ++ * Get index of the page within radix-tree (but not for hugetlb pages). + * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) + */ + static inline pgoff_t page_to_index(struct page *page) +@@ -418,15 +418,16 @@ static inline pgoff_t page_to_index(struct page *page) + return pgoff; + } + ++extern pgoff_t hugetlb_basepage_index(struct page *page); ++ + /* +- * Get the offset in PAGE_SIZE. +- * (TODO: hugepage should have ->index in PAGE_SIZE) ++ * Get the offset in PAGE_SIZE (even for hugetlb pages). ++ * (TODO: hugetlb pages should have ->index in PAGE_SIZE) + */ + static inline pgoff_t page_to_pgoff(struct page *page) + { +- if (unlikely(PageHeadHuge(page))) +- return page->index << compound_order(page); +- ++ if (unlikely(PageHuge(page))) ++ return hugetlb_basepage_index(page); + return page_to_index(page); + } + +diff --git a/include/linux/rmap.h b/include/linux/rmap.h +index d7d6d4eb17949..91ccae9467164 100644 +--- a/include/linux/rmap.h ++++ b/include/linux/rmap.h +@@ -98,7 +98,8 @@ enum ttu_flags { + * do a final flush if necessary */ + TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: + * caller holds it */ +- TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */ ++ TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */ ++ TTU_SYNC = 0x200, /* avoid racy checks with PVMW_SYNC */ + }; + + #ifdef CONFIG_MMU +diff --git a/kernel/futex.c b/kernel/futex.c +index af1d9a9939887..e282c083df59d 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -719,7 +719,7 @@ again: + + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ + key->shared.i_seq = get_inode_sequence_number(inode); +- key->shared.pgoff = basepage_index(tail); ++ key->shared.pgoff = page_to_pgoff(tail); + rcu_read_unlock(); + } + +diff --git a/kernel/kthread.c b/kernel/kthread.c +index fd6f9322312aa..7dd2c8a797d7a 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -979,8 +979,38 @@ void kthread_flush_work(struct kthread_work *work) + EXPORT_SYMBOL_GPL(kthread_flush_work); + + /* +- * This function removes the work from the worker queue. Also it makes sure +- * that it won't get queued later via the delayed work's timer. ++ * Make sure that the timer is neither set nor running and could ++ * not manipulate the work list_head any longer. ++ * ++ * The function is called under worker->lock. The lock is temporary ++ * released but the timer can't be set again in the meantime. ++ */ ++static void kthread_cancel_delayed_work_timer(struct kthread_work *work, ++ unsigned long *flags) ++{ ++ struct kthread_delayed_work *dwork = ++ container_of(work, struct kthread_delayed_work, work); ++ struct kthread_worker *worker = work->worker; ++ ++ /* ++ * del_timer_sync() must be called to make sure that the timer ++ * callback is not running. The lock must be temporary released ++ * to avoid a deadlock with the callback. In the meantime, ++ * any queuing is blocked by setting the canceling counter. ++ */ ++ work->canceling++; ++ spin_unlock_irqrestore(&worker->lock, *flags); ++ del_timer_sync(&dwork->timer); ++ spin_lock_irqsave(&worker->lock, *flags); ++ work->canceling--; ++} ++ ++/* ++ * This function removes the work from the worker queue. ++ * ++ * It is called under worker->lock. The caller must make sure that ++ * the timer used by delayed work is not running, e.g. by calling ++ * kthread_cancel_delayed_work_timer(). + * + * The work might still be in use when this function finishes. See the + * current_work proceed by the worker. +@@ -988,28 +1018,8 @@ EXPORT_SYMBOL_GPL(kthread_flush_work); + * Return: %true if @work was pending and successfully canceled, + * %false if @work was not pending + */ +-static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, +- unsigned long *flags) ++static bool __kthread_cancel_work(struct kthread_work *work) + { +- /* Try to cancel the timer if exists. */ +- if (is_dwork) { +- struct kthread_delayed_work *dwork = +- container_of(work, struct kthread_delayed_work, work); +- struct kthread_worker *worker = work->worker; +- +- /* +- * del_timer_sync() must be called to make sure that the timer +- * callback is not running. The lock must be temporary released +- * to avoid a deadlock with the callback. In the meantime, +- * any queuing is blocked by setting the canceling counter. +- */ +- work->canceling++; +- spin_unlock_irqrestore(&worker->lock, *flags); +- del_timer_sync(&dwork->timer); +- spin_lock_irqsave(&worker->lock, *flags); +- work->canceling--; +- } +- + /* + * Try to remove the work from a worker list. It might either + * be from worker->work_list or from worker->delayed_work_list. +@@ -1062,11 +1072,23 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, + /* Work must not be used with >1 worker, see kthread_queue_work() */ + WARN_ON_ONCE(work->worker != worker); + +- /* Do not fight with another command that is canceling this work. */ ++ /* ++ * Temporary cancel the work but do not fight with another command ++ * that is canceling the work as well. ++ * ++ * It is a bit tricky because of possible races with another ++ * mod_delayed_work() and cancel_delayed_work() callers. ++ * ++ * The timer must be canceled first because worker->lock is released ++ * when doing so. But the work can be removed from the queue (list) ++ * only when it can be queued again so that the return value can ++ * be used for reference counting. ++ */ ++ kthread_cancel_delayed_work_timer(work, &flags); + if (work->canceling) + goto out; ++ ret = __kthread_cancel_work(work); + +- ret = __kthread_cancel_work(work, true, &flags); + fast_queue: + __kthread_queue_delayed_work(worker, dwork, delay); + out: +@@ -1088,7 +1110,10 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); + +- ret = __kthread_cancel_work(work, is_dwork, &flags); ++ if (is_dwork) ++ kthread_cancel_delayed_work_timer(work, &flags); ++ ++ ret = __kthread_cancel_work(work); + + if (worker->current_work != work) + goto out_fast; +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 513f0cf173ad5..972893908bcda 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2324,16 +2324,16 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, + static void unmap_page(struct page *page) + { + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | +- TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; +- bool unmap_success; ++ TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; + + VM_BUG_ON_PAGE(!PageHead(page), page); + + if (PageAnon(page)) + ttu_flags |= TTU_SPLIT_FREEZE; + +- unmap_success = try_to_unmap(page, ttu_flags); +- VM_BUG_ON_PAGE(!unmap_success, page); ++ try_to_unmap(page, ttu_flags); ++ ++ VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); + } + + static void remap_page(struct page *page) +@@ -2586,7 +2586,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); + struct anon_vma *anon_vma = NULL; + struct address_space *mapping = NULL; +- int count, mapcount, extra_pins, ret; ++ int extra_pins, ret; + bool mlocked; + unsigned long flags; + pgoff_t end; +@@ -2648,7 +2648,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + + mlocked = PageMlocked(page); + unmap_page(head); +- VM_BUG_ON_PAGE(compound_mapcount(head), head); + + /* Make sure the page is not on per-CPU pagevec as it takes pin */ + if (mlocked) +@@ -2674,9 +2673,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + + /* Prevent deferred_split_scan() touching ->_refcount */ + spin_lock(&pgdata->split_queue_lock); +- count = page_count(head); +- mapcount = total_mapcount(head); +- if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { ++ if (page_ref_freeze(head, 1 + extra_pins)) { + if (!list_empty(page_deferred_list(head))) { + pgdata->split_queue_len--; + list_del(page_deferred_list(head)); +@@ -2692,16 +2689,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + } else + ret = 0; + } else { +- if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { +- pr_alert("total_mapcount: %u, page_count(): %u\n", +- mapcount, count); +- if (PageTail(page)) +- dump_page(head, NULL); +- dump_page(page, "total_mapcount(head) > 0"); +- BUG(); +- } + spin_unlock(&pgdata->split_queue_lock); +-fail: if (mapping) ++fail: ++ if (mapping) + spin_unlock(&mapping->tree_lock); + spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); + remap_page(head); +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 0dc181290d1fb..c765fd01f0aa4 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -1403,15 +1403,12 @@ int PageHeadHuge(struct page *page_head) + return get_compound_page_dtor(page_head) == free_huge_page; + } + +-pgoff_t __basepage_index(struct page *page) ++pgoff_t hugetlb_basepage_index(struct page *page) + { + struct page *page_head = compound_head(page); + pgoff_t index = page_index(page_head); + unsigned long compound_idx; + +- if (!PageHuge(page_head)) +- return page_index(page); +- + if (compound_order(page_head) >= MAX_ORDER) + compound_idx = page_to_pfn(page) - page_to_pfn(page_head); + else +diff --git a/mm/internal.h b/mm/internal.h +index a182506242c43..97c8e896cd2f6 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -330,27 +330,52 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) + extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); + + /* +- * At what user virtual address is page expected in @vma? ++ * At what user virtual address is page expected in vma? ++ * Returns -EFAULT if all of the page is outside the range of vma. ++ * If page is a compound head, the entire compound page is considered. + */ + static inline unsigned long +-__vma_address(struct page *page, struct vm_area_struct *vma) ++vma_address(struct page *page, struct vm_area_struct *vma) + { +- pgoff_t pgoff = page_to_pgoff(page); +- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ pgoff_t pgoff; ++ unsigned long address; ++ ++ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ ++ pgoff = page_to_pgoff(page); ++ if (pgoff >= vma->vm_pgoff) { ++ address = vma->vm_start + ++ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ /* Check for address beyond vma (or wrapped through 0?) */ ++ if (address < vma->vm_start || address >= vma->vm_end) ++ address = -EFAULT; ++ } else if (PageHead(page) && ++ pgoff + (1UL << compound_order(page)) - 1 >= vma->vm_pgoff) { ++ /* Test above avoids possibility of wrap to 0 on 32-bit */ ++ address = vma->vm_start; ++ } else { ++ address = -EFAULT; ++ } ++ return address; + } + ++/* ++ * Then at what user virtual address will none of the page be found in vma? ++ * Assumes that vma_address() already returned a good starting address. ++ * If page is a compound head, the entire compound page is considered. ++ */ + static inline unsigned long +-vma_address(struct page *page, struct vm_area_struct *vma) ++vma_address_end(struct page *page, struct vm_area_struct *vma) + { +- unsigned long start, end; +- +- start = __vma_address(page, vma); +- end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1); +- +- /* page should be within @vma mapping range */ +- VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma); +- +- return max(start, vma->vm_start); ++ pgoff_t pgoff; ++ unsigned long address; ++ ++ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ ++ pgoff = page_to_pgoff(page) + (1UL << compound_order(page)); ++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ /* Check for address beyond vma (or wrapped through 0?) */ ++ if (address < vma->vm_start || address > vma->vm_end) ++ address = vma->vm_end; ++ return address; + } + + #else /* !CONFIG_MMU */ +diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c +index e00d985a51c56..a612daef5f009 100644 +--- a/mm/page_vma_mapped.c ++++ b/mm/page_vma_mapped.c +@@ -110,6 +110,13 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) + return true; + } + ++static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) ++{ ++ pvmw->address = (pvmw->address + size) & ~(size - 1); ++ if (!pvmw->address) ++ pvmw->address = ULONG_MAX; ++} ++ + /** + * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at + * @pvmw->address +@@ -138,6 +145,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) + { + struct mm_struct *mm = pvmw->vma->vm_mm; + struct page *page = pvmw->page; ++ unsigned long end; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; +@@ -147,10 +155,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) + if (pvmw->pmd && !pvmw->pte) + return not_found(pvmw); + +- if (pvmw->pte) +- goto next_pte; ++ if (unlikely(PageHuge(page))) { ++ /* The only possible mapping was handled on last iteration */ ++ if (pvmw->pte) ++ return not_found(pvmw); + +- if (unlikely(PageHuge(pvmw->page))) { + /* when pud is not present, pte will be NULL */ + pvmw->pte = huge_pte_offset(mm, pvmw->address, + PAGE_SIZE << compound_order(page)); +@@ -163,78 +172,108 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) + return not_found(pvmw); + return true; + } +-restart: +- pgd = pgd_offset(mm, pvmw->address); +- if (!pgd_present(*pgd)) +- return false; +- p4d = p4d_offset(pgd, pvmw->address); +- if (!p4d_present(*p4d)) +- return false; +- pud = pud_offset(p4d, pvmw->address); +- if (!pud_present(*pud)) +- return false; +- pvmw->pmd = pmd_offset(pud, pvmw->address); ++ + /* +- * Make sure the pmd value isn't cached in a register by the +- * compiler and used as a stale value after we've observed a +- * subsequent update. ++ * Seek to next pte only makes sense for THP. ++ * But more important than that optimization, is to filter out ++ * any PageKsm page: whose page->index misleads vma_address() ++ * and vma_address_end() to disaster. + */ +- pmde = READ_ONCE(*pvmw->pmd); +- if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { +- pvmw->ptl = pmd_lock(mm, pvmw->pmd); +- if (likely(pmd_trans_huge(*pvmw->pmd))) { +- if (pvmw->flags & PVMW_MIGRATION) +- return not_found(pvmw); +- if (pmd_page(*pvmw->pmd) != page) +- return not_found(pvmw); +- return true; +- } else if (!pmd_present(*pvmw->pmd)) { +- if (thp_migration_supported()) { +- if (!(pvmw->flags & PVMW_MIGRATION)) ++ end = PageTransCompound(page) ? ++ vma_address_end(page, pvmw->vma) : ++ pvmw->address + PAGE_SIZE; ++ if (pvmw->pte) ++ goto next_pte; ++restart: ++ do { ++ pgd = pgd_offset(mm, pvmw->address); ++ if (!pgd_present(*pgd)) { ++ step_forward(pvmw, PGDIR_SIZE); ++ continue; ++ } ++ p4d = p4d_offset(pgd, pvmw->address); ++ if (!p4d_present(*p4d)) { ++ step_forward(pvmw, P4D_SIZE); ++ continue; ++ } ++ pud = pud_offset(p4d, pvmw->address); ++ if (!pud_present(*pud)) { ++ step_forward(pvmw, PUD_SIZE); ++ continue; ++ } ++ ++ pvmw->pmd = pmd_offset(pud, pvmw->address); ++ /* ++ * Make sure the pmd value isn't cached in a register by the ++ * compiler and used as a stale value after we've observed a ++ * subsequent update. ++ */ ++ pmde = READ_ONCE(*pvmw->pmd); ++ ++ if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { ++ pvmw->ptl = pmd_lock(mm, pvmw->pmd); ++ pmde = *pvmw->pmd; ++ if (likely(pmd_trans_huge(pmde))) { ++ if (pvmw->flags & PVMW_MIGRATION) ++ return not_found(pvmw); ++ if (pmd_page(pmde) != page) + return not_found(pvmw); +- if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) { +- swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd); ++ return true; ++ } ++ if (!pmd_present(pmde)) { ++ swp_entry_t entry; + +- if (migration_entry_to_page(entry) != page) +- return not_found(pvmw); +- return true; +- } ++ if (!thp_migration_supported() || ++ !(pvmw->flags & PVMW_MIGRATION)) ++ return not_found(pvmw); ++ entry = pmd_to_swp_entry(pmde); ++ if (!is_migration_entry(entry) || ++ migration_entry_to_page(entry) != page) ++ return not_found(pvmw); ++ return true; + } +- return not_found(pvmw); +- } else { + /* THP pmd was split under us: handle on pte level */ + spin_unlock(pvmw->ptl); + pvmw->ptl = NULL; ++ } else if (!pmd_present(pmde)) { ++ /* ++ * If PVMW_SYNC, take and drop THP pmd lock so that we ++ * cannot return prematurely, while zap_huge_pmd() has ++ * cleared *pmd but not decremented compound_mapcount(). ++ */ ++ if ((pvmw->flags & PVMW_SYNC) && ++ PageTransCompound(page)) { ++ spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); ++ ++ spin_unlock(ptl); ++ } ++ step_forward(pvmw, PMD_SIZE); ++ continue; + } +- } else if (!pmd_present(pmde)) { +- return false; +- } +- if (!map_pte(pvmw)) +- goto next_pte; +- while (1) { ++ if (!map_pte(pvmw)) ++ goto next_pte; ++this_pte: + if (check_pte(pvmw)) + return true; + next_pte: +- /* Seek to next pte only makes sense for THP */ +- if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) +- return not_found(pvmw); + do { + pvmw->address += PAGE_SIZE; +- if (pvmw->address >= pvmw->vma->vm_end || +- pvmw->address >= +- __vma_address(pvmw->page, pvmw->vma) + +- hpage_nr_pages(pvmw->page) * PAGE_SIZE) ++ if (pvmw->address >= end) + return not_found(pvmw); + /* Did we cross page table boundary? */ +- if (pvmw->address % PMD_SIZE == 0) { +- pte_unmap(pvmw->pte); ++ if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) { + if (pvmw->ptl) { + spin_unlock(pvmw->ptl); + pvmw->ptl = NULL; + } ++ pte_unmap(pvmw->pte); ++ pvmw->pte = NULL; + goto restart; +- } else { +- pvmw->pte++; ++ } ++ pvmw->pte++; ++ if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) { ++ pvmw->ptl = pte_lockptr(mm, pvmw->pmd); ++ spin_lock(pvmw->ptl); + } + } while (pte_none(*pvmw->pte)); + +@@ -242,7 +281,10 @@ next_pte: + pvmw->ptl = pte_lockptr(mm, pvmw->pmd); + spin_lock(pvmw->ptl); + } +- } ++ goto this_pte; ++ } while (pvmw->address < end); ++ ++ return false; + } + + /** +@@ -261,14 +303,10 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) + .vma = vma, + .flags = PVMW_SYNC, + }; +- unsigned long start, end; +- +- start = __vma_address(page, vma); +- end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1); + +- if (unlikely(end < vma->vm_start || start >= vma->vm_end)) ++ pvmw.address = vma_address(page, vma); ++ if (pvmw.address == -EFAULT) + return 0; +- pvmw.address = max(start, vma->vm_start); + if (!page_vma_mapped_walk(&pvmw)) + return 0; + page_vma_mapped_walk_done(&pvmw); +diff --git a/mm/rmap.c b/mm/rmap.c +index 8bd2ddd8febd5..8ed8ec113d5a9 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -686,7 +686,6 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) + */ + unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) + { +- unsigned long address; + if (PageAnon(page)) { + struct anon_vma *page__anon_vma = page_anon_vma(page); + /* +@@ -696,15 +695,13 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) + if (!vma->anon_vma || !page__anon_vma || + vma->anon_vma->root != page__anon_vma->root) + return -EFAULT; +- } else if (page->mapping) { +- if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) +- return -EFAULT; +- } else ++ } else if (!vma->vm_file) { + return -EFAULT; +- address = __vma_address(page, vma); +- if (unlikely(address < vma->vm_start || address >= vma->vm_end)) ++ } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) { + return -EFAULT; +- return address; ++ } ++ ++ return vma_address(page, vma); + } + + pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) +@@ -896,7 +893,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, + * We have to assume the worse case ie pmd for invalidation. Note that + * the page can not be free from this function. + */ +- end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); ++ end = vma_address_end(page, vma); + mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + + while (page_vma_mapped_walk(&pvmw)) { +@@ -1344,6 +1341,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + unsigned long start = address, end; + enum ttu_flags flags = (enum ttu_flags)arg; + ++ /* ++ * When racing against e.g. zap_pte_range() on another cpu, ++ * in between its ptep_get_and_clear_full() and page_remove_rmap(), ++ * try_to_unmap() may return false when it is about to become true, ++ * if page table locking is skipped: use TTU_SYNC to wait for that. ++ */ ++ if (flags & TTU_SYNC) ++ pvmw.flags = PVMW_SYNC; ++ + /* munlock has nothing to gain from examining un-locked vmas */ + if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) + return true; +@@ -1365,7 +1371,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + * Note that the page can not be free in this function as call of + * try_to_unmap() must hold a reference on the page. + */ +- end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); ++ end = PageKsm(page) ? ++ address + PAGE_SIZE : vma_address_end(page, vma); + if (PageHuge(page)) { + /* + * If sharing is possible, start and end will be adjusted +@@ -1624,9 +1631,9 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) + return is_vma_temporary_stack(vma); + } + +-static int page_mapcount_is_zero(struct page *page) ++static int page_not_mapped(struct page *page) + { +- return !total_mapcount(page); ++ return !page_mapped(page); + } + + /** +@@ -1644,7 +1651,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)flags, +- .done = page_mapcount_is_zero, ++ .done = page_not_mapped, + .anon_lock = page_lock_anon_vma_read, + }; + +@@ -1665,14 +1672,15 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) + else + rmap_walk(page, &rwc); + +- return !page_mapcount(page) ? true : false; ++ /* ++ * When racing against e.g. zap_pte_range() on another cpu, ++ * in between its ptep_get_and_clear_full() and page_remove_rmap(), ++ * try_to_unmap() may return false when it is about to become true, ++ * if page table locking is skipped: use TTU_SYNC to wait for that. ++ */ ++ return !page_mapcount(page); + } + +-static int page_not_mapped(struct page *page) +-{ +- return !page_mapped(page); +-}; +- + /** + * try_to_munlock - try to munlock a page + * @page: the page to be munlocked +@@ -1767,6 +1775,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, + struct vm_area_struct *vma = avc->vma; + unsigned long address = vma_address(page, vma); + ++ VM_BUG_ON_VMA(address == -EFAULT, vma); + cond_resched(); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) +@@ -1821,6 +1830,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, + pgoff_start, pgoff_end) { + unsigned long address = vma_address(page, vma); + ++ VM_BUG_ON_VMA(address == -EFAULT, vma); + cond_resched(); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))