On Mon, 16 Oct 2017 23:10:02 -0400
jgli...@redhat.com wrote:

> From: Jérôme Glisse <jgli...@redhat.com>
> 
> +             /*
> +              * No need to call mmu_notifier_invalidate_range() as we are
> +              * downgrading page table protection not changing it to point
> +              * to a new page.
> +              *
> +              * See Documentation/vm/mmu_notifier.txt
> +              */
>               if (pmdp) {
>  #ifdef CONFIG_FS_DAX_PMD
>                       pmd_t pmd;
> @@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct 
> address_space *mapping,
>                       pmd = pmd_wrprotect(pmd);
>                       pmd = pmd_mkclean(pmd);
>                       set_pmd_at(vma->vm_mm, address, pmdp, pmd);
> -                     mmu_notifier_invalidate_range(vma->vm_mm, start, end);

Could the secondary TLB still see the mapping as dirty and propagate the dirty 
bit back?

>  unlock_pmd:
>                       spin_unlock(ptl);
>  #endif
> @@ -643,7 +649,6 @@ static void dax_mapping_entry_mkclean(struct 
> address_space *mapping,
>                       pte = pte_wrprotect(pte);
>                       pte = pte_mkclean(pte);
>                       set_pte_at(vma->vm_mm, address, ptep, pte);
> -                     mmu_notifier_invalidate_range(vma->vm_mm, start, end);

Ditto

>  unlock_pte:
>                       pte_unmap_unlock(ptep, ptl);
>               }
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index 6866e8126982..49c925c96b8a 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -155,7 +155,8 @@ struct mmu_notifier_ops {
>        * shared page-tables, it not necessary to implement the
>        * invalidate_range_start()/end() notifiers, as
>        * invalidate_range() alread catches the points in time when an
> -      * external TLB range needs to be flushed.
> +      * external TLB range needs to be flushed. For more in depth
> +      * discussion on this see Documentation/vm/mmu_notifier.txt
>        *
>        * The invalidate_range() function is called under the ptl
>        * spin-lock and not allowed to sleep.
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index c037d3d34950..ff5bc647b51d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1186,8 +1186,15 @@ static int do_huge_pmd_wp_page_fallback(struct 
> vm_fault *vmf, pmd_t orig_pmd,
>               goto out_free_pages;
>       VM_BUG_ON_PAGE(!PageHead(page), page);
>  
> +     /*
> +      * Leave pmd empty until pte is filled note we must notify here as
> +      * concurrent CPU thread might write to new page before the call to
> +      * mmu_notifier_invalidate_range_end() happens which can lead to a
> +      * device seeing memory write in different order than CPU.
> +      *
> +      * See Documentation/vm/mmu_notifier.txt
> +      */
>       pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> -     /* leave pmd empty until pte is filled */
>  
>       pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
>       pmd_populate(vma->vm_mm, &_pmd, pgtable);
> @@ -2026,8 +2033,15 @@ static void __split_huge_zero_page_pmd(struct 
> vm_area_struct *vma,
>       pmd_t _pmd;
>       int i;
>  
> -     /* leave pmd empty until pte is filled */
> -     pmdp_huge_clear_flush_notify(vma, haddr, pmd);
> +     /*
> +      * Leave pmd empty until pte is filled note that it is fine to delay
> +      * notification until mmu_notifier_invalidate_range_end() as we are
> +      * replacing a zero pmd write protected page with a zero pte write
> +      * protected page.
> +      *
> +      * See Documentation/vm/mmu_notifier.txt
> +      */
> +     pmdp_huge_clear_flush(vma, haddr, pmd);

Shouldn't the secondary TLB know if the page size changed?

>  
>       pgtable = pgtable_trans_huge_withdraw(mm, pmd);
>       pmd_populate(mm, &_pmd, pgtable);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 1768efa4c501..63a63f1b536c 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -3254,9 +3254,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, 
> struct mm_struct *src,
>                       set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
>               } else {
>                       if (cow) {
> +                             /*
> +                              * No need to notify as we are downgrading page
> +                              * table protection not changing it to point
> +                              * to a new page.
> +                              *
> +                              * See Documentation/vm/mmu_notifier.txt
> +                              */
>                               huge_ptep_set_wrprotect(src, addr, src_pte);

OK.. so we could get write faults on write accesses from the device.

> -                             mmu_notifier_invalidate_range(src, mmun_start,
> -                                                                mmun_end);
>                       }
>                       entry = huge_ptep_get(src_pte);
>                       ptepage = pte_page(entry);
> @@ -4288,7 +4293,12 @@ unsigned long hugetlb_change_protection(struct 
> vm_area_struct *vma,
>        * and that page table be reused and filled with junk.
>        */
>       flush_hugetlb_tlb_range(vma, start, end);
> -     mmu_notifier_invalidate_range(mm, start, end);
> +     /*
> +      * No need to call mmu_notifier_invalidate_range() we are downgrading
> +      * page table protection not changing it to point to a new page.
> +      *
> +      * See Documentation/vm/mmu_notifier.txt
> +      */
>       i_mmap_unlock_write(vma->vm_file->f_mapping);
>       mmu_notifier_invalidate_range_end(mm, start, end);
>  
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 6cb60f46cce5..be8f4576f842 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -1052,8 +1052,13 @@ static int write_protect_page(struct vm_area_struct 
> *vma, struct page *page,
>                * So we clear the pte and flush the tlb before the check
>                * this assure us that no O_DIRECT can happen after the check
>                * or in the middle of the check.
> +              *
> +              * No need to notify as we are downgrading page table to read
> +              * only not changing it to point to a new page.
> +              *
> +              * See Documentation/vm/mmu_notifier.txt
>                */
> -             entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
> +             entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
>               /*
>                * Check that no O_DIRECT or similar I/O is in progress on the
>                * page
> @@ -1136,7 +1141,13 @@ static int replace_page(struct vm_area_struct *vma, 
> struct page *page,
>       }
>  
>       flush_cache_page(vma, addr, pte_pfn(*ptep));
> -     ptep_clear_flush_notify(vma, addr, ptep);
> +     /*
> +      * No need to notify as we are replacing a read only page with another
> +      * read only page with the same content.
> +      *
> +      * See Documentation/vm/mmu_notifier.txt
> +      */
> +     ptep_clear_flush(vma, addr, ptep);
>       set_pte_at_notify(mm, addr, ptep, newpte);
>  
>       page_remove_rmap(page, false);
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 061826278520..6b5a0f219ac0 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -937,10 +937,15 @@ static bool page_mkclean_one(struct page *page, struct 
> vm_area_struct *vma,
>  #endif
>               }
>  
> -             if (ret) {
> -                     mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend);
> +             /*
> +              * No need to call mmu_notifier_invalidate_range() as we are
> +              * downgrading page table protection not changing it to point
> +              * to a new page.
> +              *
> +              * See Documentation/vm/mmu_notifier.txt
> +              */
> +             if (ret)
>                       (*cleaned)++;
> -             }
>       }
>  
>       mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
> @@ -1424,6 +1429,10 @@ static bool try_to_unmap_one(struct page *page, struct 
> vm_area_struct *vma,
>                       if (pte_soft_dirty(pteval))
>                               swp_pte = pte_swp_mksoft_dirty(swp_pte);
>                       set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
> +                     /*
> +                      * No need to invalidate here it will synchronize on
> +                      * against the special swap migration pte.
> +                      */
>                       goto discard;
>               }
>  
> @@ -1481,6 +1490,9 @@ static bool try_to_unmap_one(struct page *page, struct 
> vm_area_struct *vma,
>                        * will take care of the rest.
>                        */
>                       dec_mm_counter(mm, mm_counter(page));
> +                     /* We have to invalidate as we cleared the pte */
> +                     mmu_notifier_invalidate_range(mm, address,
> +                                                   address + PAGE_SIZE);
>               } else if (IS_ENABLED(CONFIG_MIGRATION) &&
>                               (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
>                       swp_entry_t entry;
> @@ -1496,6 +1508,10 @@ static bool try_to_unmap_one(struct page *page, struct 
> vm_area_struct *vma,
>                       if (pte_soft_dirty(pteval))
>                               swp_pte = pte_swp_mksoft_dirty(swp_pte);
>                       set_pte_at(mm, address, pvmw.pte, swp_pte);
> +                     /*
> +                      * No need to invalidate here it will synchronize on
> +                      * against the special swap migration pte.
> +                      */
>               } else if (PageAnon(page)) {
>                       swp_entry_t entry = { .val = page_private(subpage) };
>                       pte_t swp_pte;
> @@ -1507,6 +1523,8 @@ static bool try_to_unmap_one(struct page *page, struct 
> vm_area_struct *vma,
>                               WARN_ON_ONCE(1);
>                               ret = false;
>                               /* We have to invalidate as we cleared the pte 
> */
> +                             mmu_notifier_invalidate_range(mm, address,
> +                                                     address + PAGE_SIZE);
>                               page_vma_mapped_walk_done(&pvmw);
>                               break;
>                       }
> @@ -1514,6 +1532,9 @@ static bool try_to_unmap_one(struct page *page, struct 
> vm_area_struct *vma,
>                       /* MADV_FREE page check */
>                       if (!PageSwapBacked(page)) {
>                               if (!PageDirty(page)) {
> +                                     /* Invalidate as we cleared the pte */
> +                                     mmu_notifier_invalidate_range(mm,
> +                                             address, address + PAGE_SIZE);
>                                       dec_mm_counter(mm, MM_ANONPAGES);
>                                       goto discard;
>                               }
> @@ -1547,13 +1568,39 @@ static bool try_to_unmap_one(struct page *page, 
> struct vm_area_struct *vma,
>                       if (pte_soft_dirty(pteval))
>                               swp_pte = pte_swp_mksoft_dirty(swp_pte);
>                       set_pte_at(mm, address, pvmw.pte, swp_pte);
> -             } else
> +                     /* Invalidate as we cleared the pte */
> +                     mmu_notifier_invalidate_range(mm, address,
> +                                                   address + PAGE_SIZE);
> +             } else {
> +                     /*
> +                      * We should not need to notify here as we reach this
> +                      * case only from freeze_page() itself only call from
> +                      * split_huge_page_to_list() so everything below must
> +                      * be true:
> +                      *   - page is not anonymous
> +                      *   - page is locked
> +                      *
> +                      * So as it is a locked file back page thus it can not
> +                      * be remove from the page cache and replace by a new
> +                      * page before mmu_notifier_invalidate_range_end so no
> +                      * concurrent thread might update its page table to
> +                      * point at new page while a device still is using this
> +                      * page.
> +                      *
> +                      * See Documentation/vm/mmu_notifier.txt
> +                      */
>                       dec_mm_counter(mm, mm_counter_file(page));
> +             }
>  discard:
> +             /*
> +              * No need to call mmu_notifier_invalidate_range() it has be
> +              * done above for all cases requiring it to happen under page
> +              * table lock before mmu_notifier_invalidate_range_end()
> +              *
> +              * See Documentation/vm/mmu_notifier.txt
> +              */
>               page_remove_rmap(subpage, PageHuge(page));
>               put_page(page);
> -             mmu_notifier_invalidate_range(mm, address,
> -                                           address + PAGE_SIZE);
>       }
>  
>       mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);

Looking at the patchset, I understand the efficiency, but I am concerned
with correctness.

Balbir Singh.

Reply via email to