From: William Kucharski <william.kuchar...@oracle.com>

Add filemap_huge_fault() to attempt to satisfy page
faults on memory-mapped read-only text pages using THP when possible.

Signed-off-by: William Kucharski <william.kuchar...@oracle.com>
[rebased on top of mm prep patches -- Matthew]
Signed-off-by: Matthew Wilcox (Oracle) <wi...@infradead.org>
---
 include/linux/mm.h      |  10 +++
 include/linux/pagemap.h |   8 ++
 mm/filemap.c            | 165 ++++++++++++++++++++++++++++++++++++++--
 3 files changed, 178 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 04bea9f9282c..623878f11eaf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2414,6 +2414,16 @@ extern void truncate_inode_pages_final(struct 
address_space *);
 
 /* generic vm_area_ops exported for stackable file systems */
 extern vm_fault_t filemap_fault(struct vm_fault *vmf);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern vm_fault_t filemap_huge_fault(struct vm_fault *vmf,
+               enum page_entry_size pe_size);
+#else
+static inline vm_fault_t filemap_huge_fault(struct vm_fault *vmf,
+               enum page_entry_size pe_size)
+{
+       return VM_FAULT_FALLBACK;
+}
+#endif
 extern void filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff);
 extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d6d97f9fb762..ae09788f5345 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -354,6 +354,14 @@ static inline struct page *grab_cache_page_nowait(struct 
address_space *mapping,
                        mapping_gfp_mask(mapping));
 }
 
+/* This (head) page should be found at this offset in the page cache */
+static inline void page_cache_assert(struct page *page, pgoff_t offset)
+{
+       VM_BUG_ON_PAGE(PageTail(page), page);
+       VM_BUG_ON_PAGE(page->index == (offset & ~(compound_nr(page) - 1)),
+                       page);
+}
+
 static inline struct page *find_subpage(struct page *page, pgoff_t offset)
 {
        if (PageHuge(page))
diff --git a/mm/filemap.c b/mm/filemap.c
index b07ef9469861..8017e905df7a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1590,7 +1590,8 @@ static bool pagecache_is_conflict(struct page *page)
  *
  * Looks up the page cache entries at @mapping between @offset and
  * @offset + 2^@order.  If there is a page cache page, it is returned with
- * an increased refcount unless it is smaller than @order.
+ * an increased refcount unless it is smaller than @order.  This function
+ * returns the head page, not a tail page.
  *
  * If the slot holds a shadow entry of a previously evicted page, or a
  * swap entry from shmem/tmpfs, it is returned.
@@ -1601,7 +1602,7 @@ static bool pagecache_is_conflict(struct page *page)
 static struct page *__find_get_page(struct address_space *mapping,
                unsigned long offset, unsigned int order)
 {
-       XA_STATE(xas, &mapping->i_pages, offset);
+       XA_STATE(xas, &mapping->i_pages, offset & ~((1UL << order) - 1));
        struct page *page;
 
        rcu_read_lock();
@@ -1635,7 +1636,6 @@ static struct page *__find_get_page(struct address_space 
*mapping,
                put_page(page);
                goto repeat;
        }
-       page = find_subpage(page, offset);
 out:
        rcu_read_unlock();
 
@@ -1741,11 +1741,12 @@ struct page *pagecache_get_page(struct address_space 
*mapping, pgoff_t offset,
                        put_page(page);
                        goto repeat;
                }
-               VM_BUG_ON_PAGE(page->index != offset, page);
+               page_cache_assert(page, offset);
        }
 
        if (fgp_flags & FGP_ACCESSED)
                mark_page_accessed(page);
+       page = find_subpage(page, offset);
 
 no_page:
        if (!page && (fgp_flags & FGP_CREAT)) {
@@ -2638,7 +2639,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
                put_page(page);
                goto retry_find;
        }
-       VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+       page_cache_assert(page, offset);
 
        /*
         * We have a locked page in the page cache, now we need to check
@@ -2711,6 +2712,160 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/**
+ * filemap_huge_fault - Read in file data for page fault handling.
+ * @vmf: struct vm_fault containing details of the fault.
+ * @pe_size: Page entry size.
+ *
+ * filemap_huge_fault() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * The goto's are kind of ugly, but this streamlines the normal case of having
+ * it in the page cache, and handles the special cases reasonably without
+ * having a lot of duplicated code.
+ *
+ * vma->vm_mm->mmap_sem must be held on entry.
+ *
+ * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem
+ * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
+ *
+ * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
+ * has not been released.
+ *
+ * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
+ *
+ * Return: bitwise-OR of %VM_FAULT_ codes.
+ */
+vm_fault_t filemap_huge_fault(struct vm_fault *vmf,
+                               enum page_entry_size pe_size)
+{
+       int error;
+       struct vm_area_struct *vma = vmf->vma;
+       struct file *file = vma->vm_file;
+       struct file *fpin = NULL;
+       struct address_space *mapping = file->f_mapping;
+       struct inode *inode = mapping->host;
+       pgoff_t offset = vmf->pgoff;
+       pgoff_t max_off;
+       struct page *page;
+       vm_fault_t ret = 0;
+
+       if (pe_size != PE_SIZE_PMD)
+               return VM_FAULT_FALLBACK;
+       /* Read-only mappings for now */
+       if (vmf->flags & FAULT_FLAG_WRITE)
+               return VM_FAULT_FALLBACK;
+       if (vma->vm_start & ~HPAGE_PMD_MASK)
+               return VM_FAULT_FALLBACK;
+       /* Don't allocate a huge page for the tail of the file (?) */
+       max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+       if (unlikely((offset | (HPAGE_PMD_NR - 1)) >= max_off))
+               return VM_FAULT_FALLBACK;
+
+       /*
+        * Do we have something in the page cache already?
+        */
+       page = __find_get_page(mapping, offset, HPAGE_PMD_ORDER);
+       if (likely(page)) {
+               if (pagecache_is_conflict(page))
+                       return VM_FAULT_FALLBACK;
+               /* Readahead the next huge page here? */
+               page = find_subpage(page, offset & ~(HPAGE_PMD_NR - 1));
+       } else {
+               /* No page in the page cache at all */
+               count_vm_event(PGMAJFAULT);
+               count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
+               ret = VM_FAULT_MAJOR;
+retry_find:
+               page = pagecache_get_page(mapping, offset,
+                                         FGP_CREAT | FGP_FOR_MMAP | FGP_PMD,
+                                         vmf->gfp_mask |
+                                               __GFP_NOWARN | __GFP_NORETRY);
+               if (!page)
+                       return VM_FAULT_FALLBACK;
+       }
+
+       if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
+               goto out_retry;
+
+       /* Did it get truncated? */
+       if (unlikely(page->mapping != mapping)) {
+               unlock_page(page);
+               put_page(page);
+               goto retry_find;
+       }
+       VM_BUG_ON_PAGE(page_to_index(page) != offset, page);
+
+       /*
+        * We have a locked page in the page cache, now we need to check
+        * that it's up-to-date.  Because we don't readahead in huge_fault,
+        * this may or may not be due to an error.
+        */
+       if (!PageUptodate(page))
+               goto page_not_uptodate;
+
+       /*
+        * We've made it this far and we had to drop our mmap_sem, now is the
+        * time to return to the upper layer and have it re-find the vma and
+        * redo the fault.
+        */
+       if (fpin) {
+               unlock_page(page);
+               goto out_retry;
+       }
+
+       /*
+        * Found the page and have a reference on it.
+        * We must recheck i_size under page lock.
+        */
+       max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+       if (unlikely(offset >= max_off)) {
+               unlock_page(page);
+               put_page(page);
+               return VM_FAULT_SIGBUS;
+       }
+
+       ret |= alloc_set_pte(vmf, NULL, page);
+       unlock_page(page);
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+               put_page(page);
+       return ret;
+
+page_not_uptodate:
+       ClearPageError(page);
+       fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+       error = mapping->a_ops->readpage(file, page);
+       if (!error) {
+               wait_on_page_locked(page);
+               if (!PageUptodate(page))
+                       error = -EIO;
+       }
+       if (fpin)
+               goto out_retry;
+       put_page(page);
+
+       if (!error || error == AOP_TRUNCATED_PAGE)
+               goto retry_find;
+
+       /* Things didn't work out */
+       return VM_FAULT_SIGBUS;
+
+out_retry:
+       /*
+        * We dropped the mmap_sem, we need to return to the fault handler to
+        * re-find the vma and come back and find our hopefully still populated
+        * page.
+        */
+       if (page)
+               put_page(page);
+       if (fpin)
+               fput(fpin);
+       return ret | VM_FAULT_RETRY;
+}
+EXPORT_SYMBOL(filemap_huge_fault);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 void filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff)
 {
-- 
2.23.0

Reply via email to