The current memory_failure_dev_pagemap() can only handle single-mapped
dax page for fsdax mode.  The dax page could be mapped by multiple files
and offsets if we let reflink feature & fsdax mode work together.  So,
we refactor current implementation to support handle memory failure on
each file and offset.

Signed-off-by: Shiyang Ruan <ruansy.f...@cn.fujitsu.com>
---
 fs/dax.c            |  24 +++++-----
 include/linux/dax.h |   5 +-
 include/linux/mm.h  |   9 ++++
 mm/memory-failure.c | 112 +++++++++++++++++++++++++++++++++-----------
 4 files changed, 108 insertions(+), 42 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 5b47834f2e1b..1c17c4605bc0 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -379,14 +379,17 @@ static struct page *dax_busy_page(void *entry)
 }
 
 /*
- * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
- * @page: The page whose entry we want to lock
+ * dax_lock - Lock the DAX entry corresponding to a page
+ * @mapping: The file whose entry we want to lock
+ * @index:   The offset where the DAX entry located in
+ * @pfnp:    Page frame number of the locked entry
  *
  * Context: Process context.
  * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
  * not be locked.
  */
-dax_entry_t dax_lock_page(struct page *page)
+dax_entry_t dax_lock(struct address_space *mapping, unsigned long index,
+               unsigned long *pfnp)
 {
        XA_STATE(xas, NULL, 0);
        void *entry;
@@ -394,8 +397,6 @@ dax_entry_t dax_lock_page(struct page *page)
        /* Ensure page->mapping isn't freed while we look at it */
        rcu_read_lock();
        for (;;) {
-               struct address_space *mapping = READ_ONCE(page->mapping);
-
                entry = NULL;
                if (!mapping || !dax_mapping(mapping))
                        break;
@@ -413,11 +414,7 @@ dax_entry_t dax_lock_page(struct page *page)
 
                xas.xa = &mapping->i_pages;
                xas_lock_irq(&xas);
-               if (mapping != page->mapping) {
-                       xas_unlock_irq(&xas);
-                       continue;
-               }
-               xas_set(&xas, page->index);
+               xas_set(&xas, index);
                entry = xas_load(&xas);
                if (dax_is_locked(entry)) {
                        rcu_read_unlock();
@@ -427,16 +424,17 @@ dax_entry_t dax_lock_page(struct page *page)
                }
                dax_lock_entry(&xas, entry);
                xas_unlock_irq(&xas);
+               *pfnp = dax_to_pfn(entry);
                break;
        }
        rcu_read_unlock();
        return (dax_entry_t)entry;
 }
 
-void dax_unlock_page(struct page *page, dax_entry_t cookie)
+void dax_unlock(struct address_space *mapping, unsigned long index,
+               dax_entry_t cookie)
 {
-       struct address_space *mapping = page->mapping;
-       XA_STATE(xas, &mapping->i_pages, page->index);
+       XA_STATE(xas, &mapping->i_pages, index);
 
        if (S_ISCHR(mapping->host->i_mode))
                return;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b52f084aa643..b24675af1de8 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -150,8 +150,9 @@ int dax_writeback_mapping_range(struct address_space 
*mapping,
 
 struct page *dax_layout_busy_page(struct address_space *mapping);
 struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t 
start, loff_t end);
-dax_entry_t dax_lock_page(struct page *page);
-void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock(struct address_space *mapping, unsigned long index, 
unsigned long *pfnp);
+void dax_unlock(struct address_space *mapping, unsigned long index,
+               dax_entry_t cookie);
 #else
 static inline bool bdev_dax_supported(struct block_device *bdev,
                int blocksize)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index db6ae4d3fb4e..db3059a1853e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1141,6 +1141,14 @@ static inline bool is_device_private_page(const struct 
page *page)
                page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
+static inline bool is_device_fsdax_page(const struct page *page)
+{
+       return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+               IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+               is_zone_device_page(page) &&
+               page->pgmap->type == MEMORY_DEVICE_FS_DAX;
+}
+
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
        return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
@@ -3030,6 +3038,7 @@ enum mf_flags {
        MF_MUST_KILL = 1 << 2,
        MF_SOFT_OFFLINE = 1 << 3,
 };
+extern int mf_dax_mapping_kill_procs(struct address_space *mapping, pgoff_t 
index, int flags);
 extern int memory_failure(unsigned long pfn, int flags);
 extern void memory_failure_queue(unsigned long pfn, int flags);
 extern void memory_failure_queue_kick(int cpu);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5d880d4eb9a2..03a4f4c1b803 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -120,6 +120,9 @@ static int hwpoison_filter_dev(struct page *p)
        if (PageSlab(p))
                return -EINVAL;
 
+       if (is_device_fsdax_page(p))
+               return 0;
+
        mapping = page_mapping(p);
        if (mapping == NULL || mapping->host == NULL)
                return -EINVAL;
@@ -290,9 +293,8 @@ void shake_page(struct page *p, int access)
 EXPORT_SYMBOL_GPL(shake_page);
 
 static unsigned long dev_pagemap_mapping_shift(struct page *page,
-               struct vm_area_struct *vma)
+               struct vm_area_struct *vma, unsigned long address)
 {
-       unsigned long address = vma_address(page, vma);
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
@@ -333,8 +335,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page 
*page,
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  */
 static void add_to_kill(struct task_struct *tsk, struct page *p,
-                      struct vm_area_struct *vma,
-                      struct list_head *to_kill)
+                       struct address_space *mapping, pgoff_t pgoff,
+                       struct vm_area_struct *vma, struct list_head *to_kill)
 {
        struct to_kill *tk;
 
@@ -345,9 +347,12 @@ static void add_to_kill(struct task_struct *tsk, struct 
page *p,
        }
 
        tk->addr = page_address_in_vma(p, vma);
-       if (is_zone_device_page(p))
-               tk->size_shift = dev_pagemap_mapping_shift(p, vma);
-       else
+       if (is_zone_device_page(p)) {
+               if (is_device_fsdax_page(p))
+                       tk->addr = vma->vm_start +
+                                       ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+               tk->size_shift = dev_pagemap_mapping_shift(p, vma, tk->addr);
+       } else
                tk->size_shift = page_shift(compound_head(p));
 
        /*
@@ -495,7 +500,7 @@ static void collect_procs_anon(struct page *page, struct 
list_head *to_kill,
                        if (!page_mapped_in_vma(page, vma))
                                continue;
                        if (vma->vm_mm == t->mm)
-                               add_to_kill(t, page, vma, to_kill);
+                               add_to_kill(t, page, NULL, 0, vma, to_kill);
                }
        }
        read_unlock(&tasklist_lock);
@@ -505,24 +510,19 @@ static void collect_procs_anon(struct page *page, struct 
list_head *to_kill,
 /*
  * Collect processes when the error hit a file mapped page.
  */
-static void collect_procs_file(struct page *page, struct list_head *to_kill,
-                               int force_early)
+static void collect_procs_file(struct page *page, struct address_space 
*mapping,
+               pgoff_t pgoff, struct list_head *to_kill, int force_early)
 {
        struct vm_area_struct *vma;
        struct task_struct *tsk;
-       struct address_space *mapping = page->mapping;
-       pgoff_t pgoff;
 
        i_mmap_lock_read(mapping);
        read_lock(&tasklist_lock);
-       pgoff = page_to_pgoff(page);
        for_each_process(tsk) {
                struct task_struct *t = task_early_kill(tsk, force_early);
-
                if (!t)
                        continue;
-               vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
-                                     pgoff) {
+               vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                        /*
                         * Send early kill signal to tasks where a vma covers
                         * the page but the corrupted page is not necessarily
@@ -531,7 +531,7 @@ static void collect_procs_file(struct page *page, struct 
list_head *to_kill,
                         * to be informed of all such data corruptions.
                         */
                        if (vma->vm_mm == t->mm)
-                               add_to_kill(t, page, vma, to_kill);
+                               add_to_kill(t, page, mapping, pgoff, vma, 
to_kill);
                }
        }
        read_unlock(&tasklist_lock);
@@ -550,7 +550,8 @@ static void collect_procs(struct page *page, struct 
list_head *tokill,
        if (PageAnon(page))
                collect_procs_anon(page, tokill, force_early);
        else
-               collect_procs_file(page, tokill, force_early);
+               collect_procs_file(page, page->mapping, page_to_pgoff(page),
+                                  tokill, force_early);
 }
 
 static const char *action_name[] = {
@@ -1147,6 +1148,60 @@ static int try_to_split_thp_page(struct page *page, 
const char *msg)
        return 0;
 }
 
+int mf_dax_mapping_kill_procs(struct address_space *mapping, pgoff_t index, 
int flags)
+{
+       const bool unmap_success = true;
+       unsigned long pfn, size = 0;
+       struct to_kill *tk;
+       LIST_HEAD(to_kill);
+       int rc = -EBUSY;
+       loff_t start;
+       dax_entry_t cookie;
+
+       /*
+        * Prevent the inode from being freed while we are interrogating
+        * the address_space, typically this would be handled by
+        * lock_page(), but dax pages do not use the page lock. This
+        * also prevents changes to the mapping of this pfn until
+        * poison signaling is complete.
+        */
+       cookie = dax_lock(mapping, index, &pfn);
+       if (!cookie)
+               goto unlock;
+
+       /*
+        * Unlike System-RAM there is no possibility to swap in a
+        * different physical page at a given virtual address, so all
+        * userspace consumption of ZONE_DEVICE memory necessitates
+        * SIGBUS (i.e. MF_MUST_KILL)
+        */
+       flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+       collect_procs_file(pfn_to_page(pfn), mapping, index, &to_kill,
+                          flags & MF_ACTION_REQUIRED);
+
+       list_for_each_entry(tk, &to_kill, nd)
+               if (tk->size_shift)
+                       size = max(size, 1UL << tk->size_shift);
+       if (size) {
+               /*
+                * Unmap the largest mapping to avoid breaking up
+                * device-dax mappings which are constant size. The
+                * actual size of the mapping being torn down is
+                * communicated in siginfo, see kill_proc()
+                */
+               start = (index << PAGE_SHIFT) & ~(size - 1);
+               unmap_mapping_range(mapping, start, start + size, 0);
+       }
+
+       kill_procs(&to_kill, flags & MF_MUST_KILL, !unmap_success,
+                  pfn, flags);
+       rc = 0;
+unlock:
+       dax_unlock(mapping, index, cookie);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(mf_dax_mapping_kill_procs);
+
 static int memory_failure_hugetlb(unsigned long pfn, int flags)
 {
        struct page *p = pfn_to_page(pfn);
@@ -1223,10 +1278,12 @@ static int memory_failure_dev_pagemap(unsigned long 
pfn, int flags,
                struct dev_pagemap *pgmap)
 {
        struct page *page = pfn_to_page(pfn);
+       struct address_space *mapping = page->mapping;
+       pgoff_t index = page->index;
        const bool unmap_success = true;
-       unsigned long size = 0;
+       unsigned long size = 0, dummy_pfn;
        struct to_kill *tk;
-       LIST_HEAD(tokill);
+       LIST_HEAD(to_kill);
        int rc = -EBUSY;
        loff_t start;
        dax_entry_t cookie;
@@ -1238,7 +1295,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, 
int flags,
         * also prevents changes to the mapping of this pfn until
         * poison signaling is complete.
         */
-       cookie = dax_lock_page(page);
+       cookie = dax_lock(mapping, index, &dummy_pfn);
        if (!cookie)
                goto out;
 
@@ -1268,9 +1325,10 @@ static int memory_failure_dev_pagemap(unsigned long pfn, 
int flags,
         * SIGBUS (i.e. MF_MUST_KILL)
         */
        flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
-       collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+       collect_procs_file(page, mapping, index, &to_kill,
+                          flags & MF_ACTION_REQUIRED);
 
-       list_for_each_entry(tk, &tokill, nd)
+       list_for_each_entry(tk, &to_kill, nd)
                if (tk->size_shift)
                        size = max(size, 1UL << tk->size_shift);
        if (size) {
@@ -1280,13 +1338,13 @@ static int memory_failure_dev_pagemap(unsigned long 
pfn, int flags,
                 * actual size of the mapping being torn down is
                 * communicated in siginfo, see kill_proc()
                 */
-               start = (page->index << PAGE_SHIFT) & ~(size - 1);
-               unmap_mapping_range(page->mapping, start, start + size, 0);
+               start = (index << PAGE_SHIFT) & ~(size - 1);
+               unmap_mapping_range(mapping, start, start + size, 0);
        }
-       kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+       kill_procs(&to_kill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
        rc = 0;
 unlock:
-       dax_unlock_page(page, cookie);
+       dax_unlock(mapping, index, cookie);
 out:
        /* drop pgmap ref acquired in caller */
        put_dev_pagemap(pgmap);
-- 
2.29.2



Reply via email to