This patch makes to use remove_mapping in irq context.
For it, this patch adds irqcontext argument in some functions
(ex, remove_maping and __remove_mapping) but these functions
are not hot path so i believe it's not a problem.

And it exports swap_info_get and check that we can get a
swap_info_struct->s lock in advance in irq context.
Because __swapcache_free must be succesful.
Otherwise, If __swapcache_free can be failed, we should rollback
things done by __delete_from_swap_cache and it could need memory
allocation for radix tree node in irq context.

More concern is handling for mem_cgroup_uncharge_swapcache in
irqcontext, which isn't aware of irqcontext at the moment and it
should be successful like above reason.

After I review that code, I think it's not a big challenge if I missed
somethings.

My rough plan is following as,

1. Make mctz->lock beging aware of irq by changing spin_lock with
   spin_lock_irqsave.
2. Introuduce new argument "locked" in __mem_cgroup_uncharge_common
   so that __mem_cgroup_uncharge_common can avoid lock_page_cgroup in
   irqcontext to avoid deadlock but caller in irqcontext should be held
   it in advance by next patch.
3. Introduce try_lock_page_cgroup, which will be used __swapcache_free.
4. __remove_mapping can held a page_cgroup lock in advance before calling
   __swapcache_free

I'd like to listen memcg people's opinions before diving into coding.

Signed-off-by: Minchan Kim <minc...@kernel.org>
---
 fs/splice.c          |  2 +-
 include/linux/swap.h | 12 ++++++++++-
 mm/swapfile.c        |  2 +-
 mm/truncate.c        |  2 +-
 mm/vmscan.c          | 56 +++++++++++++++++++++++++++++++++++++++++++---------
 5 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index e6b2559..db77694 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -70,7 +70,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info 
*pipe,
                 * If we succeeded in removing the mapping, set LRU flag
                 * and return good.
                 */
-               if (remove_mapping(mapping, page)) {
+               if (remove_mapping(mapping, page, false)) {
                        buf->flags |= PIPE_BUF_FLAG_LRU;
                        return 0;
                }
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ca031f7..eb126d2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -274,7 +274,8 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct 
mem_cgroup *mem,
                                                unsigned long *nr_scanned);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
-extern int remove_mapping(struct address_space *mapping, struct page *page);
+extern int remove_mapping(struct address_space *mapping, struct page *page,
+                               bool irqcontext);
 extern unsigned long vm_total_pages;
 
 #ifdef CONFIG_NUMA
@@ -407,6 +408,9 @@ mem_cgroup_uncharge_swapcache(struct page *page, 
swp_entry_t ent, bool swapout)
 }
 #endif
 
+
+extern struct swap_info_struct *swap_info_get(swp_entry_t entry);
+
 #else /* CONFIG_SWAP */
 
 #define get_nr_swap_pages()                    0L
@@ -430,6 +434,12 @@ static inline void show_swap_cache_info(void)
 #define free_swap_and_cache(swp)       is_migration_entry(swp)
 #define swapcache_prepare(swp)         is_migration_entry(swp)
 
+
+struct swap_info_struct *swap_info_get(swp_entry_t entry)
+{
+       return NULL;
+}
+
 static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
 {
        return 0;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 33ebdd5..8a425d4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -505,7 +505,7 @@ swp_entry_t get_swap_page_of_type(int type)
        return (swp_entry_t) {0};
 }
 
-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+struct swap_info_struct *swap_info_get(swp_entry_t entry)
 {
        struct swap_info_struct *p;
        unsigned long offset, type;
diff --git a/mm/truncate.c b/mm/truncate.c
index c75b736..fa1dc60 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -131,7 +131,7 @@ invalidate_complete_page(struct address_space *mapping, 
struct page *page)
        if (page_has_private(page) && !try_to_release_page(page, 0))
                return 0;
 
-       ret = remove_mapping(mapping, page);
+       ret = remove_mapping(mapping, page, false);
 
        return ret;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa6a853..d14c9be 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -450,12 +450,18 @@ static pageout_t pageout(struct page *page, struct 
address_space *mapping,
  * Same as remove_mapping, but if the page is removed from the mapping, it
  * gets returned with a refcount of 0.
  */
-static int __remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page,
+                               bool irqcontext)
 {
+       unsigned long flags;
+
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
 
-       spin_lock_irq(&mapping->tree_lock);
+       if (irqcontext)
+               spin_lock_irqsave(&mapping->tree_lock, flags);
+       else
+               spin_lock_irq(&mapping->tree_lock);
        /*
         * The non racy check for a busy page.
         *
@@ -490,17 +496,45 @@ static int __remove_mapping(struct address_space 
*mapping, struct page *page)
        }
 
        if (PageSwapCache(page)) {
+               struct swap_info_struct *p;
                swp_entry_t swap = { .val = page_private(page) };
+               p = swap_info_get(swap);
+               /*
+                * If we are irq context, check that we can get a
+                * swap_info_strcut->lock before removing the page from
+                * swap cache. Because __swapcache_free must be successful.
+                * If __swapcache_free can be failed, we should rollback
+                * things done by __delete_from_swap_cache and it needs
+                * memory allocation for radix tree node in irqcontext
+                * That's thing we really want to avoid.
+                * TODO : memcg mem_cgroup_uncharge_swapcache handling
+                * in irqcontext
+                */
+               if (irqcontext && p && !spin_trylock(&p->lock)) {
+                       page_unfreeze_refs(page, 2);
+                       goto cannot_free;
+               }
+
                __delete_from_swap_cache(page);
-               spin_unlock_irq(&mapping->tree_lock);
-               swapcache_free(swap, page);
+               if (irqcontext) {
+                       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+                       if (p)
+                               __swapcache_free(p, swap, page);
+                       spin_unlock(&p->lock);
+               } else {
+                       spin_unlock_irq(&mapping->tree_lock);
+                       swapcache_free(swap, page);
+               }
        } else {
                void (*freepage)(struct page *);
 
                freepage = mapping->a_ops->freepage;
 
                __delete_from_page_cache(page);
-               spin_unlock_irq(&mapping->tree_lock);
+               if (irqcontext)
+                       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               else
+                       spin_unlock_irq(&mapping->tree_lock);
                mem_cgroup_uncharge_cache_page(page);
 
                if (freepage != NULL)
@@ -510,7 +544,10 @@ static int __remove_mapping(struct address_space *mapping, 
struct page *page)
        return 1;
 
 cannot_free:
-       spin_unlock_irq(&mapping->tree_lock);
+       if (irqcontext)
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       else
+               spin_unlock_irq(&mapping->tree_lock);
        return 0;
 }
 
@@ -520,9 +557,10 @@ cannot_free:
  * successfully detached, return 1.  Assumes the caller has a single ref on
  * this page.
  */
-int remove_mapping(struct address_space *mapping, struct page *page)
+int remove_mapping(struct address_space *mapping, struct page *page,
+                       bool irqcontext)
 {
-       if (__remove_mapping(mapping, page)) {
+       if (__remove_mapping(mapping, page, irqcontext)) {
                /*
                 * Unfreezing the refcount with 1 rather than 2 effectively
                 * drops the pagecache ref for us without requiring another
@@ -904,7 +942,7 @@ static unsigned long shrink_page_list(struct list_head 
*page_list,
                        }
                }
 
-               if (!mapping || !__remove_mapping(mapping, page))
+               if (!mapping || !__remove_mapping(mapping, page, false))
                        goto keep_locked;
 
                /*
-- 
1.8.2.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to