Add the meta_page to the per container LRU. The reclaim algorithm has been
modified to make the isolate_lru_pages() as a pluggable component. The
scan_control data structure now accepts the container on behalf of which
reclaims are carried out. try_to_free_pages() has been extended to become
container aware.

Signed-off-by: Balbir Singh <[EMAIL PROTECTED]>
---

 include/linux/memcontrol.h  |   11 +++
 include/linux/res_counter.h |   23 +++++++
 include/linux/swap.h        |    3 +
 mm/memcontrol.c             |   88 +++++++++++++++++++++++++++++-
 mm/swap.c                   |    2 
 mm/vmscan.c                 |  129 +++++++++++++++++++++++++++++++++++---------
 6 files changed, 230 insertions(+), 26 deletions(-)

diff -puN include/linux/memcontrol.h~mem-control-lru-and-reclaim 
include/linux/memcontrol.h
--- linux-2.6.22-rc6/include/linux/memcontrol.h~mem-control-lru-and-reclaim     
2007-07-04 15:05:31.000000000 -0700
+++ linux-2.6.22-rc6-balbir/include/linux/memcontrol.h  2007-07-04 
15:05:31.000000000 -0700
@@ -26,6 +26,13 @@ extern void page_assign_meta_page(struct
 extern struct meta_page *page_get_meta_page(struct page *page);
 extern int mem_container_charge(struct page *page, struct mm_struct *mm);
 extern void mem_container_uncharge(struct meta_page *mp);
+extern void mem_container_move_lists(struct meta_page *mp, bool active);
+extern unsigned long mem_container_isolate_pages(unsigned long nr_to_scan,
+                                       struct list_head *dst,
+                                       unsigned long *scanned, int order,
+                                       int mode, struct zone *z,
+                                       struct mem_container *mem_cont,
+                                       int active);
 
 #else /* CONFIG_CONTAINER_MEM_CONT */
 static inline void mm_init_container(struct mm_struct *mm,
@@ -56,6 +63,10 @@ static inline void mem_container_uncharg
 {
 }
 
+static inline void mem_container_move_lists(struct meta_page *mp, bool active)
+{
+}
+
 #endif /* CONFIG_CONTAINER_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff -puN include/linux/swap.h~mem-control-lru-and-reclaim include/linux/swap.h
--- linux-2.6.22-rc6/include/linux/swap.h~mem-control-lru-and-reclaim   
2007-07-04 15:05:31.000000000 -0700
+++ linux-2.6.22-rc6-balbir/include/linux/swap.h        2007-07-04 
15:05:31.000000000 -0700
@@ -6,6 +6,7 @@
 #include <linux/mmzone.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/memcontrol.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -191,6 +192,8 @@ extern void swap_setup(void);
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zone **zones, int order,
                                        gfp_t gfp_mask);
+extern unsigned long try_to_free_mem_container_pages(struct mem_container 
*mem);
+extern int __isolate_lru_page(struct page *page, int mode);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
diff -puN mm/memcontrol.c~mem-control-lru-and-reclaim mm/memcontrol.c
--- linux-2.6.22-rc6/mm/memcontrol.c~mem-control-lru-and-reclaim        
2007-07-04 15:05:31.000000000 -0700
+++ linux-2.6.22-rc6-balbir/mm/memcontrol.c     2007-07-04 15:05:31.000000000 
-0700
@@ -19,6 +19,8 @@
 #include <linux/page-flags.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/swap.h>
+#include <linux/spinlock.h>
 
 struct container_subsys mem_container_subsys;
 
@@ -103,6 +105,71 @@ void __always_inline unlock_meta_page(st
        bit_spin_unlock(PG_metapage, &page->flags);
 }
 
+unsigned long mem_container_isolate_pages(unsigned long nr_to_scan,
+                                       struct list_head *dst,
+                                       unsigned long *scanned, int order,
+                                       int mode, struct zone *z,
+                                       struct mem_container *mem_cont,
+                                       int active)
+{
+       unsigned long nr_taken = 0;
+       struct page *page;
+       unsigned long scan;
+       LIST_HEAD(mp_list);
+       struct list_head *src;
+       struct meta_page *mp;
+
+       if (active)
+               src = &mem_cont->active_list;
+       else
+               src = &mem_cont->inactive_list;
+
+       for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+               mp = list_entry(src->prev, struct meta_page, list);
+               page = mp->page;
+
+               if (PageActive(page) && !active) {
+                       mem_container_move_lists(mp, true);
+                       scan--;
+                       continue;
+               }
+               if (!PageActive(page) && active) {
+                       mem_container_move_lists(mp, false);
+                       scan--;
+                       continue;
+               }
+
+               /*
+                * Reclaim, per zone
+                * TODO: make the active/inactive lists per zone
+                */
+               if (page_zone(page) != z)
+                       continue;
+
+               list_move(&mp->list, &mp_list);
+               if (__isolate_lru_page(page, mode) == 0) {
+                       list_move(&page->lru, dst);
+                       nr_taken++;
+               }
+       }
+
+       list_splice(&mp_list, src);
+
+       *scanned = scan;
+       return nr_taken;
+}
+
+/*
+ * This routine assumes that the appropriate zone's lru lock is already held
+ */
+void mem_container_move_lists(struct meta_page *mp, bool active)
+{
+       if (active)
+               list_move(&mp->list, &mp->mem_container->active_list);
+       else
+               list_move(&mp->list, &mp->mem_container->inactive_list);
+}
+
 /*
  * Charge the memory controller for page usage.
  * Return
@@ -159,8 +226,22 @@ int mem_container_charge(struct page *pa
         * If we created the meta_page, we should free it on exceeding
         * the container limit.
         */
-       if (res_counter_charge(&mem->res, 1))
+       while (res_counter_charge(&mem->res, 1)) {
+               if (try_to_free_mem_container_pages(mem))
+                       continue;
+
+               /*
+                * try_to_free_mem_container_pages() might not give us a full
+                * picture of reclaim. Some pages are reclaimed and might be
+                * moved to swap cache or just unmapped from the container.
+                * Check the limit again to see if the reclaim reduced the
+                * current usage of the container before giving up
+                */
+               if (res_counter_check_under_limit(&mem->res))
+                       continue;
+
                goto free_mp;
+       }
 
        lock_meta_page(page);
        /*
@@ -177,6 +258,8 @@ int mem_container_charge(struct page *pa
        mp->page = page;
        page_assign_meta_page(page, mp);
 
+       list_add(&mp->list, &mem->active_list);
+
 done:
        unlock_meta_page(page);
        return 0;
@@ -205,12 +288,15 @@ void mem_container_uncharge(struct meta_
 
        if (atomic_dec_and_test(&mp->ref_cnt)) {
                page = mp->page;
+
                lock_meta_page(page);
                mem = mp->mem_container;
                css_put(&mem->css);
                page_assign_meta_page(page, NULL);
                unlock_meta_page(page);
+
                res_counter_uncharge(&mem->res, 1);
+               list_del(&mp->list);
                kfree(mp);
        }
 }
diff -puN mm/swap.c~mem-control-lru-and-reclaim mm/swap.c
--- linux-2.6.22-rc6/mm/swap.c~mem-control-lru-and-reclaim      2007-07-04 
15:05:31.000000000 -0700
+++ linux-2.6.22-rc6-balbir/mm/swap.c   2007-07-04 15:05:31.000000000 -0700
@@ -31,6 +31,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
+#include <linux/memcontrol.h>
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -148,6 +149,7 @@ void fastcall activate_page(struct page 
                SetPageActive(page);
                add_page_to_active_list(zone, page);
                __count_vm_event(PGACTIVATE);
+               mem_container_move_lists(page_get_meta_page(page), true);
        }
        spin_unlock_irq(&zone->lru_lock);
 }
diff -puN mm/vmscan.c~mem-control-lru-and-reclaim mm/vmscan.c
--- linux-2.6.22-rc6/mm/vmscan.c~mem-control-lru-and-reclaim    2007-07-04 
15:05:31.000000000 -0700
+++ linux-2.6.22-rc6-balbir/mm/vmscan.c 2007-07-04 15:05:31.000000000 -0700
@@ -39,6 +39,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/memcontrol.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -70,6 +71,15 @@ struct scan_control {
        int all_unreclaimable;
 
        int order;
+
+       /* Which container do we reclaim from */
+       struct mem_container *mem_container;
+
+       /* Pluggable isolate pages callback */
+       unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
+                       unsigned long *scanned, int order, int mode,
+                       struct zone *z, struct mem_container *mem_cont,
+                       int active);
 };
 
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -604,7 +614,7 @@ keep:
  *
  * returns 0 on success, -ve errno on failure.
  */
-static int __isolate_lru_page(struct page *page, int mode)
+int __isolate_lru_page(struct page *page, int mode)
 {
        int ret = -EINVAL;
 
@@ -738,6 +748,21 @@ static unsigned long isolate_lru_pages(u
        return nr_taken;
 }
 
+static unsigned long isolate_pages_global(unsigned long nr,
+                                       struct list_head *dst,
+                                       unsigned long *scanned, int order,
+                                       int mode, struct zone *z,
+                                       struct mem_container *mem_cont,
+                                       int active)
+{
+       if (active)
+               return isolate_lru_pages(nr, &z->active_list, dst,
+                                               scanned, order, mode);
+       else
+               return isolate_lru_pages(nr, &z->inactive_list, dst,
+                                               scanned, order, mode);
+}
+
 /*
  * clear_active_flags() is a helper for shrink_active_list(), clearing
  * any active bits from the pages in the list.
@@ -779,11 +804,11 @@ static unsigned long shrink_inactive_lis
                unsigned long nr_freed;
                unsigned long nr_active;
 
-               nr_taken = isolate_lru_pages(sc->swap_cluster_max,
-                            &zone->inactive_list,
+               nr_taken = sc->isolate_pages(sc->swap_cluster_max,
                             &page_list, &nr_scan, sc->order,
                             (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
-                                            ISOLATE_BOTH : ISOLATE_INACTIVE);
+                                            ISOLATE_BOTH : ISOLATE_INACTIVE,
+                               zone, sc->mem_container, 0);
                nr_active = clear_active_flags(&page_list);
 
                __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
@@ -932,8 +957,9 @@ force_reclaim_mapped:
 
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
-       pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
-                           &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
+       pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
+                                       ISOLATE_ACTIVE, zone,
+                                       sc->mem_container, 1);
        zone->pages_scanned += pgscanned;
        __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
        spin_unlock_irq(&zone->lru_lock);
@@ -947,10 +973,14 @@ force_reclaim_mapped:
                            (total_swap_pages == 0 && PageAnon(page)) ||
                            page_referenced(page, 0)) {
                                list_add(&page->lru, &l_active);
+                               mem_container_move_lists(
+                                       page_get_meta_page(page), true);
                                continue;
                        }
                } else if (TestClearPageReferenced(page)) {
                        list_add(&page->lru, &l_active);
+                       mem_container_move_lists(page_get_meta_page(page),
+                                                       true);
                        continue;
                }
                list_add(&page->lru, &l_inactive);
@@ -968,6 +998,7 @@ force_reclaim_mapped:
                ClearPageActive(page);
 
                list_move(&page->lru, &zone->inactive_list);
+               mem_container_move_lists(page_get_meta_page(page), false);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
                        __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
@@ -996,6 +1027,7 @@ force_reclaim_mapped:
                SetPageLRU(page);
                VM_BUG_ON(!PageActive(page));
                list_move(&page->lru, &zone->active_list);
+               mem_container_move_lists(page_get_meta_page(page), true);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
                        __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
@@ -1127,7 +1159,8 @@ static unsigned long shrink_zones(int pr
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
  */
-unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
+                                       struct scan_control *sc)
 {
        int priority;
        int ret = 0;
@@ -1136,14 +1169,6 @@ unsigned long try_to_free_pages(struct z
        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long lru_pages = 0;
        int i;
-       struct scan_control sc = {
-               .gfp_mask = gfp_mask,
-               .may_writepage = !laptop_mode,
-               .swap_cluster_max = SWAP_CLUSTER_MAX,
-               .may_swap = 1,
-               .swappiness = vm_swappiness,
-               .order = order,
-       };
 
        delay_swap_prefetch();
        count_vm_event(ALLOCSTALL);
@@ -1159,17 +1184,22 @@ unsigned long try_to_free_pages(struct z
        }
 
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-               sc.nr_scanned = 0;
+               sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token();
-               nr_reclaimed += shrink_zones(priority, zones, &sc);
-               shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
+               nr_reclaimed += shrink_zones(priority, zones, sc);
+               /*
+                * Don't shrink slabs when reclaiming memory from
+                * over limit containers
+                */
+               if (sc->mem_container == NULL)
+                       shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
                if (reclaim_state) {
                        nr_reclaimed += reclaim_state->reclaimed_slab;
                        reclaim_state->reclaimed_slab = 0;
                }
-               total_scanned += sc.nr_scanned;
-               if (nr_reclaimed >= sc.swap_cluster_max) {
+               total_scanned += sc->nr_scanned;
+               if (nr_reclaimed >= sc->swap_cluster_max) {
                        ret = 1;
                        goto out;
                }
@@ -1181,18 +1211,18 @@ unsigned long try_to_free_pages(struct z
                 * that's undesirable in laptop mode, where we *want* lumpy
                 * writeout.  So in laptop mode, write out the whole world.
                 */
-               if (total_scanned > sc.swap_cluster_max +
-                                       sc.swap_cluster_max / 2) {
+               if (total_scanned > sc->swap_cluster_max +
+                                       sc->swap_cluster_max / 2) {
                        wakeup_pdflush(laptop_mode ? 0 : total_scanned);
-                       sc.may_writepage = 1;
+                       sc->may_writepage = 1;
                }
 
                /* Take a nap, wait for some writeback to complete */
-               if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+               if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
                        congestion_wait(WRITE, HZ/10);
        }
        /* top priority shrink_caches still had more to do? don't OOM, then */
-       if (!sc.all_unreclaimable)
+       if (!sc->all_unreclaimable && sc->mem_container == NULL)
                ret = 1;
 out:
        /*
@@ -1215,6 +1245,53 @@ out:
        return ret;
 }
 
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+{
+       struct scan_control sc = {
+               .gfp_mask = gfp_mask,
+               .may_writepage = !laptop_mode,
+               .swap_cluster_max = SWAP_CLUSTER_MAX,
+               .may_swap = 1,
+               .swappiness = vm_swappiness,
+               .order = order,
+               .mem_container = NULL,
+               .isolate_pages = isolate_pages_global,
+       };
+
+       return do_try_to_free_pages(zones, gfp_mask, &sc);
+}
+
+#ifdef CONFIG_CONTAINER_MEM_CONT
+
+#ifdef CONFIG_HIGHMEM
+#define ZONE_USERPAGES ZONE_HIGHMEM
+#else
+#define ZONE_USERPAGES ZONE_NORMAL
+#endif
+
+unsigned long try_to_free_mem_container_pages(struct mem_container *mem_cont)
+{
+       struct scan_control sc = {
+               .gfp_mask = GFP_KERNEL,
+               .may_swap = 1,
+               .swap_cluster_max = SWAP_CLUSTER_MAX,
+               .swappiness = vm_swappiness,
+               .order = 1,
+               .mem_container = mem_cont,
+               .isolate_pages = mem_container_isolate_pages,
+       };
+       int node;
+       struct zone **zones;
+
+       for_each_online_node(node) {
+               zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones;
+               if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
+                       return 1;
+       }
+       return 0;
+}
+#endif
+
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
@@ -1250,6 +1327,8 @@ static unsigned long balance_pgdat(pg_da
                .swap_cluster_max = SWAP_CLUSTER_MAX,
                .swappiness = vm_swappiness,
                .order = order,
+               .mem_container = NULL,
+               .isolate_pages = isolate_pages_global,
        };
        /*
         * temp_priority is used to remember the scanning priority at which
diff -puN include/linux/res_counter.h~mem-control-lru-and-reclaim 
include/linux/res_counter.h
--- linux-2.6.22-rc6/include/linux/res_counter.h~mem-control-lru-and-reclaim    
2007-07-04 15:05:31.000000000 -0700
+++ linux-2.6.22-rc6-balbir/include/linux/res_counter.h 2007-07-04 
15:05:31.000000000 -0700
@@ -99,4 +99,27 @@ int res_counter_charge(struct res_counte
 void res_counter_uncharge_locked(struct res_counter *cnt, unsigned long val);
 void res_counter_uncharge(struct res_counter *cnt, unsigned long val);
 
+static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
+{
+       if (cnt->usage < cnt->limit)
+               return true;
+
+       return false;
+}
+
+/*
+ * Helper function to detect if the container is within it's limit or
+ * not. It's currently called from container_rss_prepare()
+ */
+static inline bool res_counter_check_under_limit(struct res_counter *cnt)
+{
+       bool ret;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cnt->lock, flags);
+       ret = res_counter_limit_check_locked(cnt);
+       spin_unlock_irqrestore(&cnt->lock, flags);
+       return ret;
+}
+
 #endif
_

-- 
        Warm Regards,
        Balbir Singh
        Linux Technology Center
        IBM, ISTL
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to