Changelog v2

1. Take reference on mem->css in pushback (YAMAMOTO Takshi)
2. Move away from trying to reclaim nr_pages over soft limit to swap
   cluster at a time (KAMEZAWA Hiroyuki)

The global list of all cgroups over their soft limit is scanned under
memory pressure. We call mem_cgroup_pushback_groups_over_soft_limit
from __alloc_pages() prior to calling try_to_free_pages(), in an attempt
to rescue memory from groups that are using memory above their soft limit.
If this attempt is unsuccessfull, we call try_to_free_pages() and take
the normal global reclaim path.


Signed-off-by: Balbir Singh <[EMAIL PROTECTED]>
---

 include/linux/memcontrol.h  |    9 +++++
 include/linux/res_counter.h |   11 ++++++
 include/linux/swap.h        |    4 +-
 mm/memcontrol.c             |   76 ++++++++++++++++++++++++++++++++++++++++----
 mm/page_alloc.c             |   10 +++++
 mm/vmscan.c                 |   12 ++++--
 6 files changed, 110 insertions(+), 12 deletions(-)

diff -puN include/linux/memcontrol.h~memory-controller-reclaim-on-contention 
include/linux/memcontrol.h
--- 
linux-2.6.25-rc2/include/linux/memcontrol.h~memory-controller-reclaim-on-contention
 2008-02-19 12:31:51.000000000 +0530
+++ linux-2.6.25-rc2-balbir/include/linux/memcontrol.h  2008-02-19 
12:31:51.000000000 +0530
@@ -71,6 +71,8 @@ extern long mem_cgroup_calc_reclaim_acti
                                struct zone *zone, int priority);
 extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
                                struct zone *zone, int priority);
+extern unsigned long
+mem_cgroup_pushback_groups_over_soft_limit(struct zone **zones, gfp_t 
gfp_mask);
 
 #else /* CONFIG_CGROUP_MEM_CONT */
 static inline void mm_init_cgroup(struct mm_struct *mm,
@@ -179,6 +181,13 @@ static inline long mem_cgroup_calc_recla
 {
        return 0;
 }
+
+static inline unsigned long
+mem_cgroup_pushback_groups_over_soft_limit(struct zone **zones, gfp_t gfp_mask)
+{
+       return 0;
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff -puN include/linux/res_counter.h~memory-controller-reclaim-on-contention 
include/linux/res_counter.h
--- 
linux-2.6.25-rc2/include/linux/res_counter.h~memory-controller-reclaim-on-contention
        2008-02-19 12:31:51.000000000 +0530
+++ linux-2.6.25-rc2-balbir/include/linux/res_counter.h 2008-02-19 
12:31:51.000000000 +0530
@@ -140,4 +140,15 @@ static inline bool res_counter_check_und
        return ret;
 }
 
+static inline long long res_counter_sl_excess(struct res_counter *cnt)
+{
+       unsigned long flags;
+       long long ret;
+
+       spin_lock_irqsave(&cnt->lock, flags);
+       ret = cnt->usage - cnt->soft_limit;
+       spin_unlock_irqrestore(&cnt->lock, flags);
+       return ret;
+}
+
 #endif
diff -puN include/linux/swap.h~memory-controller-reclaim-on-contention 
include/linux/swap.h
--- 
linux-2.6.25-rc2/include/linux/swap.h~memory-controller-reclaim-on-contention   
    2008-02-19 12:31:51.000000000 +0530
+++ linux-2.6.25-rc2-balbir/include/linux/swap.h        2008-02-19 
12:31:51.000000000 +0530
@@ -184,7 +184,9 @@ extern void swap_setup(void);
 extern unsigned long try_to_free_pages(struct zone **zones, int order,
                                        gfp_t gfp_mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
-                                                       gfp_t gfp_mask);
+                                                       gfp_t gfp_mask,
+                                                       unsigned long nr_pages,
+                                                       struct zone **zones);
 extern int __isolate_lru_page(struct page *page, int mode);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
diff -puN mm/memcontrol.c~memory-controller-reclaim-on-contention 
mm/memcontrol.c
--- linux-2.6.25-rc2/mm/memcontrol.c~memory-controller-reclaim-on-contention    
2008-02-19 12:31:51.000000000 +0530
+++ linux-2.6.25-rc2-balbir/mm/memcontrol.c     2008-02-19 12:31:51.000000000 
+0530
@@ -35,7 +35,7 @@
 
 struct cgroup_subsys mem_cgroup_subsys;
 static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
-static spinlock_t mem_cgroup_sl_list_lock;     /* spin lock that protects */
+static rwlock_t mem_cgroup_sl_list_lock;       /* spin lock that protects */
                                                /* the list of cgroups over*/
                                                /* their soft limit */
 static struct list_head mem_cgroup_sl_exceeded_list;
@@ -646,7 +646,8 @@ retry:
                if (!(gfp_mask & __GFP_WAIT))
                        goto out;
 
-               if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
+               if (try_to_free_mem_cgroup_pages(mem, gfp_mask,
+                                                       SWAP_CLUSTER_MAX, NULL))
                        continue;
 
                /*
@@ -692,11 +693,11 @@ retry:
         * cgroups over their soft limit
         */
        if (!res_counter_check_under_limit(&mem->res, RES_SOFT_LIMIT)) {
-               spin_lock_irqsave(&mem_cgroup_sl_list_lock, flags);
+               write_lock_irqsave(&mem_cgroup_sl_list_lock, flags);
                if (list_empty(&mem->sl_exceeded_list))
                        list_add_tail(&mem->sl_exceeded_list,
                                                &mem_cgroup_sl_exceeded_list);
-               spin_unlock_irqrestore(&mem_cgroup_sl_list_lock, flags);
+               write_unlock_irqrestore(&mem_cgroup_sl_list_lock, flags);
        }
 
        mz = page_cgroup_zoneinfo(pc);
@@ -928,7 +929,64 @@ out:
        return ret;
 }
 
+/*
+ * Free all control groups, which are over their soft limit
+ */
+unsigned long mem_cgroup_pushback_groups_over_soft_limit(struct zone **zones,
+                                                               gfp_t gfp_mask)
+{
+       struct mem_cgroup *mem;
+       long long nr_bytes_over_sl;
+       unsigned long ret = 0;
+       unsigned long flags;
+       struct list_head reclaimed_groups;
+
+       INIT_LIST_HEAD(&reclaimed_groups);
+       read_lock_irqsave(&mem_cgroup_sl_list_lock, flags);
+       while (!list_empty(&mem_cgroup_sl_exceeded_list)) {
+               mem = list_first_entry(&mem_cgroup_sl_exceeded_list,
+                               struct mem_cgroup, sl_exceeded_list);
+               css_get(&mem->css);
+               list_move(&mem->sl_exceeded_list, &reclaimed_groups);
+               read_unlock_irqrestore(&mem_cgroup_sl_list_lock, flags);
+
+               nr_bytes_over_sl = res_counter_sl_excess(&mem->res);
+               if (nr_bytes_over_sl <= 0)
+                       goto next;
+               /*
+                * Even though we can try and reclaim all memory over limit
+                * it makes sense to go at it SWAP_CLUSTER_MAX at a time
+                */
+               ret += try_to_free_mem_cgroup_pages(mem, gfp_mask,
+                                                       SWAP_CLUSTER_MAX,
+                                                       zones);
+next:
+               css_put(&mem->css);
+               read_lock_irqsave(&mem_cgroup_sl_list_lock, flags);
+       }
 
+       while (!list_empty(&reclaimed_groups)) {
+               /*
+                * Check again to see if we've gone below the soft
+                * limit. XXX: Consider giving up the &mem_cgroup_sl_list_lock
+                * before calling res_counter_sl_excess.
+                */
+               mem = list_first_entry(&reclaimed_groups, struct mem_cgroup,
+                                       sl_exceeded_list);
+               /*
+                * NOTE: we don't need to take a css reference under
+                * the mem_cgroup_sl_list lock
+                */
+               nr_bytes_over_sl = res_counter_sl_excess(&mem->res);
+               if (nr_bytes_over_sl <= 0)
+                       list_del_init(&mem->sl_exceeded_list);
+               else
+                       list_move(&mem->sl_exceeded_list,
+                               &mem_cgroup_sl_exceeded_list);
+       }
+       read_unlock_irqrestore(&mem_cgroup_sl_list_lock, flags);
+       return ret;
+}
 
 int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
 {
@@ -1124,8 +1182,7 @@ mem_cgroup_create(struct cgroup_subsys *
        if (unlikely((cont->parent) == NULL)) {
                mem = &init_mem_cgroup;
                init_mm.mem_cgroup = mem;
-               INIT_LIST_HEAD(&mem->sl_exceeded_list);
-               spin_lock_init(&mem_cgroup_sl_list_lock);
+               rwlock_init(&mem_cgroup_sl_list_lock);
                INIT_LIST_HEAD(&mem_cgroup_sl_exceeded_list);
        } else
                mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
@@ -1155,7 +1212,14 @@ static void mem_cgroup_pre_destroy(struc
                                        struct cgroup *cont)
 {
        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+       unsigned long flags;
+
        mem_cgroup_force_empty(mem);
+
+       write_lock_irqsave(&mem_cgroup_sl_list_lock, flags);
+       if (!list_empty(&mem->sl_exceeded_list))
+               list_del_init(&mem->sl_exceeded_list);
+       write_unlock_irqrestore(&mem_cgroup_sl_list_lock, flags);
 }
 
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
diff -puN mm/page_alloc.c~memory-controller-reclaim-on-contention 
mm/page_alloc.c
--- linux-2.6.25-rc2/mm/page_alloc.c~memory-controller-reclaim-on-contention    
2008-02-19 12:31:51.000000000 +0530
+++ linux-2.6.25-rc2-balbir/mm/page_alloc.c     2008-02-19 12:31:51.000000000 
+0530
@@ -1635,7 +1635,15 @@ nofail_alloc:
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
 
-       did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
+       /*
+        * First reclaim from all memory control groups over their
+        * soft limit
+        */
+       did_some_progress = mem_cgroup_pushback_groups_over_soft_limit(
+                                               zonelist->zones, gfp_mask);
+       if (!did_some_progress)
+               did_some_progress =
+                       try_to_free_pages(zonelist->zones, order, gfp_mask);
 
        p->reclaim_state = NULL;
        p->flags &= ~PF_MEMALLOC;
diff -puN mm/vmscan.c~memory-controller-reclaim-on-contention mm/vmscan.c
--- linux-2.6.25-rc2/mm/vmscan.c~memory-controller-reclaim-on-contention        
2008-02-19 12:31:51.000000000 +0530
+++ linux-2.6.25-rc2-balbir/mm/vmscan.c 2008-02-19 12:31:51.000000000 +0530
@@ -1440,22 +1440,26 @@ unsigned long try_to_free_pages(struct z
 #ifdef CONFIG_CGROUP_MEM_CONT
 
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
-                                               gfp_t gfp_mask)
+                                               gfp_t gfp_mask,
+                                               unsigned long nr_pages,
+                                               struct zone **zones)
 {
        struct scan_control sc = {
                .gfp_mask = gfp_mask,
                .may_writepage = !laptop_mode,
                .may_swap = 1,
-               .swap_cluster_max = SWAP_CLUSTER_MAX,
+               .swap_cluster_max = nr_pages,
                .swappiness = vm_swappiness,
                .order = 0,
                .mem_cgroup = mem_cont,
                .isolate_pages = mem_cgroup_isolate_pages,
        };
-       struct zone **zones;
        int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
 
-       zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones;
+       if (!zones)
+               zones =
+               NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones;
+
        if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
                return 1;
        return 0;
_

-- 
        Warm Regards,
        Balbir Singh
        Linux Technology Center
        IBM, ISTL
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to