Changelog v2
1. Take reference on mem->css in pushback (YAMAMOTO Takshi) 2. Move away from trying to reclaim nr_pages over soft limit to swap cluster at a time (KAMEZAWA Hiroyuki) The global list of all cgroups over their soft limit is scanned under memory pressure. We call mem_cgroup_pushback_groups_over_soft_limit from __alloc_pages() prior to calling try_to_free_pages(), in an attempt to rescue memory from groups that are using memory above their soft limit. If this attempt is unsuccessfull, we call try_to_free_pages() and take the normal global reclaim path. Signed-off-by: Balbir Singh <[EMAIL PROTECTED]> --- include/linux/memcontrol.h | 9 +++++ include/linux/res_counter.h | 11 ++++++ include/linux/swap.h | 4 +- mm/memcontrol.c | 76 ++++++++++++++++++++++++++++++++++++++++---- mm/page_alloc.c | 10 +++++ mm/vmscan.c | 12 ++++-- 6 files changed, 110 insertions(+), 12 deletions(-) diff -puN include/linux/memcontrol.h~memory-controller-reclaim-on-contention include/linux/memcontrol.h --- linux-2.6.25-rc2/include/linux/memcontrol.h~memory-controller-reclaim-on-contention 2008-02-19 12:31:51.000000000 +0530 +++ linux-2.6.25-rc2-balbir/include/linux/memcontrol.h 2008-02-19 12:31:51.000000000 +0530 @@ -71,6 +71,8 @@ extern long mem_cgroup_calc_reclaim_acti struct zone *zone, int priority); extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, struct zone *zone, int priority); +extern unsigned long +mem_cgroup_pushback_groups_over_soft_limit(struct zone **zones, gfp_t gfp_mask); #else /* CONFIG_CGROUP_MEM_CONT */ static inline void mm_init_cgroup(struct mm_struct *mm, @@ -179,6 +181,13 @@ static inline long mem_cgroup_calc_recla { return 0; } + +static inline unsigned long +mem_cgroup_pushback_groups_over_soft_limit(struct zone **zones, gfp_t gfp_mask) +{ + return 0; +} + #endif /* CONFIG_CGROUP_MEM_CONT */ #endif /* _LINUX_MEMCONTROL_H */ diff -puN include/linux/res_counter.h~memory-controller-reclaim-on-contention include/linux/res_counter.h --- linux-2.6.25-rc2/include/linux/res_counter.h~memory-controller-reclaim-on-contention 2008-02-19 12:31:51.000000000 +0530 +++ linux-2.6.25-rc2-balbir/include/linux/res_counter.h 2008-02-19 12:31:51.000000000 +0530 @@ -140,4 +140,15 @@ static inline bool res_counter_check_und return ret; } +static inline long long res_counter_sl_excess(struct res_counter *cnt) +{ + unsigned long flags; + long long ret; + + spin_lock_irqsave(&cnt->lock, flags); + ret = cnt->usage - cnt->soft_limit; + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + #endif diff -puN include/linux/swap.h~memory-controller-reclaim-on-contention include/linux/swap.h --- linux-2.6.25-rc2/include/linux/swap.h~memory-controller-reclaim-on-contention 2008-02-19 12:31:51.000000000 +0530 +++ linux-2.6.25-rc2-balbir/include/linux/swap.h 2008-02-19 12:31:51.000000000 +0530 @@ -184,7 +184,9 @@ extern void swap_setup(void); extern unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, - gfp_t gfp_mask); + gfp_t gfp_mask, + unsigned long nr_pages, + struct zone **zones); extern int __isolate_lru_page(struct page *page, int mode); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; diff -puN mm/memcontrol.c~memory-controller-reclaim-on-contention mm/memcontrol.c --- linux-2.6.25-rc2/mm/memcontrol.c~memory-controller-reclaim-on-contention 2008-02-19 12:31:51.000000000 +0530 +++ linux-2.6.25-rc2-balbir/mm/memcontrol.c 2008-02-19 12:31:51.000000000 +0530 @@ -35,7 +35,7 @@ struct cgroup_subsys mem_cgroup_subsys; static const int MEM_CGROUP_RECLAIM_RETRIES = 5; -static spinlock_t mem_cgroup_sl_list_lock; /* spin lock that protects */ +static rwlock_t mem_cgroup_sl_list_lock; /* spin lock that protects */ /* the list of cgroups over*/ /* their soft limit */ static struct list_head mem_cgroup_sl_exceeded_list; @@ -646,7 +646,8 @@ retry: if (!(gfp_mask & __GFP_WAIT)) goto out; - if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) + if (try_to_free_mem_cgroup_pages(mem, gfp_mask, + SWAP_CLUSTER_MAX, NULL)) continue; /* @@ -692,11 +693,11 @@ retry: * cgroups over their soft limit */ if (!res_counter_check_under_limit(&mem->res, RES_SOFT_LIMIT)) { - spin_lock_irqsave(&mem_cgroup_sl_list_lock, flags); + write_lock_irqsave(&mem_cgroup_sl_list_lock, flags); if (list_empty(&mem->sl_exceeded_list)) list_add_tail(&mem->sl_exceeded_list, &mem_cgroup_sl_exceeded_list); - spin_unlock_irqrestore(&mem_cgroup_sl_list_lock, flags); + write_unlock_irqrestore(&mem_cgroup_sl_list_lock, flags); } mz = page_cgroup_zoneinfo(pc); @@ -928,7 +929,64 @@ out: return ret; } +/* + * Free all control groups, which are over their soft limit + */ +unsigned long mem_cgroup_pushback_groups_over_soft_limit(struct zone **zones, + gfp_t gfp_mask) +{ + struct mem_cgroup *mem; + long long nr_bytes_over_sl; + unsigned long ret = 0; + unsigned long flags; + struct list_head reclaimed_groups; + + INIT_LIST_HEAD(&reclaimed_groups); + read_lock_irqsave(&mem_cgroup_sl_list_lock, flags); + while (!list_empty(&mem_cgroup_sl_exceeded_list)) { + mem = list_first_entry(&mem_cgroup_sl_exceeded_list, + struct mem_cgroup, sl_exceeded_list); + css_get(&mem->css); + list_move(&mem->sl_exceeded_list, &reclaimed_groups); + read_unlock_irqrestore(&mem_cgroup_sl_list_lock, flags); + + nr_bytes_over_sl = res_counter_sl_excess(&mem->res); + if (nr_bytes_over_sl <= 0) + goto next; + /* + * Even though we can try and reclaim all memory over limit + * it makes sense to go at it SWAP_CLUSTER_MAX at a time + */ + ret += try_to_free_mem_cgroup_pages(mem, gfp_mask, + SWAP_CLUSTER_MAX, + zones); +next: + css_put(&mem->css); + read_lock_irqsave(&mem_cgroup_sl_list_lock, flags); + } + while (!list_empty(&reclaimed_groups)) { + /* + * Check again to see if we've gone below the soft + * limit. XXX: Consider giving up the &mem_cgroup_sl_list_lock + * before calling res_counter_sl_excess. + */ + mem = list_first_entry(&reclaimed_groups, struct mem_cgroup, + sl_exceeded_list); + /* + * NOTE: we don't need to take a css reference under + * the mem_cgroup_sl_list lock + */ + nr_bytes_over_sl = res_counter_sl_excess(&mem->res); + if (nr_bytes_over_sl <= 0) + list_del_init(&mem->sl_exceeded_list); + else + list_move(&mem->sl_exceeded_list, + &mem_cgroup_sl_exceeded_list); + } + read_unlock_irqrestore(&mem_cgroup_sl_list_lock, flags); + return ret; +} int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) { @@ -1124,8 +1182,7 @@ mem_cgroup_create(struct cgroup_subsys * if (unlikely((cont->parent) == NULL)) { mem = &init_mem_cgroup; init_mm.mem_cgroup = mem; - INIT_LIST_HEAD(&mem->sl_exceeded_list); - spin_lock_init(&mem_cgroup_sl_list_lock); + rwlock_init(&mem_cgroup_sl_list_lock); INIT_LIST_HEAD(&mem_cgroup_sl_exceeded_list); } else mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); @@ -1155,7 +1212,14 @@ static void mem_cgroup_pre_destroy(struc struct cgroup *cont) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + unsigned long flags; + mem_cgroup_force_empty(mem); + + write_lock_irqsave(&mem_cgroup_sl_list_lock, flags); + if (!list_empty(&mem->sl_exceeded_list)) + list_del_init(&mem->sl_exceeded_list); + write_unlock_irqrestore(&mem_cgroup_sl_list_lock, flags); } static void mem_cgroup_destroy(struct cgroup_subsys *ss, diff -puN mm/page_alloc.c~memory-controller-reclaim-on-contention mm/page_alloc.c --- linux-2.6.25-rc2/mm/page_alloc.c~memory-controller-reclaim-on-contention 2008-02-19 12:31:51.000000000 +0530 +++ linux-2.6.25-rc2-balbir/mm/page_alloc.c 2008-02-19 12:31:51.000000000 +0530 @@ -1635,7 +1635,15 @@ nofail_alloc: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask); + /* + * First reclaim from all memory control groups over their + * soft limit + */ + did_some_progress = mem_cgroup_pushback_groups_over_soft_limit( + zonelist->zones, gfp_mask); + if (!did_some_progress) + did_some_progress = + try_to_free_pages(zonelist->zones, order, gfp_mask); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; diff -puN mm/vmscan.c~memory-controller-reclaim-on-contention mm/vmscan.c --- linux-2.6.25-rc2/mm/vmscan.c~memory-controller-reclaim-on-contention 2008-02-19 12:31:51.000000000 +0530 +++ linux-2.6.25-rc2-balbir/mm/vmscan.c 2008-02-19 12:31:51.000000000 +0530 @@ -1440,22 +1440,26 @@ unsigned long try_to_free_pages(struct z #ifdef CONFIG_CGROUP_MEM_CONT unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, - gfp_t gfp_mask) + gfp_t gfp_mask, + unsigned long nr_pages, + struct zone **zones) { struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, .may_swap = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, + .swap_cluster_max = nr_pages, .swappiness = vm_swappiness, .order = 0, .mem_cgroup = mem_cont, .isolate_pages = mem_cgroup_isolate_pages, }; - struct zone **zones; int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE); - zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones; + if (!zones) + zones = + NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones; + if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) return 1; return 0; _ -- Warm Regards, Balbir Singh Linux Technology Center IBM, ISTL -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/