On Wed, 7 Nov 2012, Sha Zhengju wrote:

> From: Sha Zhengju <handai....@taobao.com>
> 
> If memcg oom happening, don't scan all system tasks to dump memory state of
> eligible tasks, instead we iterates only over the process attached to the oom
> memcg and avoid the rcu lock.
> 

Avoiding the rcu lock isn't actually that impressive here, the cgroup 
iterator will use it's own lock for that memcg.

> Signed-off-by: Sha Zhengju <handai....@taobao.com>
> Cc: Michal Hocko <mho...@suse.cz>
> Cc: KAMEZAWA Hiroyuki <kamezawa.hir...@jp.fujitsu.com>
> Cc: David Rientjes <rient...@google.com>
> Cc: Andrew Morton <a...@linux-foundation.org>
> ---
>  include/linux/memcontrol.h |    7 +++++
>  include/linux/oom.h        |    2 +
>  mm/memcontrol.c            |   14 +++++++++++
>  mm/oom_kill.c              |   55 ++++++++++++++++++++++++++-----------------
>  4 files changed, 56 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index c91e3c1..4322ca8 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -122,6 +122,8 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec 
> *lruvec, enum lru_list);
>  void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
>  extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
>                                       struct task_struct *p);
> +extern void dump_tasks_memcg(const struct mem_cgroup *memcg,
> +                                     const nodemask_t *nodemask);

Shouldn't need the nodemask parameter, just have dump_tasks_memcg() pass 
NULL to dump_per_task(), we won't be isolating to tasks with mempolicies 
restricted to a particular set of nodes since we're in the memcg oom path 
here, not the global page allocator oom path.

>  extern void mem_cgroup_replace_page_cache(struct page *oldpage,
>                                       struct page *newpage);
>  
> @@ -337,6 +339,11 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, 
> struct task_struct *p)
>  {
>  }
>  
> +static inline void
> +dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
> +{
> +}
> +
>  static inline void mem_cgroup_begin_update_page_stat(struct page *page,
>                                       bool *locked, unsigned long *flags)
>  {
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 20b5c46..9ba3344 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -57,6 +57,8 @@ extern enum oom_scan_t oom_scan_process_thread(struct 
> task_struct *task,
>               unsigned long totalpages, const nodemask_t *nodemask,
>               bool force_kill);
>  
> +extern inline void dump_per_task(struct task_struct *p,
> +                             const nodemask_t *nodemask);

This is a global symbol, so dump_per_task() doesn't make a lot of sense: 
it would need to be prefixed with "oom_" so perhaps oom_dump_task() is 
better?

>  extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
>               int order, nodemask_t *mask, bool force_kill);
>  extern int register_oom_notifier(struct notifier_block *nb);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 2df5e72..fe648f8 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1665,6 +1665,20 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup 
> *memcg)
>       return min(limit, memsw);
>  }
>  
> +void dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t 
> *nodemask)
> +{
> +     struct cgroup_iter it;
> +     struct task_struct *task;
> +     struct cgroup *cgroup = memcg->css.cgroup;
> +
> +     cgroup_iter_start(cgroup, &it);
> +     while ((task = cgroup_iter_next(cgroup, &it))) {
> +             dump_per_task(task, nodemask);
> +     }
> +
> +     cgroup_iter_end(cgroup, &it);
> +}
> +
>  static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t 
> gfp_mask,
>                                    int order)
>  {
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 4b8a6dd..aaf6237 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -367,6 +367,32 @@ static struct task_struct *select_bad_process(unsigned 
> int *ppoints,
>       return chosen;
>  }
>  
> +inline void dump_per_task(struct task_struct *p, const nodemask_t *nodemask)

No inline.

> +{
> +     struct task_struct *task;
> +
> +     if (oom_unkillable_task(p, NULL, nodemask))
> +             return;
> +
> +     task = find_lock_task_mm(p);
> +     if (!task) {
> +             /*
> +              * This is a kthread or all of p's threads have already
> +              * detached their mm's.  There's no need to report
> +              * them; they can't be oom killed anyway.
> +              */
> +             return;
> +     }
> +
> +     pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5d %s\n",
> +             task->pid, from_kuid(&init_user_ns, task_uid(task)),
> +             task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> +             task->mm->nr_ptes,
> +             get_mm_counter(task->mm, MM_SWAPENTS),
> +             task->signal->oom_score_adj, task->comm);
> +     task_unlock(task);
> +}
> +
>  /**
>   * dump_tasks - dump current memory state of all system tasks
>   * @memcg: current's memory controller, if constrained
> @@ -381,32 +407,17 @@ static struct task_struct *select_bad_process(unsigned 
> int *ppoints,
>  static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t 
> *nodemask)
>  {
>       struct task_struct *p;
> -     struct task_struct *task;
>  
>       pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes swapents 
> oom_score_adj name\n");
> -     rcu_read_lock();
> -     for_each_process(p) {
> -             if (oom_unkillable_task(p, memcg, nodemask))
> -                     continue;
> -
> -             task = find_lock_task_mm(p);
> -             if (!task) {
> -                     /*
> -                      * This is a kthread or all of p's threads have already
> -                      * detached their mm's.  There's no need to report
> -                      * them; they can't be oom killed anyway.
> -                      */
> -                     continue;
> -             }
>  
> -             pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5d %s\n",
> -                     task->pid, from_kuid(&init_user_ns, task_uid(task)),
> -                     task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> -                     task->mm->nr_ptes,
> -                     get_mm_counter(task->mm, MM_SWAPENTS),
> -                     task->signal->oom_score_adj, task->comm);
> -             task_unlock(task);
> +     if (memcg) {
> +             dump_tasks_memcg(memcg, nodemask);
> +             return;
>       }
> +
> +     rcu_read_lock();
> +     for_each_process(p)
> +             dump_per_task(p, nodemask);
>       rcu_read_unlock();
>  }
>  
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to