Even if an OOM happened in a memory cgroup, oom_berserker() iterated over all tasks in the system and called task_in_mem_cgroup() for each one to find the tasks from the necessary memcg.
It is not needed to scan all the tasks though. select_bad_process() and other functions already rely on mem_cgroup_scan_tasks() to iterate over the tasks from a given memory cgroup only. Now, oom_berserker() was revisited to make use of that too. task_in_mem_cgroup() was no longer needed after that and was removed. There is one potential disadvantage with this approach though. Walking through the whole list of tasks in the system allowed oom_berserker() to pick the youngest tasks in the memcg which were no better than the OOM killer's previous target. mem_cgroup_scan_tasks(), however, iterates over the tasks in an unspecified order. It seems, it processes the cgroups from parent to children and the tasks in each cgroup - from the oldest added to the newest added ones, but this could change in the future. This way, oom_berserker() can select the tasks which are no better than the target, but not the youngest ones. Not sure if this is a real problem though. oom_berserker() triggers only when the normal OOM killer kills tasks very often. If some system task eats up lots of memory and gets shot as a result, then there is something wrong with that task already. It would likely be shot by the normal OOM killer anyway, but a bit later. https://jira.sw.ru/browse/PSBM-131984 Signed-off-by: Evgenii Shatokhin <eshatok...@virtuozzo.com> --- include/linux/memcontrol.h | 7 -- mm/memcontrol.c | 26 ------ mm/oom_kill.c | 164 +++++++++++++++++++++---------------- 3 files changed, 95 insertions(+), 102 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5458f18c0ae7..c6244a62f376 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -551,7 +551,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct mem_cgroup *get_mem_cgroup_from_page(struct page *page); @@ -1096,12 +1095,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return true; } -static inline bool task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg) -{ - return true; -} - static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { return NULL; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f9a1c3c4f85e..424bc1e52b3b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1462,32 +1462,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, *lru_size += nr_pages; } -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) -{ - struct mem_cgroup *task_memcg; - struct task_struct *p; - bool ret; - - p = find_lock_task_mm(task); - if (p) { - task_memcg = get_mem_cgroup_from_mm(p->mm); - task_unlock(p); - } else { - /* - * All threads may have already detached their mm's, but the oom - * killer still needs to detect if they have already been oom - * killed to prevent needlessly killing additional tasks. - */ - rcu_read_lock(); - task_memcg = mem_cgroup_from_task(task); - css_get(&task_memcg->css); - rcu_read_unlock(); - } - ret = mem_cgroup_is_descendant(task_memcg, memcg); - css_put(&task_memcg->css); - return ret; -} - #ifdef CONFIG_CLEANCACHE bool mem_cgroup_cleancache_disabled(struct page *page) { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a64a9ff7391b..382a85b64c1f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -951,21 +951,97 @@ static int oom_kill_memcg_member(struct task_struct *task, void *message) return 0; } +/* + * The data used when iterating over tasks in "rage/berserker" mode. + */ +struct rage_kill_data { + struct oom_control *oc; + + /* current level of berserker rage */ + int rage; + + /* number of tasks killed so far */ + int killed; +}; + +/* + * Returns 1 if enough tasks have been killed, 0 otherwise. + */ +static int +maybe_rage_kill(struct task_struct *p, void *arg) +{ + static DEFINE_RATELIMIT_STATE(berserker_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct rage_kill_data *rkd = arg; + struct oom_control *oc = rkd->oc; + unsigned long tsk_points; + unsigned long tsk_overdraft; + + if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) || + fatal_signal_pending(p) || p->flags & PF_EXITING || + oom_unkillable_task(p)) + return 0; + + /* p may not have freeable memory in nodemask */ + if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc)) + return 0; + + tsk_points = oom_badness(p, oc->totalpages, &tsk_overdraft); + if (tsk_overdraft < oc->overdraft) + return 0; + + /* + * oom_badness never returns a negative value, even if + * oom_score_adj would make badness so, instead it + * returns 1. So we do not kill task with badness 1 if + * the victim has badness > 1 so as not to risk killing + * protected tasks. + */ + if (tsk_points <= 1 && oc->chosen_points > 1) + return 0; + + /* + * Consider tasks as equally bad if they occupy equal + * percentage of available memory. + */ + if (tsk_points * 100 / oc->totalpages < + oc->chosen_points * 100 / oc->totalpages) + return 0; + + if (__ratelimit(&berserker_rs)) { + task_lock(p); + pr_err("Rage kill process %d (%s)\n", + task_pid_nr(p), p->comm); + task_unlock(p); + } + + count_vm_event(OOM_KILL); + memcg_memory_event((is_memcg_oom(oc) ? oc->memcg : root_mem_cgroup), + MEMCG_OOM_KILL); + + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID); + + ++rkd->killed; + if (rkd->killed >= 1 << rkd->rage) + return 1; + return 0; +} + /* * Kill more processes if oom happens too often in this context. */ static void oom_berserker(struct oom_control *oc) { - static DEFINE_RATELIMIT_STATE(berserker_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); struct task_struct *p; struct mem_cgroup *memcg; unsigned long now = jiffies; - int rage; - int killed = 0; + struct rage_kill_data rkd = { + .oc = oc, + .killed = 0, + }; - memcg = oc->memcg ?: root_mem_cgroup; + memcg = is_memcg_oom(oc) ? oc->memcg : root_mem_cgroup; spin_lock(&memcg->oom_rage_lock); memcg->prev_oom_time = memcg->oom_time; @@ -983,77 +1059,27 @@ static void oom_berserker(struct oom_control *oc) memcg->oom_rage = OOM_BASE_RAGE; else if (memcg->oom_rage < OOM_MAX_RAGE) memcg->oom_rage++; - rage = memcg->oom_rage; + rkd.rage = memcg->oom_rage; spin_unlock(&memcg->oom_rage_lock); - if (rage < 0) + if (rkd.rage < 0) return; /* - * So, we are in rage. Kill (1 << rage) youngest tasks that are - * as bad as the victim. + * So, we are in rage. Kill (1 << rage) tasks that are as bad as + * the victim. */ - read_lock(&tasklist_lock); - list_for_each_entry_reverse(p, &init_task.tasks, tasks) { - unsigned long tsk_points; - unsigned long tsk_overdraft; - - if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) || - fatal_signal_pending(p) || p->flags & PF_EXITING || - oom_unkillable_task(p)) - continue; - - /* - * When mem_cgroup_out_of_memory() and - * p is not member of the group. - */ - if (oc->memcg && !task_in_mem_cgroup(p, oc->memcg)) - continue; - - /* p may not have freeable memory in nodemask */ - if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc)) - continue; - - tsk_points = oom_badness(p, oc->totalpages, &tsk_overdraft); - if (tsk_overdraft < oc->overdraft) - continue; - - /* - * oom_badness never returns a negative value, even if - * oom_score_adj would make badness so, instead it - * returns 1. So we do not kill task with badness 1 if - * the victim has badness > 1 so as not to risk killing - * protected tasks. - */ - if (tsk_points <= 1 && oc->chosen_points > 1) - continue; - - /* - * Consider tasks as equally bad if they occupy equal - * percentage of available memory. - */ - if (tsk_points * 100 / oc->totalpages < - oc->chosen_points * 100 / oc->totalpages) - continue; - - if (__ratelimit(&berserker_rs)) { - task_lock(p); - pr_err("Rage kill process %d (%s)\n", - task_pid_nr(p), p->comm); - task_unlock(p); - } - - count_vm_event(OOM_KILL); - memcg_memory_event(memcg, MEMCG_OOM_KILL); - - do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID); - - if (++killed >= 1 << rage) - break; + if (is_memcg_oom(oc)) { + mem_cgroup_scan_tasks(oc->memcg, maybe_rage_kill, &rkd); + } else { + read_lock(&tasklist_lock); + list_for_each_entry_reverse(p, &init_task.tasks, tasks) + if (maybe_rage_kill(p, &rkd)) + break; + read_unlock(&tasklist_lock); } - read_unlock(&tasklist_lock); - pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed); + pr_err("OOM killer in rage %d: %d tasks killed\n", rkd.rage, rkd.killed); } static void oom_kill_process(struct oom_control *oc, const char *message) -- 2.29.0 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel