[Devel] [PATCH RH8] oom: Scan only the relevant memcg when in berserker mode

Evgenii Shatokhin Wed, 28 Jul 2021 17:05:53 -0700

Even if an OOM happened in a memory cgroup, oom_berserker() iterated
over all tasks in the system and called task_in_mem_cgroup() for each
one to find the tasks from the necessary memcg.


It is not needed to scan all the tasks though. select_bad_process() and
other functions already rely on mem_cgroup_scan_tasks() to iterate over
the tasks from a given memory cgroup only.

Now, oom_berserker() was revisited to make use of that too.
task_in_mem_cgroup() was no longer needed after that and was removed.

There is one potential disadvantage with this approach though. Walking
through the whole list of tasks in the system allowed oom_berserker() to
pick the youngest tasks in the memcg which were no better than the OOM
killer's previous target. mem_cgroup_scan_tasks(), however, iterates over
the tasks in an unspecified order. It seems, it processes the cgroups
from parent to children and the tasks in each cgroup - from the oldest
added to the newest added ones, but this could change in the future.
This way, oom_berserker() can select the tasks which are no better than
the target, but not the youngest ones.

Not sure if this is a real problem though. oom_berserker() triggers only
when the normal OOM killer kills tasks very often. If some system task
eats up lots of memory and gets shot as a result, then there is
something wrong with that task already. It would likely be shot by the
normal OOM killer anyway, but a bit later.

https://jira.sw.ru/browse/PSBM-131984
Signed-off-by: Evgenii Shatokhin <eshatok...@virtuozzo.com>
---
 include/linux/memcontrol.h |   7 --
 mm/memcontrol.c            |  26 ------
 mm/oom_kill.c              | 164 +++++++++++++++++++++----------------
 3 files changed, 95 insertions(+), 102 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5458f18c0ae7..c6244a62f376 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -551,7 +551,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct 
pglist_data *);
 
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
@@ -1096,12 +1095,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
        return true;
 }
 
-static inline bool task_in_mem_cgroup(struct task_struct *task,
-                                     const struct mem_cgroup *memcg)
-{
-       return true;
-}
-
 static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
        return NULL;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f9a1c3c4f85e..424bc1e52b3b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1462,32 +1462,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, 
enum lru_list lru,
                *lru_size += nr_pages;
 }
 
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-{
-       struct mem_cgroup *task_memcg;
-       struct task_struct *p;
-       bool ret;
-
-       p = find_lock_task_mm(task);
-       if (p) {
-               task_memcg = get_mem_cgroup_from_mm(p->mm);
-               task_unlock(p);
-       } else {
-               /*
-                * All threads may have already detached their mm's, but the oom
-                * killer still needs to detect if they have already been oom
-                * killed to prevent needlessly killing additional tasks.
-                */
-               rcu_read_lock();
-               task_memcg = mem_cgroup_from_task(task);
-               css_get(&task_memcg->css);
-               rcu_read_unlock();
-       }
-       ret = mem_cgroup_is_descendant(task_memcg, memcg);
-       css_put(&task_memcg->css);
-       return ret;
-}
-
 #ifdef CONFIG_CLEANCACHE
 bool mem_cgroup_cleancache_disabled(struct page *page)
 {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a64a9ff7391b..382a85b64c1f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -951,21 +951,97 @@ static int oom_kill_memcg_member(struct task_struct 
*task, void *message)
        return 0;
 }
 
+/*
+ * The data used when iterating over tasks in "rage/berserker" mode.
+ */
+struct rage_kill_data {
+       struct oom_control *oc;
+
+       /* current level of berserker rage */
+       int rage;
+
+       /* number of tasks killed so far */
+       int killed;
+};
+
+/*
+ * Returns 1 if enough tasks have been killed, 0 otherwise.
+ */
+static int
+maybe_rage_kill(struct task_struct *p, void *arg)
+{
+       static DEFINE_RATELIMIT_STATE(berserker_rs,
+                                     DEFAULT_RATELIMIT_INTERVAL,
+                                     DEFAULT_RATELIMIT_BURST);
+       struct rage_kill_data *rkd = arg;
+       struct oom_control *oc = rkd->oc;
+       unsigned long tsk_points;
+       unsigned long tsk_overdraft;
+
+       if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
+               fatal_signal_pending(p) || p->flags & PF_EXITING ||
+               oom_unkillable_task(p))
+               return 0;
+
+       /* p may not have freeable memory in nodemask */
+       if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
+               return 0;
+
+       tsk_points = oom_badness(p, oc->totalpages, &tsk_overdraft);
+       if (tsk_overdraft < oc->overdraft)
+               return 0;
+
+       /*
+        * oom_badness never returns a negative value, even if
+        * oom_score_adj would make badness so, instead it
+        * returns 1. So we do not kill task with badness 1 if
+        * the victim has badness > 1 so as not to risk killing
+        * protected tasks.
+        */
+       if (tsk_points <= 1 && oc->chosen_points > 1)
+               return 0;
+
+       /*
+        * Consider tasks as equally bad if they occupy equal
+        * percentage of available memory.
+        */
+       if (tsk_points * 100 / oc->totalpages <
+               oc->chosen_points * 100 / oc->totalpages)
+               return 0;
+
+       if (__ratelimit(&berserker_rs)) {
+               task_lock(p);
+               pr_err("Rage kill process %d (%s)\n",
+                       task_pid_nr(p), p->comm);
+               task_unlock(p);
+       }
+
+       count_vm_event(OOM_KILL);
+       memcg_memory_event((is_memcg_oom(oc) ? oc->memcg : root_mem_cgroup),
+                          MEMCG_OOM_KILL);
+
+       do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
+
+       ++rkd->killed;
+       if (rkd->killed >= 1 << rkd->rage)
+               return 1;
+       return 0;
+}
+
 /*
  * Kill more processes if oom happens too often in this context.
  */
 static void oom_berserker(struct oom_control *oc)
 {
-       static DEFINE_RATELIMIT_STATE(berserker_rs,
-                               DEFAULT_RATELIMIT_INTERVAL,
-                               DEFAULT_RATELIMIT_BURST);
        struct task_struct *p;
        struct mem_cgroup *memcg;
        unsigned long now = jiffies;
-       int rage;
-       int killed = 0;
+       struct rage_kill_data rkd = {
+               .oc = oc,
+               .killed = 0,
+       };
 
-       memcg = oc->memcg ?: root_mem_cgroup;
+       memcg = is_memcg_oom(oc) ? oc->memcg : root_mem_cgroup;
 
        spin_lock(&memcg->oom_rage_lock);
        memcg->prev_oom_time = memcg->oom_time;
@@ -983,77 +1059,27 @@ static void oom_berserker(struct oom_control *oc)
                memcg->oom_rage = OOM_BASE_RAGE;
        else if (memcg->oom_rage < OOM_MAX_RAGE)
                memcg->oom_rage++;
-       rage = memcg->oom_rage;
+       rkd.rage = memcg->oom_rage;
        spin_unlock(&memcg->oom_rage_lock);
 
-       if (rage < 0)
+       if (rkd.rage < 0)
                return;
 
        /*
-        * So, we are in rage. Kill (1 << rage) youngest tasks that are
-        * as bad as the victim.
+        * So, we are in rage. Kill (1 << rage) tasks that are as bad as
+        * the victim.
         */
-       read_lock(&tasklist_lock);
-       list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
-               unsigned long tsk_points;
-               unsigned long tsk_overdraft;
-
-               if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
-                       fatal_signal_pending(p) || p->flags & PF_EXITING ||
-                       oom_unkillable_task(p))
-                       continue;
-
-               /*
-                * When mem_cgroup_out_of_memory() and
-                * p is not member of the group.
-                */
-               if (oc->memcg && !task_in_mem_cgroup(p, oc->memcg))
-                       continue;
-
-               /* p may not have freeable memory in nodemask */
-               if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
-                       continue;
-
-               tsk_points = oom_badness(p, oc->totalpages, &tsk_overdraft);
-               if (tsk_overdraft < oc->overdraft)
-                       continue;
-
-               /*
-                * oom_badness never returns a negative value, even if
-                * oom_score_adj would make badness so, instead it
-                * returns 1. So we do not kill task with badness 1 if
-                * the victim has badness > 1 so as not to risk killing
-                * protected tasks.
-                */
-               if (tsk_points <= 1 && oc->chosen_points > 1)
-                       continue;
-
-               /*
-                * Consider tasks as equally bad if they occupy equal
-                * percentage of available memory.
-                */
-               if (tsk_points * 100 / oc->totalpages <
-                       oc->chosen_points * 100 / oc->totalpages)
-                       continue;
-
-               if (__ratelimit(&berserker_rs)) {
-                       task_lock(p);
-                       pr_err("Rage kill process %d (%s)\n",
-                               task_pid_nr(p), p->comm);
-                       task_unlock(p);
-               }
-
-               count_vm_event(OOM_KILL);
-               memcg_memory_event(memcg, MEMCG_OOM_KILL);
-
-               do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
-
-               if (++killed >= 1 << rage)
-                       break;
+       if (is_memcg_oom(oc)) {
+               mem_cgroup_scan_tasks(oc->memcg, maybe_rage_kill, &rkd);
+       } else {
+               read_lock(&tasklist_lock);
+               list_for_each_entry_reverse(p, &init_task.tasks, tasks)
+                       if (maybe_rage_kill(p, &rkd))
+                               break;
+               read_unlock(&tasklist_lock);
        }
-       read_unlock(&tasklist_lock);
 
-       pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed);
+       pr_err("OOM killer in rage %d: %d tasks killed\n", rkd.rage, 
rkd.killed);
 }
 
 static void oom_kill_process(struct oom_control *oc, const char *message)
-- 
2.29.0

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH8] oom: Scan only the relevant memcg when in berserker mode

Reply via email to