Michal Hocko wrote:
> On Tue 21-06-16 13:46:43, Michal Hocko wrote:
> > On Tue 21-06-16 20:03:17, Tetsuo Handa wrote:
> > > Michal Hocko wrote:
> > > > On Mon 13-06-16 13:19:43, Michal Hocko wrote:
> > > > [...]
> > > > > I am trying to remember why we are disabling oom killer before kernel
> > > > > threads are frozen but not really sure about that right away.
> > > > 
> > > > OK, I guess I remember now. Say that a task would depend on a freezable
> > > > kernel thread to get to do_exit (stuck in wait_event etc...). We would
> > > > simply get stuck in oom_killer_disable for ever. So we need to address
> > > > it a different way.
> > > > 
> > > > One way would be what you are proposing but I guess it would be more
> > > > systematic to never call exit_oom_victim on a remote task.  After [1] we
> > > > have a solid foundation to rely only on MMF_REAPED even when TIF_MEMDIE
> > > > is set. It is more code than your patch so I can see a reason to go with
> > > > yours if the following one seems too large or ugly.
> > > > 
> > > > [1] 
> > > > http://lkml.kernel.org/r/1466426628-15074-1-git-send-email-mho...@kernel.org
> > > > 
> > > > What do you think about the following?
> > > 
> > > I'm OK with not clearing TIF_MEMDIE from a remote task. But this patch is 
> > > racy.
> > > 
> > > > @@ -567,40 +612,23 @@ static void oom_reap_task(struct task_struct *tsk)
> > > >         while (attempts++ < MAX_OOM_REAP_RETRIES && 
> > > > !__oom_reap_task(tsk))
> > > >                 schedule_timeout_idle(HZ/10);
> > > >  
> > > > -       if (attempts > MAX_OOM_REAP_RETRIES) {
> > > > -               struct task_struct *p;
> > > > +       tsk->oom_reaper_list = NULL;
> > > >  
> > > > +       if (attempts > MAX_OOM_REAP_RETRIES) {
> > > 
> > > attempts > MAX_OOM_REAP_RETRIES would mean that down_read_trylock()
> > > continuously failed. But it does not guarantee that the offending task
> > > shall not call up_write(&mm->mmap_sem) and arrives at mmput() from 
> > > exit_mm()
> > > (as well as other threads which are blocked at down_read(&mm->mmap_sem) in
> > > exit_mm() by the offending task arrive at mmput() from exit_mm()) when the
> > > OOM reaper was preempted at this point.
> > > 
> > > Therefore, find_lock_task_mm() in requeue_oom_victim() could return NULL 
> > > and
> > > the OOM reaper could fail to set MMF_OOM_REAPED (and find_lock_task_mm() 
> > > in
> > > oom_scan_process_thread() could return NULL and the OOM killer could fail 
> > > to
> > > select next OOM victim as well) when __mmput() got stuck.
> > 
> > Fair enough. As this would break no-lockup requirement we cannot go that
> > way. Let me think about it more.
> 
> Hmm, what about the following instead. It is rather a workaround than a
> full flaged fix but it seems much more easier and shouldn't introduce
> new issues.

Yes, I think that will work. But I think below patch (marking signal_struct
to ignore TIF_MEMDIE instead of clearing TIF_MEMDIE from task_struct) on top of
current linux.git will implement no-lockup requirement. No race is possible 
unlike
"[PATCH 10/10] mm, oom: hide mm which is shared with kthread or global init".

 include/linux/oom.h   |  1 +
 include/linux/sched.h |  2 ++
 mm/memcontrol.c       |  3 ++-
 mm/oom_kill.c         | 60 ++++++++++++++++++++++++++++++---------------------
 4 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 8346952..f072c6c 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -69,6 +69,7 @@ static inline bool oom_task_origin(const struct task_struct 
*p)
 
 extern void mark_oom_victim(struct task_struct *tsk);
 
+extern bool task_is_reapable(struct task_struct *tsk);
 #ifdef CONFIG_MMU
 extern void try_oom_reaper(struct task_struct *tsk);
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e42ada..9248f90 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -799,6 +799,7 @@ struct signal_struct {
         * oom
         */
        bool oom_flag_origin;
+       bool oom_ignore_me;
        short oom_score_adj;            /* OOM kill score adjustment */
        short oom_score_adj_min;        /* OOM kill score adjustment min value.
                                         * Only settable by CAP_SYS_RESOURCE. */
@@ -1545,6 +1546,7 @@ struct task_struct {
        /* unserialized, strictly 'current' */
        unsigned in_execve:1; /* bit to tell LSMs we're in execve */
        unsigned in_iowait:1;
+       unsigned oom_shortcut_done:1;
 #ifdef CONFIG_MEMCG
        unsigned memcg_may_oom:1;
 #ifndef CONFIG_SLOB
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 75e7440..af162f6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1275,7 +1275,8 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup 
*memcg, gfp_t gfp_mask,
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
         */
-       if (fatal_signal_pending(current) || task_will_free_mem(current)) {
+       if (task_is_reapable(current) && !current->oom_shortcut_done) {
+               current->oom_shortcut_done = true;
                mark_oom_victim(current);
                try_oom_reaper(current);
                goto unlock;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index acbc432..e20d889 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -149,7 +149,7 @@ static bool oom_unkillable_task(struct task_struct *p,
        if (!has_intersects_mems_allowed(p, nodemask))
                return true;
 
-       return false;
+       return p->signal->oom_ignore_me;
 }
 
 /**
@@ -555,15 +555,15 @@ static void oom_reap_task(struct task_struct *tsk)
        }
 
        /*
-        * Clear TIF_MEMDIE because the task shouldn't be sitting on a
+        * Ignore TIF_MEMDIE because the task shouldn't be sitting on a
         * reasonably reclaimable memory anymore or it is not a good candidate
         * for the oom victim right now because it cannot release its memory
         * itself nor by the oom reaper.
         */
        tsk->oom_reaper_list = NULL;
-       exit_oom_victim(tsk);
+       tsk->signal->oom_ignore_me = true;
 
-       /* Drop a reference taken by wake_oom_reaper */
+       /* Drop a reference taken by try_oom_reaper */
        put_task_struct(tsk);
 }
 
@@ -589,7 +589,7 @@ static int oom_reaper(void *unused)
        return 0;
 }
 
-static void wake_oom_reaper(struct task_struct *tsk)
+void try_oom_reaper(struct task_struct *tsk)
 {
        if (!oom_reaper_th)
                return;
@@ -610,13 +610,13 @@ static void wake_oom_reaper(struct task_struct *tsk)
 /* Check if we can reap the given task. This has to be called with stable
  * tsk->mm
  */
-void try_oom_reaper(struct task_struct *tsk)
+bool task_is_reapable(struct task_struct *tsk)
 {
        struct mm_struct *mm = tsk->mm;
        struct task_struct *p;
 
        if (!mm)
-               return;
+               return false;
 
        /*
         * There might be other threads/processes which are either not
@@ -639,12 +639,11 @@ void try_oom_reaper(struct task_struct *tsk)
 
                        /* Give up */
                        rcu_read_unlock();
-                       return;
+                       return false;
                }
                rcu_read_unlock();
        }
-
-       wake_oom_reaper(tsk);
+       return true;
 }
 
 static int __init oom_init(void)
@@ -659,8 +658,10 @@ static int __init oom_init(void)
 }
 subsys_initcall(oom_init)
 #else
-static void wake_oom_reaper(struct task_struct *tsk)
+bool task_is_reapable(struct task_struct *tsk)
 {
+       return tsk->mm &&
+               (fatal_signal_pending(tsk) || task_will_free_mem(tsk));
 }
 #endif
 
@@ -753,20 +754,28 @@ void oom_kill_process(struct oom_control *oc, struct 
task_struct *p,
        unsigned int victim_points = 0;
        static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
                                              DEFAULT_RATELIMIT_BURST);
-       bool can_oom_reap = true;
 
        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
         * its children or threads, just set TIF_MEMDIE so it can die quickly
         */
        task_lock(p);
-       if (p->mm && task_will_free_mem(p)) {
+#ifdef CONFIG_MMU
+       if (task_is_reapable(p)) {
                mark_oom_victim(p);
                try_oom_reaper(p);
                task_unlock(p);
                put_task_struct(p);
                return;
        }
+#else
+       if (p->mm && task_will_free_mem(p)) {
+               mark_oom_victim(p);
+               task_unlock(p);
+               put_task_struct(p);
+               return;
+       }
+#endif
        task_unlock(p);
 
        if (__ratelimit(&oom_rs))
@@ -846,21 +855,22 @@ void oom_kill_process(struct oom_control *oc, struct 
task_struct *p,
                if (same_thread_group(p, victim))
                        continue;
                if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
-                   p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-                       /*
-                        * We cannot use oom_reaper for the mm shared by this
-                        * process because it wouldn't get killed and so the
-                        * memory might be still used.
-                        */
-                       can_oom_reap = false;
+                   p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
                        continue;
-               }
+
                do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
        }
        rcu_read_unlock();
 
-       if (can_oom_reap)
-               wake_oom_reaper(victim);
+#ifdef CONFIG_MMU
+       p = find_lock_task_mm(victim);
+       if (p && task_is_reapable(p))
+               try_oom_reaper(victim);
+       else
+               victim->signal->oom_ignore_me = true;
+       if (p)
+               task_unlock(p);
+#endif
 
        mmdrop(mm);
        put_task_struct(victim);
@@ -939,8 +949,8 @@ bool out_of_memory(struct oom_control *oc)
         * But don't select if current has already released its mm and cleared
         * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
         */
-       if (current->mm &&
-           (fatal_signal_pending(current) || task_will_free_mem(current))) {
+       if (!current->oom_shortcut_done && task_is_reapable(current)) {
+               current->oom_shortcut_done = true;
                mark_oom_victim(current);
                try_oom_reaper(current);
                return true;
-- 
1.8.3.1

Reply via email to