>From 2804913f4d21a20a154b93d5437c21e52bf761a1 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
Date: Sun, 13 Dec 2015 00:02:29 +0900
Subject: [PATCH v4] mm/oom: Add memory allocation watchdog kernel thread.

This patch adds a kernel thread which periodically reports number of
memory allocating tasks, dying tasks and OOM victim tasks when some task
is spending too long time inside __alloc_pages_slowpath().

Changes from v1:

  (1) Use per a "struct task_struct" variables. This allows vmcore to
      remember information about last memory allocation request, which
      is useful for understanding last-minute behavior of the kernel.

  (2) Report using accurate timeout. This increases possibility of
      successfully reporting before watchdog timers reset the machine.

  (3) Show memory information (SysRq-m). This makes it easier to know
      the reason of stalling.

  (4) Show both $state_of_allocation and $state_of_task in the same
      line. This makes it easier to grep the output.

  (5) Minimize duration of spinlock held by the kernel thread.

Changes from v2:

  (1) Print sequence number. This makes it easier to know whether
      memory allocation is succeeding (looks like a livelock but making
      forward progress) or not.

  (2) Replace spinlock with cheaper seqlock_t like sequence number based
      method. The caller no longer contend on lock, and major overhead
      for caller side will be two smp_wmb() instead for
      read_lock()/read_unlock().

  (3) Print "exiting" instead for "dying" if an OOM victim is stalling
      at do_exit(), for SIGKILL is removed before arriving at do_exit().

  (4) Moved explanation to Documentation/malloc-watchdog.txt .

Changes from v3:

  (1) Avoid stalls even if there are so many tasks to report.

Signed-off-by: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
---
 Documentation/malloc-watchdog.txt | 139 +++++++++++++++++++++
 include/linux/sched.h             |  25 ++++
 kernel/fork.c                     |   4 +
 mm/Kconfig                        |  10 ++
 mm/page_alloc.c                   | 254 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 432 insertions(+)
 create mode 100644 Documentation/malloc-watchdog.txt

diff --git a/Documentation/malloc-watchdog.txt 
b/Documentation/malloc-watchdog.txt
new file mode 100644
index 0000000..599d751
--- /dev/null
+++ b/Documentation/malloc-watchdog.txt
@@ -0,0 +1,139 @@
+=========================================
+Memory allocation watchdog kernel thread.
+=========================================
+
+
+- What is it?
+
+This kernel thread resembles khungtaskd kernel thread, but this kernel
+thread is for warning that memory allocation requests are stalling, in
+order to catch unexplained hangups/reboots caused by memory allocation
+stalls.
+
+
+- Why need to use it?
+
+Currently, when something went wrong inside memory allocation request,
+the system will stall with either 100% CPU usage (if memory allocating
+tasks are doing busy loop) or 0% CPU usage (if memory allocating tasks
+are waiting for file data to be flushed to storage).
+But /proc/sys/kernel/hung_task_warnings is not helpful because memory
+allocating tasks unlikely sleep in uninterruptible state for
+/proc/sys/kernel/hung_task_timeout_secs seconds.
+
+People are reporting hang up problems. But we are forcing people to use
+kernels without means to find out what was happening. The means are
+expected to work without knowledge to use trace points functionality,
+are expected to run without memory allocation, are expected to dump
+output without administrator's operation, are expected to work before
+watchdog timers reset the machine. Without this kernel thread, it is
+extremely hard to figure out that the system hung up due to memory
+allocation stalls.
+
+
+- How to configure it?
+
+Build kernels with CONFIG_MEMALLOC_WATCHDOG=y.
+
+Default scan interval is 10 seconds. Scan interval can be changed by passing
+integer value to kmallocwd boot parameter. For example, passing kmallocwd=30
+will emit first stall warnings in 30 seconds, and emit subsequent warnings in
+30 seconds.
+
+Even if you disable this kernel thread by passing kmallocwd=0 boot parameter,
+information about last memory allocation request is kept. That is, you will
+get some hint for understanding last-minute behavior of the kernel when you
+analyze vmcore (or memory snapshot of a virtualized machine).
+
+
+- How memory allocation stalls are reported?
+
+There are two types of memory allocation stalls, one is that we fail to
+solve OOM conditions after the OOM killer is invoked, the other is that
+we fail to solve OOM conditions before the OOM killer is invoked.
+
+The former case is that the OOM killer chose an OOM victim but the chosen
+victim is unable to make forward progress. Although the OOM victim
+receives TIF_MEMDIE by the OOM killer, TIF_MEMDIE helps only if the OOM
+victim was doing memory allocation. That is, if the OOM victim was
+blocked at unkillable locks (e.g. mutex_lock(&inode->i_mutex) or
+down_read(&mm->mmap_sem)), the system will hang up upon global OOM
+condition. This kernel thread will report such situation by printing
+
+  MemAlloc-Info: $X stalling task, $Y dying task, $Z victim task.
+
+line where $X > 0 and $Y > 0 and $Z > 0, followed by at most $X + $Y
+lines of
+
+  MemAlloc: $name($pid) $state_of_allocation $state_of_task
+
+where $name and $pid are comm name and pid of a task.
+
+$state_of_allocation is reported only when that task is stalling inside
+__alloc_pages_slowpath(), in seq=$seq gfp=$gfp order=$order delay=$delay
+format where $seq is the sequence number for allocation request, $gfp is
+the gfp flags used for that allocation request, $order is the order,
+delay is jiffies elapsed since entering into __alloc_pages_slowpath().
+
+$state_of_task is reported only when that task is dying, in combination
+of "uninterruptible" (where that task is in uninterruptible sleep,
+likely due to uninterruptible lock), "exiting" (where that task arrived
+at do_exit() function), "dying" (where that task has pending SIGKILL)
+and "victim" (where that task received TIF_MEMDIE, likely be only 1 task).
+
+The latter case has three possibilities. First possibility is simply
+overloaded (not a livelock but progress is too slow to wait). You can
+check for seq=$seq field for each reported process. If $seq is
+increasing over time, it is not a livelock. Second possibility is that
+at least one task is doing __GFP_FS || __GFP_NOFAIL memory allocation
+request but operation for reclaiming memory is not working as expected
+due to unknown reason (a livelock), which will not invoke the OOM
+killer. Third possibility is that all ongoing memory allocation
+requests are !__GFP_FS && !__GFP_NOFAIL, which does not invoke the OOM
+killer. This kernel thread will report such situation with $X > 0,
+$Y >= 0 and $Z = 0.
+
+
+- How the messages look like?
+
+An example of MemAlloc lines (grep of dmesg output) is shown below.
+You can use serial console and/or netconsole to save these messages
+when the system is stalling.
+
+  [   78.402510] MemAlloc-Info: 7 stalling task, 1 dying task, 1 victim task.
+  [   78.404691] MemAlloc: kthreadd(2) seq=6 gfp=0x27000c0 order=2 delay=9931 
uninterruptible
+  [   78.451201] MemAlloc: systemd-journal(478) seq=73 gfp=0x24201ca order=0 
delay=9842
+  [   78.497058] MemAlloc: irqbalance(747) seq=4 gfp=0x24201ca order=0 
delay=7454
+  [   78.542291] MemAlloc: crond(969) seq=18 gfp=0x24201ca order=0 delay=9842
+  [   78.586270] MemAlloc: vmtoolsd(1912) seq=64 gfp=0x24201ca order=0 
delay=9847
+  [   78.631516] MemAlloc: oom-write(3786) seq=25322 gfp=0x24280ca order=0 
delay=10000 uninterruptible
+  [   78.676193] MemAlloc: write(3787) seq=46308 gfp=0x2400240 order=0 
delay=9847 uninterruptible exiting
+  [   78.755351] MemAlloc: write(3788) uninterruptible dying victim
+  [   88.854456] MemAlloc-Info: 8 stalling task, 1 dying task, 1 victim task.
+  [   88.856533] MemAlloc: kthreadd(2) seq=6 gfp=0x27000c0 order=2 delay=20383 
uninterruptible
+  [   88.900375] MemAlloc: systemd-journal(478) seq=73 gfp=0x24201ca order=0 
delay=20294 uninterruptible
+  [   88.952300] MemAlloc: irqbalance(747) seq=4 gfp=0x24201ca order=0 
delay=17906 uninterruptible
+  [   88.997542] MemAlloc: crond(969) seq=18 gfp=0x24201ca order=0 delay=20294
+  [   89.041480] MemAlloc: vmtoolsd(1912) seq=64 gfp=0x24201ca order=0 
delay=20299
+  [   89.090096] MemAlloc: nmbd(3709) seq=9 gfp=0x24201ca order=0 delay=13855
+  [   89.142032] MemAlloc: oom-write(3786) seq=25322 gfp=0x24280ca order=0 
delay=20452
+  [   89.177999] MemAlloc: write(3787) seq=46308 gfp=0x2400240 order=0 
delay=20299 exiting
+  [   89.254554] MemAlloc: write(3788) uninterruptible dying victim
+  [   99.353664] MemAlloc-Info: 11 stalling task, 1 dying task, 1 victim task.
+  [   99.356044] MemAlloc: kthreadd(2) seq=6 gfp=0x27000c0 order=2 delay=30882 
uninterruptible
+  [   99.403609] MemAlloc: systemd-journal(478) seq=73 gfp=0x24201ca order=0 
delay=30793 uninterruptible
+  [   99.449469] MemAlloc: irqbalance(747) seq=4 gfp=0x24201ca order=0 
delay=28405
+  [   99.493474] MemAlloc: crond(969) seq=18 gfp=0x24201ca order=0 delay=30793 
uninterruptible
+  [   99.536027] MemAlloc: vmtoolsd(1912) seq=64 gfp=0x24201ca order=0 
delay=30798 uninterruptible
+  [   99.582630] MemAlloc: master(3682) seq=2 gfp=0x24201ca order=0 delay=10886
+  [   99.626574] MemAlloc: nmbd(3709) seq=9 gfp=0x24201ca order=0 delay=24354
+  [   99.669191] MemAlloc: smbd(3737) seq=2 gfp=0x24201ca order=0 delay=7130
+  [   99.714555] MemAlloc: smbd(3753) seq=2 gfp=0x24201ca order=0 delay=6616 
uninterruptible
+  [   99.758412] MemAlloc: oom-write(3786) seq=25322 gfp=0x24280ca order=0 
delay=30951
+  [   99.793156] MemAlloc: write(3787) seq=46308 gfp=0x2400240 order=0 
delay=30798 uninterruptible exiting
+  [   99.871842] MemAlloc: write(3788) uninterruptible dying victim
+
+You can check whether memory allocations are making forward progress.
+You can check where memory allocations are stalling using stack trace
+of reported task which follows each MemAlloc line. You can check memory
+information (SysRq-m) which follows end of MemAlloc lines.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7b76e39..039b04d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1379,6 +1379,28 @@ struct tlbflush_unmap_batch {
        bool writable;
 };
 
+struct memalloc_info {
+       /* For locking and progress monitoring. */
+       unsigned int sequence;
+       /*
+        * 0: not doing __GFP_RECLAIM allocation.
+        * 1: doing non-recursive __GFP_RECLAIM allocation.
+        * 2: doing recursive __GFP_RECLAIM allocation.
+        */
+       u8 valid;
+       /*
+        * bit 0: Will be reported as OOM victim.
+        * bit 1: Will be reported as dying task.
+        * bit 2: Will be reported as stalling task.
+        */
+       u8 type;
+       /* Started time in jiffies as of valid == 1. */
+       unsigned long start;
+       /* Requested order and gfp flags as of valid == 1. */
+       unsigned int order;
+       gfp_t gfp;
+};
+
 struct task_struct {
        volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
        void *stack;
@@ -1822,6 +1844,9 @@ struct task_struct {
        unsigned long   task_state_change;
 #endif
        int pagefault_disabled;
+#ifdef CONFIG_MEMALLOC_WATCHDOG
+       struct memalloc_info memalloc;
+#endif
 /* CPU-specific state of this task */
        struct thread_struct thread;
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 8cb287a..aed1c89 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1414,6 +1414,10 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
        p->sequential_io_avg    = 0;
 #endif
 
+#ifdef CONFIG_MEMALLOC_WATCHDOG
+       p->memalloc.sequence = 0;
+#endif
+
        /* Perform scheduler related setup. Assign this task to a CPU. */
        retval = sched_fork(clone_flags, p);
        if (retval)
diff --git a/mm/Kconfig b/mm/Kconfig
index 97a4e06..df05f85 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -668,3 +668,13 @@ config ZONE_DEVICE
 
 config FRAME_VECTOR
        bool
+
+config MEMALLOC_WATCHDOG
+       bool "Memory allocation stalling watchdog"
+       default n
+       help
+         This option emits warning messages and traces when memory
+         allocation requests are stalling, in order to catch unexplained
+         hangups/reboots caused by memory allocation stalls.
+
+         See Documentation/malloc-watchdog.txt for more information.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bac8842d..5ff89ae 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -62,6 +62,7 @@
 #include <linux/sched/rt.h>
 #include <linux/page_owner.h>
 #include <linux/kthread.h>
+#include <linux/console.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -3199,6 +3200,257 @@ got_pg:
        return page;
 }
 
+#ifdef CONFIG_MEMALLOC_WATCHDOG
+
+static unsigned long kmallocwd_timeout = 10 * HZ; /* Default scan interval. */
+static struct memalloc_info memalloc; /* Filled by is_stalling_task(). */
+
+/**
+ * is_stalling_task - Check and copy a task's memalloc variable.
+ *
+ * @task:   A task to check.
+ * @expire: Timeout in jiffies.
+ *
+ * Returns true if a task is stalling, false otherwise.
+ */
+static bool is_stalling_task(const struct task_struct *task,
+                            const unsigned long expire)
+{
+       const struct memalloc_info *m = &task->memalloc;
+
+       /*
+        * If start_memalloc_timer() is updating "struct memalloc_info" now,
+        * we can ignore it because timeout jiffies cannot be expired as soon
+        * as updating it completes.
+        */
+       if (!m->valid || (m->sequence & 1))
+               return false;
+       smp_rmb(); /* Block start_memalloc_timer(). */
+       memalloc.start = m->start;
+       memalloc.order = m->order;
+       memalloc.gfp = m->gfp;
+       smp_rmb(); /* Unblock start_memalloc_timer(). */
+       memalloc.sequence = m->sequence;
+       /*
+        * If start_memalloc_timer() started updating it while we read it,
+        * we can ignore it for the same reason.
+        */
+       if (!m->valid || (memalloc.sequence & 1))
+               return false;
+       /* This is a valid "struct memalloc_info". Check for timeout. */
+       return time_after_eq(expire, memalloc.start);
+}
+
+/*
+ * kmallocwd - A kernel thread for monitoring memory allocation stalls.
+ *
+ * @unused: Not used.
+ *
+ * This kernel thread does not terminate.
+ */
+static int kmallocwd(void *unused)
+{
+       char buf[128];
+       struct task_struct *g, *p;
+       unsigned long now;
+       unsigned long expire;
+       unsigned int sigkill_pending;
+       unsigned int memdie_pending;
+       unsigned int stalling_tasks;
+
+ restart:
+       /* Sleep until stalled tasks are found. */
+       while (1) {
+               /*
+                * If memory allocations are not stalling, the value of t after
+                * this for_each_process_thread() loop should remain close to
+                * kmallocwd_timeout. Also, we sleep for kmallocwd_timeout
+                * before retrying if memory allocations are stalling.
+                * Therefore, this while() loop won't waste too much CPU cycles
+                * due to sleeping for too short period.
+                */
+               long t = kmallocwd_timeout;
+               const unsigned long delta = t - jiffies;
+               /*
+                * We might see outdated values in "struct memalloc_info" here.
+                * We will recheck later using is_stalling_task().
+                */
+               preempt_disable();
+               rcu_read_lock();
+               for_each_process_thread(g, p) {
+                       if (likely(!p->memalloc.valid))
+                               continue;
+                       t = min_t(long, t, p->memalloc.start + delta);
+                       if (unlikely(t <= 0))
+                               goto stalling;
+               }
+               rcu_read_unlock();
+               preempt_enable();
+               schedule_timeout_interruptible(t);
+       }
+ stalling:
+       rcu_read_unlock();
+       preempt_enable();
+       cond_resched();
+       now = jiffies;
+       /*
+        * Report tasks that stalled for more than half of timeout duration
+        * because such tasks might be correlated with tasks that already
+        * stalled for full timeout duration.
+        */
+       expire = now - kmallocwd_timeout / 2;
+       /* Count stalling tasks, dying and victim tasks. */
+       sigkill_pending = 0;
+       memdie_pending = 0;
+       stalling_tasks = 0;
+       preempt_disable();
+       rcu_read_lock();
+       for_each_process_thread(g, p) {
+               u8 type = 0;
+
+               if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
+                       type |= 1;
+                       memdie_pending++;
+               }
+               if (fatal_signal_pending(p)) {
+                       type |= 2;
+                       sigkill_pending++;
+               }
+               if (is_stalling_task(p, expire)) {
+                       type |= 4;
+                       stalling_tasks++;
+               }
+               p->memalloc.type = type;
+       }
+       rcu_read_unlock();
+       preempt_enable();
+       if (!stalling_tasks)
+               goto restart;
+       /* Report stalling tasks, dying and victim tasks. */
+       pr_warn("MemAlloc-Info: %u stalling task, %u dying task, %u victim 
task.\n",
+               stalling_tasks, sigkill_pending, memdie_pending);
+       cond_resched();
+       preempt_disable();
+       rcu_read_lock();
+ restart_report:
+       for_each_process_thread(g, p) {
+               bool can_cont;
+               u8 type = p->memalloc.type;
+
+               if (likely(!type))
+                       continue;
+               p->memalloc.type = 0;
+               buf[0] = '\0';
+               /*
+                * Recheck stalling tasks in case they called
+                * stop_memalloc_timer() meanwhile.
+                */
+               if (type & 4) {
+                       if (is_stalling_task(p, expire)) {
+                               snprintf(buf, sizeof(buf),
+                                        " seq=%u gfp=0x%x order=%u delay=%lu",
+                                        memalloc.sequence >> 1, memalloc.gfp,
+                                        memalloc.order, now - memalloc.start);
+                       } else {
+                               type &= ~4;
+                               if (!type)
+                                       continue;
+                       }
+               }
+               /*
+                * Victim tasks get pending SIGKILL removed before arriving at
+                * do_exit(). Therefore, print " exiting" instead for " dying".
+                */
+               pr_warn("MemAlloc: %s(%u)%s%s%s%s%s\n", p->comm, p->pid, buf,
+                       (p->state & TASK_UNINTERRUPTIBLE) ?
+                       " uninterruptible" : "",
+                       (p->flags & PF_EXITING) ? " exiting" : "",
+                       (type & 2) ? " dying" : "",
+                       (type & 1) ? " victim" : "");
+               sched_show_task(p);
+               debug_show_held_locks(p);
+               /*
+                * Since there could be thousands of tasks to report, we always
+                * sleep and try to flush printk() buffer after each report, in
+                * order to avoid RCU stalls and reduce possibility of messages
+                * being dropped by continuous printk() flood.
+                *
+                * Since not yet reported tasks have p->memalloc.type > 0, we
+                * can simply restart this loop in case "g" or "p" went away.
+                */
+               get_task_struct(g);
+               get_task_struct(p);
+               rcu_read_unlock();
+               preempt_enable();
+               schedule_timeout_interruptible(1);
+               console_lock();
+               console_unlock();
+               preempt_disable();
+               rcu_read_lock();
+               can_cont = pid_alive(g) && pid_alive(p);
+               put_task_struct(p);
+               put_task_struct(g);
+               if (!can_cont)
+                       goto restart_report;
+       }
+       rcu_read_unlock();
+       preempt_enable();
+       cond_resched();
+       /* Show memory information. (SysRq-m) */
+       show_mem(0);
+       /* Sleep until next timeout duration. */
+       schedule_timeout_interruptible(kmallocwd_timeout);
+       goto restart;
+       return 0; /* To suppress "no return statement" compiler warning. */
+}
+
+static int __init start_kmallocwd(void)
+{
+       if (kmallocwd_timeout)
+               kthread_run(kmallocwd, NULL, "kmallocwd");
+       return 0;
+}
+late_initcall(start_kmallocwd);
+
+static int __init kmallocwd_config(char *str)
+{
+       if (kstrtoul(str, 10, &kmallocwd_timeout) == 0)
+               kmallocwd_timeout = min(kmallocwd_timeout * HZ,
+                                       (unsigned long) LONG_MAX);
+       return 0;
+}
+__setup("kmallocwd=", kmallocwd_config);
+
+static void start_memalloc_timer(const gfp_t gfp_mask, const int order)
+{
+       struct memalloc_info *m = &current->memalloc;
+
+       /* We don't check for stalls for !__GFP_RECLAIM allocations. */
+       if (!(gfp_mask & __GFP_RECLAIM))
+               return;
+       /* We don't check for stalls for nested __GFP_RECLAIM allocations */
+       if (!m->valid) {
+               m->sequence++;
+               smp_wmb(); /* Block is_stalling_task(). */
+               m->start = jiffies;
+               m->order = order;
+               m->gfp = gfp_mask;
+               smp_wmb(); /* Unblock is_stalling_task(). */
+               m->sequence++;
+       }
+       m->valid++;
+}
+
+static void stop_memalloc_timer(const gfp_t gfp_mask)
+{
+       if (gfp_mask & __GFP_RECLAIM)
+               current->memalloc.valid--;
+}
+#else
+#define start_memalloc_timer(gfp_mask, order) do { } while (0)
+#define stop_memalloc_timer(gfp_mask) do { } while (0)
+#endif
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -3266,7 +3518,9 @@ retry_cpuset:
                alloc_mask = memalloc_noio_flags(gfp_mask);
                ac.spread_dirty_pages = false;
 
+               start_memalloc_timer(alloc_mask, order);
                page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+               stop_memalloc_timer(alloc_mask);
        }
 
        if (kmemcheck_enabled && page)
-- 
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to