On Mon, May 13, 2019 at 08:02:11PM +0900, Tetsuo Handa wrote:
> syzbot's second top report is "no output from test machine" where the
> userspace process failed to spawn a new test process for 300 seconds
> for some reason. One of reasons which can result in this report is that
> an already spawned test process was unable to terminate (e.g. trapped at
> an unkillable retry loop due to some bug) after SIGKILL was sent to that
> process. Therefore, reporting when a thread is failing to terminate
> despite a fatal signal is pending would give us more useful information.
> 
> This version shares existing sysctl settings (e.g. check interval,
> timeout, whether to panic) used for detecting TASK_UNINTERRUPTIBLE
> threads, for I don't know whether people want to use a new kernel
> config option and different sysctl settings for monitoring killed
> threads.
> 
> Signed-off-by: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
> Cc: Dmitry Vyukov <dvyu...@google.com>

Looks good to me.

Acked-by: Paul E. McKenney <paul...@linux.ibm.com>

A few inconsequential comments below.

> ---
>  include/linux/sched.h |  1 +
>  kernel/hung_task.c    | 44 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 45 insertions(+)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a2cd1585..d42bdd7 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -850,6 +850,7 @@ struct task_struct {
>  #ifdef CONFIG_DETECT_HUNG_TASK
>       unsigned long                   last_switch_count;
>       unsigned long                   last_switch_time;
> +     unsigned long                   killed_time;
>  #endif
>       /* Filesystem information: */
>       struct fs_struct                *fs;
> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index f108a95..34e7b84 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -141,6 +141,47 @@ static void check_hung_task(struct task_struct *t, 
> unsigned long timeout)
>       touch_nmi_watchdog();
>  }
>  
> +static void check_killed_task(struct task_struct *t, unsigned long timeout)
> +{
> +     unsigned long stamp = t->killed_time;
> +
> +     /*
> +      * Ensure the task is not frozen.
> +      * Also, skip vfork and any other user process that freezer should skip.
> +      */
> +     if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
> +             return;
> +     /*
> +      * Skip threads which are already inside do_exit(), for exit_mm() etc.
> +      * might take many seconds.
> +      */
> +     if (t->flags & PF_EXITING)
> +             return;
> +     if (!stamp) {
> +             stamp = jiffies;
> +             if (!stamp)
> +                     stamp++;

Cute trick to avoid issues with jiffy overflow on 32-bit systems.  ;-)

> +             t->killed_time = stamp;
> +             return;
> +     }
> +     if (time_is_after_jiffies(stamp + timeout * HZ))

And if I understand correctly, timeout of zero disables everything, so
we don't get the backwards false-positive comparison above.

> +             return;
> +     trace_sched_process_hang(t);
> +     if (sysctl_hung_task_panic) {
> +             console_verbose();
> +             hung_task_call_panic = true;
> +     }
> +     /*
> +      * This thread failed to terminate for more than
> +      * sysctl_hung_task_timeout_secs seconds, complain:
> +      */
> +     pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n",
> +            t->comm, t->pid, (jiffies - stamp) / HZ);
> +     sched_show_task(t);
> +     hung_task_show_lock = true;
> +     touch_nmi_watchdog();
> +}
> +
>  /*
>   * To avoid extending the RCU grace period for an unbounded amount of time,
>   * periodically exit the critical section and enter a new one.
> @@ -192,6 +233,9 @@ static void check_hung_uninterruptible_tasks(unsigned 
> long timeout)
>                               goto unlock;
>                       last_break = jiffies;
>               }
> +             /* Check threads which are about to terminate. */
> +             if (unlikely(fatal_signal_pending(t)))
> +                     check_killed_task(t, timeout);
>               /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
>               if (t->state == TASK_UNINTERRUPTIBLE)
>                       check_hung_task(t, timeout);
> -- 
> 1.8.3.1
> 

Reply via email to