On 04/15/2015 03:37 AM, Chris Metcalf wrote:

> Change the default behavior of watchdog so it only runs on the
> housekeeping cores when nohz_full is enabled at build and boot time.
> Allow modifying the set of cores the watchdog is currently running
> on with a new kernel.watchdog_cpumask sysctl.
> 
> If we allowed the watchdog to run on nohz_full cores, the timer
> interrupts and scheduler work would prevent the desired tickless
> operation on those cores.  But if we disable the watchdog globally,
> then the housekeeping cores can't benefit from the watchdog
> functionality.  So we allow disabling it only on some cores.
> See Documentation/lockup-watchdogs.txt for more information.
> 
> Acked-by: Don Zickus <dzic...@redhat.com>
> Signed-off-by: Chris Metcalf <cmetc...@ezchip.com>
> ---
> v8: use new semantics of smpboot_update_cpumask_percpu_thread() [Frederic]
>     improve documentation in "Documentation/" and in changelong [akpm]


s/changelong/changelog/

> 
> v7: use cpumask field instead of valid_cpu() callback
> 
> v6: use alloc_cpumask_var() [Sasha Levin]
>     switch from watchdog_exclude to watchdog_cpumask [Frederic]
>     simplify the smp_hotplug_thread API to watchdog [Frederic]
>     add Don's Acked-by
> 
>  Documentation/lockup-watchdogs.txt | 18 ++++++++++++++
>  Documentation/sysctl/kernel.txt    | 15 ++++++++++++
>  include/linux/nmi.h                |  3 +++
>  kernel/sysctl.c                    |  7 ++++++
>  kernel/watchdog.c                  | 49 
> ++++++++++++++++++++++++++++++++++++++
>  5 files changed, 92 insertions(+)
> 
> diff --git a/Documentation/lockup-watchdogs.txt 
> b/Documentation/lockup-watchdogs.txt
> index ab0baa692c13..22dd6af2e4bd 100644
> --- a/Documentation/lockup-watchdogs.txt
> +++ b/Documentation/lockup-watchdogs.txt
> @@ -61,3 +61,21 @@ As explained above, a kernel knob is provided that allows
>  administrators to configure the period of the hrtimer and the perf
>  event. The right value for a particular environment is a trade-off
>  between fast response to lockups and detection overhead.
> +
> +By default, the watchdog runs on all online cores.  However, on a
> +kernel configured with NO_HZ_FULL, by default the watchdog runs only
> +on the housekeeping cores, not the cores specified in the "nohz_full"
> +boot argument.  If we allowed the watchdog to run by default on
> +the "nohz_full" cores, we would have to run timer ticks to activate
> +the scheduler, which would prevent the "nohz_full" functionality
> +from protecting the user code on those cores from the kernel.
> +Of course, disabling it by default on the nohz_full cores means that
> +when those cores do enter the kernel, by default we will not be
> +able to detect if they lock up.  However, allowing the watchdog
> +to continue to run on the housekeeping (non-tickless) cores means
> +that we will continue to detect lockups properly on those cores.
> +
> +In either case, the set of cores excluded from running the watchdog
> +may be adjusted via the kernel.watchdog_cpumask sysctl.  For
> +nohz_full cores, this may be useful for debugging a case where the
> +kernel seems to be hanging on the nohz_full cores.
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
> index c831001c45f1..f1697858d71c 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -923,6 +923,21 @@ and nmi_watchdog.
>  
>  ==============================================================
>  
> +watchdog_cpumask:
> +
> +This value can be used to control on which cpus the watchdog may run.
> +The default cpumask is all possible cores, but if NO_HZ_FULL is
> +enabled in the kernel config, and cores are specified with the
> +nohz_full= boot argument, those cores are excluded by default.
> +Offline cores can be included in this mask, and if the core is later
> +brought online, the watchdog will be started based on the mask value.
> +
> +Typically this value would only be touched in the nohz_full case
> +to re-enable cores that by default were not running the watchdog,
> +if a kernel lockup was suspected on those cores.
> +
> +==============================================================
> +
>  watchdog_thresh:
>  
>  This value can be used to control the frequency of hrtimer and NMI
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 3d46fb4708e0..f94da0e65dea 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled;
>  extern int soft_watchdog_enabled;
>  extern int watchdog_user_enabled;
>  extern int watchdog_thresh;
> +extern unsigned long *watchdog_cpumask_bits;
>  extern int sysctl_softlockup_all_cpu_backtrace;
>  struct ctl_table;
>  extern int proc_watchdog(struct ctl_table *, int ,
> @@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int ,
>                             void __user *, size_t *, loff_t *);
>  extern int proc_watchdog_thresh(struct ctl_table *, int ,
>                               void __user *, size_t *, loff_t *);
> +extern int proc_watchdog_cpumask(struct ctl_table *, int,
> +                              void __user *, size_t *, loff_t *);
>  #endif
>  
>  #ifdef CONFIG_HAVE_ACPI_APEI_NMI
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 2082b1a88fb9..699571a74e3b 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -881,6 +881,13 @@ static struct ctl_table kern_table[] = {
>               .extra2         = &one,
>       },
>       {
> +             .procname       = "watchdog_cpumask",
> +             .data           = &watchdog_cpumask_bits,
> +             .maxlen         = NR_CPUS,
> +             .mode           = 0644,
> +             .proc_handler   = proc_watchdog_cpumask,
> +     },
> +     {
>               .procname       = "softlockup_panic",
>               .data           = &softlockup_panic,
>               .maxlen         = sizeof(int),
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 2316f50b07a4..5bd80a212486 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -19,6 +19,7 @@
>  #include <linux/sysctl.h>
>  #include <linux/smpboot.h>
>  #include <linux/sched/rt.h>
> +#include <linux/tick.h>
>  
>  #include <asm/irq_regs.h>
>  #include <linux/kvm_para.h>
> @@ -56,6 +57,9 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace;
>  #else
>  #define sysctl_softlockup_all_cpu_backtrace 0
>  #endif
> +static cpumask_var_t watchdog_cpumask_for_smpboot;
> +static cpumask_var_t watchdog_cpumask;
> +unsigned long *watchdog_cpumask_bits;
>  
>  static int __read_mostly watchdog_running;
>  static u64 __read_mostly sample_period;
> @@ -694,6 +698,7 @@ static int watchdog_enable_all_cpus(void)
>       int err = 0;
>  
>       if (!watchdog_running) {
> +             cpumask_copy(watchdog_cpumask_for_smpboot, watchdog_cpumask);
>               err = smpboot_register_percpu_thread(&watchdog_threads);
>               if (err)
>                       pr_err("Failed to create watchdog threads, disabled\n");
> @@ -869,12 +874,56 @@ out:
>       mutex_unlock(&watchdog_proc_mutex);
>       return err;
>  }
> +
> +/*
> + * The cpumask is the mask of possible cpus that the watchdog can run
> + * on, not the mask of cpus it is actually running on.  This allows the
> + * user to specify a mask that will include cpus that have not yet
> + * been brought online, if desired.
> + */
> +int proc_watchdog_cpumask(struct ctl_table *table, int write,
> +                       void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +     int err;
> +
> +     mutex_lock(&watchdog_proc_mutex);
> +     err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
> +     if (!err && write) {
> +             /* Remove impossible cpus to keep sysctl output cleaner. */
> +             cpumask_and(watchdog_cpumask, watchdog_cpumask,
> +                         cpu_possible_mask);
> +
> +             if (watchdog_enabled && watchdog_thresh)


If the new mask is same as the current one, then there is no need to go on ?
cpus_equal(watchdog_cpumask, watchdog_cpumask_for_smpboot) or something else ?


> +                     smpboot_update_cpumask_percpu_thread(&watchdog_threads,
> +                                                          watchdog_cpumask);
> +     }
> +     mutex_unlock(&watchdog_proc_mutex);
> +     return err;
> +}
> +
>  #endif /* CONFIG_SYSCTL */
>  
>  void __init lockup_detector_init(void)
>  {
>       set_sample_period();
>  
> +     /* One cpumask is allocated for smpboot to own. */
> +     alloc_cpumask_var(&watchdog_cpumask_for_smpboot, GFP_KERNEL);


alloc_cpumask_var could fail?

> +     watchdog_threads.cpumask = watchdog_cpumask_for_smpboot;
> +
> +     /* Another cpumask is allocated for /proc to use. */
> +     alloc_cpumask_var(&watchdog_cpumask, GFP_KERNEL);


ditto

thanks
chai wen

> +     watchdog_cpumask_bits = cpumask_bits(watchdog_cpumask);
> +
> +#ifdef CONFIG_NO_HZ_FULL
> +     if (!cpumask_empty(tick_nohz_full_mask))
> +             pr_info("Disabling watchdog on nohz_full cores by default\n");
> +     cpumask_andnot(watchdog_cpumask, cpu_possible_mask,
> +                    tick_nohz_full_mask);
> +#else
> +     cpumask_copy(watchdog_cpumask, cpu_possible_mask);
> +#endif
> +
>       if (watchdog_enabled)
>               watchdog_enable_all_cpus();
>  }



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to