Re: [PATCH] watchdog/hardlockup: reassign last_timestamp when enable nmi event

kbuild test robot Sun, 13 Oct 2019 13:35:39 -0700

Hi Li,

Thank you for the patch! Yet something to improve:


[auto build test ERROR on linus/master]
[cannot apply to v5.4-rc2 next-20191011]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    
https://github.com/0day-ci/linux/commits/Li-RongQing/watchdog-hardlockup-reassign-last_timestamp-when-enable-nmi-event/20191014-022936
config: i386-randconfig-g004-201941 (attached as .config)
compiler: gcc-7 (Debian 7.4.0-13) 7.4.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <l...@intel.com>

All error/warnings (new ones prefixed by >>):

   In file included from include/asm-generic/percpu.h:7:0,
                    from arch/x86/include/asm/percpu.h:556,
                    from arch/x86/include/asm/current.h:6,
                    from include/linux/sched.h:12,
                    from include/linux/nmi.h:8,
                    from kernel/watchdog_hld.c:15:
   kernel/watchdog_hld.c: In function 'hardlockup_detector_perf_enable':
>> kernel/watchdog_hld.c:201:17: error: 'last_timestamp' undeclared (first use 
>> in this function); did you mean 'statx_timestamp'?
     this_cpu_write(last_timestamp, now);
                    ^
   include/linux/percpu-defs.h:220:47: note: in definition of macro 
'__verify_pcpu_ptr'
     const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL; \
                                                  ^~~
>> include/linux/percpu-defs.h:509:34: note: in expansion of macro 
>> '__pcpu_size_call'
    #define this_cpu_write(pcp, val) __pcpu_size_call(this_cpu_write_, pcp, val)
                                     ^~~~~~~~~~~~~~~~
>> kernel/watchdog_hld.c:201:2: note: in expansion of macro 'this_cpu_write'
     this_cpu_write(last_timestamp, now);
     ^~~~~~~~~~~~~~
   kernel/watchdog_hld.c:201:17: note: each undeclared identifier is reported 
only once for each function it appears in
     this_cpu_write(last_timestamp, now);
                    ^
   include/linux/percpu-defs.h:220:47: note: in definition of macro 
'__verify_pcpu_ptr'
     const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL; \
                                                  ^~~
>> include/linux/percpu-defs.h:509:34: note: in expansion of macro 
>> '__pcpu_size_call'
    #define this_cpu_write(pcp, val) __pcpu_size_call(this_cpu_write_, pcp, val)
                                     ^~~~~~~~~~~~~~~~
>> kernel/watchdog_hld.c:201:2: note: in expansion of macro 'this_cpu_write'
     this_cpu_write(last_timestamp, now);
     ^~~~~~~~~~~~~~
   kernel/watchdog_hld.c: In function 'hardlockup_detector_perf_restart':
   kernel/watchdog_hld.c:283:12: error: 'last_timestamp' undeclared (first use 
in this function); did you mean 'statx_timestamp'?
       per_cpu(last_timestamp, cpu) = now;
               ^
   include/linux/percpu-defs.h:220:47: note: in definition of macro 
'__verify_pcpu_ptr'
     const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL; \
                                                  ^~~
   include/linux/percpu-defs.h:270:29: note: in expansion of macro 'per_cpu_ptr'
    #define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu))
                                ^~~~~~~~~~~
>> kernel/watchdog_hld.c:283:4: note: in expansion of macro 'per_cpu'
       per_cpu(last_timestamp, cpu) = now;
       ^~~~~~~

vim +201 kernel/watchdog_hld.c

    14  
  > 15  #include <linux/nmi.h>
    16  #include <linux/atomic.h>
    17  #include <linux/module.h>
    18  #include <linux/sched/debug.h>
    19  
    20  #include <asm/irq_regs.h>
    21  #include <linux/perf_event.h>
    22  
    23  static DEFINE_PER_CPU(bool, hard_watchdog_warn);
    24  static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
    25  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
    26  static DEFINE_PER_CPU(struct perf_event *, dead_event);
    27  static struct cpumask dead_events_mask;
    28  
    29  static unsigned long hardlockup_allcpu_dumped;
    30  static atomic_t watchdog_cpus = ATOMIC_INIT(0);
    31  
    32  notrace void arch_touch_nmi_watchdog(void)
    33  {
    34          /*
    35           * Using __raw here because some code paths have
    36           * preemption enabled.  If preemption is enabled
    37           * then interrupts should be enabled too, in which
    38           * case we shouldn't have to worry about the watchdog
    39           * going off.
    40           */
    41          raw_cpu_write(watchdog_nmi_touch, true);
    42  }
    43  EXPORT_SYMBOL(arch_touch_nmi_watchdog);
    44  
    45  #ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
    46  static DEFINE_PER_CPU(ktime_t, last_timestamp);
    47  static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
    48  static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;
    49  
    50  void watchdog_update_hrtimer_threshold(u64 period)
    51  {
    52          /*
    53           * The hrtimer runs with a period of (watchdog_threshold * 2) / 
5
    54           *
    55           * So it runs effectively with 2.5 times the rate of the NMI
    56           * watchdog. That means the hrtimer should fire 2-3 times before
    57           * the NMI watchdog expires. The NMI watchdog on x86 is based on
    58           * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU 
cycles
    59           * might run way faster than expected and the NMI fires in a
    60           * smaller period than the one deduced from the nominal CPU
    61           * frequency. Depending on the Turbo-Mode factor this might be 
fast
    62           * enough to get the NMI period smaller than the hrtimer 
watchdog
    63           * period and trigger false positives.
    64           *
    65           * The sample threshold is used to check in the NMI handler 
whether
    66           * the minimum time between two NMI samples has elapsed. That
    67           * prevents false positives.
    68           *
    69           * Set this to 4/5 of the actual watchdog threshold period so 
the
    70           * hrtimer is guaranteed to fire at least once within the real
    71           * watchdog threshold.
    72           */
    73          watchdog_hrtimer_sample_threshold = period * 2;
    74  }
    75  
    76  static bool watchdog_check_timestamp(void)
    77  {
    78          ktime_t delta, now = ktime_get_mono_fast_ns();
    79  
    80          delta = now - __this_cpu_read(last_timestamp);
    81          if (delta < watchdog_hrtimer_sample_threshold) {
    82                  /*
    83                   * If ktime is jiffies based, a stalled timer would 
prevent
    84                   * jiffies from being incremented and the filter would 
look
    85                   * at a stale timestamp and never trigger.
    86                   */
    87                  if (__this_cpu_inc_return(nmi_rearmed) < 10)
    88                          return false;
    89          }
    90          __this_cpu_write(nmi_rearmed, 0);
    91          __this_cpu_write(last_timestamp, now);
    92          return true;
    93  }
    94  #else
    95  static inline bool watchdog_check_timestamp(void)
    96  {
    97          return true;
    98  }
    99  #endif
   100  
   101  static struct perf_event_attr wd_hw_attr = {
   102          .type           = PERF_TYPE_HARDWARE,
   103          .config         = PERF_COUNT_HW_CPU_CYCLES,
   104          .size           = sizeof(struct perf_event_attr),
   105          .pinned         = 1,
   106          .disabled       = 1,
   107  };
   108  
   109  /* Callback function for perf event subsystem */
   110  static void watchdog_overflow_callback(struct perf_event *event,
   111                                         struct perf_sample_data *data,
   112                                         struct pt_regs *regs)
   113  {
   114          /* Ensure the watchdog never gets throttled */
   115          event->hw.interrupts = 0;
   116  
   117          if (__this_cpu_read(watchdog_nmi_touch) == true) {
   118                  __this_cpu_write(watchdog_nmi_touch, false);
   119                  return;
   120          }
   121  
   122          if (!watchdog_check_timestamp())
   123                  return;
   124  
   125          /* check for a hardlockup
   126           * This is done by making sure our timer interrupt
   127           * is incrementing.  The timer interrupt should have
   128           * fired multiple times before we overflow'd.  If it hasn't
   129           * then this is a good indication the cpu is stuck
   130           */
   131          if (is_hardlockup()) {
   132                  int this_cpu = smp_processor_id();
   133  
   134                  /* only print hardlockups once */
   135                  if (__this_cpu_read(hard_watchdog_warn) == true)
   136                          return;
   137  
   138                  pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
   139                           this_cpu);
   140                  print_modules();
   141                  print_irqtrace_events(current);
   142                  if (regs)
   143                          show_regs(regs);
   144                  else
   145                          dump_stack();
   146  
   147                  /*
   148                   * Perform all-CPU dump only once to avoid multiple 
hardlockups
   149                   * generating interleaving traces
   150                   */
   151                  if (sysctl_hardlockup_all_cpu_backtrace &&
   152                                  !test_and_set_bit(0, 
&hardlockup_allcpu_dumped))
   153                          trigger_allbutself_cpu_backtrace();
   154  
   155                  if (hardlockup_panic)
   156                          nmi_panic(regs, "Hard LOCKUP");
   157  
   158                  __this_cpu_write(hard_watchdog_warn, true);
   159                  return;
   160          }
   161  
   162          __this_cpu_write(hard_watchdog_warn, false);
   163          return;
   164  }
   165  
   166  static int hardlockup_detector_event_create(void)
   167  {
   168          unsigned int cpu = smp_processor_id();
   169          struct perf_event_attr *wd_attr;
   170          struct perf_event *evt;
   171  
   172          wd_attr = &wd_hw_attr;
   173          wd_attr->sample_period = 
hw_nmi_get_sample_period(watchdog_thresh);
   174  
   175          /* Try to register using hardware perf events */
   176          evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
   177                                                 
watchdog_overflow_callback, NULL);
   178          if (IS_ERR(evt)) {
   179                  pr_debug("Perf event create on CPU %d failed with 
%ld\n", cpu,
   180                           PTR_ERR(evt));
   181                  return PTR_ERR(evt);
   182          }
   183          this_cpu_write(watchdog_ev, evt);
   184          return 0;
   185  }
   186  
   187  /**
   188   * hardlockup_detector_perf_enable - Enable the local event
   189   */
   190  void hardlockup_detector_perf_enable(void)
   191  {
   192          ktime_t now = ktime_get_mono_fast_ns();
   193  
   194          if (hardlockup_detector_event_create())
   195                  return;
   196  
   197          /* use original value for check */
   198          if (!atomic_fetch_inc(&watchdog_cpus))
   199                  pr_info("Enabled. Permanently consumes one hw-PMU 
counter.\n");
   200  
 > 201          this_cpu_write(last_timestamp, now);
   202          perf_event_enable(this_cpu_read(watchdog_ev));
   203  }
   204  
   205  /**
   206   * hardlockup_detector_perf_disable - Disable the local event
   207   */
   208  void hardlockup_detector_perf_disable(void)
   209  {
   210          struct perf_event *event = this_cpu_read(watchdog_ev);
   211  
   212          if (event) {
   213                  perf_event_disable(event);
   214                  this_cpu_write(watchdog_ev, NULL);
   215                  this_cpu_write(dead_event, event);
   216                  cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
   217                  atomic_dec(&watchdog_cpus);
   218          }
   219  }
   220  
   221  /**
   222   * hardlockup_detector_perf_cleanup - Cleanup disabled events and 
destroy them
   223   *
   224   * Called from lockup_detector_cleanup(). Serialized by the caller.
   225   */
   226  void hardlockup_detector_perf_cleanup(void)
   227  {
   228          int cpu;
   229  
   230          for_each_cpu(cpu, &dead_events_mask) {
   231                  struct perf_event *event = per_cpu(dead_event, cpu);
   232  
   233                  /*
   234                   * Required because for_each_cpu() reports  
unconditionally
   235                   * CPU0 as set on UP kernels. Sigh.
   236                   */
   237                  if (event)
   238                          perf_event_release_kernel(event);
   239                  per_cpu(dead_event, cpu) = NULL;
   240          }
   241          cpumask_clear(&dead_events_mask);
   242  }
   243  
   244  /**
   245   * hardlockup_detector_perf_stop - Globally stop watchdog events
   246   *
   247   * Special interface for x86 to handle the perf HT bug.
   248   */
   249  void __init hardlockup_detector_perf_stop(void)
   250  {
   251          int cpu;
   252  
   253          lockdep_assert_cpus_held();
   254  
   255          for_each_online_cpu(cpu) {
   256                  struct perf_event *event = per_cpu(watchdog_ev, cpu);
   257  
   258                  if (event)
   259                          perf_event_disable(event);
   260          }
   261  }
   262  
   263  /**
   264   * hardlockup_detector_perf_restart - Globally restart watchdog events
   265   *
   266   * Special interface for x86 to handle the perf HT bug.
   267   */
   268  void __init hardlockup_detector_perf_restart(void)
   269  {
   270          int cpu;
   271  
   272          lockdep_assert_cpus_held();
   273  
   274          if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
   275                  return;
   276  
   277          for_each_online_cpu(cpu) {
   278                  struct perf_event *event = per_cpu(watchdog_ev, cpu);
   279  
   280                  if (event) {
   281                          ktime_t now = ktime_get_mono_fast_ns();
   282  
 > 283                          per_cpu(last_timestamp, cpu) = now;
   284                          perf_event_enable(event);
   285                  }
   286          }
   287  }
   288  

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

.config.gz
Description: application/gzip

Re: [PATCH] watchdog/hardlockup: reassign last_timestamp when enable nmi event

Reply via email to