On Wed, Dec 14, 2016 at 08:32:39PM +0100, Peter Zijlstra wrote:
> On Wed, Dec 14, 2016 at 07:16:36PM +0100, Jiri Olsa wrote:
> 
> > > > +++ b/arch/x86/events/intel/ds.c
> > > > @@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct 
> > > > pt_regs *iregs)
> > > >                         continue;
> > > >  
> > > >                 /* log dropped samples number */
> > > > -               if (error[bit])
> > > > +               if (error[bit]) {
> > > >                         perf_log_lost_samples(event, error[bit]);
> > > >  
> > > > +                       if (perf_event_account_interrupt(event, 1))
> > > 
> > > Seems a bit daft to expose the .throttle argument, since that would be
> > > the only point of calling this.
> > 
> > there's also the other caller from __perf_event_overflow
> 
> See the below patchlet ;-)
> 
> > > > +static int __perf_event_overflow(struct perf_event *event,
> > > > +                                  int throttle, struct 
> > > > perf_sample_data *data,
> > > > +                                  struct pt_regs *regs)
> > > > +{
> > > > +       int events = atomic_read(&event->event_limit);
> > > > +       struct hw_perf_event *hwc = &event->hw;
> > > > +       int ret = 0;
> > > > +
> > > > +       /*
> > > > +        * Non-sampling counters might still use the PMI to fold short
> > > > +        * hardware counters, ignore those.
> > > > +        */
> > > > +       if (unlikely(!is_sampling_event(event)))
> > > > +               return 0;
> > > > +
> > > > +       ret = perf_event_account_interrupt(event, throttle);
> > > > +
> > > >         if (event->attr.freq) {
> > > >                 u64 now = perf_clock();
> > > >                 s64 delta = now - hwc->freq_time_stamp;
> > > 
> > > Arguably, everything in __perf_event_overflow() except for calling of
> > > ->overflow_handler() should be done I think.
> > 
> > well, I was wondering about that period adjustment bit
> > 
> > but I wasn't sure about those pending_kill/pending_wakeup bits,
> > they make sense to me only if we have some data to deliver
> 
> Hmm, maybe. Please add a comment, that way we can at least rediscover we
> thought about this.
> 

new version with full changelog

jirka


---
It's possible to setup PEBS events and get only errors and not
a single data, like on SNB-X (model 45) and IVB-EP (model 62)
via 2 perf commands running simultaneously:

    taskset -c 1 ./perf record -c 4 -e branches:pp -j any -C 10

This leads to soft lock up, because the error path of the
intel_pmu_drain_pebs_nhm does not account event->hw.interrupt
for error PEBS interrupts so the event is not eventually
stopped when it gets over the max_samples_per_tick limit.

  NMI watchdog: BUG: soft lockup - CPU#22 stuck for 22s! [perf_fuzzer:5816]
  ...
  task: ffff880273148000 task.stack: ffffc90002d58000
  RIP: 0010:[<ffffffff81159232>]  [<ffffffff81159232>] 
smp_call_function_single+0xe2/0x140
  RSP: 0018:ffffc90002d5bd60  EFLAGS: 00000202
  ...
  Call Trace:
   ? trace_hardirqs_on_caller+0xf5/0x1b0
   ? perf_cgroup_attach+0x70/0x70
   perf_install_in_context+0x199/0x1b0
   ? ctx_resched+0x90/0x90
   SYSC_perf_event_open+0x641/0xf90
   SyS_perf_event_open+0x9/0x10
   do_syscall_64+0x6c/0x1f0
   entry_SYSCALL64_slow_path+0x25/0x25

Adding  perf_event_account_interrupt with event's interrupt
and frequency checks and calling it from drain_pebs's error
path.

Keeping pending_kill and pending_wakeup check up logic only
in __perf_event_overflow path, because they make sense only
in case if there's any data to deliver.

Signed-off-by: Jiri Olsa <jo...@kernel.org>
---
 arch/x86/events/intel/ds.c |  6 +++++-
 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 47 ++++++++++++++++++++++++++++++----------------
 3 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index be202390bbd3..9dfeeeca0ea8 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs 
*iregs)
                        continue;
 
                /* log dropped samples number */
-               if (error[bit])
+               if (error[bit]) {
                        perf_log_lost_samples(event, error[bit]);
 
+                       if (perf_event_account_interrupt(event))
+                               x86_pmu_stop(event, 0);
+               }
+
                if (counts[bit]) {
                        __intel_pmu_pebs_event(event, iregs, base,
                                               top, bit, counts[bit]);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecdb9817..78ed8105e64d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1259,6 +1259,7 @@ extern void perf_event_disable(struct perf_event *event);
 extern void perf_event_disable_local(struct perf_event *event);
 extern void perf_event_disable_inatomic(struct perf_event *event);
 extern void perf_event_task_tick(void);
+extern int perf_event_account_interrupt(struct perf_event *event);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 02c8421f8c01..7c6264f5deb7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7034,25 +7034,11 @@ static void perf_log_itrace_start(struct perf_event 
*event)
        perf_output_end(&handle);
 }
 
-/*
- * Generic event overflow handling, sampling.
- */
-
-static int __perf_event_overflow(struct perf_event *event,
-                                  int throttle, struct perf_sample_data *data,
-                                  struct pt_regs *regs)
+static int __perf_event_account_interrupt(struct perf_event *event, int 
throttle)
 {
-       int events = atomic_read(&event->event_limit);
        struct hw_perf_event *hwc = &event->hw;
-       u64 seq;
        int ret = 0;
-
-       /*
-        * Non-sampling counters might still use the PMI to fold short
-        * hardware counters, ignore those.
-        */
-       if (unlikely(!is_sampling_event(event)))
-               return 0;
+       u64 seq;
 
        seq = __this_cpu_read(perf_throttled_seq);
        if (seq != hwc->interrupts_seq) {
@@ -7080,6 +7066,35 @@ static int __perf_event_overflow(struct perf_event 
*event,
                        perf_adjust_period(event, delta, hwc->last_period, 
true);
        }
 
+       return ret;
+}
+
+int perf_event_account_interrupt(struct perf_event *event)
+{
+       return __perf_event_account_interrupt(event, 1);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+                                  int throttle, struct perf_sample_data *data,
+                                  struct pt_regs *regs)
+{
+       int events = atomic_read(&event->event_limit);
+       struct hw_perf_event *hwc = &event->hw;
+       int ret = 0;
+
+       /*
+        * Non-sampling counters might still use the PMI to fold short
+        * hardware counters, ignore those.
+        */
+       if (unlikely(!is_sampling_event(event)))
+               return 0;
+
+       ret = __perf_event_account_interrupt(event, throttle);
+
        /*
         * XXX event_limit might not quite work as expected on inherited
         * events
-- 
2.7.4

Reply via email to