This patch implements the feature that the tracing_max_latency file, e.g. /sys/kernel/debug/tracing/tracing_max_latency will receive notifications through the fsnotify framework when a new latency is available.
One particularly interesting use of this facility is when enabling threshold tracing, through /sys/kernel/debug/tracing/tracing_thresh, together with the preempt/irqsoff tracers. This makes it possible to implement a user space program that can, with equal probability, obtain traces of latencies that occur immediately after each other in spite of the fact that the preempt/irqsoff tracers operate in overwrite mode. This facility works with the hwlat, preempt/irqsoff, and wakeup tracers. This patch also add four new tracepoints, for entering/exiting __schedule() and do_idle(). Those tracepoints are needed to disable and enable the notificaton facility because we cannot wake up the workqueue from these critical sections without risking a deadlock. Similar problems would also arise if we try to schedule a tasklet, raise a softirq, or wake up a kernel thread. If a notification event would happen in the forbidden sections, we schedule the fsnotify work as soon as we have exited do_idle()/__schedule(). Signed-off-by: Viktor Rosendahl <viktor.rosend...@gmail.com> --- include/trace/events/power.h | 40 +++++++++ include/trace/events/sched.h | 40 +++++++++ kernel/sched/core.c | 2 + kernel/sched/idle.c | 2 + kernel/trace/trace.c | 160 ++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 10 +++ kernel/trace/trace_hwlat.c | 4 +- 7 files changed, 255 insertions(+), 3 deletions(-) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index f7aece721aed..40ab747274bd 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -40,6 +40,46 @@ DEFINE_EVENT(cpu, cpu_idle, TP_ARGS(state, cpu_id) ); +/* + * Tracepoint for entering do_idle(): + */ +TRACE_EVENT(do_idle_enter, + + TP_PROTO(int cpu), + + TP_ARGS(cpu), + + TP_STRUCT__entry( + __field(int, cpu) + ), + + TP_fast_assign( + __entry->cpu = cpu; + ), + + TP_printk("cpu=%d", __entry->cpu) +); + +/* + * Tracepoint for exiting do_idle(): + */ +TRACE_EVENT(do_idle_exit, + + TP_PROTO(int cpu), + + TP_ARGS(cpu), + + TP_STRUCT__entry( + __field(int, cpu) + ), + + TP_fast_assign( + __entry->cpu = cpu; + ), + + TP_printk("cpu=%d", __entry->cpu) +); + TRACE_EVENT(powernv_throttle, TP_PROTO(int chip_id, const char *reason, int pmax), diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9a4bdfadab07..6a3dae7b3249 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -183,6 +183,46 @@ TRACE_EVENT(sched_switch, __entry->next_comm, __entry->next_pid, __entry->next_prio) ); +/* + * Tracepoint for entering __schedule(): + */ +TRACE_EVENT(sched_schedule_enter, + + TP_PROTO(int cpu), + + TP_ARGS(cpu), + + TP_STRUCT__entry( + __field(int, cpu) + ), + + TP_fast_assign( + __entry->cpu = cpu; + ), + + TP_printk("cpu=%d", __entry->cpu) +); + +/* + * Tracepoint for exiting __schedule(): + */ +TRACE_EVENT(sched_schedule_exit, + + TP_PROTO(int cpu), + + TP_ARGS(cpu), + + TP_STRUCT__entry( + __field(int, cpu) + ), + + TP_fast_assign( + __entry->cpu = cpu; + ), + + TP_printk("cpu=%d", __entry->cpu) +); + /* * Tracepoint for a task being migrated: */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 102dfcf0a29a..c9d86fcc48f5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3374,6 +3374,7 @@ static void __sched notrace __schedule(bool preempt) int cpu; cpu = smp_processor_id(); + trace_sched_schedule_enter(cpu); rq = cpu_rq(cpu); prev = rq->curr; @@ -3448,6 +3449,7 @@ static void __sched notrace __schedule(bool preempt) } balance_callback(rq); + trace_sched_schedule_exit(cpu); } void __noreturn do_task_dead(void) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f5516bae0c1b..e328e045c6e8 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -224,6 +224,7 @@ static void cpuidle_idle_call(void) static void do_idle(void) { int cpu = smp_processor_id(); + trace_do_idle_enter(cpu); /* * If the arch has a polling bit, we maintain an invariant: * @@ -287,6 +288,7 @@ static void do_idle(void) if (unlikely(klp_patch_pending(current))) klp_update_patch_state(current); + trace_do_idle_exit(cpu); } bool cpu_in_idle(unsigned long pc) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ec439999f387..608e0dbdf851 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -44,6 +44,10 @@ #include <linux/trace.h> #include <linux/sched/clock.h> #include <linux/sched/rt.h> +#include <linux/fsnotify.h> +#include <linux/workqueue.h> +#include <trace/events/power.h> +#include <trace/events/sched.h> #include "trace.h" #include "trace_output.h" @@ -1481,6 +1485,157 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) unsigned long __read_mostly tracing_thresh; +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +static const struct file_operations tracing_max_lat_fops; +static struct workqueue_struct *fsnotify_wq; +static DEFINE_PER_CPU(int, notify_disabled); +static DEFINE_PER_CPU(struct trace_array *, notify_delayed_tr); + +static void trace_fsnotify_workfn(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, + fsnotify_work); + fsnotify(tr->d_max_latency->d_inode, FS_MODIFY, + tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); +} + +static void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ + INIT_WORK(&tr->fsnotify_work, trace_fsnotify_workfn); + tr->d_max_latency = trace_create_file("tracing_max_latency", 0644, + d_tracer, &tr->max_latency, + &tracing_max_lat_fops); +} + +/* + * Disable fsnotify while in scheduler and idle code. Trying wake anyting up + * from there is prone to deadlock. + */ +static inline void notrace trace_disable_fsnotify(void) +{ + int cpu; + unsigned long flags; + + raw_local_irq_save(flags); + cpu = smp_processor_id(); + per_cpu(notify_disabled, cpu) = 1; + raw_local_irq_restore(flags); +} + +static inline void notrace trace_enable_fsnotify(void) +{ + int cpu; + struct trace_array *tr; + unsigned long flags; + bool do_queue = false; + + raw_local_irq_save(flags); + cpu = smp_processor_id(); + per_cpu(notify_disabled, cpu) = 0; + tr = per_cpu(notify_delayed_tr, cpu); + if (tr != NULL) { + per_cpu(notify_delayed_tr, cpu) = NULL; + do_queue = true; + } + raw_local_irq_restore(flags); + + if (do_queue && fsnotify_wq) + queue_work(fsnotify_wq, &tr->fsnotify_work); +} + +static void notrace probe_sched_enter(void *nihil, int cpu) +{ + trace_disable_fsnotify(); +} + +static void notrace probe_sched_exit(void *nihil, int cpu) +{ + trace_enable_fsnotify(); +} + +static __init void trace_maxlat_fsnotify_init(void) +{ + int ret; + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(notify_disabled, cpu) = 1; + smp_wmb(); + + ret = register_trace_sched_schedule_enter(probe_sched_enter, NULL); + if (ret) { + pr_info("tracing_max_latency: Could not activate tracepoint probe to sched_schedule_enter\n"); + return; + } + + ret = register_trace_sched_schedule_exit(probe_sched_exit, NULL); + if (ret) { + pr_info("tracing_max_latency: Could not activate tracepoint probe to sched_schedule_exit\n"); + goto fail_deprobe_sched_enter; + } + + ret = register_trace_do_idle_enter(probe_sched_enter, NULL); + if (ret) { + pr_info("tracing_max_latency: Could not activate tracepoint probe to do_idle_enter\n"); + goto fail_deprobe_sched_exit; + } + + ret = register_trace_do_idle_exit(probe_sched_exit, NULL); + if (ret) { + pr_info("tracing_max_latency: Could not activate tracepoint probe to do_idle_exit\n"); + goto fail_deprobe_idle_enter; + } + + fsnotify_wq = alloc_workqueue("tr_max_lat_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (fsnotify_wq) + return; + + pr_err("Unable to allocate tr_max_lat_wq"); + + unregister_trace_do_idle_exit(probe_sched_exit, NULL); + +fail_deprobe_idle_enter: + unregister_trace_do_idle_enter(probe_sched_enter, NULL); + +fail_deprobe_sched_exit: + unregister_trace_sched_schedule_exit(probe_sched_exit, NULL); + +fail_deprobe_sched_enter: + unregister_trace_sched_schedule_enter(probe_sched_enter, NULL); +} + +void trace_maxlat_fsnotify(struct trace_array *tr) +{ + int cpu = smp_processor_id(); + + if (!fsnotify_wq) + return; + + if (!per_cpu(notify_disabled, cpu)) + queue_work(fsnotify_wq, &tr->fsnotify_work); + else + /* queue it in trace_enable_fsnotify() */ + per_cpu(notify_delayed_tr, cpu) = tr; +} + +/* + * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + * defined(CONFIG_FSNOTIFY) + */ +#else + +#define trace_create_maxlat_file(tr, d_tracer) \ + trace_create_file("tracing_max_latency", 0644, d_tracer, \ + &tr->max_latency, &tracing_max_lat_fops) + +#define trace_maxlat_fsnotify_init() do {} while (0) + +#endif + #ifdef CONFIG_TRACER_MAX_TRACE /* * Copy the new maximum trace into the separate maximum-trace @@ -1519,6 +1674,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) /* record this tasks comm */ tracing_record_cmdline(tsk); + trace_maxlat_fsnotify(tr); } /** @@ -8242,8 +8398,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tr->max_latency, &tracing_max_lat_fops); + trace_create_maxlat_file(tr, d_tracer); #endif if (ftrace_create_function_files(tr, d_tracer)) @@ -8404,6 +8559,7 @@ static __init int tracer_init_tracefs(void) { struct dentry *d_tracer; + trace_maxlat_fsnotify_init(); trace_access_lock_init(); d_tracer = tracing_init_dentry(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 639047b259d7..3456d6a47a47 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -17,6 +17,7 @@ #include <linux/compiler.h> #include <linux/trace_seq.h> #include <linux/glob.h> +#include <linux/workqueue.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ @@ -265,6 +266,10 @@ struct trace_array { #endif #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) unsigned long max_latency; +#ifdef CONFIG_FSNOTIFY + struct dentry *d_max_latency; + struct work_struct fsnotify_work; +#endif #endif struct trace_pid_list __rcu *filtered_pids; /* @@ -781,6 +786,11 @@ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) +void trace_maxlat_fsnotify(struct trace_array *tr); +#endif + #ifdef CONFIG_STACKTRACE void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 1e6db9cbe4dc..44a47f81d949 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -254,8 +254,10 @@ static int get_sample(void) trace_hwlat_sample(&s); /* Keep a running maximum ever recorded hardware latency */ - if (sample > tr->max_latency) + if (sample > tr->max_latency) { tr->max_latency = sample; + trace_maxlat_fsnotify(tr); + } } out: -- 2.17.1