Allow Ftrace, Perf, and LTTng to hook on the irqsoff tracer and trace irqsoff and preemptoff critical sections.
How to setup the irqsoff tracer for this use-case (needed for use with Ftrace, Perf, or LTTng): if the tracing_thresh value is set, all durations beyond the specified threshold will emit a "core_critical_timing_hit" tracepoint. If unset, then the max duration will be tracked. The current max duration can be reset using the tracing_max_latency file. echo 0 > /sys/kernel/debug/tracing/options/function-trace # don't need function tracing echo 1 > /sys/kernel/debug/tracing/options/quiet # silence ftrace buffers echo preemptirqsoff > /sys/kernel/debug/tracing/current_tracer echo 2 > /sys/kernel/debug/tracing/tracing_thresh # set threshold (e.g. 2 µs) echo 1 > /sys/kernel/debug/tracing/tracing_on The following two commands can also be useful to tweak the irqsoff tracer: echo 0 > /sys/kernel/debug/tracing/tracing_max_latency # reset max latency echo 0 > /sys/kernel/debug/tracing/tracing_thresh # disable threshold *** An example usage of Ftrace hooking on this tracepoint: *** echo 1 > /sys/kernel/debug/tracing/events/core/core_critical_timing_hit/enable cat /sys/kernel/debug/tracing/trace_pipe [...] <idle>-0 17d..3 189981402us : core_critical_timing_hit: ip=0xffffffff810fa2f9 parent_ip=0xffffffff8108c31b start_ip=0xffffffff8108c31b delta_ns=2193 irqs_disabled=1 preempt_disabled=1 in_softirq=0 in_irq=0 in_nmi=0 *** An example usage of Perf hooking on this tracepoint: *** perf record -g -e core:core_critical_timing_hit -c 1 sleep 30 perf report -g + 100.00% 100.00% sleep [kernel.vmlinux] [k] check_critical_timing + 92.75% 0.00% sleep [kernel.vmlinux] [k] preempt_count_sub + 92.75% 0.00% sleep [kernel.vmlinux] [k] trace_preempt_on + 65.22% 0.00% sleep [kernel.vmlinux] [k] page_fault + 65.22% 0.00% sleep [kernel.vmlinux] [k] do_page_fault + 63.77% 0.00% sleep [kernel.vmlinux] [k] __do_page_fault + 63.77% 0.00% sleep [kernel.vmlinux] [k] handle_mm_fault + 56.52% 0.00% sleep [kernel.vmlinux] [k] _raw_spin_unlock + 30.43% 0.00% sleep ld-2.19.so [.] _dl_sysdep_start [...] *** An example usage of LTTng hooking on this tracepoint: *** (as root) lttng-sessiond -d (then, as root or tracing group) lttng create --live lttng enable-event -k core_critical_timing_hit lttng start lttng view [...] [05:13:25.432692322] (+0.026847647) compudjdev core_critical_timing_hit: { cpu_id = 3 }, { ip = 0xFFFFFFFF810FA2F9, parent_ip = 0xFFFFFFFF8108C31B, start_ip = 0xFFFFFFFF8108C31B, delta_ns = 2060, irqs_disabled = 1, preempt_disabled = 1, in_softirq = 0, in_irq = 0, in_nmi = 0 } Tracepoints core_critical_timing_start and core_critical_timing_stop are also available. They trigger at a much higher rate than core_critical_timing_hit. Signed-off-by: Mathieu Desnoyers <mathieu.desnoy...@efficios.com> CC: Thomas Gleixner <t...@linutronix.de> CC: Steven Rostedt <rost...@goodmis.org> CC: Ingo Molnar <mi...@redhat.com> CC: Peter Zijlstra <pet...@infradead.org> --- Open question about perf use-case: we currently get " Warning: [core:core_critical_timing_hit] bad op token {" warning when doing "perf record -g -e core:core_critical_timing_hit -c 1 sleep 30". This should be investigated. Changes since v2: - add start ip to hit event, - rebase on per-tracer "quiet" flag commit. Changes since v1: - Remove unneeded tracepoint jump label bypass. - Enhance TP_printk() formatting. - Use new ftrace-buffer option in changelog. - Add core_critical_timing_start/stop tracepoints. --- include/trace/events/core.h | 98 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_irqsoff.c | 18 +++++--- 2 files changed, 111 insertions(+), 5 deletions(-) create mode 100644 include/trace/events/core.h diff --git a/include/trace/events/core.h b/include/trace/events/core.h new file mode 100644 index 0000000..76e0efb --- /dev/null +++ b/include/trace/events/core.h @@ -0,0 +1,98 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM core + +#if !defined(_TRACE_CORE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CORE_H + +#include <linux/tracepoint.h> +#include <linux/irqflags.h> +#include <linux/preempt.h> + +/* + * Tracepoint for critical timings max/threshold hit. + */ +TRACE_EVENT(core_critical_timing_hit, + + TP_PROTO(unsigned long ip, unsigned long parent_ip, + unsigned long start_ip, unsigned long flags, + int preempt_cnt, cycles_t delta_ns), + + TP_ARGS(ip, parent_ip, start_ip, flags, preempt_cnt, delta_ns), + + TP_STRUCT__entry( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + __field( unsigned long, start_ip ) + __field( cycles_t, delta_ns ) + __field( unsigned long, flags ) + __field( int, preempt_cnt ) + ), + + TP_fast_assign( + __entry->ip = ip; + __entry->parent_ip = parent_ip; + __entry->start_ip = start_ip; + __entry->delta_ns = delta_ns; + __entry->flags = flags; + __entry->preempt_cnt = preempt_cnt; + ), + + TP_printk("ip=0x%lx parent_ip=0x%lx start_ip=0x%lx delta_ns=%llu irqs_disabled=%u preempt_disabled=%u in_softirq=%u in_irq=%u in_nmi=%u", + __entry->ip, __entry->parent_ip, __entry->start_ip, + (unsigned long long) __entry->delta_ns, + !!raw_irqs_disabled_flags(__entry->flags), + !!(__entry->preempt_cnt & PREEMPT_MASK), + !!(__entry->preempt_cnt & SOFTIRQ_MASK), + !!(__entry->preempt_cnt & HARDIRQ_MASK), + !!(__entry->preempt_cnt & NMI_MASK)) +); + +/* + * Tracepoint for critical timings start/stop. + */ +DECLARE_EVENT_CLASS(core_critical_timing, + + TP_PROTO(unsigned long ip, unsigned long parent_ip, + unsigned long flags, int preempt_cnt), + + TP_ARGS(ip, parent_ip, flags, preempt_cnt), + + TP_STRUCT__entry( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + __field( unsigned long, flags ) + __field( int, preempt_cnt ) + ), + + TP_fast_assign( + __entry->ip = ip; + __entry->parent_ip = parent_ip; + __entry->flags = flags; + __entry->preempt_cnt = preempt_cnt; + ), + + TP_printk("ip=0x%lx parent_ip=0x%lx irqs_disabled=%u preempt_disabled=%u in_softirq=%u in_irq=%u in_nmi=%u", + __entry->ip, __entry->parent_ip, + !!raw_irqs_disabled_flags(__entry->flags), + !!(__entry->preempt_cnt & PREEMPT_MASK), + !!(__entry->preempt_cnt & SOFTIRQ_MASK), + !!(__entry->preempt_cnt & HARDIRQ_MASK), + !!(__entry->preempt_cnt & NMI_MASK)) +); + +DEFINE_EVENT(core_critical_timing, core_critical_timing_start, + TP_PROTO(unsigned long ip, unsigned long parent_ip, + unsigned long flags, int preempt_cnt), + TP_ARGS(ip, parent_ip, flags, preempt_cnt) +); + +DEFINE_EVENT(core_critical_timing, core_critical_timing_stop, + TP_PROTO(unsigned long ip, unsigned long parent_ip, + unsigned long flags, int preempt_cnt), + TP_ARGS(ip, parent_ip, flags, preempt_cnt) +); + +#endif /* _TRACE_CORE_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 4ed348a..33295b6 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -14,6 +14,9 @@ #include <linux/module.h> #include <linux/ftrace.h> +#define CREATE_TRACE_POINTS +#include <trace/events/core.h> + #include "trace.h" static struct trace_array *irqsoff_trace __read_mostly; @@ -328,6 +331,8 @@ check_critical_timing(struct trace_array *tr, /* Skip 5 functions to get to the irq/preempt enable function */ __trace_stack(tr, flags, 5, pc); } + trace_core_critical_timing_hit(CALLER_ADDR0, parent_ip, + data->critical_start, flags, pc, delta); if (data->critical_sequence != max_sequence) goto out_unlock; @@ -354,7 +359,7 @@ out: static inline void start_critical_timing(unsigned long ip, unsigned long parent_ip) { - int cpu; + int cpu, pc; struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; @@ -380,9 +385,10 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) local_save_flags(flags); + pc = preempt_count(); if (!is_quiet(tr)) - __trace_function(tr, ip, parent_ip, flags, preempt_count()); - + __trace_function(tr, ip, parent_ip, flags, pc); + trace_core_critical_timing_start(ip, parent_ip, flags, pc); per_cpu(tracing_cpu, cpu) = 1; atomic_dec(&data->disabled); @@ -391,7 +397,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) static inline void stop_critical_timing(unsigned long ip, unsigned long parent_ip) { - int cpu; + int cpu, pc; struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; @@ -415,8 +421,10 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); + pc = preempt_count(); if (!is_quiet(tr)) - __trace_function(tr, ip, parent_ip, flags, preempt_count()); + __trace_function(tr, ip, parent_ip, flags, pc); + trace_core_critical_timing_stop(ip, parent_ip, flags, pc); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/