This tracepoint allows to keep track of all priority changes of a task. It outputs the scheduling policy, the nice value, the rt_priority and the deadline-related attributes (dl_runtime, dl_deadline and dl_period).
It is emitted in the code path of the sched_setscheduler, sched_setattr, sched_setparam, nice and the fork system calls. For fork, it is emitted after the sched_process_fork tracepoint for timeline consistency and because the PID is not yet set when sched_fork() is called. This allows the analysis of real-time scheduling delays based on the configured scheduling priorities and policies, which cannot be performed with the current instrumentation in sched_switch. Also, instead of exposing the internal kernel prio field, this tracepoint only outputs the user-visible priority attributes. The effective priority of running threads can also be temporarily changed in the PI code, but a dedicated tracepoint is already in place to cover this case. Here are a few output examples: After fork of a normal task: sched_prio_update: comm=bash pid=2104, policy=SCHED_NORMAL, nice=0, rt_priority=0, dl_runtime=0, dl_deadline=0, dl_period=0 renice -n 10 of a normal task: sched_prio_update: comm=sleep pid=2130, policy=SCHED_NORMAL, nice=10, rt_priority=0, dl_runtime=0, dl_deadline=0, dl_period=0 SCHED_FIFO 60: sched_prio_update: comm=chrt pid=2105, policy=SCHED_FIFO, nice=0, rt_priority=60, dl_runtime=0, dl_deadline=0, dl_period=0 SCHED_RR 60: sched_prio_update: comm=chrt pid=2109, policy=SCHED_RR, nice=0, rt_priority=60, dl_runtime=0, dl_deadline=0, dl_period=0 SCHED_DEADLINE: sched_prio_update: comm=b pid=2110, policy=SCHED_DEADLING, nice=0, rt_priority=0, dl_runtime=10000000, dl_deadline=30000000, dl_period=30000000 Reviewed-by: Mathieu Desnoyers <mathieu.desnoy...@efficios.com> Signed-off-by: Julien Desfossez <jdesfos...@efficios.com> --- include/trace/events/sched.h | 68 ++++++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 1 + kernel/sched/core.c | 3 ++ 3 files changed, 72 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9b90c57..fcb0f29 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -8,6 +8,34 @@ #include <linux/tracepoint.h> #include <linux/binfmts.h> +#define SCHEDULING_POLICY \ + EM( SCHED_NORMAL, "SCHED_NORMAL") \ + EM( SCHED_FIFO, "SCHED_FIFO") \ + EM( SCHED_RR, "SCHED_RR") \ + EM( SCHED_BATCH, "SCHED_BATCH") \ + EM( SCHED_IDLE, "SCHED_IDLE") \ + EMe(SCHED_DEADLINE, "SCHED_DEADLINE") + +/* + * First define the enums in the above macros to be exported to userspace + * via TRACE_DEFINE_ENUM(). + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +SCHEDULING_POLICY + +/* + * Now redefine the EM() and EMe() macros to map the enums to the strings + * that will be printed in the output. + */ +#undef EM +#undef EMe +#define EM(a, b) {a, b}, +#define EMe(a, b) {a, b} + /* * Tracepoint for calling kthread_stop, performed to end a kthread: */ @@ -562,6 +590,46 @@ TRACE_EVENT(sched_wake_idle_without_ipi, TP_printk("cpu=%d", __entry->cpu) ); + +/* + * Tracepoint for showing scheduling priority changes. + */ +TRACE_EVENT(sched_prio_update, + + TP_PROTO(struct task_struct *tsk), + + TP_ARGS(tsk), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( unsigned int, policy ) + __field( int, nice ) + __field( unsigned int, rt_priority ) + __field( u64, dl_runtime ) + __field( u64, dl_deadline ) + __field( u64, dl_period ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->policy = tsk->policy; + __entry->nice = task_nice(tsk); + __entry->rt_priority = tsk->rt_priority; + __entry->dl_runtime = tsk->dl.dl_runtime; + __entry->dl_deadline = tsk->dl.dl_deadline; + __entry->dl_period = tsk->dl.dl_period; + ), + + TP_printk("comm=%s pid=%d, policy=%s, nice=%d, rt_priority=%u, " + "dl_runtime=%Lu, dl_deadline=%Lu, dl_period=%Lu", + __entry->comm, __entry->pid, + __print_symbolic(__entry->policy, SCHEDULING_POLICY), + __entry->nice, __entry->rt_priority, + __entry->dl_runtime, __entry->dl_deadline, + __entry->dl_period) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/fork.c b/kernel/fork.c index 7926993..ac4294a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1773,6 +1773,7 @@ long _do_fork(unsigned long clone_flags, struct pid *pid; trace_sched_process_fork(current, p); + trace_sched_prio_update(p); pid = get_task_pid(p, PIDTYPE_PID); nr = pid_vnr(pid); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ce83e39..c729425 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3708,6 +3708,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_curr(rq); } out_unlock: + trace_sched_prio_update(p); task_rq_unlock(rq, p, &rf); } EXPORT_SYMBOL(set_user_nice); @@ -3912,6 +3913,8 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; + + trace_sched_prio_update(p); } static void -- 1.9.1