* Peter Zijlstra <[EMAIL PROTECTED]> wrote: > > I can reproduce priority leakage on my SMP system; any userspace process > > chrt'ed up and a lot will follow. This makes the system very > > unresponsive when doing a make -j5. Verified on 51-{6,18,23}. > > > > The following patch seems to fix it for me, YMMV. > > --- kernel/sched.c~ 2005-07-08 10:27:59.000000000 +0200 > +++ kernel/sched.c 2005-07-10 13:00:42.000000000 +0200 > @@ -780,7 +780,8 @@ static void recalc_task_prio(task_t *p, > } > } > > - p->prio = p->normal_prio = effective_prio(p); > + p->prio = effective_prio(p); > + p->normal_prio = unlikely(rt_prio(p->normal_prio)) ? p->prio : > __effective_prio(p); > }
ahh, indeed, this code did not take boosting into account. Good catch! I'm wondering why this only showed up on SMP. I've fixed it a bit differently in my tree, by making the roles of the various priority fields and functions more obvious, see the delta patch below. I've also released the -51-23 patch with these changes included. Does this fix priority leakage on your SMP system? Ingo Index: linux/include/linux/sched.h =================================================================== --- linux.orig/include/linux/sched.h +++ linux/include/linux/sched.h @@ -1035,7 +1035,7 @@ extern int idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); extern task_t *idle_task(int cpu); extern void mutex_setprio(task_t *p, int prio); -extern int mutex_getprio(task_t *p); +extern int normal_prio(task_t *p); void yield(void); void __yield(void); Index: linux/kernel/rt.c =================================================================== --- linux.orig/kernel/rt.c +++ linux/kernel/rt.c @@ -720,7 +720,7 @@ static void pi_setprio(struct rt_mutex * #ifdef CONFIG_RT_DEADLOCK_DETECT pi_prio++; - if (p->policy != SCHED_NORMAL && prio > mutex_getprio(p)) { + if (p->policy != SCHED_NORMAL && prio > normal_prio(p)) { TRACE_OFF(); printk("huh? (%d->%d??)\n", p->prio, prio); Index: linux/kernel/sched.c =================================================================== --- linux.orig/kernel/sched.c +++ linux/kernel/sched.c @@ -645,7 +645,7 @@ static inline void enqueue_task_head(str } /* - * effective_prio - return the priority that is based on the static + * __normal_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. * * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] @@ -659,7 +659,7 @@ static inline void enqueue_task_head(str * Both properties are important to certain workloads. */ -static inline int __effective_prio(task_t *p) +static inline int __normal_prio(task_t *p) { int bonus, prio; @@ -670,57 +670,53 @@ static inline int __effective_prio(task_ prio = MAX_RT_PRIO; if (prio > MAX_PRIO-1) prio = MAX_PRIO-1; - return prio; -} -static int effective_prio(task_t *p) -{ - if (rt_task(p)) - return p->prio; - return __effective_prio(p); -} - -static inline void trace_start_sched_wakeup(task_t *p, runqueue_t *rq) -{ - if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr)) - __trace_start_sched_wakeup(p); + return prio; } /* - * __activate_task - move a task to the runqueue. + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. */ -static inline void __activate_task(task_t *p, runqueue_t *rq) +inline int normal_prio(task_t *p) { - trace_special_pid(p->pid, p->prio, rq->nr_running); - enqueue_task(p, rq->active); - rq->nr_running++; + int prio; + + if (p->policy != SCHED_NORMAL) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + + trace_special_pid(p->pid, p->prio, prio); + return prio; } /* - * __activate_task_after - move a task to the runqueue, - * to execute after a specific task. + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. */ -static inline -void __activate_task_after(task_t *p, task_t *parent, runqueue_t *rq) +static void __recalc_task_prio(task_t *p) { - // FIXME: to head rather? - list_add_tail(&p->run_list, &parent->run_list); - p->array = parent->array; - p->array->nr_active++; - rq->nr_running++; - inc_rt_tasks(p, rq); + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + p->prio = p->normal_prio; } /* - * __activate_idle_task - move idle task to the _front_ of runqueue. + * Recalculate p->normal_prio and p->prio after having slept, + * updating the sleep-average too: */ -static inline void __activate_idle_task(task_t *p, runqueue_t *rq) -{ - enqueue_task_head(p, rq->active); - rq->nr_running++; - WARN_ON(rt_task(p)); -} - static void recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ @@ -780,7 +776,48 @@ static void recalc_task_prio(task_t *p, } } - p->prio = p->normal_prio = effective_prio(p); + __recalc_task_prio(p); +} + +static inline void trace_start_sched_wakeup(task_t *p, runqueue_t *rq) +{ + if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr)) + __trace_start_sched_wakeup(p); +} + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq) +{ + trace_special_pid(p->pid, p->prio, rq->nr_running); + enqueue_task(p, rq->active); + rq->nr_running++; +} + +/* + * __activate_task_after - move a task to the runqueue, + * to execute after a specific task. + */ +static inline +void __activate_task_after(task_t *p, task_t *parent, runqueue_t *rq) +{ + // FIXME: to head rather? + list_add_tail(&p->run_list, &parent->run_list); + p->array = parent->array; + p->array->nr_active++; + rq->nr_running++; + inc_rt_tasks(p, rq); +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +{ + enqueue_task_head(p, rq->active); + rq->nr_running++; + WARN_ON(rt_task(p)); } /* @@ -1415,7 +1452,7 @@ void fastcall wake_up_new_task(task_t * p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - p->prio = p->normal_prio = effective_prio(p); + __recalc_task_prio(p); if (likely(cpu == this_cpu)) { if (!(clone_flags & CLONE_VM)) { @@ -1427,7 +1464,8 @@ void fastcall wake_up_new_task(task_t * if (unlikely(!current->array)) __activate_task(p, rq); else { - p->prio = p->normal_prio = current->prio; + p->prio = current->prio; + p->normal_prio = current->normal_prio; __activate_task_after(p, current, rq); } set_need_resched(); @@ -2730,6 +2768,10 @@ void scheduler_tick(void) /* * RR tasks need a special form of timeslice management. * FIFO tasks have no timeslices. + * + * On PREEMPT_RT, boosted tasks will also get into this + * branch and wont get their timeslice decreased until + * they have done their work. */ if ((p->policy == SCHED_RR) && !--p->time_slice) { p->time_slice = task_timeslice(p); @@ -2744,7 +2786,7 @@ void scheduler_tick(void) if (!--p->time_slice) { dequeue_task(p, rq->active); set_tsk_need_resched(p); - p->prio = p->normal_prio = effective_prio(p); + __recalc_task_prio(p); p->time_slice = task_timeslice(p); p->first_time_slice = 0; @@ -3682,7 +3724,7 @@ void set_user_nice(task_t *p, long nice) new_prio = NICE_TO_PRIO(nice); delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); - p->prio += delta; + __recalc_task_prio(p); if (array) { enqueue_task(p, array); @@ -3712,18 +3754,6 @@ int can_nice(const task_t *p, const int capable(CAP_SYS_NICE)); } -int mutex_getprio(task_t *p) -{ - int prio; - - if (p->policy != SCHED_NORMAL) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __effective_prio(p); - trace_special_pid(p->pid, p->prio, prio); - return prio; -} - /* * Used by the PREEMPT_RT code to implement * priority inheritance logic: @@ -3880,10 +3910,7 @@ static void __setscheduler(struct task_s BUG_ON(p->array); p->policy = policy; p->rt_priority = prio; - if (policy != SCHED_NORMAL) - p->prio = p->normal_prio = MAX_RT_PRIO-1 - p->rt_priority; - else - p->prio = p->normal_prio = p->static_prio; + __recalc_task_prio(p); } /** - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/