You found some scheduling changes made post-2.6.28 which seemed to deal with this bug. However, that cannot be the full story because they are modifying code which was added after 2.6.26.
The attached patch combines a bug fix made between 2.6.26 and .28 with most of the changes you identified. Based on my reading of commit messages and discussion, I think this makes a coherent set of changes. However there have been many other changes to the scheduler in this time and I cannot say for sure whether any of those are also required as I do not have any great knowledge of it. Please try rebuilding the "lenny" kernel (linux-source-2.6.26) with this patch applied and report whether it fixes the bug for you. Ben. -- Ben Hutchings Logic doesn't apply to the real world. - Marvin Minsky
Combination of these scheduler fixes: commit 1af5f730fc1bf7c62ec9fb2d307206e18bf40a69 Author: Peter Zijlstra <a.p.zijls...@chello.nl> Date: Fri Oct 24 11:06:13 2008 +0200 sched: more accurate min_vruntime accounting [part of:] commit 6bc912b71b6f33b041cfde93ca3f019cbaa852bc Author: Peter Zijlstra <a.p.zijls...@chello.nl> Date: Thu Jan 15 14:53:38 2009 +0100 sched: SCHED_OTHER vs SCHED_IDLE isolation commit cce7ade803699463ecc62a065ca522004f7ccb3d Author: Peter Zijlstra <a.p.zijls...@chello.nl> Date: Thu Jan 15 14:53:37 2009 +0100 sched: SCHED_IDLE weight change commit e17036dac189dd034c092a91df56aa740db7146d Author: Peter Zijlstra <a.p.zijls...@chello.nl> Date: Thu Jan 15 14:53:39 2009 +0100 sched: fix update_min_vruntime --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1390,8 +1390,8 @@ * slice expiry etc. */ -#define WEIGHT_IDLEPRIO 2 -#define WMULT_IDLEPRIO (1 << 31) +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 /* * Nice levels are multiplicative, with a gentle 10% change for every --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -221,6 +221,27 @@ return se->vruntime - cfs_rq->min_vruntime; } +static void update_min_vruntime(struct cfs_rq *cfs_rq) +{ + u64 vruntime = cfs_rq->min_vruntime; + + if (cfs_rq->curr) + vruntime = cfs_rq->curr->vruntime; + + if (cfs_rq->rb_leftmost) { + struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, + struct sched_entity, + run_node); + + if (!cfs_rq->curr) + vruntime = se->vruntime; + else + vruntime = min_vruntime(vruntime, se->vruntime); + } + + cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +} + /* * Enqueue an entity into the rb-tree: */ @@ -254,15 +275,8 @@ * Maintain a cache of leftmost tree entries (it is frequently * used): */ - if (leftmost) { + if (leftmost) cfs_rq->rb_leftmost = &se->run_node; - /* - * maintain cfs_rq->min_vruntime to be a monotonic increasing - * value tracking the leftmost vruntime in the tree. - */ - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, se->vruntime); - } rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); @@ -272,18 +286,9 @@ { if (cfs_rq->rb_leftmost == &se->run_node) { struct rb_node *next_node; - struct sched_entity *next; next_node = rb_next(&se->run_node); cfs_rq->rb_leftmost = next_node; - - if (next_node) { - next = rb_entry(next_node, - struct sched_entity, run_node); - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, - next->vruntime); - } } if (cfs_rq->next == se) @@ -425,6 +430,7 @@ &curr->load); } curr->vruntime += delta_exec_weighted; + update_min_vruntime(cfs_rq); } static void update_curr(struct cfs_rq *cfs_rq) @@ -590,13 +596,7 @@ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { - u64 vruntime; - - if (first_fair(cfs_rq)) { - vruntime = min_vruntime(cfs_rq->min_vruntime, - __pick_next_entity(cfs_rq)->vruntime); - } else - vruntime = cfs_rq->min_vruntime; + u64 vruntime = cfs_rq->min_vruntime; /* * The 'current' period is already promised to the current tasks, @@ -680,6 +680,7 @@ if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); + update_min_vruntime(cfs_rq); } /* @@ -1184,12 +1185,18 @@ cfs_rq_of(pse)->next = pse; /* - * Batch tasks do not preempt (their preemption is driven by + * Batch and idle tasks do not preempt (their preemption is driven by * the tick): */ - if (unlikely(p->policy == SCHED_BATCH)) + if (unlikely(p->policy != SCHED_NORMAL)) return; + /* Idle tasks are by definition preempted by everybody. */ + if (unlikely(curr->policy == SCHED_IDLE)) { + resched_task(curr); + return; + } + if (!sched_feat(WAKEUP_PREEMPT)) return;
signature.asc
Description: This is a digitally signed message part