On Friday 23 March 2007 23:28, Andy Whitcroft wrote: > Andy Whitcroft wrote: > > Con Kolivas wrote: > >> On Friday 23 March 2007 05:17, Andy Whitcroft wrote: > >>> Ok, I have yet a third x86_64 machine is is blowing up with the latest > >>> 2.6.21-rc4-mm1+hotfixes+rsdl-0.32 but working with > >>> 2.6.21-rc4-mm1+hotfixes-RSDL. I have results on various hotfix levels > >>> so I have just fired off a set of tests across the affected machines on > >>> that latest hotfix stack plus the RSDL backout and the results should > >>> be in in the next hour or two. > >>> > >>> I think there is a strong correlation between RSDL and these hangs. > >>> Any suggestions as to the next step. > >> > >> Found a nasty in requeue_task > >> + if (list_empty(old_array->queue + old_prio)) > >> + __clear_bit(old_prio, p->array->prio_bitmap); > >> > >> see anything wrong there? I do :P > >> > >> I'll queue that up with the other changes pending and hopefully that > >> will fix your bug. > > > > Tests queued with your rdsl-0.33 patch (I am assuming its in there). > > Will let you know how it looks. > > Hmmm, this is good for the original machine (as was 0.32) but not for > either of the other two. I am seeing panics as below on those two.
This machine seems most sensitive to it (first column): elm3b6 amd64 newisys 4cpu config: amd64 Can you throw this debugging patch at it please? The console output might be very helpful. On top of sched-rsdl-0.33 thanks! --- kernel/sched.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) Index: linux-2.6.21-rc4-mm1/kernel/sched.c =================================================================== --- linux-2.6.21-rc4-mm1.orig/kernel/sched.c 2007-03-24 08:32:19.000000000 +1100 +++ linux-2.6.21-rc4-mm1/kernel/sched.c 2007-03-24 08:42:04.000000000 +1100 @@ -659,6 +659,25 @@ static inline void set_task_entitlement( p->time_slice = p->quota; } +static int debug_rqbitmap(struct rq *rq) +{ + struct list_head *queue; + int idx = 0, error = 0; + struct prio_array *array = rq->active; + + for (idx = 0; idx < MAX_PRIO; idx++) { + queue = array->queue + idx; + if (!list_empty(queue)) { + if (!test_bit(idx, rq->dyn_bitmap)) { + __set_bit(idx, rq->dyn_bitmap); + error = 1; + printk(KERN_ERR "MISSING DYNAMIC BIT %d\n", idx); + } + } + } + return error; +} + /* * There is no specific hard accounting. The dynamic bits can have * false positives. rt_tasks can only be on the active queue. @@ -679,6 +698,7 @@ static void dequeue_task(struct task_str list_del_init(&p->run_list); if (list_empty(p->array->queue + p->prio)) __clear_bit(p->prio, p->array->prio_bitmap); + WARN_ON(debug_rqbitmap(rq)); } /* @@ -797,12 +817,14 @@ static void enqueue_task(struct task_str { __enqueue_task(p, rq); list_add_tail(&p->run_list, p->array->queue + p->prio); + WARN_ON(debug_rqbitmap(rq)); } static inline void enqueue_task_head(struct task_struct *p, struct rq *rq) { __enqueue_task(p, rq); list_add(&p->run_list, p->array->queue + p->prio); + WARN_ON(debug_rqbitmap(rq)); } /* @@ -820,6 +842,7 @@ static void requeue_task(struct task_str __clear_bit(old_prio, old_array->prio_bitmap); set_dynamic_bit(p, rq); } + WARN_ON(debug_rqbitmap(rq)); } /* @@ -906,6 +929,7 @@ static inline void __activate_task(struc { enqueue_task(p, rq); inc_nr_running(p, rq); + WARN_ON(debug_rqbitmap(rq)); } /* @@ -1006,6 +1030,7 @@ static void deactivate_task(struct task_ { dec_nr_running(p, rq); dequeue_task(p, rq); + WARN_ON(debug_rqbitmap(rq)); } /* @@ -1718,9 +1743,11 @@ void fastcall wake_up_new_task(struct ta * Parent and child are on different CPUs, now get the * parent runqueue to update the parent's ->flags: */ + WARN_ON(debug_rqbitmap(rq)); task_rq_unlock(rq, &flags); this_rq = task_rq_lock(current, &flags); } + WARN_ON(debug_rqbitmap(this_rq)); task_rq_unlock(this_rq, &flags); } @@ -3357,6 +3384,7 @@ static inline void major_prio_rotation(s rq->dyn_bitmap = rq->active->prio_bitmap; rq->best_static_prio = MAX_PRIO - 1; rq->prio_rotation++; + WARN_ON(debug_rqbitmap(rq)); } /* @@ -3399,6 +3427,8 @@ static inline void rotate_runqueue_prior } memset(rq->prio_quota, 0, ARRAY_SIZE(rq->prio_quota)); major_prio_rotation(rq); + WARN_ON(debug_rqbitmap(rq)); + } else { /* Minor rotation */ new_prio_level = rq->prio_level + 1; @@ -3409,6 +3439,7 @@ static inline void rotate_runqueue_prior __set_bit(new_prio_level, rq->dyn_bitmap); } rq_quota(rq, rq->prio_level) = 0; + WARN_ON(debug_rqbitmap(rq)); } rq->prio_level = new_prio_level; /* @@ -3431,6 +3462,10 @@ static void task_running_tick(struct rq return; spin_lock(&rq->lock); + if (!p->time_slice) { + printk(KERN_ERR "NO TIME_SLICE IN TRT \n"); + p->time_slice++; + } /* * Accounting is performed by both the task and the runqueue. This * allows frequently sleeping tasks to get their proper quota of @@ -3460,6 +3495,7 @@ static void task_running_tick(struct rq set_tsk_need_resched(p); } out_unlock: + WARN_ON(debug_rqbitmap(rq)); spin_unlock(&rq->lock); } @@ -3479,6 +3515,7 @@ void scheduler_tick(void) if (!idle_at_tick) task_running_tick(rq, p, 1); + WARN_ON(debug_rqbitmap(rq)); #ifdef CONFIG_SMP update_load(rq); rq->idle_at_tick = idle_at_tick; @@ -3548,6 +3585,7 @@ static inline struct task_struct *next_d struct prio_array *array = rq->active; int expirations = 0; + WARN_ON(debug_rqbitmap(rq)); retry: if (idx >= MAX_PRIO) { BUG_ON(++expirations > 1); @@ -3601,6 +3639,7 @@ retry: if (next->static_prio < rq->best_static_prio && next->policy != SCHED_BATCH) rq->best_static_prio = next->static_prio; + WARN_ON(debug_rqbitmap(rq)); return next; } -- -ck - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/