On Friday 23 March 2007 23:28, Andy Whitcroft wrote:
> Andy Whitcroft wrote:
> > Con Kolivas wrote:
> >> On Friday 23 March 2007 05:17, Andy Whitcroft wrote:
> >>> Ok, I have yet a third x86_64 machine is is blowing up with the latest
> >>> 2.6.21-rc4-mm1+hotfixes+rsdl-0.32 but working with
> >>> 2.6.21-rc4-mm1+hotfixes-RSDL.  I have results on various hotfix levels
> >>> so I have just fired off a set of tests across the affected machines on
> >>> that latest hotfix stack plus the RSDL backout and the results should
> >>> be in in the next hour or two.
> >>>
> >>> I think there is a strong correlation between RSDL and these hangs. 
> >>> Any suggestions as to the next step.
> >>
> >> Found a nasty in requeue_task
> >> +  if (list_empty(old_array->queue + old_prio))
> >> +          __clear_bit(old_prio, p->array->prio_bitmap);
> >>
> >> see anything wrong there? I do :P
> >>
> >> I'll queue that up with the other changes pending and hopefully that
> >> will fix your bug.
> >
> > Tests queued with your rdsl-0.33 patch (I am assuming its in there).
> > Will let you know how it looks.
>
> Hmmm, this is good for the original machine (as was 0.32) but not for
> either of the other two.  I am seeing panics as below on those two.

This machine seems most sensitive to it (first column):
elm3b6
amd64
newisys
4cpu
config: amd64

Can you throw this debugging patch at it please? The console output might be 
very helpful. On top of sched-rsdl-0.33 thanks!

---
 kernel/sched.c |   39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

Index: linux-2.6.21-rc4-mm1/kernel/sched.c
===================================================================
--- linux-2.6.21-rc4-mm1.orig/kernel/sched.c    2007-03-24 08:32:19.000000000 
+1100
+++ linux-2.6.21-rc4-mm1/kernel/sched.c 2007-03-24 08:42:04.000000000 +1100
@@ -659,6 +659,25 @@ static inline void set_task_entitlement(
        p->time_slice = p->quota;
 }
 
+static int debug_rqbitmap(struct rq *rq)
+{
+       struct list_head *queue;
+       int idx = 0, error = 0;
+       struct prio_array *array = rq->active;
+
+       for (idx = 0; idx < MAX_PRIO; idx++) {
+               queue = array->queue + idx;
+               if (!list_empty(queue)) {
+                       if (!test_bit(idx, rq->dyn_bitmap)) {
+                               __set_bit(idx, rq->dyn_bitmap);
+                               error = 1;
+                               printk(KERN_ERR "MISSING DYNAMIC BIT %d\n", 
idx);
+                       }
+               }
+       }
+       return error;
+}
+
 /*
  * There is no specific hard accounting. The dynamic bits can have
  * false positives. rt_tasks can only be on the active queue.
@@ -679,6 +698,7 @@ static void dequeue_task(struct task_str
        list_del_init(&p->run_list);
        if (list_empty(p->array->queue + p->prio))
                __clear_bit(p->prio, p->array->prio_bitmap);
+       WARN_ON(debug_rqbitmap(rq));
 }
 
 /*
@@ -797,12 +817,14 @@ static void enqueue_task(struct task_str
 {
        __enqueue_task(p, rq);
        list_add_tail(&p->run_list, p->array->queue + p->prio);
+       WARN_ON(debug_rqbitmap(rq));
 }
 
 static inline void enqueue_task_head(struct task_struct *p, struct rq *rq)
 {
        __enqueue_task(p, rq);
        list_add(&p->run_list, p->array->queue + p->prio);
+       WARN_ON(debug_rqbitmap(rq));
 }
 
 /*
@@ -820,6 +842,7 @@ static void requeue_task(struct task_str
                        __clear_bit(old_prio, old_array->prio_bitmap);
                set_dynamic_bit(p, rq);
        }
+       WARN_ON(debug_rqbitmap(rq));
 }
 
 /*
@@ -906,6 +929,7 @@ static inline void __activate_task(struc
 {
        enqueue_task(p, rq);
        inc_nr_running(p, rq);
+       WARN_ON(debug_rqbitmap(rq));
 }
 
 /*
@@ -1006,6 +1030,7 @@ static void deactivate_task(struct task_
 {
        dec_nr_running(p, rq);
        dequeue_task(p, rq);
+       WARN_ON(debug_rqbitmap(rq));
 }
 
 /*
@@ -1718,9 +1743,11 @@ void fastcall wake_up_new_task(struct ta
                 * Parent and child are on different CPUs, now get the
                 * parent runqueue to update the parent's ->flags:
                 */
+               WARN_ON(debug_rqbitmap(rq));
                task_rq_unlock(rq, &flags);
                this_rq = task_rq_lock(current, &flags);
        }
+       WARN_ON(debug_rqbitmap(this_rq));
        task_rq_unlock(this_rq, &flags);
 }
 
@@ -3357,6 +3384,7 @@ static inline void major_prio_rotation(s
        rq->dyn_bitmap = rq->active->prio_bitmap;
        rq->best_static_prio = MAX_PRIO - 1;
        rq->prio_rotation++;
+       WARN_ON(debug_rqbitmap(rq));
 }
 
 /*
@@ -3399,6 +3427,8 @@ static inline void rotate_runqueue_prior
                }
                memset(rq->prio_quota, 0, ARRAY_SIZE(rq->prio_quota));
                major_prio_rotation(rq);
+               WARN_ON(debug_rqbitmap(rq));
+
        } else {
                /* Minor rotation */
                new_prio_level = rq->prio_level + 1;
@@ -3409,6 +3439,7 @@ static inline void rotate_runqueue_prior
                        __set_bit(new_prio_level, rq->dyn_bitmap);
                }
                rq_quota(rq, rq->prio_level) = 0;
+               WARN_ON(debug_rqbitmap(rq));
        }
        rq->prio_level = new_prio_level;
        /*
@@ -3431,6 +3462,10 @@ static void task_running_tick(struct rq 
                return;
 
        spin_lock(&rq->lock);
+       if (!p->time_slice) {
+               printk(KERN_ERR "NO TIME_SLICE IN TRT \n");
+               p->time_slice++;
+       }
        /*
         * Accounting is performed by both the task and the runqueue. This
         * allows frequently sleeping tasks to get their proper quota of
@@ -3460,6 +3495,7 @@ static void task_running_tick(struct rq 
                set_tsk_need_resched(p);
        }
 out_unlock:
+       WARN_ON(debug_rqbitmap(rq));
        spin_unlock(&rq->lock);
 }
 
@@ -3479,6 +3515,7 @@ void scheduler_tick(void)
 
        if (!idle_at_tick)
                task_running_tick(rq, p, 1);
+       WARN_ON(debug_rqbitmap(rq));
 #ifdef CONFIG_SMP
        update_load(rq);
        rq->idle_at_tick = idle_at_tick;
@@ -3548,6 +3585,7 @@ static inline struct task_struct *next_d
        struct prio_array *array = rq->active;
        int expirations = 0;
 
+       WARN_ON(debug_rqbitmap(rq));
 retry:
        if (idx >= MAX_PRIO) {
                BUG_ON(++expirations > 1);
@@ -3601,6 +3639,7 @@ retry:
        if (next->static_prio < rq->best_static_prio &&
            next->policy != SCHED_BATCH)
                rq->best_static_prio = next->static_prio;
+       WARN_ON(debug_rqbitmap(rq));
        return next;
 }
 


-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to