I added a separate flag to keep track of whether a neighbour struct
is on the timer list or not.  I also added some logic to dump the
stack if the add-timer method was called while we are already on
the timer list.

This dump-stack method is being called quite often in my test that
used to reproduce the netdev hang, but the system has not hung yet.

Could a double-add to the timer list be the problem?

Here is the first error:

ERROR:  trying to add timer, but already there, neigh: d5884780
 [<c02cce45>] neigh_add_timer+0x75/0x80
 [<c02ce077>] neigh_timer_handler+0x97/0x2b0
 [<c012a3f2>] run_timer_softirq+0xe2/0x1b0
 [<c02cdfe0>] neigh_timer_handler+0x0/0x2b0
 [<c012a3f2>] run_timer_softirq+0xe2/0x1b0
 [<c011c1ef>] rebalance_tick+0xff/0x110
 [<c0126309>] __do_softirq+0xd9/0xf0
 [<c012636a>] do_softirq+0x4a/0x50
 [<c0126464>] irq_exit+0x44/0x50
 [<c0103a40>] apic_timer_interrupt+0x1c/0x24
 [<c0100f93>] mwait_idle+0x33/0x50
 [<c02104a7>] acpi_processor_idle+0x0/0x2ae
 [<c02105ad>] acpi_processor_idle+0x106/0x2ae
 [<c02104a7>] acpi_processor_idle+0x0/0x2ae
 [<c0100db1>] cpu_idle+0x61/0x80
 [<c046491c>] start_kernel+0x18c/0x1d0
 [<c0464360>] unknown_bootoption+0x0/0x1c0


Here is the patch, on top of the rest of my debugging code.

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -124,7 +124,7 @@ struct neighbour
        struct neighbour        *next;
        struct neigh_table      *tbl;
        struct neigh_parms      *parms;
-       struct net_device               *dev;
+       struct net_device       *dev;
        unsigned long           used;
        unsigned long           confirmed;
        unsigned long           updated;
@@ -136,10 +136,11 @@ struct neighbour
        rwlock_t                lock;
        unsigned char           ha[(MAX_ADDR_LEN+sizeof(unsigned 
long)-1)&~(sizeof(unsigned long)-1)];
        struct hh_cache         *hh;
-       atomic_t                __refcnt; /* modify through neigh_put, 
neigh_hold */
+       atomic_t                __refcnt; /* modify through neigh_release, 
neigh_hold */
        int                     (*output)(struct sk_buff *skb);
        struct sk_buff_head     arp_queue;
        struct timer_list       timer;
+       int                     in_timer; /* boolean */
        struct neigh_ops        *ops;
        u8                      primary_key[0];
 };
diff --git a/net/atm/clip.c b/net/atm/clip.c
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -146,11 +146,11 @@ static int neigh_check_cb(struct neighbo
        if (entry->vccs || time_before(jiffies, entry->expires))
                return 0;

-       if (atomic_read(&n->refcnt) > 1) {
+       if (atomic_read(&n->__refcnt) > 1) {
                struct sk_buff *skb;

                DPRINTK("destruction postponed with ref %d\n",
-                       atomic_read(&n->refcnt));
+                       atomic_read(&n->__refcnt));

                while ((skb = skb_dequeue(&n->arp_queue)) != NULL)
                        dev_kfree_skb(skb);
@@ -539,7 +539,7 @@ static int clip_setentry(struct atm_vcc
        }
        error = ip_route_output_key(&rt,&fl);
        if (error) return error;
-       neigh = __neigh_lookup(&clip_tbl,&ip,rt->u.dst.dev,1);
+       neigh = __neigh_lookup(&clip_tbl,&ip,rt->u.dst.dev,1, NDRK_GENERIC);
        ip_rt_put(rt);
        if (!neigh)
                return -ENOMEM;
@@ -554,7 +554,7 @@ static int clip_setentry(struct atm_vcc
        }
        error = neigh_update(neigh, llc_oui, NUD_PERMANENT,
                             NEIGH_UPDATE_F_OVERRIDE|NEIGH_UPDATE_F_ADMIN);
-       neigh_release(neigh);
+       neigh_release(neigh, NDRK_GENERIC);
        return error;
 }

@@ -855,7 +855,7 @@ static void atmarp_info(struct seq_file
                        seq_printf(seq, "(resolving)\n");
                else
                        seq_printf(seq, "(expired, ref %d)\n",
-                                  atomic_read(&entry->neigh->refcnt));
+                                  atomic_read(&entry->neigh->__refcnt));
        } else if (!svc) {
                seq_printf(seq, "%d.%d.%d\n",
                           clip_vcc->vcc->dev->number,
diff --git a/net/core/dev.c b/net/core/dev.c
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -593,8 +593,8 @@ static void dump_rfcnt_info(struct net_d
                                                       n->flags, n->nud_state, 
n->type,
                                                       n->dead, 
atomic_read(&n->probes),
                                                       n->hh, 
atomic_read(&n->__refcnt));
-                                               printk("  output: %p  ops: 
%p\n",
-                                                      n->output, n->ops);
+                                               printk("  output: %p  ops: %p  in_timer: 
%d\n",+                                                      n->output, n->ops, 
n->in_timer);
                                                dump_neigh_rfcnt_info(n);

                                                debug_locate_neigh(n);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -157,11 +157,31 @@ static int neigh_forced_gc(struct neigh_
        return shrunk;
 }

-static int neigh_del_timer(struct neighbour *n)
-{
-       if ((n->nud_state & NUD_IN_TIMER) &&
-           del_timer(&n->timer)) {
-               neigh_release(n, NDRK_NEIGH_TIMER);
+static int neigh_del_timer(struct neighbour *n) {
+       if (n->in_timer) {
+               if (!n->nud_state & NUD_IN_TIMER) {
+                       printk("ERROR:  in_timer true, but NUD_IN_TIMER false, 
neigh: %p\n", n);
+                       dump_stack();
+               }
+               if (del_timer(&n->timer)) {
+                       n->in_timer = 0;
+                       neigh_release(n, NDRK_NEIGH_TIMER);
+                       return 1;
+               }
+       }
+       return 0;
+}
+
+static int neigh_add_timer(struct neighbour *n, u32 expires) {
+       if (n->in_timer) {
+               printk("ERROR:  trying to add timer, but already there, neigh: 
%p\n", n);
+               dump_stack();
+       }
+       else {
+               n->timer.expires = expires;
+               neigh_hold(n, NDRK_NEIGH_TIMER);
+               add_timer(&n->timer);
+               n->in_timer = 1;
                return 1;
        }
        return 0;
@@ -748,6 +768,9 @@ static void neigh_timer_handler(unsigned

        write_lock(&neigh->lock);

+       /* We are out of the timer list at this point. */
+       neigh->in_timer = 0;
+
        state = neigh->nud_state;
        now = jiffies;
        next = now + HZ;
@@ -817,12 +840,9 @@ static void neigh_timer_handler(unsigned
        }

        if (neigh->nud_state & NUD_IN_TIMER) {
-               neigh_hold(neigh, NDRK_NEIGH_TIMER);
                if (time_before(next, jiffies + HZ/2))
                        next = jiffies + HZ/2;
-               neigh->timer.expires = next;
-               neigh->nud_state |= NUD_IN_TIMER;
-               add_timer(&neigh->timer);
+               neigh_add_timer(neigh, next);
        }
        if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
                struct sk_buff *skb = skb_peek(&neigh->arp_queue);
@@ -863,10 +883,7 @@ int __neigh_event_send(struct neighbour
                if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
                        atomic_set(&neigh->probes, neigh->parms->ucast_probes);
                        neigh->nud_state     = NUD_INCOMPLETE;
-                       neigh_hold(neigh, NDRK_NEIGH_TIMER);
-                       neigh->timer.expires = now + 1;
-                       neigh->nud_state |= NUD_IN_TIMER;
-                       add_timer(&neigh->timer);
+                       neigh_add_timer(neigh, now + 1);
                } else {
                        neigh->nud_state = NUD_FAILED;
                        write_unlock_bh(&neigh->lock);
@@ -877,11 +894,8 @@ int __neigh_event_send(struct neighbour
                }
        } else if (neigh->nud_state & NUD_STALE) {
                NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
-               neigh_hold(neigh, NDRK_NEIGH_TIMER);
                neigh->nud_state = NUD_DELAY;
-               neigh->timer.expires = jiffies + neigh->parms->delay_probe_time;
-               neigh->nud_state |= NUD_IN_TIMER;
-               add_timer(&neigh->timer);
+               neigh_add_timer(neigh, jiffies + 
neigh->parms->delay_probe_time);
        }

        if (neigh->nud_state == NUD_INCOMPLETE) {
@@ -1026,12 +1040,9 @@ int neigh_update(struct neighbour *neigh
        if (new != old) {
                neigh_del_timer(neigh);
                if (new & NUD_IN_TIMER) {
-                       neigh_hold(neigh, NDRK_NEIGH_TIMER);
-                       neigh->timer.expires = jiffies +
+                       neigh_add_timer(neigh, (jiffies +
                                                ((new & NUD_REACHABLE) ?
-                                                neigh->parms->reachable_time : 
0);
-                       neigh->nud_state |= NUD_IN_TIMER;
-                       add_timer(&neigh->timer);
+                                                neigh->parms->reachable_time : 
0)));
                }
                neigh->nud_state = new;
        }

--
Ben Greear <[EMAIL PROTECTED]>
Candela Technologies Inc  http://www.candelatech.com

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to