I added a separate flag to keep track of whether a neighbour struct is on the timer list or not. I also added some logic to dump the stack if the add-timer method was called while we are already on the timer list.
This dump-stack method is being called quite often in my test that used to reproduce the netdev hang, but the system has not hung yet. Could a double-add to the timer list be the problem? Here is the first error: ERROR: trying to add timer, but already there, neigh: d5884780 [<c02cce45>] neigh_add_timer+0x75/0x80 [<c02ce077>] neigh_timer_handler+0x97/0x2b0 [<c012a3f2>] run_timer_softirq+0xe2/0x1b0 [<c02cdfe0>] neigh_timer_handler+0x0/0x2b0 [<c012a3f2>] run_timer_softirq+0xe2/0x1b0 [<c011c1ef>] rebalance_tick+0xff/0x110 [<c0126309>] __do_softirq+0xd9/0xf0 [<c012636a>] do_softirq+0x4a/0x50 [<c0126464>] irq_exit+0x44/0x50 [<c0103a40>] apic_timer_interrupt+0x1c/0x24 [<c0100f93>] mwait_idle+0x33/0x50 [<c02104a7>] acpi_processor_idle+0x0/0x2ae [<c02105ad>] acpi_processor_idle+0x106/0x2ae [<c02104a7>] acpi_processor_idle+0x0/0x2ae [<c0100db1>] cpu_idle+0x61/0x80 [<c046491c>] start_kernel+0x18c/0x1d0 [<c0464360>] unknown_bootoption+0x0/0x1c0 Here is the patch, on top of the rest of my debugging code. diff --git a/include/net/neighbour.h b/include/net/neighbour.h --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -124,7 +124,7 @@ struct neighbour struct neighbour *next; struct neigh_table *tbl; struct neigh_parms *parms; - struct net_device *dev; + struct net_device *dev; unsigned long used; unsigned long confirmed; unsigned long updated; @@ -136,10 +136,11 @@ struct neighbour rwlock_t lock; unsigned char ha[(MAX_ADDR_LEN+sizeof(unsigned long)-1)&~(sizeof(unsigned long)-1)]; struct hh_cache *hh; - atomic_t __refcnt; /* modify through neigh_put, neigh_hold */ + atomic_t __refcnt; /* modify through neigh_release, neigh_hold */ int (*output)(struct sk_buff *skb); struct sk_buff_head arp_queue; struct timer_list timer; + int in_timer; /* boolean */ struct neigh_ops *ops; u8 primary_key[0]; }; diff --git a/net/atm/clip.c b/net/atm/clip.c --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -146,11 +146,11 @@ static int neigh_check_cb(struct neighbo if (entry->vccs || time_before(jiffies, entry->expires)) return 0; - if (atomic_read(&n->refcnt) > 1) { + if (atomic_read(&n->__refcnt) > 1) { struct sk_buff *skb; DPRINTK("destruction postponed with ref %d\n", - atomic_read(&n->refcnt)); + atomic_read(&n->__refcnt)); while ((skb = skb_dequeue(&n->arp_queue)) != NULL) dev_kfree_skb(skb); @@ -539,7 +539,7 @@ static int clip_setentry(struct atm_vcc } error = ip_route_output_key(&rt,&fl); if (error) return error; - neigh = __neigh_lookup(&clip_tbl,&ip,rt->u.dst.dev,1); + neigh = __neigh_lookup(&clip_tbl,&ip,rt->u.dst.dev,1, NDRK_GENERIC); ip_rt_put(rt); if (!neigh) return -ENOMEM; @@ -554,7 +554,7 @@ static int clip_setentry(struct atm_vcc } error = neigh_update(neigh, llc_oui, NUD_PERMANENT, NEIGH_UPDATE_F_OVERRIDE|NEIGH_UPDATE_F_ADMIN); - neigh_release(neigh); + neigh_release(neigh, NDRK_GENERIC); return error; } @@ -855,7 +855,7 @@ static void atmarp_info(struct seq_file seq_printf(seq, "(resolving)\n"); else seq_printf(seq, "(expired, ref %d)\n", - atomic_read(&entry->neigh->refcnt)); + atomic_read(&entry->neigh->__refcnt)); } else if (!svc) { seq_printf(seq, "%d.%d.%d\n", clip_vcc->vcc->dev->number, diff --git a/net/core/dev.c b/net/core/dev.c --- a/net/core/dev.c +++ b/net/core/dev.c @@ -593,8 +593,8 @@ static void dump_rfcnt_info(struct net_d n->flags, n->nud_state, n->type, n->dead, atomic_read(&n->probes), n->hh, atomic_read(&n->__refcnt)); - printk(" output: %p ops: %p\n", - n->output, n->ops); + printk(" output: %p ops: %p in_timer: %d\n",+ n->output, n->ops, n->in_timer); dump_neigh_rfcnt_info(n); debug_locate_neigh(n); diff --git a/net/core/neighbour.c b/net/core/neighbour.c --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -157,11 +157,31 @@ static int neigh_forced_gc(struct neigh_ return shrunk; } -static int neigh_del_timer(struct neighbour *n) -{ - if ((n->nud_state & NUD_IN_TIMER) && - del_timer(&n->timer)) { - neigh_release(n, NDRK_NEIGH_TIMER); +static int neigh_del_timer(struct neighbour *n) { + if (n->in_timer) { + if (!n->nud_state & NUD_IN_TIMER) { + printk("ERROR: in_timer true, but NUD_IN_TIMER false, neigh: %p\n", n); + dump_stack(); + } + if (del_timer(&n->timer)) { + n->in_timer = 0; + neigh_release(n, NDRK_NEIGH_TIMER); + return 1; + } + } + return 0; +} + +static int neigh_add_timer(struct neighbour *n, u32 expires) { + if (n->in_timer) { + printk("ERROR: trying to add timer, but already there, neigh: %p\n", n); + dump_stack(); + } + else { + n->timer.expires = expires; + neigh_hold(n, NDRK_NEIGH_TIMER); + add_timer(&n->timer); + n->in_timer = 1; return 1; } return 0; @@ -748,6 +768,9 @@ static void neigh_timer_handler(unsigned write_lock(&neigh->lock); + /* We are out of the timer list at this point. */ + neigh->in_timer = 0; + state = neigh->nud_state; now = jiffies; next = now + HZ; @@ -817,12 +840,9 @@ static void neigh_timer_handler(unsigned } if (neigh->nud_state & NUD_IN_TIMER) { - neigh_hold(neigh, NDRK_NEIGH_TIMER); if (time_before(next, jiffies + HZ/2)) next = jiffies + HZ/2; - neigh->timer.expires = next; - neigh->nud_state |= NUD_IN_TIMER; - add_timer(&neigh->timer); + neigh_add_timer(neigh, next); } if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { struct sk_buff *skb = skb_peek(&neigh->arp_queue); @@ -863,10 +883,7 @@ int __neigh_event_send(struct neighbour if (neigh->parms->mcast_probes + neigh->parms->app_probes) { atomic_set(&neigh->probes, neigh->parms->ucast_probes); neigh->nud_state = NUD_INCOMPLETE; - neigh_hold(neigh, NDRK_NEIGH_TIMER); - neigh->timer.expires = now + 1; - neigh->nud_state |= NUD_IN_TIMER; - add_timer(&neigh->timer); + neigh_add_timer(neigh, now + 1); } else { neigh->nud_state = NUD_FAILED; write_unlock_bh(&neigh->lock); @@ -877,11 +894,8 @@ int __neigh_event_send(struct neighbour } } else if (neigh->nud_state & NUD_STALE) { NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); - neigh_hold(neigh, NDRK_NEIGH_TIMER); neigh->nud_state = NUD_DELAY; - neigh->timer.expires = jiffies + neigh->parms->delay_probe_time; - neigh->nud_state |= NUD_IN_TIMER; - add_timer(&neigh->timer); + neigh_add_timer(neigh, jiffies + neigh->parms->delay_probe_time); } if (neigh->nud_state == NUD_INCOMPLETE) { @@ -1026,12 +1040,9 @@ int neigh_update(struct neighbour *neigh if (new != old) { neigh_del_timer(neigh); if (new & NUD_IN_TIMER) { - neigh_hold(neigh, NDRK_NEIGH_TIMER); - neigh->timer.expires = jiffies + + neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? - neigh->parms->reachable_time : 0); - neigh->nud_state |= NUD_IN_TIMER; - add_timer(&neigh->timer); + neigh->parms->reachable_time : 0))); } neigh->nud_state = new; } -- Ben Greear <[EMAIL PROTECTED]> Candela Technologies Inc http://www.candelatech.com - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html