On Mon, 2007-07-09 at 15:52 +0200, Patrick McHardy wrote: > Ranko Zivojnovic wrote: > > Patrick, I've taken liberty to try and implement this myself. Attached > > is the whole new gen_estimator-fix-locking-and-timer-related-bugs.patch > > that is RCU lists based. Please be kind to review. > > Thanks for taking care of this, I'm a bit behind.
Glad to be able to help. > See below for comments .. > > > --- a/net/core/gen_estimator.c 2007-06-25 02:21:48.000000000 +0300 > > +++ b/net/core/gen_estimator.c 2007-07-09 14:27:12.053336875 +0300 > > @@ -79,7 +79,7 @@ > > > > struct gen_estimator > > { > > - struct gen_estimator *next; > > + struct list_head list; > > struct gnet_stats_basic *bstats; > > struct gnet_stats_rate_est *rate_est; > > spinlock_t *stats_lock; > > @@ -89,26 +89,27 @@ > > u32 last_packets; > > u32 avpps; > > u32 avbps; > > + struct rcu_head e_rcu; > > > You could also use synchronize_rcu(), estimator destruction is > not particulary performance critical. I've tried synchronize_rcu(), but it went kaboom with the below, thus call_rcu() must stay: ---cut--- BUG: sleeping function called from invalid context at kernel/sched.c:3928 in_atomic():1, irqs_disabled():0 3 locks held by tc/2933: #0: (rtnl_mutex){--..}, at: [<c0296e0a>] rtnetlink_rcv+0x18/0x42 #1: (&dev->queue_lock){-+..}, at: [<c029c337>] qdisc_lock_tree+0xe/0x1c #2: (&dev->ingress_lock){-...}, at: [<c029dfb8>] tc_get_qdisc+0x192/0x1e9 [<c02eec9e>] wait_for_completion+0x1a/0xb7 [<c01e3cd4>] __spin_lock_init+0x29/0x4b [<c012e410>] synchronize_rcu+0x2a/0x2f [<c012e0bc>] wakeme_after_rcu+0x0/0x8 [<c028da00>] gen_kill_estimator+0x70/0x9b [<f8b7c5a6>] htb_destroy_class+0x27/0x12f [sch_htb] [<f8b7d037>] htb_destroy+0x34/0x70 [sch_htb] [<c029c554>] qdisc_destroy+0x44/0x69 . . . ---cut--- > > static struct gen_estimator_head elist[EST_MAX_INTERVAL+1]; > > > > /* Estimator array lock */ > > -static DEFINE_RWLOCK(est_lock); > > +static DEFINE_SPINLOCK(est_lock); > > > The lock isn't needed anymore since we can rely on the rtnl > for creation/destruction. The 'est_lock' now completely removed. > > /** > > @@ -152,6 +153,7 @@ > > { > > struct gen_estimator *est; > > struct gnet_estimator *parm = RTA_DATA(opt); > > + int idx; > > > > if (RTA_PAYLOAD(opt) < sizeof(*parm)) > > return -EINVAL; > > @@ -163,7 +165,8 @@ > > if (est == NULL) > > return -ENOBUFS; > > > > - est->interval = parm->interval + 2; > > + INIT_LIST_HEAD(&est->list); > > > Initializing the members' list_head isn't necessary. > Removed. All the changes above now included in the attached patch. I've clean compiled it with 2.6.22 and now testing this version. R.
--- a/net/core/gen_estimator.c 2007-06-25 02:21:48.000000000 +0300 +++ b/net/core/gen_estimator.c 2007-07-09 19:08:06.801544963 +0300 @@ -79,7 +79,7 @@ struct gen_estimator { - struct gen_estimator *next; + struct list_head list; struct gnet_stats_basic *bstats; struct gnet_stats_rate_est *rate_est; spinlock_t *stats_lock; @@ -89,26 +89,24 @@ u32 last_packets; u32 avpps; u32 avbps; + struct rcu_head e_rcu; }; struct gen_estimator_head { struct timer_list timer; - struct gen_estimator *list; + struct list_head list; }; static struct gen_estimator_head elist[EST_MAX_INTERVAL+1]; -/* Estimator array lock */ -static DEFINE_RWLOCK(est_lock); - static void est_timer(unsigned long arg) { int idx = (int)arg; struct gen_estimator *e; - read_lock(&est_lock); - for (e = elist[idx].list; e; e = e->next) { + rcu_read_lock(); + list_for_each_entry_rcu(e, &elist[idx].list, list) { u64 nbytes; u32 npackets; u32 rate; @@ -127,9 +125,9 @@ e->rate_est->pps = (e->avpps+0x1FF)>>10; spin_unlock(e->stats_lock); } - - mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); - read_unlock(&est_lock); + if (!list_empty(&elist[idx].list)) + mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); + rcu_read_unlock(); } /** @@ -152,6 +150,7 @@ { struct gen_estimator *est; struct gnet_estimator *parm = RTA_DATA(opt); + int idx; if (RTA_PAYLOAD(opt) < sizeof(*parm)) return -EINVAL; @@ -163,7 +162,7 @@ if (est == NULL) return -ENOBUFS; - est->interval = parm->interval + 2; + est->interval = idx = parm->interval + 2; est->bstats = bstats; est->rate_est = rate_est; est->stats_lock = stats_lock; @@ -173,20 +172,24 @@ est->last_packets = bstats->packets; est->avpps = rate_est->pps<<10; - est->next = elist[est->interval].list; - if (est->next == NULL) { - init_timer(&elist[est->interval].timer); - elist[est->interval].timer.data = est->interval; - elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4); - elist[est->interval].timer.function = est_timer; - add_timer(&elist[est->interval].timer); + if (!elist[idx].timer.function) { + INIT_LIST_HEAD(&elist[idx].list); + setup_timer(&elist[idx].timer, est_timer, est->interval); } - write_lock_bh(&est_lock); - elist[est->interval].list = est; - write_unlock_bh(&est_lock); + + if (list_empty(&elist[est->interval].list)) + mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); + + list_add_rcu(&est->list, &elist[idx].list); return 0; } +static void __gen_kill_estimator(struct rcu_head *head) +{ + struct gen_estimator *e = container_of(head, struct gen_estimator, e_rcu); + kfree(e); +} + /** * gen_kill_estimator - remove a rate estimator * @bstats: basic statistics @@ -199,26 +202,21 @@ struct gnet_stats_rate_est *rate_est) { int idx; - struct gen_estimator *est, **pest; + struct gen_estimator *e, *n; for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { - int killed = 0; - pest = &elist[idx].list; - while ((est=*pest) != NULL) { - if (est->rate_est != rate_est || est->bstats != bstats) { - pest = &est->next; - continue; - } - write_lock_bh(&est_lock); - *pest = est->next; - write_unlock_bh(&est_lock); + /* Skip non initialized indexes */ + if (!elist[idx].timer.function) + continue; + + list_for_each_entry_safe(e, n, &elist[idx].list, list) { + if (e->rate_est != rate_est || e->bstats != bstats) + continue; - kfree(est); - killed++; + list_del_rcu(&e->list); + call_rcu(&e->e_rcu, __gen_kill_estimator); } - if (killed && elist[idx].list == NULL) - del_timer(&elist[idx].timer); } }