Currently deleting qdisc with a large number of children and filters can take a lot of time:
tc qdisc add dev lo root htb for I in `seq 1 1000`; do tc class add dev lo parent 1: classid 1:$I htb rate 100kbit tc qdisc add dev lo parent 1:$I handle $((I + 1)): htb for J in `seq 1 10`; do tc filter add dev lo parent $((I + 1)): u32 match ip src 1.1.1.$J done done time tc qdisc del dev root real 0m54.764s user 0m0.023s sys 0m0.000s This is due to the multiple rcu_barrier() calls, one for each tcf_block freed, invoked with the rtnl lock held. Most other network related tasks will block in this timespan. This change implements bulk free of tcf_block() at destroy() time: when deleting a qdisc hierarchy, the to-be-deleted blocks are queued in a rtnl_lock protected list, and a single rcu_barrier is invoked for each burst. The burst is flushed after the deletion of the topmost qdisc of the destroyed hierarchy and all the queued blocks are deleted with a single delayed work call. After this change, the previous example gives: real 0m0.193s user 0m0.000s sys 0m0.016s Signed-off-by: Paolo Abeni <pab...@redhat.com> --- This patch adds 2 new list_head fields to tcf_block, that could be replaced with a single pointer, open coding single linked list manipulation in cls_api.c, if a lower memory impact is required. --- include/net/pkt_cls.h | 1 + include/net/sch_generic.h | 5 +++ net/sched/cls_api.c | 78 +++++++++++++++++++++++++++++++++++------------ net/sched/sch_api.c | 1 + net/sched/sch_generic.c | 17 +++++++++++ net/sched/sch_ingress.c | 1 + 6 files changed, 83 insertions(+), 20 deletions(-) diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 0105445cab83..12777cfae77c 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -45,6 +45,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, void tcf_block_put(struct tcf_block *block); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); +void tcf_flush_blocks(void); static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 65d0d25f2648..99846ee644a8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -71,6 +71,7 @@ struct Qdisc { * qdisc_tree_decrease_qlen() should stop. */ #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ +#define TCQ_F_DELETING 0x100 u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; @@ -279,6 +280,10 @@ struct tcf_block { struct Qdisc *q; struct list_head cb_list; struct work_struct work; + + /* TODO: use a single list, do avoid wasting too much memory */ + struct list_head del_list; + struct list_head del_head; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 7d97f612c9b9..446b16c1f532 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -37,6 +37,61 @@ static LIST_HEAD(tcf_proto_base); /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); +/* List of tcf_blocks queued for deletion. Bulk freeing them we avoid the + * rcu_barrier() storm at root_qdisc->destroy() time + */ +static LIST_HEAD(queued_blocks); + +static void queue_for_deletion(struct tcf_block *block) +{ + if (WARN_ON(!list_empty(&block->del_list))) + return; + + ASSERT_RTNL(); + list_add(&block->del_list, &queued_blocks); +} + +static void flush_blocks(struct work_struct *work) +{ + struct tcf_block *block, *tmp_block; + struct tcf_chain *chain, *tmp; + struct list_head *head; + + head = &container_of(work, struct tcf_block, work)->del_head; + rtnl_lock(); + list_for_each_entry(block, head, del_list) + /* Only chain 0 should be still here. */ + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + tcf_chain_put(chain); + rtnl_unlock(); + + list_for_each_entry_safe(block, tmp_block, head, del_list) + kfree(block); +} + +void tcf_flush_blocks(void) +{ + struct tcf_block *head; + LIST_HEAD(flush_burst); + + ASSERT_RTNL(); + if (list_empty(&queued_blocks)) + return; + + head = list_first_entry(&queued_blocks, struct tcf_block, del_list); + INIT_LIST_HEAD(&head->del_head); + list_splice_init(&queued_blocks, &head->del_head); + INIT_WORK(&head->work, flush_blocks); + + /* Wait for existing RCU callbacks to cool down, make sure their works + * have been queued before this. We can not flush pending works here + * because we are holding the RTNL lock. + */ + rcu_barrier(); + schedule_work(&head->work); +} +EXPORT_SYMBOL_GPL(tcf_flush_blocks); + /* Find classifier type by string name */ static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind) @@ -288,6 +343,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, return -ENOMEM; INIT_LIST_HEAD(&block->chain_list); INIT_LIST_HEAD(&block->cb_list); + INIT_LIST_HEAD(&block->del_list); /* Create chain 0 by default, it has to be always present. */ chain = tcf_chain_create(block, 0); @@ -330,19 +386,6 @@ int tcf_block_get(struct tcf_block **p_block, } EXPORT_SYMBOL(tcf_block_get); -static void tcf_block_put_final(struct work_struct *work) -{ - struct tcf_block *block = container_of(work, struct tcf_block, work); - struct tcf_chain *chain, *tmp; - - rtnl_lock(); - /* Only chain 0 should be still here. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) - tcf_chain_put(chain); - rtnl_unlock(); - kfree(block); -} - /* XXX: Standalone actions are not allowed to jump to any chain, and bound * actions should be all removed after flushing. However, filters are now * destroyed in tc filter workqueue with RTNL lock, they can not race here. @@ -357,13 +400,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, tcf_block_offload_unbind(block, q, ei); - INIT_WORK(&block->work, tcf_block_put_final); - /* Wait for existing RCU callbacks to cool down, make sure their works - * have been queued before this. We can not flush pending works here - * because we are holding the RTNL lock. - */ - rcu_barrier(); - tcf_queue_work(&block->work); + queue_for_deletion(block); } EXPORT_SYMBOL(tcf_block_put_ext); @@ -920,6 +957,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (tp_created) tcf_proto_destroy(tp); } + tcf_flush_blocks(); errout: if (chain) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b6c4f536876b..5fe2dcb73bfd 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1645,6 +1645,7 @@ static int tclass_del_notify(struct net *net, kfree_skb(skb); return err; } + tcf_flush_blocks(); return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 3839cbbdc32b..299c5d33916a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -28,6 +28,7 @@ #include <linux/if_vlan.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/dst.h> #include <trace/events/qdisc.h> @@ -708,11 +709,25 @@ static void qdisc_free(struct Qdisc *qdisc) void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; + struct Qdisc *p; + bool flush; if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt)) return; + /* we can avoid flush the pending blocks if this qdisc is a child + * deleted a recursive destroy() call and the parent qdisc is already + * removed. + */ + qdisc->flags |= TCQ_F_DELETING; + if (qdisc->parent != TC_H_ROOT) { + p = qdisc_lookup(qdisc_dev(qdisc), TC_H_MAJ(qdisc->parent)); + flush = p && !(p->flags & TCQ_F_DELETING); + } else { + flush = true; + } + #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); @@ -723,6 +738,8 @@ void qdisc_destroy(struct Qdisc *qdisc) ops->reset(qdisc); if (ops->destroy) ops->destroy(qdisc); + if (flush) + tcf_flush_blocks(); module_put(ops->owner); dev_put(qdisc_dev(qdisc)); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 5ecc38f35d47..7329b8f55160 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -201,6 +201,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) err_egress_block_get: tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); + tcf_flush_blocks(); return err; } -- 2.13.6