This is a re-post of previous patch wrote by David Miller[1]. Phil Karn reported[2] that on busy networks with lots of unresolved multicast routing entries, the creation of new multicast group routes can be extremely slow and unreliable.
The reason is we hard-coded multicast route entries with unresolved source addresses(cache_resolve_queue_len) to 10. If some multicast route never resolves and the unresolved source addresses increased, there will be no ability to create new multicast route cache. To resolve this issue, we need either add a sysctl entry to make the cache_resolve_queue_len configurable, or just remove cache_resolve_queue_len directly, as we already have the socket receive queue limits of mrouted socket, pointed by David. >From my side, I'd perfer to remove the cache_resolve_queue_len instead of creating two more(IPv4 and IPv6 version) sysctl entry. [1] https://lkml.org/lkml/2018/7/22/11 [2] https://lkml.org/lkml/2018/7/21/343 v2: hold the mfc_unres_lock while walking the unresolved list in queue_count(), as Nikolay Aleksandrov remind. Reported-by: Phil Karn <k...@ka9q.net> Signed-off-by: Hangbin Liu <liuhang...@gmail.com> --- include/linux/mroute_base.h | 2 -- net/ipv4/ipmr.c | 30 +++++++++++++++++++++--------- net/ipv6/ip6mr.c | 10 +++------- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 34de06b426ef..50fb89533029 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -235,7 +235,6 @@ struct mr_table_ops { * @mfc_hash: Hash table of all resolved routes for easy lookup * @mfc_cache_list: list of resovled routes for possible traversal * @maxvif: Identifier of highest value vif currently in use - * @cache_resolve_queue_len: current size of unresolved queue * @mroute_do_assert: Whether to inform userspace on wrong ingress * @mroute_do_pim: Whether to receive IGMP PIMv1 * @mroute_reg_vif_num: PIM-device vif index @@ -252,7 +251,6 @@ struct mr_table { struct rhltable mfc_hash; struct list_head mfc_cache_list; int maxvif; - atomic_t cache_resolve_queue_len; bool mroute_do_assert; bool mroute_do_pim; bool mroute_do_wrvifwhole; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c07bc82cbbe9..4d67c64d94a4 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -744,8 +744,6 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) struct sk_buff *skb; struct nlmsgerr *e; - atomic_dec(&mrt->cache_resolve_queue_len); - while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, @@ -1133,9 +1131,11 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, } if (!found) { + bool was_empty; + /* Create a new entry if allowable */ - if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || - (c = ipmr_cache_alloc_unres()) == NULL) { + c = ipmr_cache_alloc_unres(); + if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); @@ -1161,11 +1161,11 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, return err; } - atomic_inc(&mrt->cache_resolve_queue_len); + was_empty = list_empty(&mrt->mfc_unres_queue); list_add(&c->_c.list, &mrt->mfc_unres_queue); mroute_netlink_event(mrt, c, RTM_NEWROUTE); - if (atomic_read(&mrt->cache_resolve_queue_len) == 1) + if (was_empty) mod_timer(&mrt->ipmr_expire_timer, c->_c.mfc_un.unres.expires); } @@ -1272,7 +1272,6 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (uc->mfc_origin == c->mfc_origin && uc->mfc_mcastgrp == c->mfc_mcastgrp) { list_del(&_uc->list); - atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } @@ -1328,7 +1327,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) } if (flags & MRT_FLUSH_MFC) { - if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + if (!list_empty(&mrt->mfc_unres_queue)) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); @@ -2750,9 +2749,22 @@ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh, return ipmr_mfc_delete(tbl, &mfcc, parent); } +static int queue_count(struct mr_table *mrt) +{ + struct list_head *pos; + int count = 0; + + spin_lock_bh(&mfc_unres_lock); + list_for_each(pos, &mrt->mfc_unres_queue) + count++; + spin_unlock_bh(&mfc_unres_lock); + + return count; +} + static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) { - u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len); + u32 queue_len = queue_count(mrt); if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) || nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) || diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e80d36c5073d..d02f0f903559 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -768,8 +768,6 @@ static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; - atomic_dec(&mrt->cache_resolve_queue_len); - while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, @@ -1148,8 +1146,8 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, * Create a new entry if allowable */ - if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || - (c = ip6mr_cache_alloc_unres()) == NULL) { + c = ip6mr_cache_alloc_unres(); + if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); @@ -1176,7 +1174,6 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, return err; } - atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); @@ -1468,7 +1465,6 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { list_del(&_uc->list); - atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } @@ -1526,7 +1522,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) } if (flags & MRT6_FLUSH_MFC) { - if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + if (!list_empty(&mrt->mfc_unres_queue)) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); -- 2.19.2