Currenlty all the kernels (including vanilla) free ifa list under rtln_lock() taken which takes a huge time to release all entries when we stop the container. Moreover it's allowed to create unlimited number of addresses from inside of net-namespace if CAP-NET_ADMIN granted (which is common for containers).
Lets introduce per-net limit (4096 by default) of addresses, which can be tuned up via sysfs entry /proc/sys/net/ipv4/ifa_limit. Reported-by: Solar Designer <so...@openwall.com> Signed-off-by: Cyrill Gorcunov <gorcu...@virtuozzo.com> CC: Vasily Averin <v...@virtuozzo.com> CC: Andrey Vagin <ava...@virtuozzo.com> CC: Pavel Emelianov <xe...@virtuozzo.com> CC: Vladimir Davydov <vdavy...@virtuozzo.com> CC: Konstantin Khorenko <khore...@virtuozzo.com> CC: David Miller <da...@davemloft.net> CC: Eric Dumazet <eric.duma...@gmail.com> --- Please share the idea if there some more elegant way exist to fix this problem, maybe I miss something obvious. Thanks! include/net/netns/ipv4.h | 3 +++ net/ipv4/devinet.c | 34 +++++++++++++++++++--------------- net/ipv4/sysctl_net_ipv4.c | 8 ++++++++ 3 files changed, 30 insertions(+), 15 deletions(-) Index: linux-ml.git/include/net/netns/ipv4.h =================================================================== --- linux-ml.git.orig/include/net/netns/ipv4.h +++ linux-ml.git/include/net/netns/ipv4.h @@ -77,6 +77,8 @@ struct netns_ipv4 { struct local_ports ip_local_ports; + int sysctl_ifa_limit; + int sysctl_tcp_ecn; int sysctl_tcp_ecn_fallback; @@ -101,6 +103,7 @@ struct netns_ipv4 { struct ping_group_range ping_group_range; atomic_t dev_addr_genid; + atomic_t ifa_nr; #ifdef CONFIG_SYSCTL unsigned long *sysctl_local_reserved_ports; Index: linux-ml.git/net/ipv4/devinet.c =================================================================== --- linux-ml.git.orig/net/ipv4/devinet.c +++ linux-ml.git/net/ipv4/devinet.c @@ -194,8 +194,11 @@ static void devinet_sysctl_unregister(st /* Locks all the inet devices. */ -static struct in_ifaddr *inet_alloc_ifa(void) +static struct in_ifaddr *inet_alloc_ifa(struct net *net) { + if (atomic_add_return(1, &net->ipv4.ifa_nr) > + net->ipv4.sysctl_ifa_limit) + return NULL; return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL); } @@ -207,8 +210,9 @@ static void inet_rcu_free_ifa(struct rcu kfree(ifa); } -static void inet_free_ifa(struct in_ifaddr *ifa) +static void inet_free_ifa(struct net *net, struct in_ifaddr *ifa) { + atomic_dec(&net->ipv4.ifa_nr); call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); } @@ -296,7 +300,7 @@ static void inetdev_destroy(struct in_de while ((ifa = in_dev->ifa_list) != NULL) { inet_del_ifa(in_dev, &in_dev->ifa_list, 0); - inet_free_ifa(ifa); + inet_free_ifa(dev_net(dev), ifa); } RCU_INIT_POINTER(dev->ip_ptr, NULL); @@ -361,7 +365,7 @@ static void __inet_del_ifa(struct in_dev rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); - inet_free_ifa(ifa); + inet_free_ifa(dev_net(in_dev->dev), ifa); } else { promote = ifa; break; @@ -420,7 +424,7 @@ static void __inet_del_ifa(struct in_dev } if (destroy) - inet_free_ifa(ifa1); + inet_free_ifa(dev_net(in_dev->dev), ifa1); } static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, @@ -442,7 +446,7 @@ static int __inet_insert_ifa(struct in_i ASSERT_RTNL(); if (!ifa->ifa_local) { - inet_free_ifa(ifa); + inet_free_ifa(dev_net(in_dev->dev), ifa); return 0; } @@ -457,11 +461,11 @@ static int __inet_insert_ifa(struct in_i if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) { if (ifa1->ifa_local == ifa->ifa_local) { - inet_free_ifa(ifa); + inet_free_ifa(dev_net(in_dev->dev), ifa); return -EEXIST; } if (ifa1->ifa_scope != ifa->ifa_scope) { - inet_free_ifa(ifa); + inet_free_ifa(dev_net(in_dev->dev), ifa); return -EINVAL; } ifa->ifa_flags |= IFA_F_SECONDARY; @@ -502,7 +506,7 @@ static int inet_set_ifa(struct net_devic ASSERT_RTNL(); if (!in_dev) { - inet_free_ifa(ifa); + inet_free_ifa(dev_net(dev), ifa); return -ENOBUFS; } ipv4_devconf_setall(in_dev); @@ -768,7 +772,7 @@ static struct in_ifaddr *rtm_to_ifaddr(s if (!in_dev) goto errout; - ifa = inet_alloc_ifa(); + ifa = inet_alloc_ifa(net); if (!ifa) /* * A potential indev allocation can be left alive, it stays @@ -817,7 +821,7 @@ static struct in_ifaddr *rtm_to_ifaddr(s return ifa; errout_free: - inet_free_ifa(ifa); + inet_free_ifa(net, ifa); errout: return ERR_PTR(err); } @@ -865,13 +869,13 @@ static int inet_rtm_newaddr(struct sk_bu true, ifa); if (ret < 0) { - inet_free_ifa(ifa); + inet_free_ifa(net, ifa); return ret; } } return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); } else { - inet_free_ifa(ifa); + inet_free_ifa(net, ifa); if (nlh->nlmsg_flags & NLM_F_EXCL || !(nlh->nlmsg_flags & NLM_F_REPLACE)) @@ -1055,7 +1059,7 @@ int devinet_ioctl(struct net *net, unsig if (!ifa) { ret = -ENOBUFS; - ifa = inet_alloc_ifa(); + ifa = inet_alloc_ifa(net); if (!ifa) break; INIT_HLIST_NODE(&ifa->hash); @@ -1408,7 +1412,7 @@ static int inetdev_event(struct notifier if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { - struct in_ifaddr *ifa = inet_alloc_ifa(); + struct in_ifaddr *ifa = inet_alloc_ifa(dev_net(dev)); if (ifa) { INIT_HLIST_NODE(&ifa->hash); Index: linux-ml.git/net/ipv4/sysctl_net_ipv4.c =================================================================== --- linux-ml.git.orig/net/ipv4/sysctl_net_ipv4.c +++ linux-ml.git/net/ipv4/sysctl_net_ipv4.c @@ -960,6 +960,13 @@ static struct ctl_table ipv4_net_table[] .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "ifa_limit", + .data = &init_net.ipv4.sysctl_ifa_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -988,6 +995,7 @@ static __net_init int ipv4_sysctl_init_n if (!net->ipv4.sysctl_local_reserved_ports) goto err_ports; + net->ipv4.sysctl_ifa_limit = 4096; return 0; err_ports: