Currenlty all the kernels (including vanilla) free ifa
list under rtln_lock() taken which takes a huge time
to release all entries when we stop the container.
Moreover it's allowed to create unlimited number
of addresses from inside of net-namespace if
CAP-NET_ADMIN granted (which is common for containers).

Lets introduce per-net limit (4096 by default)
of addresses, which can be tuned up via sysfs
entry /proc/sys/net/ipv4/ifa_limit.

Reported-by: Solar Designer <so...@openwall.com>
Signed-off-by: Cyrill Gorcunov <gorcu...@virtuozzo.com>
CC: Vasily Averin <v...@virtuozzo.com>
CC: Andrey Vagin <ava...@virtuozzo.com>
CC: Pavel Emelianov <xe...@virtuozzo.com>
CC: Vladimir Davydov <vdavy...@virtuozzo.com>
CC: Konstantin Khorenko <khore...@virtuozzo.com>
CC: David Miller <da...@davemloft.net>
CC: Eric Dumazet <eric.duma...@gmail.com>
---

Please share the idea if there some more elegant way exist
to fix this problem, maybe I miss something obvious. Thanks!

 include/net/netns/ipv4.h   |    3 +++
 net/ipv4/devinet.c         |   34 +++++++++++++++++++---------------
 net/ipv4/sysctl_net_ipv4.c |    8 ++++++++
 3 files changed, 30 insertions(+), 15 deletions(-)

Index: linux-ml.git/include/net/netns/ipv4.h
===================================================================
--- linux-ml.git.orig/include/net/netns/ipv4.h
+++ linux-ml.git/include/net/netns/ipv4.h
@@ -77,6 +77,8 @@ struct netns_ipv4 {
 
        struct local_ports ip_local_ports;
 
+       int sysctl_ifa_limit;
+
        int sysctl_tcp_ecn;
        int sysctl_tcp_ecn_fallback;
 
@@ -101,6 +103,7 @@ struct netns_ipv4 {
        struct ping_group_range ping_group_range;
 
        atomic_t dev_addr_genid;
+       atomic_t ifa_nr;
 
 #ifdef CONFIG_SYSCTL
        unsigned long *sysctl_local_reserved_ports;
Index: linux-ml.git/net/ipv4/devinet.c
===================================================================
--- linux-ml.git.orig/net/ipv4/devinet.c
+++ linux-ml.git/net/ipv4/devinet.c
@@ -194,8 +194,11 @@ static void devinet_sysctl_unregister(st
 
 /* Locks all the inet devices. */
 
-static struct in_ifaddr *inet_alloc_ifa(void)
+static struct in_ifaddr *inet_alloc_ifa(struct net *net)
 {
+       if (atomic_add_return(1, &net->ipv4.ifa_nr) >
+           net->ipv4.sysctl_ifa_limit)
+               return NULL;
        return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
 }
 
@@ -207,8 +210,9 @@ static void inet_rcu_free_ifa(struct rcu
        kfree(ifa);
 }
 
-static void inet_free_ifa(struct in_ifaddr *ifa)
+static void inet_free_ifa(struct net *net, struct in_ifaddr *ifa)
 {
+       atomic_dec(&net->ipv4.ifa_nr);
        call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
 }
 
@@ -296,7 +300,7 @@ static void inetdev_destroy(struct in_de
 
        while ((ifa = in_dev->ifa_list) != NULL) {
                inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
-               inet_free_ifa(ifa);
+               inet_free_ifa(dev_net(dev), ifa);
        }
 
        RCU_INIT_POINTER(dev->ip_ptr, NULL);
@@ -361,7 +365,7 @@ static void __inet_del_ifa(struct in_dev
                                rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
                                blocking_notifier_call_chain(&inetaddr_chain,
                                                NETDEV_DOWN, ifa);
-                               inet_free_ifa(ifa);
+                               inet_free_ifa(dev_net(in_dev->dev), ifa);
                        } else {
                                promote = ifa;
                                break;
@@ -420,7 +424,7 @@ static void __inet_del_ifa(struct in_dev
 
        }
        if (destroy)
-               inet_free_ifa(ifa1);
+               inet_free_ifa(dev_net(in_dev->dev), ifa1);
 }
 
 static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
@@ -442,7 +446,7 @@ static int __inet_insert_ifa(struct in_i
        ASSERT_RTNL();
 
        if (!ifa->ifa_local) {
-               inet_free_ifa(ifa);
+               inet_free_ifa(dev_net(in_dev->dev), ifa);
                return 0;
        }
 
@@ -457,11 +461,11 @@ static int __inet_insert_ifa(struct in_i
                if (ifa1->ifa_mask == ifa->ifa_mask &&
                    inet_ifa_match(ifa1->ifa_address, ifa)) {
                        if (ifa1->ifa_local == ifa->ifa_local) {
-                               inet_free_ifa(ifa);
+                               inet_free_ifa(dev_net(in_dev->dev), ifa);
                                return -EEXIST;
                        }
                        if (ifa1->ifa_scope != ifa->ifa_scope) {
-                               inet_free_ifa(ifa);
+                               inet_free_ifa(dev_net(in_dev->dev), ifa);
                                return -EINVAL;
                        }
                        ifa->ifa_flags |= IFA_F_SECONDARY;
@@ -502,7 +506,7 @@ static int inet_set_ifa(struct net_devic
        ASSERT_RTNL();
 
        if (!in_dev) {
-               inet_free_ifa(ifa);
+               inet_free_ifa(dev_net(dev), ifa);
                return -ENOBUFS;
        }
        ipv4_devconf_setall(in_dev);
@@ -768,7 +772,7 @@ static struct in_ifaddr *rtm_to_ifaddr(s
        if (!in_dev)
                goto errout;
 
-       ifa = inet_alloc_ifa();
+       ifa = inet_alloc_ifa(net);
        if (!ifa)
                /*
                 * A potential indev allocation can be left alive, it stays
@@ -817,7 +821,7 @@ static struct in_ifaddr *rtm_to_ifaddr(s
        return ifa;
 
 errout_free:
-       inet_free_ifa(ifa);
+       inet_free_ifa(net, ifa);
 errout:
        return ERR_PTR(err);
 }
@@ -865,13 +869,13 @@ static int inet_rtm_newaddr(struct sk_bu
                                               true, ifa);
 
                        if (ret < 0) {
-                               inet_free_ifa(ifa);
+                               inet_free_ifa(net, ifa);
                                return ret;
                        }
                }
                return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
        } else {
-               inet_free_ifa(ifa);
+               inet_free_ifa(net, ifa);
 
                if (nlh->nlmsg_flags & NLM_F_EXCL ||
                    !(nlh->nlmsg_flags & NLM_F_REPLACE))
@@ -1055,7 +1059,7 @@ int devinet_ioctl(struct net *net, unsig
 
                if (!ifa) {
                        ret = -ENOBUFS;
-                       ifa = inet_alloc_ifa();
+                       ifa = inet_alloc_ifa(net);
                        if (!ifa)
                                break;
                        INIT_HLIST_NODE(&ifa->hash);
@@ -1408,7 +1412,7 @@ static int inetdev_event(struct notifier
                if (!inetdev_valid_mtu(dev->mtu))
                        break;
                if (dev->flags & IFF_LOOPBACK) {
-                       struct in_ifaddr *ifa = inet_alloc_ifa();
+                       struct in_ifaddr *ifa = inet_alloc_ifa(dev_net(dev));
 
                        if (ifa) {
                                INIT_HLIST_NODE(&ifa->hash);
Index: linux-ml.git/net/ipv4/sysctl_net_ipv4.c
===================================================================
--- linux-ml.git.orig/net/ipv4/sysctl_net_ipv4.c
+++ linux-ml.git/net/ipv4/sysctl_net_ipv4.c
@@ -960,6 +960,13 @@ static struct ctl_table ipv4_net_table[]
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
        },
+       {
+               .procname       = "ifa_limit",
+               .data           = &init_net.ipv4.sysctl_ifa_limit,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
        { }
 };
 
@@ -988,6 +995,7 @@ static __net_init int ipv4_sysctl_init_n
        if (!net->ipv4.sysctl_local_reserved_ports)
                goto err_ports;
 
+       net->ipv4.sysctl_ifa_limit = 4096;
        return 0;
 
 err_ports:

Reply via email to