It's a terrible thing to hold dev in iptables target. When the dev is
being removed, unregister_netdevice has to wait for the dev to become
free. dmesg will keep logging the err:

  kernel:unregister_netdevice: waiting for veth0_in to become free. \
  Usage count = 1

until iptables rules with this target are removed manually.

The worse thing is when deleting a netns, a virtual nic will be deleted
instead of reset to init_net in default_device_ops exit/exit_batch. As
it is earlier than to flush the iptables rules in iptable_filter_net_ops
exit, unregister_netdevice will block to wait for the nic to become free.

As unregister_netdevice is actually waiting for iptables rules flushing
while iptables rules have to be flushed after unregister_netdevice. This
'dead lock' will cause unregister_netdevice to block there forever. As
the netns is not available to operate at that moment, iptables rules can
not even be flushed manually either.

The reproducer can be:

  # ip netns add test
  # ip link add veth0_in type veth peer name veth0_out
  # ip link set veth0_in netns test
  # ip netns exec test ip link set lo up
  # ip netns exec test ip link set veth0_in up
  # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \
    CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \
    --local-node 1 --hashmode sourceip-sourceport
  # ip netns del test

This issue can be triggered by all virtual nics with ipt_CLUSTERIP.

This patch is to fix it by not holding dev in ipt_CLUSTERIP, but only
save dev->ifindex instead of dev. When removing the mc from the dev,
it will get dev by c->ifindex through dev_get_by_index.

Note that it doesn't save dev->name but dev->ifindex, as a dev->name
can be changed, it will confuse ipt_CLUSTERIP.

Reported-by: Jianlin Shi <ji...@redhat.com>
Signed-off-by: Xin Long <lucien....@gmail.com>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c 
b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 038f293..d1adb2f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -47,7 +47,7 @@ struct clusterip_config {
 
        __be32 clusterip;                       /* the IP address */
        u_int8_t clustermac[ETH_ALEN];          /* the MAC address */
-       struct net_device *dev;                 /* device */
+       int ifindex;                            /* device ifindex */
        u_int16_t num_total_nodes;              /* total number of nodes */
        unsigned long local_nodes;              /* node number array */
 
@@ -98,19 +98,23 @@ clusterip_config_put(struct clusterip_config *c)
  * entry(rule) is removed, remove the config from lists, but don't free it
  * yet, since proc-files could still be holding references */
 static inline void
-clusterip_config_entry_put(struct clusterip_config *c)
+clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
 {
-       struct net *net = dev_net(c->dev);
        struct clusterip_net *cn = net_generic(net, clusterip_net_id);
 
        local_bh_disable();
        if (refcount_dec_and_lock(&c->entries, &cn->lock)) {
+               struct net_device *dev;
+
                list_del_rcu(&c->list);
                spin_unlock(&cn->lock);
                local_bh_enable();
 
-               dev_mc_del(c->dev, c->clustermac);
-               dev_put(c->dev);
+               dev = dev_get_by_index(net, c->ifindex);
+               if (dev) {
+                       dev_mc_del(dev, c->clustermac);
+                       dev_put(dev);
+               }
 
                /* In case anyone still accesses the file, the open/close
                 * functions are also incrementing the refcount on their own,
@@ -182,7 +186,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info 
*i, __be32 ip,
        if (!c)
                return ERR_PTR(-ENOMEM);
 
-       c->dev = dev;
+       c->ifindex = dev->ifindex;
        c->clusterip = ip;
        memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
        c->num_total_nodes = i->num_total_nodes;
@@ -427,12 +431,14 @@ static int clusterip_tg_check(const struct xt_tgchk_param 
*par)
                        }
 
                        config = clusterip_config_init(cipinfo,
-                                                       e->ip.dst.s_addr, dev);
+                                                      e->ip.dst.s_addr, dev);
                        if (IS_ERR(config)) {
                                dev_put(dev);
                                return PTR_ERR(config);
                        }
-                       dev_mc_add(config->dev, config->clustermac);
+
+                       dev_mc_add(dev, config->clustermac);
+                       dev_put(dev);
                }
        }
        cipinfo->config = config;
@@ -458,7 +464,7 @@ static void clusterip_tg_destroy(const struct 
xt_tgdtor_param *par)
 
        /* if no more entries are referencing the config, remove it
         * from the list and destroy the proc entry */
-       clusterip_config_entry_put(cipinfo->config);
+       clusterip_config_entry_put(par->net, cipinfo->config);
 
        clusterip_config_put(cipinfo->config);
 
@@ -558,10 +564,9 @@ arp_mangle(void *priv,
         * addresses on different interfacs.  However, in the CLUSTERIP case
         * this wouldn't work, since we didn't subscribe the mcast group on
         * other interfaces */
-       if (c->dev != state->out) {
-               pr_debug("not mangling arp reply on different "
-                        "interface: cip'%s'-skb'%s'\n",
-                        c->dev->name, state->out->name);
+       if (c->ifindex != state->out->ifindex) {
+               pr_debug("not mangling arp reply on different interface: 
cip'%d'-skb'%d'\n",
+                        c->ifindex, state->out->ifindex);
                clusterip_config_put(c);
                return NF_ACCEPT;
        }
-- 
2.1.0

Reply via email to