Hi, We've encountered a non released device reference upon device unregistration which seems to stem from xfrm policy code.
The setup includes: - an underlay device (e.g. eth0) using IPv4 - an xfrm IPv6 over IPv4 tunnel routed via the underlay device - an ipip6 tunnel over the xfrm IPv6 tunnel When tearing down the underlay device, after traffic had passed via the ipip6 tunnel, log messages of the following form are observed: unregister_netdevice: waiting for eth0 to become free. Usage count = 2 The below synthetic script reproduces this consistently on a fresh ubuntu vm running net-next v4.15-6066-ge9522a5: --------------------------------------------------------- #!/bin/bash ipsec_underlay_dst=192.168.6.1 ipsec_underlay_src=192.168.5.2 ipv6_pfx=1234 local_ipv6_addr="$ipv6_pfx::1" remote_ipv6_addr="$ipv6_pfx::2" # create dummy ipsec underlay ip l add dev dummy1 type dummy ip l set dev dummy1 up ip r add "$ipsec_underlay_dst/32" dev dummy1 ip -6 r add "$ipv6_pfx::/16" dev dummy1 ip a add dev dummy1 "$local_ipv6_addr/128" ip a add dev dummy1 "$ipsec_underlay_src/24" # add xfrm policy and state ip x p add src "$local_ipv6_addr/128" dst "$ipv6_pfx::/16" dir out tmpl src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp reqid 1 mode tunnel ip x s add src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp spi 0xcd440ce6 reqid 1 mode tunnel auth-trunc 'hmac(sha1)' 0x34a546d309031628962b814ef073aff1a638ad21 96 enc 'cbc(aes)' 0xf31e14149c328297fe7925ad7448420e encap espinudp 4500 4500 0.0.0.0 # add 4o6 tunnel ip l add tnl46 type ip6tnl mode ipip6 local "$local_ipv6_addr" remote "$remote_ipv6_addr" ip l set dev tnl46 up ip r add 10.64.0.0/10 dev tnl46 # pass traffic so route is cached ping -w 1 -c 1 10.64.0.1 # remove dummy underlay ip l del dummy1 --------------------------------------------------------- Analysis: ip6_tunnel holds a dst_cache which caches its underlay dst objects. When devices are unregistered, non-xfrm dst objects are invlidated by their original creators (ipv4/ipv6/...) and thus are wiped from dst_cache. xfrm created routes otoh are not tracked by xfrm, and are not invalidated upon device unregistration, thus hold the device upon unregistration. The following rough sketch patch illustrates an approach overcoming this issue: --------------------------------------------------------- From e188dc5295e3500bc59e8780049840afa2eb3e24 Mon Sep 17 00:00:00 2001 From: Eyal Birger <e...@metanetworks.com> Date: Sun, 4 Feb 2018 13:08:02 +0200 Subject: [PATCH] net: xfrm_policy: invalidate xfrm_dsts on device unregistration --- include/net/xfrm.h | 10 ++----- net/xfrm/xfrm_policy.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 7 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 7d20776..c1c8493 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -994,6 +994,8 @@ struct xfrm_dst { u32 child_mtu_cached; u32 route_cookie; u32 path_cookie; + struct list_head xfrm_dst_gc; + struct xfrm_dst_list *xfrm_dst_gc_list; }; static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst) @@ -1025,13 +1027,7 @@ static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *c xdst->child = child; } -static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) -{ - xfrm_pols_put(xdst->pols, xdst->num_pols); - dst_release(xdst->route); - if (likely(xdst->u.dst.xfrm)) - xfrm_state_put(xdst->u.dst.xfrm); -} +void xfrm_dst_destroy(struct xfrm_dst *xdst); #endif void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7a23078..d446641 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -94,6 +94,58 @@ __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) (fl6->flowi6_oif == sel->ifindex || !sel->ifindex); } +struct xfrm_dst_list { + spinlock_t lock; + struct list_head head; +}; + +static DEFINE_PER_CPU_ALIGNED(struct xfrm_dst_list, xfrm_dst_gc_list); + +static void xfrm_add_dst_list(struct xfrm_dst *xdst) +{ + struct xfrm_dst_list *xl = raw_cpu_ptr(&xfrm_dst_gc_list); + + xdst->xfrm_dst_gc_list = xl; + + spin_lock_bh(&xl->lock); + list_add_tail(&xdst->xfrm_dst_gc, &xl->head); + spin_unlock_bh(&xl->lock); +} + +void xfrm_dst_destroy(struct xfrm_dst *xdst) +{ + xfrm_pols_put(xdst->pols, xdst->num_pols); + dst_release(xdst->route); + if (likely(xdst->u.dst.xfrm)) + xfrm_state_put(xdst->u.dst.xfrm); + if (!list_empty(&xdst->xfrm_dst_gc)) { + struct xfrm_dst_list *xl = xdst->xfrm_dst_gc_list; + + spin_lock_bh(&xl->lock); + list_del(&xdst->xfrm_dst_gc); + spin_unlock_bh(&xl->lock); + } +} +EXPORT_SYMBOL_GPL(xfrm_dst_destroy); + +void xfrm_flush_dev(struct net_device *dev) +{ + struct xfrm_dst *xdst; + int cpu; + + for_each_possible_cpu(cpu) { + struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu); + + spin_lock_bh(&xl->lock); + list_for_each_entry(xdst, &xl->head, xfrm_dst_gc) { + if (xdst->u.dst.dev != dev) + continue; + dst_dev_put(&xdst->u.dst); + } + spin_unlock_bh(&xl->lock); + } +} + bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, unsigned short family) { @@ -1581,6 +1633,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, } bundle[i] = xdst; + xfrm_add_dst_list(xdst); + if (!xdst_prev) xdst0 = xdst; else @@ -2984,8 +3038,22 @@ static struct pernet_operations __net_initdata xfrm_net_ops = { .exit = xfrm_net_exit, }; +static int xfrm_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (event == NETDEV_UNREGISTER) + xfrm_flush_dev(dev); + return NOTIFY_DONE; +} + +static struct notifier_block xfrm_netdev_notifier = { + .notifier_call = xfrm_netdev_event, +}; + void __init xfrm_init(void) { + int cpu; int i; xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work), @@ -2998,6 +3066,13 @@ void __init xfrm_init(void) register_pernet_subsys(&xfrm_net_ops); seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); + for_each_possible_cpu(cpu) { + struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu); + + INIT_LIST_HEAD(&xl->head); + spin_lock_init(&xl->lock); + } + register_netdevice_notifier(&xfrm_netdev_notifier); } #ifdef CONFIG_AUDITSYSCALL -- 2.7.4 --------------------------------------------------------- This approach has the unfortunate side effects of adding a spin lock for the tracked list, as well as increasing struct xfrm_dst. Any thoughts on how to best approach this within xfrm would be most welcomed. Thanks, Eyal.