Cc Wei Wang On Sun, Feb 04, 2018 at 01:21:18PM +0200, Eyal Birger wrote: > Hi, > > We've encountered a non released device reference upon device > unregistration which seems to stem from xfrm policy code. > > The setup includes: > - an underlay device (e.g. eth0) using IPv4 > - an xfrm IPv6 over IPv4 tunnel routed via the underlay device > - an ipip6 tunnel over the xfrm IPv6 tunnel > > When tearing down the underlay device, after traffic had passed via the ipip6 > tunnel, log messages of the following form are observed: > > unregister_netdevice: waiting for eth0 to become free. Usage count = 2
Looks like this happened when the dst garbage collection code was removed. I could not point to a commit that introduced it so I did a bisection and this pointed to: commit 9514528d92d4cbe086499322370155ed69f5d06c ipv6: call dst_dev_put() properly With this commit we leak the one refcount and some further commit leaked the second one. > > The below synthetic script reproduces this consistently on a fresh ubuntu vm > running net-next v4.15-6066-ge9522a5: > --------------------------------------------------------- > #!/bin/bash > > ipsec_underlay_dst=192.168.6.1 > ipsec_underlay_src=192.168.5.2 > ipv6_pfx=1234 > local_ipv6_addr="$ipv6_pfx::1" > remote_ipv6_addr="$ipv6_pfx::2" > > # create dummy ipsec underlay > ip l add dev dummy1 type dummy > ip l set dev dummy1 up > ip r add "$ipsec_underlay_dst/32" dev dummy1 > ip -6 r add "$ipv6_pfx::/16" dev dummy1 > > ip a add dev dummy1 "$local_ipv6_addr/128" > ip a add dev dummy1 "$ipsec_underlay_src/24" > > # add xfrm policy and state > ip x p add src "$local_ipv6_addr/128" dst "$ipv6_pfx::/16" dir out tmpl src > "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp reqid 1 mode tunnel > ip x s add src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp spi > 0xcd440ce6 reqid 1 mode tunnel auth-trunc 'hmac(sha1)' > 0x34a546d309031628962b814ef073aff1a638ad21 96 enc 'cbc(aes)' > 0xf31e14149c328297fe7925ad7448420e encap espinudp 4500 4500 0.0.0.0 > > # add 4o6 tunnel > ip l add tnl46 type ip6tnl mode ipip6 local "$local_ipv6_addr" remote > "$remote_ipv6_addr" > ip l set dev tnl46 up > ip r add 10.64.0.0/10 dev tnl46 > > # pass traffic so route is cached > ping -w 1 -c 1 10.64.0.1 > > # remove dummy underlay > ip l del dummy1 > --------------------------------------------------------- > > Analysis: > > ip6_tunnel holds a dst_cache which caches its underlay dst objects. > When devices are unregistered, non-xfrm dst objects are invlidated by their > original creators (ipv4/ipv6/...) and thus are wiped from dst_cache. > > xfrm created routes otoh are not tracked by xfrm, and are not invalidated upon > device unregistration, thus hold the device upon unregistration. > > The following rough sketch patch illustrates an approach overcoming this > issue: > --------------------------------------------------------- > >From e188dc5295e3500bc59e8780049840afa2eb3e24 Mon Sep 17 00:00:00 2001 > From: Eyal Birger <e...@metanetworks.com> > Date: Sun, 4 Feb 2018 13:08:02 +0200 > Subject: [PATCH] net: xfrm_policy: invalidate xfrm_dsts on device > unregistration > > --- > include/net/xfrm.h | 10 ++----- > net/xfrm/xfrm_policy.c | 75 > ++++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 78 insertions(+), 7 deletions(-) > > diff --git a/include/net/xfrm.h b/include/net/xfrm.h > index 7d20776..c1c8493 100644 > --- a/include/net/xfrm.h > +++ b/include/net/xfrm.h > @@ -994,6 +994,8 @@ struct xfrm_dst { > u32 child_mtu_cached; > u32 route_cookie; > u32 path_cookie; > + struct list_head xfrm_dst_gc; > + struct xfrm_dst_list *xfrm_dst_gc_list; > }; > > static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst) > @@ -1025,13 +1027,7 @@ static inline void xfrm_dst_set_child(struct xfrm_dst > *xdst, struct dst_entry *c > xdst->child = child; > } > > -static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) > -{ > - xfrm_pols_put(xdst->pols, xdst->num_pols); > - dst_release(xdst->route); > - if (likely(xdst->u.dst.xfrm)) > - xfrm_state_put(xdst->u.dst.xfrm); > -} > +void xfrm_dst_destroy(struct xfrm_dst *xdst); > #endif > > void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); > diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c > index 7a23078..d446641 100644 > --- a/net/xfrm/xfrm_policy.c > +++ b/net/xfrm/xfrm_policy.c > @@ -94,6 +94,58 @@ __xfrm6_selector_match(const struct xfrm_selector *sel, > const struct flowi *fl) > (fl6->flowi6_oif == sel->ifindex || !sel->ifindex); > } > > +struct xfrm_dst_list { > + spinlock_t lock; > + struct list_head head; > +}; > + > +static DEFINE_PER_CPU_ALIGNED(struct xfrm_dst_list, xfrm_dst_gc_list); > + > +static void xfrm_add_dst_list(struct xfrm_dst *xdst) > +{ > + struct xfrm_dst_list *xl = raw_cpu_ptr(&xfrm_dst_gc_list); > + > + xdst->xfrm_dst_gc_list = xl; > + > + spin_lock_bh(&xl->lock); > + list_add_tail(&xdst->xfrm_dst_gc, &xl->head); > + spin_unlock_bh(&xl->lock); > +} > + > +void xfrm_dst_destroy(struct xfrm_dst *xdst) > +{ > + xfrm_pols_put(xdst->pols, xdst->num_pols); > + dst_release(xdst->route); > + if (likely(xdst->u.dst.xfrm)) > + xfrm_state_put(xdst->u.dst.xfrm); > + if (!list_empty(&xdst->xfrm_dst_gc)) { > + struct xfrm_dst_list *xl = xdst->xfrm_dst_gc_list; > + > + spin_lock_bh(&xl->lock); > + list_del(&xdst->xfrm_dst_gc); > + spin_unlock_bh(&xl->lock); > + } > +} > +EXPORT_SYMBOL_GPL(xfrm_dst_destroy); > + > +void xfrm_flush_dev(struct net_device *dev) > +{ > + struct xfrm_dst *xdst; > + int cpu; > + > + for_each_possible_cpu(cpu) { > + struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu); > + > + spin_lock_bh(&xl->lock); > + list_for_each_entry(xdst, &xl->head, xfrm_dst_gc) { > + if (xdst->u.dst.dev != dev) > + continue; > + dst_dev_put(&xdst->u.dst); > + } > + spin_unlock_bh(&xl->lock); > + } > +} > + > bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi > *fl, > unsigned short family) > { > @@ -1581,6 +1633,8 @@ static struct dst_entry *xfrm_bundle_create(struct > xfrm_policy *policy, > } > > bundle[i] = xdst; > + xfrm_add_dst_list(xdst); > + > if (!xdst_prev) > xdst0 = xdst; > else > @@ -2984,8 +3038,22 @@ static struct pernet_operations __net_initdata > xfrm_net_ops = { > .exit = xfrm_net_exit, > }; > > +static int xfrm_netdev_event(struct notifier_block *this, unsigned long > event, void *ptr) > +{ > + struct net_device *dev = netdev_notifier_info_to_dev(ptr); > + > + if (event == NETDEV_UNREGISTER) > + xfrm_flush_dev(dev); > + return NOTIFY_DONE; > +} > + > +static struct notifier_block xfrm_netdev_notifier = { > + .notifier_call = xfrm_netdev_event, > +}; > + > void __init xfrm_init(void) > { > + int cpu; > int i; > > xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work), > @@ -2998,6 +3066,13 @@ void __init xfrm_init(void) > register_pernet_subsys(&xfrm_net_ops); > seqcount_init(&xfrm_policy_hash_generation); > xfrm_input_init(); > + for_each_possible_cpu(cpu) { > + struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu); > + > + INIT_LIST_HEAD(&xl->head); > + spin_lock_init(&xl->lock); > + } > + register_netdevice_notifier(&xfrm_netdev_notifier); > } > > #ifdef CONFIG_AUDITSYSCALL > -- > 2.7.4 > > --------------------------------------------------------- > > This approach has the unfortunate side effects of adding a spin lock for the > tracked list, as well as increasing struct xfrm_dst. Reintroducing garbage collection is probably not a so good idea. I think the patch below should fix it a bit less intrusive. Subject: [PATCH RFC] xfrm: Fix netdev refcount leak when flushing the percpu dst cache. The dst garbage collection code is removed, so we need to call dst_dev_put() on cached dst entries before we release them. Otherwise we leak the refcount to the netdev. Signed-off-by: Steffen Klassert <steffen.klass...@secunet.com> --- net/xfrm/xfrm_policy.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7a23078132cf..7836b7601b49 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1715,8 +1715,10 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old) { this_cpu_write(xfrm_last_dst, xdst); - if (old) + if (old) { + dst_dev_put(&old->u.dst); dst_release(&old->u.dst); + } } static void __xfrm_pcpu_work_fn(void) @@ -1787,6 +1789,7 @@ void xfrm_policy_cache_flush(void) old = per_cpu(xfrm_last_dst, cpu); if (old && !xfrm_bundle_ok(old)) { per_cpu(xfrm_last_dst, cpu) = NULL; + dst_dev_put(&old->u.dst); dst_release(&old->u.dst); } rcu_read_unlock(); -- 2.14.1