On Thu, Sep 21, 2017 at 04:15:46PM +0000, Eric Dumazet wrote: > From: Eric Dumazet <eduma...@google.com> > > In linux-4.13, Wei worked hard to convert dst to a traditional > refcounted model, removing GC. > > We now want to make sure a dst refcount can not transition from 0 back > to 1. > > The problem here is that input path attached a not refcounted dst to an > skb. Then later, because packet is forwarded and hits skb_dst_force() > before exiting RCU section, we might try to take a refcount on one dst > that is about to be freed, if another cpu saw 1 -> 0 transition in > dst_release() and queued the dst for freeing after one RCU grace period. > > Lets unify skb_dst_force() and skb_dst_force_safe(), since we should > always perform the complete check against dst refcount, and not assume > it is not zero. Acked-by: Martin KaFai Lau <ka...@fb.com>
> > Bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=197005 > > [ 989.919496] skb_dst_force+0x32/0x34 > [ 989.919498] __dev_queue_xmit+0x1ad/0x482 > [ 989.919501] ? eth_header+0x28/0xc6 > [ 989.919502] dev_queue_xmit+0xb/0xd > [ 989.919504] neigh_connected_output+0x9b/0xb4 > [ 989.919507] ip_finish_output2+0x234/0x294 > [ 989.919509] ? ipt_do_table+0x369/0x388 > [ 989.919510] ip_finish_output+0x12c/0x13f > [ 989.919512] ip_output+0x53/0x87 > [ 989.919513] ip_forward_finish+0x53/0x5a > [ 989.919515] ip_forward+0x2cb/0x3e6 > [ 989.919516] ? pskb_trim_rcsum.part.9+0x4b/0x4b > [ 989.919518] ip_rcv_finish+0x2e2/0x321 > [ 989.919519] ip_rcv+0x26f/0x2eb > [ 989.919522] ? vlan_do_receive+0x4f/0x289 > [ 989.919523] __netif_receive_skb_core+0x467/0x50b > [ 989.919526] ? tcp_gro_receive+0x239/0x239 > [ 989.919529] ? inet_gro_receive+0x226/0x238 > [ 989.919530] __netif_receive_skb+0x4d/0x5f > [ 989.919532] netif_receive_skb_internal+0x5c/0xaf > [ 989.919533] napi_gro_receive+0x45/0x81 > [ 989.919536] ixgbe_poll+0xc8a/0xf09 > [ 989.919539] ? kmem_cache_free_bulk+0x1b6/0x1f7 > [ 989.919540] net_rx_action+0xf4/0x266 > [ 989.919543] __do_softirq+0xa8/0x19d > [ 989.919545] irq_exit+0x5d/0x6b > [ 989.919546] do_IRQ+0x9c/0xb5 > [ 989.919548] common_interrupt+0x93/0x93 > [ 989.919548] </IRQ> > > > Similarly dst_clone() can use dst_hold() helper to have additional > debugging, as a follow up to commit 44ebe79149ff ("net: add debug > atomic_inc_not_zero() in dst_hold()") > > In net-next we will convert dst atomic_t to refcount_t for peace of > mind. > > Fixes: a4c2fd7f7891 ("net: remove DST_NOCACHE flag") > Signed-off-by: Eric Dumazet <eduma...@google.com> > Cc: Wei Wang <wei...@google.com> > Reported-by: Paweł Staszewski <pstaszew...@itcare.pl> > Bisected-by: Paweł Staszewski <pstaszew...@itcare.pl> > --- > include/net/dst.h | 22 ++++------------------ > include/net/route.h | 2 +- > include/net/sock.h | 2 +- > 3 files changed, 6 insertions(+), 20 deletions(-) > > diff --git a/include/net/dst.h b/include/net/dst.h > index > 93568bd0a3520bb7402f04d90cf04ac99c81cfbe..06a6765da074449e6f1fe42ee05e711e898ad372 > 100644 > --- a/include/net/dst.h > +++ b/include/net/dst.h > @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry *dst, > unsigned long time) > static inline struct dst_entry *dst_clone(struct dst_entry *dst) > { > if (dst) > - atomic_inc(&dst->__refcnt); > + dst_hold(dst); > return dst; > } > > @@ -311,21 +311,6 @@ static inline void skb_dst_copy(struct sk_buff *nskb, > const struct sk_buff *oskb > __skb_dst_copy(nskb, oskb->_skb_refdst); > } > > -/** > - * skb_dst_force - makes sure skb dst is refcounted > - * @skb: buffer > - * > - * If dst is not yet refcounted, let's do it > - */ > -static inline void skb_dst_force(struct sk_buff *skb) > -{ > - if (skb_dst_is_noref(skb)) { > - WARN_ON(!rcu_read_lock_held()); > - skb->_skb_refdst &= ~SKB_DST_NOREF; > - dst_clone(skb_dst(skb)); > - } > -} > - > /** > * dst_hold_safe - Take a reference on a dst if possible > * @dst: pointer to dst entry > @@ -339,16 +324,17 @@ static inline bool dst_hold_safe(struct dst_entry *dst) > } > > /** > - * skb_dst_force_safe - makes sure skb dst is refcounted > + * skb_dst_force - makes sure skb dst is refcounted > * @skb: buffer > * > * If dst is not yet refcounted and not destroyed, grab a ref on it. > */ > -static inline void skb_dst_force_safe(struct sk_buff *skb) > +static inline void skb_dst_force(struct sk_buff *skb) > { > if (skb_dst_is_noref(skb)) { > struct dst_entry *dst = skb_dst(skb); > > + WARN_ON(!rcu_read_lock_held()); > if (!dst_hold_safe(dst)) > dst = NULL; > > diff --git a/include/net/route.h b/include/net/route.h > index > 1b09a9368c68d46f0c5ee8ce3cefe566000c1ec1..57dfc6850d378e4b96f13b140eef554d66c24cdf > 100644 > --- a/include/net/route.h > +++ b/include/net/route.h > @@ -190,7 +190,7 @@ static inline int ip_route_input(struct sk_buff *skb, > __be32 dst, __be32 src, > rcu_read_lock(); > err = ip_route_input_noref(skb, dst, src, tos, devin); > if (!err) { > - skb_dst_force_safe(skb); > + skb_dst_force(skb); > if (!skb_dst(skb)) > err = -EINVAL; > } > diff --git a/include/net/sock.h b/include/net/sock.h > index > 03a362568357acc7278a318423dd3873103f90ca..a6b9a8d1a6df3f72df8f1aac0f577257fa6452d0 > 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -856,7 +856,7 @@ void sk_stream_write_space(struct sock *sk); > static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) > { > /* dont let skb dst not refcounted, we are going to leave rcu lock */ > - skb_dst_force_safe(skb); > + skb_dst_force(skb); > > if (!sk->sk_backlog.tail) > sk->sk_backlog.head = skb; > >