On Fri, Jun 22, 2018 at 08:21:44AM -0700, John Fastabend wrote: > First in tcp_close, reduce scope of sk_callback_lock() the lock is > only needed for protecting maps list the ingress and cork > lists are protected by sock lock. Having the lock in wider scope is > harmless but may confuse the reader who may infer it is in fact > needed. > > Next, in sock_hash_delete_elem() the pattern is as follows, > > sock_hash_delete_elem() > [...] > spin_lock(bucket_lock) > l = lookup_elem_raw() > if (l) > hlist_del_rcu() > write_lock(sk_callback_lock) > .... destroy psock ... > write_unlock(sk_callback_lock) > spin_unlock(bucket_lock) > > The ordering is necessary because we only know the {p}sock after > dereferencing the hash table which we can't do unless we have the > bucket lock held. Once we have the bucket lock and the psock element > it is deleted from the hashmap to ensure any other path doing a lookup > will fail. Finally, the refcnt is decremented and if zero the psock > is destroyed. > > In parallel with the above (or free'ing the map) a tcp close event > may trigger tcp_close(). Which at the moment omits the bucket lock > altogether (oops!) where the flow looks like this, > > bpf_tcp_close() > [...] > write_lock(sk_callback_lock) > for each psock->maps // list of maps this sock is part of > hlist_del_rcu(ref_hash_node); > .... destroy psock ... > write_unlock(sk_callback_lock) > > Obviously, and demonstrated by syzbot, this is broken because > we can have multiple threads deleting entries via hlist_del_rcu(). > > To fix this we might be tempted to wrap the hlist operation in a > bucket lock but that would create a lock inversion problem. In > summary to follow locking rules the psocks maps list needs the > sk_callback_lock but we need the bucket lock to do the hlist_del_rcu. > To resolve the lock inversion problem pop the head of the maps list > repeatedly and remove the reference until no more are left. If a > delete happens in parallel from the BPF API that is OK as well because > it will do a similar action, lookup the lock in the map/hash, delete > it from the map/hash, and dec the refcnt. We check for this case > before doing a destroy on the psock to ensure we don't have two > threads tearing down a psock. The new logic is as follows, > > bpf_tcp_close() > e = psock_map_pop(psock->maps) // done with sk_callback_lock > bucket_lock() // lock hash list bucket > l = lookup_elem_raw(head, hash, key, key_size); > if (l) { > //only get here if elmnt was not already removed > hlist_del_rcu() > ... destroy psock... > } > bucket_unlock() > > And finally for all the above to work add missing sk_callback_lock > around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise > delete and update may corrupt maps list. Then add RCU annotations and > use rcu_dereference/rcu_assign_pointer to manage values relying on > RCU so that the object is not free'd from sock_hash_free() while it > is being referenced in bpf_tcp_close(). > > (As an aside the sk_callback_lock serves two purposes. The > first, is to update the sock callbacks sk_data_ready, sk_write_space, > etc. The second is to protect the psock 'maps' list. The 'maps' list > is used to (as shown above) to delete all map/hash references to a > sock when the sock is closed) > > Reported-by: syzbot+0ce137753c78f7b6a...@syzkaller.appspotmail.com > Fixes: 81110384441a ("bpf: sockmap, add hash map support") > Signed-off-by: John Fastabend <john.fastab...@gmail.com> > --- > kernel/bpf/sockmap.c | 120 > +++++++++++++++++++++++++++++++++++--------------- > 1 file changed, 84 insertions(+), 36 deletions(-) > > diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c > index 69b26af..333427b 100644 > --- a/kernel/bpf/sockmap.c > +++ b/kernel/bpf/sockmap.c > @@ -72,6 +72,7 @@ struct bpf_htab { > u32 n_buckets; > u32 elem_size; > struct bpf_sock_progs progs; > + struct rcu_head rcu; > }; > > struct htab_elem { > @@ -89,8 +90,8 @@ enum smap_psock_state { > struct smap_psock_map_entry { > struct list_head list; > struct sock **entry; > - struct htab_elem *hash_link; > - struct bpf_htab *htab; > + struct htab_elem __rcu *hash_link; > + struct bpf_htab __rcu *htab; > }; > > struct smap_psock { > @@ -258,16 +259,54 @@ static void bpf_tcp_release(struct sock *sk) > rcu_read_unlock(); > } > > +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, > + u32 hash, void *key, u32 key_size) > +{ > + struct htab_elem *l; > + > + hlist_for_each_entry_rcu(l, head, hash_node) { > + if (l->hash == hash && !memcmp(&l->key, key, key_size)) > + return l; > + } > + > + return NULL; > +} > + > +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) > +{ > + return &htab->buckets[hash & (htab->n_buckets - 1)]; > +} > + > +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 > hash) > +{ > + return &__select_bucket(htab, hash)->head; > +} > + > static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) > { > atomic_dec(&htab->count); > kfree_rcu(l, rcu); > } > > +static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, > + struct smap_psock *psock) > +{ > + struct smap_psock_map_entry *e; > + > + write_lock_bh(&sk->sk_callback_lock); > + e = list_first_entry_or_null(&psock->maps, > + struct smap_psock_map_entry, > + list); > + if (e) > + list_del(&e->list); > + write_unlock_bh(&sk->sk_callback_lock); > + return e; > +} > + > static void bpf_tcp_close(struct sock *sk, long timeout) > { > void (*close_fun)(struct sock *sk, long timeout); > - struct smap_psock_map_entry *e, *tmp; > + struct smap_psock_map_entry *e; > struct sk_msg_buff *md, *mtmp; > struct smap_psock *psock; > struct sock *osk; > @@ -286,7 +325,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout) > */ > close_fun = psock->save_close; > > - write_lock_bh(&sk->sk_callback_lock); > if (psock->cork) { > free_start_sg(psock->sock, psock->cork); > kfree(psock->cork); > @@ -299,20 +337,38 @@ static void bpf_tcp_close(struct sock *sk, long timeout) > kfree(md); > } > > - list_for_each_entry_safe(e, tmp, &psock->maps, list) { > + e = psock_map_pop(sk, psock); > + while (e) { > if (e->entry) { > osk = cmpxchg(e->entry, sk, NULL); > if (osk == sk) { > - list_del(&e->list); > smap_release_sock(psock, sk); > } > } else { > - hlist_del_rcu(&e->hash_link->hash_node); > - smap_release_sock(psock, e->hash_link->sk); > - free_htab_elem(e->htab, e->hash_link); > + struct htab_elem *link = rcu_dereference(e->hash_link); > + struct bpf_htab *htab = rcu_dereference(e->htab); > + struct hlist_head *head; > + struct htab_elem *l; > + struct bucket *b; > + > + b = __select_bucket(htab, link->hash); > + head = &b->head; > + raw_spin_lock_bh(&b->lock); > + l = lookup_elem_raw(head, > + link->hash, link->key, > + htab->map.key_size); > + /* If another thread deleted this object skip deletion. > + * The refcnt on psock may or may not be zero. > + */ > + if (l) { > + hlist_del_rcu(&link->hash_node); > + smap_release_sock(psock, link->sk); > + free_htab_elem(htab, link); > + } > + raw_spin_unlock_bh(&b->lock); > } > + e = psock_map_pop(sk, psock); > } > - write_unlock_bh(&sk->sk_callback_lock); > rcu_read_unlock(); > close_fun(sk, timeout); > } > @@ -1618,7 +1674,7 @@ static void smap_list_hash_remove(struct smap_psock > *psock, > struct smap_psock_map_entry *e, *tmp; > > list_for_each_entry_safe(e, tmp, &psock->maps, list) { > - struct htab_elem *c = e->hash_link; > + struct htab_elem *c = rcu_dereference(e->hash_link); > > if (c == hash_link) > list_del(&e->list); > @@ -2090,14 +2146,13 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr > *attr) > return ERR_PTR(err); > } > > -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) > +static void __bpf_htab_free(struct rcu_head *rcu) > { > - return &htab->buckets[hash & (htab->n_buckets - 1)]; > -} > + struct bpf_htab *htab; > > -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 > hash) > -{ > - return &__select_bucket(htab, hash)->head; > + htab = container_of(rcu, struct bpf_htab, rcu); > + bpf_map_area_free(htab->buckets); > + kfree(htab); > } > > static void sock_hash_free(struct bpf_map *map) > @@ -2116,10 +2171,13 @@ static void sock_hash_free(struct bpf_map *map) > */ > rcu_read_lock(); > for (i = 0; i < htab->n_buckets; i++) { > - struct hlist_head *head = select_bucket(htab, i); > + struct bucket *b = __select_bucket(htab, i); > + struct hlist_head *head; > struct hlist_node *n; > struct htab_elem *l; > > + raw_spin_lock_bh(&b->lock); > + head = &b->head; > hlist_for_each_entry_safe(l, n, head, hash_node) { > struct sock *sock = l->sk; > struct smap_psock *psock; > @@ -2137,12 +2195,12 @@ static void sock_hash_free(struct bpf_map *map) > smap_release_sock(psock, sock); > } > write_unlock_bh(&sock->sk_callback_lock); > - kfree(l); > + free_htab_elem(htab, l); > } > + raw_spin_unlock_bh(&b->lock); > } > rcu_read_unlock(); > - bpf_map_area_free(htab->buckets); > - kfree(htab); > + call_rcu(&htab->rcu, __bpf_htab_free); > } > > static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, > @@ -2169,19 +2227,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct > bpf_htab *htab, > return l_new; > } > > -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, > - u32 hash, void *key, u32 key_size) > -{ > - struct htab_elem *l; > - > - hlist_for_each_entry_rcu(l, head, hash_node) { > - if (l->hash == hash && !memcmp(&l->key, key, key_size)) > - return l; > - } > - > - return NULL; > -} > - > static inline u32 htab_map_hash(const void *key, u32 key_len) > { > return jhash(key, key_len, 0); > @@ -2301,8 +2346,9 @@ static int sock_hash_ctx_update_elem(struct > bpf_sock_ops_kern *skops, > goto bucket_err; > } > > - e->hash_link = l_new; > - e->htab = container_of(map, struct bpf_htab, map); > + rcu_assign_pointer(e->hash_link, l_new); > + rcu_assign_pointer(e->htab, > + container_of(map, struct bpf_htab, map)); > list_add_tail(&e->list, &psock->maps); Is sock->sk_callback_lock held here?
Others LGTM. > > /* add new element to the head of the list, so that > @@ -2313,8 +2359,10 @@ static int sock_hash_ctx_update_elem(struct > bpf_sock_ops_kern *skops, > psock = smap_psock_sk(l_old->sk); > > hlist_del_rcu(&l_old->hash_node); > + write_lock_bh(&l_old->sk->sk_callback_lock); > smap_list_hash_remove(psock, l_old); > smap_release_sock(psock, l_old->sk); > + write_unlock_bh(&l_old->sk->sk_callback_lock); > free_htab_elem(htab, l_old); > } > raw_spin_unlock_bh(&b->lock); >