On Thu, Jun 14, 2018 at 09:44:57AM -0700, John Fastabend wrote: > First in tcp_close, reduce scope of sk_callback_lock() the lock is > only needed for protecting smap_release_sock() the ingress and cork > lists are protected by sock lock. Having the lock in wider scope is > harmless but may confuse the reader who may infer it is in fact > needed. > > Next, in sock_hash_delete_elem() the pattern is as follows, > > sock_hash_delete_elem() > [...] > spin_lock(bucket_lock) > l = lookup_elem_raw() > if (l) > hlist_del_rcu() > write_lock(sk_callback_lock) > .... destroy psock ... > write_unlock(sk_callback_lock) > spin_unlock(bucket_lock) > > The ordering is necessary because we only know the {p}sock after > dereferencing the hash table which we can't do unless we have the > bucket lock held. Once we have the bucket lock and the psock element > it is deleted from the hashmap to ensure any other path doing a lookup > will fail. Finally, the refcnt is decremented and if zero the psock > is destroyed. > > In parallel with the above (or free'ing the map) a tcp close event > may trigger tcp_close(). Which at the moment omits the bucket lock > altogether (oops!) where the flow looks like this, > > bpf_tcp_close() > [...] > write_lock(sk_callback_lock) > for each psock->maps // list of maps this sock is part of > hlist_del_rcu(ref_hash_node); > .... destroy psock ... > write_unlock(sk_callback_lock) > > Obviously, and demonstrated by syzbot, this is broken because > we can have multiple threads deleting entries via hlist_del_rcu(). > > To fix this we might be tempted to wrap the hlist operation in a > bucket lock but that would create a lock inversion problem. In > summary to follow locking rules maps needs the sk_callback_lock but we > need the bucket lock to do the hlist_del_rcu. To resolve the lock > inversion problem note that when bpf_tcp_close is called no updates > can happen in parallel, due to ESTABLISH state check in update logic, > so pop the head of the list repeatedly and remove the reference until > no more are left. If a delete happens in parallel from the BPF API > that is OK as well because it will do a similar action, lookup the > sock in the map/hash, delete it from the map/hash, and dec the refcnt. > We check for this case before doing a destroy on the psock to ensure > we don't have two threads tearing down a psock. The new logic is > as follows, > > bpf_tcp_close() > e = psock_map_pop(psock->maps) // done with sk_callback_lock > bucket_lock() // lock hash list bucket > l = lookup_elem_raw(head, hash, key, key_size); > if (l) { > //only get here if elmnt was not already removed > hlist_del_rcu() > ... destroy psock... > } > bucket_unlock() > > And finally for all the above to work add missing sk_callback_lock > around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise > delete and update may corrupt maps list. > > (As an aside the sk_callback_lock serves two purposes. The > first, is to update the sock callbacks sk_data_ready, sk_write_space, > etc. The second is to protect the psock 'maps' list. The 'maps' list > is used to (as shown above) to delete all map/hash references to a > sock when the sock is closed) > > (If we did not have the ESTABLISHED state guarantee from tcp_close > then we could not ensure completion because updates could happen > forever and pin thread in delete loop.) > > Reported-by: syzbot+0ce137753c78f7b6a...@syzkaller.appspotmail.com > Fixes: 81110384441a ("bpf: sockmap, add hash map support") > Signed-off-by: John Fastabend <john.fastab...@gmail.com> > --- > 0 files changed > > diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c > index f1ab52d..04764f5 100644 > --- a/kernel/bpf/sockmap.c > +++ b/kernel/bpf/sockmap.c > @@ -258,16 +258,54 @@ static void bpf_tcp_release(struct sock *sk) > rcu_read_unlock(); > } > > +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, > + u32 hash, void *key, u32 key_size) > +{ > + struct htab_elem *l; > + > + hlist_for_each_entry_rcu(l, head, hash_node) { > + if (l->hash == hash && !memcmp(&l->key, key, key_size)) > + return l; > + } > + > + return NULL; > +} > + > +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) > +{ > + return &htab->buckets[hash & (htab->n_buckets - 1)]; > +} > + > +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 > hash) > +{ > + return &__select_bucket(htab, hash)->head; > +} > + > static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) > { > atomic_dec(&htab->count); > kfree_rcu(l, rcu); > } > > +struct smap_psock_map_entry *psock_map_pop(struct sock *sk, > + struct smap_psock *psock) > +{ > + struct smap_psock_map_entry *e; > + > + write_lock_bh(&sk->sk_callback_lock); > + e = list_first_entry_or_null(&psock->maps, > + struct smap_psock_map_entry, > + list); > + if (e) > + list_del(&e->list); > + write_unlock_bh(&sk->sk_callback_lock); > + return e; > +} > + > static void bpf_tcp_close(struct sock *sk, long timeout) > { > void (*close_fun)(struct sock *sk, long timeout); > - struct smap_psock_map_entry *e, *tmp; > + struct smap_psock_map_entry *e; > struct sk_msg_buff *md, *mtmp; > struct smap_psock *psock; > struct sock *osk; > @@ -286,7 +324,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout) > */ > close_fun = psock->save_close; > > - write_lock_bh(&sk->sk_callback_lock); > if (psock->cork) { > free_start_sg(psock->sock, psock->cork); > kfree(psock->cork); > @@ -299,20 +336,48 @@ static void bpf_tcp_close(struct sock *sk, long timeout) > kfree(md); > } > > - list_for_each_entry_safe(e, tmp, &psock->maps, list) { > + /* Sock is in TCP_CLOSE state so any concurrent adds or updates will be > + * blocked by ESTABLISHED check. However, tcp_close() + delete + free > + * can all run at the same time. If a tcp_close + delete happens each > + * code path will remove the entry for the map/hash before deleting it. > + * In the map case a xchg and then check to verify we have a sk protects > + * two paths from tearing down the same object. For hash map we lock the > + * bucket and remove the object from the hash map before destroying to > + * ensure that only one reference exists. By pulling object off the head > + * of the list with (with sk_callback_lock) if multiple deleters are > + * running we avoid duplicate references. > + */ > + e = psock_map_pop(sk, psock); > + while (e) { > if (e->entry) { > osk = cmpxchg(e->entry, sk, NULL); > if (osk == sk) { > - list_del(&e->list); > smap_release_sock(psock, sk); > } > } else { > - hlist_del_rcu(&e->hash_link->hash_node); > - smap_release_sock(psock, e->hash_link->sk); > - free_htab_elem(e->htab, e->hash_link); > + struct htab_elem *link = e->hash_link; > + struct hlist_head *head; > + struct htab_elem *l; > + struct bucket *b; > + > + b = __select_bucket(e->htab, link->hash); > + head = &b->head; > + raw_spin_lock_bh(&b->lock); > + l = lookup_elem_raw(head, > + link->hash, link->key, > + e->htab->elem_size); > + /* If another thread deleted this object skip deletion. > + * The refcnt on psock may or may not be zero. > + */ > + if (l) { > + hlist_del_rcu(&e->hash_link->hash_node); > + smap_release_sock(psock, e->hash_link->sk); > + free_htab_elem(e->htab, e->hash_link); > + } > + raw_spin_unlock_bh(&b->lock); > } > + e = psock_map_pop(sk, psock); > } > - write_unlock_bh(&sk->sk_callback_lock); > rcu_read_unlock(); > close_fun(sk, timeout); > } > @@ -2088,16 +2153,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr > *attr) > return ERR_PTR(err); > } > > -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) > -{ > - return &htab->buckets[hash & (htab->n_buckets - 1)]; > -} > - > -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 > hash) > -{ > - return &__select_bucket(htab, hash)->head; > -} > - > static void sock_hash_free(struct bpf_map *map) > { > struct bpf_htab *htab = container_of(map, struct bpf_htab, map); > @@ -2114,10 +2169,13 @@ static void sock_hash_free(struct bpf_map *map) > */ > rcu_read_lock(); > for (i = 0; i < htab->n_buckets; i++) { > - struct hlist_head *head = select_bucket(htab, i); > + struct bucket *b = __select_bucket(htab, i); > + struct hlist_head *head; > struct hlist_node *n; > struct htab_elem *l; > > + raw_spin_lock_bh(&b->lock); There is a synchronize_rcu() at the beginning of sock_hash_free(). Taking the bucket's lock of the free-ing map at this point is a bit suspicious. What may still access the map after synchronize_rcu()?
> + head = &b->head; > hlist_for_each_entry_safe(l, n, head, hash_node) { > struct sock *sock = l->sk; > struct smap_psock *psock; > @@ -2137,6 +2195,7 @@ static void sock_hash_free(struct bpf_map *map) > write_unlock_bh(&sock->sk_callback_lock); > kfree(l); > } > + raw_spin_unlock_bh(&b->lock); > } > rcu_read_unlock(); > bpf_map_area_free(htab->buckets); > @@ -2167,19 +2226,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct > bpf_htab *htab, > return l_new; > } > > -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, > - u32 hash, void *key, u32 key_size) > -{ > - struct htab_elem *l; > - > - hlist_for_each_entry_rcu(l, head, hash_node) { > - if (l->hash == hash && !memcmp(&l->key, key, key_size)) > - return l; > - } > - > - return NULL; > -} > - > static inline u32 htab_map_hash(const void *key, u32 key_len) > { > return jhash(key, key_len, 0); > @@ -2307,8 +2353,10 @@ static int sock_hash_ctx_update_elem(struct > bpf_sock_ops_kern *skops, > psock = smap_psock_sk(l_old->sk); > > hlist_del_rcu(&l_old->hash_node); > + write_lock_bh(&l_old->sk->sk_callback_lock); > smap_list_remove(psock, NULL, l_old); > smap_release_sock(psock, l_old->sk); > + write_unlock_bh(&l_old->sk->sk_callback_lock); > free_htab_elem(htab, l_old); > } > raw_spin_unlock_bh(&b->lock); >