On 08/12, Daniel Borkmann wrote:
> On 8/9/19 6:10 PM, Stanislav Fomichev wrote:
> > Add new helper bpf_sk_storage_clone which optionally clones sk storage
> > and call it from sk_clone_lock.
> > 
> > Cc: Martin KaFai Lau <ka...@fb.com>
> > Cc: Yonghong Song <y...@fb.com>
> > Signed-off-by: Stanislav Fomichev <s...@google.com>
> [...]
> > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> > +{
> > +   struct bpf_sk_storage *new_sk_storage = NULL;
> > +   struct bpf_sk_storage *sk_storage;
> > +   struct bpf_sk_storage_elem *selem;
> > +   int ret;
> > +
> > +   RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > +
> > +   rcu_read_lock();
> > +   sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > +
> > +   if (!sk_storage || hlist_empty(&sk_storage->list))
> > +           goto out;
> > +
> > +   hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
> > +           struct bpf_sk_storage_elem *copy_selem;
> > +           struct bpf_sk_storage_map *smap;
> > +           struct bpf_map *map;
> > +           int refold;
> > +
> > +           smap = rcu_dereference(SDATA(selem)->smap);
> > +           if (!(smap->map.map_flags & BPF_F_CLONE))
> > +                   continue;
> > +
> > +           map = bpf_map_inc_not_zero(&smap->map, false);
> > +           if (IS_ERR(map))
> > +                   continue;
> > +
> > +           copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
> > +           if (!copy_selem) {
> > +                   ret = -ENOMEM;
> > +                   bpf_map_put(map);
> > +                   goto err;
> > +           }
> > +
> > +           if (new_sk_storage) {
> > +                   selem_link_map(smap, copy_selem);
> > +                   __selem_link_sk(new_sk_storage, copy_selem);
> > +           } else {
> > +                   ret = sk_storage_alloc(newsk, smap, copy_selem);
> > +                   if (ret) {
> > +                           kfree(copy_selem);
> > +                           atomic_sub(smap->elem_size,
> > +                                      &newsk->sk_omem_alloc);
> > +                           bpf_map_put(map);
> > +                           goto err;
> > +                   }
> > +
> > +                   new_sk_storage = 
> > rcu_dereference(copy_selem->sk_storage);
> > +           }
> > +           bpf_map_put(map);
> 
> The map get/put combination /under/ RCU read lock seems a bit odd to me, could
> you exactly describe the race that this would be preventing?
There is a race between sk storage release and sk storage clone.
bpf_sk_storage_map_free uses synchronize_rcu to wait for all existing
users to finish and the new ones are prevented via map's refcnt being
zero; we need to do something like that for the clone.
Martin suggested to use bpf_map_inc_not_zero/bpf_map_put.
If I read everythin correctly, I think without map_inc/map_put we
get the following race:

CPU0                                   CPU1

bpf_map_put
  bpf_sk_storage_map_free(smap)
    synchronize_rcu

    // no more users via bpf or
    // syscall, but clone
    // can still happen

    for each (bucket)
      selem_unlink
        selem_unlink_map(smap)

        // adding anything at
        // this point to the
        // bucket will leak

                                       rcu_read_lock
                                       tcp_v4_rcv
                                         tcp_v4_do_rcv
                                           // sk is lockless TCP_LISTEN
                                           tcp_v4_cookie_check
                                             tcp_v4_syn_recv_sock
                                               bpf_sk_storage_clone
                                                 
rcu_dereference(sk->sk_bpf_storage)
                                                 selem_link_map(smap, copy)
                                                 // adding new element to the
                                                 // map -> leak
                                       rcu_read_unlock

      selem_unlink_sk
       sk->sk_bpf_storage = NULL

    synchronize_rcu

Reply via email to