On Thu, 17 Jan 2019 22:50:53 -0800, Alexei Starovoitov wrote: > Introduce BPF_F_LOCK flag for map_lookup and map_update syscall commands > and for map_update() helper function. > In all these cases take a lock of existing element (which was provided > in BTF description) before copying (in or out) the rest of map value. > > Implementation details that are part of uapi: > > Array: > The array map takes the element lock for lookup/update. > > Hash: > hash map also takes the lock for lookup/update and tries to avoid the bucket > lock. > If old element exists it takes the element lock and updates the element in > place. > If element doesn't exist it allocates new one and inserts into hash table > while holding the bucket lock. > In rare case the hashmap has to take both the bucket lock and the element lock > to update old value in place. > > Cgroup local storage: > It is similar to array. update in place and lookup are done with lock taken. > > Signed-off-by: Alexei Starovoitov <a...@kernel.org>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c > index 48a41bf65e1b..6397b12c130e 100644 > --- a/kernel/bpf/hashtab.c > +++ b/kernel/bpf/hashtab.c > @@ -809,11 +809,11 @@ static struct htab_elem *alloc_htab_elem(struct > bpf_htab *htab, void *key, > static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, > u64 map_flags) > { > - if (l_old && map_flags == BPF_NOEXIST) > + if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) > /* elem already exists */ > return -EEXIST; > > - if (!l_old && map_flags == BPF_EXIST) > + if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) > /* elem doesn't exist, cannot update it */ > return -ENOENT; > > @@ -832,7 +832,7 @@ static int htab_map_update_elem(struct bpf_map *map, void > *key, void *value, > u32 key_size, hash; > int ret; > > - if (unlikely(map_flags > BPF_EXIST)) > + if (unlikely(map_flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) > /* unknown flags */ > return -EINVAL; Perhaps we should check for: hweight(map_flags & (BPF_EXIST | BPF_NOEXIST)) < 2 right now this > BPF_EXIST is a little hacky, as it depends on the fact, that BPF_EXIST is 2.. If I read the code correctly BPF_EXIST | BPF_NOEXIST will be treated as BPF_NOEXIST. Not sure that matters. > @@ -845,6 +845,28 @@ static int htab_map_update_elem(struct bpf_map *map, > void *key, void *value, > b = __select_bucket(htab, hash); > head = &b->head; > > + if (unlikely(map_flags & BPF_F_LOCK)) { > + if (unlikely(!map_value_has_spin_lock(map))) > + return -EINVAL; > + /* find an element without taking the bucket lock */ > + l_old = lookup_nulls_elem_raw(head, hash, key, key_size, > + htab->n_buckets); > + ret = check_flags(htab, l_old, map_flags); > + if (ret) > + return ret; > + if (l_old) { > + /* grab the element lock and update value in place */ > + copy_map_value_locked(map, > + l_old->key + round_up(key_size, > 8), > + value, false); > + return 0; > + } > + /* fall through, grab the bucket lock and lookup again. > + * 99.9% chance that the element won't be found, > + * but second lookup under lock has to be done. > + */ > + } > + > /* bpf_map_update_elem() can be called in_irq() */ > raw_spin_lock_irqsave(&b->lock, flags); > > @@ -854,6 +876,20 @@ static int htab_map_update_elem(struct bpf_map *map, > void *key, void *value, > if (ret) > goto err; > > + if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { > + /* first lookup without the bucket lock didn't find the element, > + * but second lookup with the bucket lock found it. > + * This case is highly unlikely, but has to be dealt with: > + * grab the element lock in addition to the bucket lock > + * and update element in place > + */ > + copy_map_value_locked(map, > + l_old->key + round_up(key_size, 8), > + value, false); > + ret = 0; > + goto err; > + } > + > l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, > l_old); > if (IS_ERR(l_new)) { > diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c > index 0295427f06e2..6b572e2de7fb 100644 > --- a/kernel/bpf/local_storage.c > +++ b/kernel/bpf/local_storage.c > @@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map > *map, void *_key, > struct bpf_cgroup_storage *storage; > struct bpf_storage_buffer *new; > > - if (flags != BPF_ANY && flags != BPF_EXIST) > + if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) > + return -EINVAL; > + > + if (unlikely(flags & BPF_NOEXIST)) > + return -EINVAL; nit: for arrays we have a different return code with BPF_NOEXIST, but here its both EINVAL, so perhaps just drop the BPF_NOEXIST from the mask? > + if (unlikely((flags & BPF_F_LOCK) && > + !map_value_has_spin_lock(map))) > return -EINVAL; > > storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, > @@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map > *map, void *_key, > if (!storage) > return -ENOENT; > > + if (flags & BPF_F_LOCK) { > + copy_map_value_locked(map, storage->buf->data, value, false); > + return 0; > + } > + > new = kmalloc_node(sizeof(struct bpf_storage_buffer) + > map->value_size, > __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index b29e6dc44650..ed55f03ba586 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -682,7 +682,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 > key_size) > } > > /* last field in 'union bpf_attr' used by this command */ > -#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value > +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags > > static int map_lookup_elem(union bpf_attr *attr) > { > @@ -698,6 +698,9 @@ static int map_lookup_elem(union bpf_attr *attr) > if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) > return -EINVAL; > > + if (attr->flags & ~BPF_F_LOCK) > + return -EINVAL; > + > f = fdget(ufd); > map = __bpf_map_get(f); > if (IS_ERR(map)) > @@ -708,6 +711,12 @@ static int map_lookup_elem(union bpf_attr *attr) > goto err_put; > } > > + if ((attr->flags & BPF_F_LOCK) && > + !map_value_has_spin_lock(map)) { > + err = -EINVAL; > + goto err_put; > + } > + > key = __bpf_copy_key(ukey, map->key_size); > if (IS_ERR(key)) { > err = PTR_ERR(key); > @@ -758,7 +767,16 @@ static int map_lookup_elem(union bpf_attr *attr) > err = -ENOENT; > } else { > err = 0; > - copy_map_value(map, value, ptr); > + if (attr->flags & BPF_F_LOCK) { > + /* lock 'ptr' elem and copy > + * everything but the lock > + */ > + copy_map_value_locked(map, value, ptr, true); > + /* mask lock, since value was kmalloced */ > + check_and_init_map_lock(map, value); > + } else { > + copy_map_value(map, value, ptr); > + } > } > rcu_read_unlock(); > } > @@ -818,6 +836,12 @@ static int map_update_elem(union bpf_attr *attr) > goto err_put; > } > > + if ((attr->flags & BPF_F_LOCK) && > + !map_value_has_spin_lock(map)) { nit: do we need this check in syscall.c for update? Just wondering. > + err = -EINVAL; > + goto err_put; > + } > + > key = __bpf_copy_key(ukey, map->key_size); > if (IS_ERR(key)) { > err = PTR_ERR(key);