On Thu, 17 Jan 2019 22:50:53 -0800, Alexei Starovoitov wrote:
> Introduce BPF_F_LOCK flag for map_lookup and map_update syscall commands
> and for map_update() helper function.
> In all these cases take a lock of existing element (which was provided
> in BTF description) before copying (in or out) the rest of map value.
> 
> Implementation details that are part of uapi:
> 
> Array:
> The array map takes the element lock for lookup/update.
> 
> Hash:
> hash map also takes the lock for lookup/update and tries to avoid the bucket 
> lock.
> If old element exists it takes the element lock and updates the element in 
> place.
> If element doesn't exist it allocates new one and inserts into hash table
> while holding the bucket lock.
> In rare case the hashmap has to take both the bucket lock and the element lock
> to update old value in place.
> 
> Cgroup local storage:
> It is similar to array. update in place and lookup are done with lock taken.
> 
> Signed-off-by: Alexei Starovoitov <a...@kernel.org>

> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 48a41bf65e1b..6397b12c130e 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -809,11 +809,11 @@ static struct htab_elem *alloc_htab_elem(struct 
> bpf_htab *htab, void *key,
>  static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
>                      u64 map_flags)
>  {
> -     if (l_old && map_flags == BPF_NOEXIST)
> +     if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
>               /* elem already exists */
>               return -EEXIST;
>  
> -     if (!l_old && map_flags == BPF_EXIST)
> +     if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
>               /* elem doesn't exist, cannot update it */
>               return -ENOENT;
>  
> @@ -832,7 +832,7 @@ static int htab_map_update_elem(struct bpf_map *map, void 
> *key, void *value,
>       u32 key_size, hash;
>       int ret;
>  
> -     if (unlikely(map_flags > BPF_EXIST))
> +     if (unlikely(map_flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
>               /* unknown flags */
>               return -EINVAL;

Perhaps we should check for:

        hweight(map_flags & (BPF_EXIST | BPF_NOEXIST)) < 2 

right now this > BPF_EXIST is a little hacky, as it depends on the fact,
that BPF_EXIST is 2..  If I read the code correctly  BPF_EXIST |
BPF_NOEXIST will be treated as BPF_NOEXIST.  Not sure that matters.

> @@ -845,6 +845,28 @@ static int htab_map_update_elem(struct bpf_map *map, 
> void *key, void *value,
>       b = __select_bucket(htab, hash);
>       head = &b->head;
>  
> +     if (unlikely(map_flags & BPF_F_LOCK)) {
> +             if (unlikely(!map_value_has_spin_lock(map)))
> +                     return -EINVAL;
> +             /* find an element without taking the bucket lock */
> +             l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
> +                                           htab->n_buckets);
> +             ret = check_flags(htab, l_old, map_flags);
> +             if (ret)
> +                     return ret;
> +             if (l_old) {
> +                     /* grab the element lock and update value in place */
> +                     copy_map_value_locked(map,
> +                                           l_old->key + round_up(key_size, 
> 8),
> +                                           value, false);
> +                     return 0;
> +             }
> +             /* fall through, grab the bucket lock and lookup again.
> +              * 99.9% chance that the element won't be found,
> +              * but second lookup under lock has to be done.
> +              */
> +     }
> +
>       /* bpf_map_update_elem() can be called in_irq() */
>       raw_spin_lock_irqsave(&b->lock, flags);
>  
> @@ -854,6 +876,20 @@ static int htab_map_update_elem(struct bpf_map *map, 
> void *key, void *value,
>       if (ret)
>               goto err;
>  
> +     if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
> +             /* first lookup without the bucket lock didn't find the element,
> +              * but second lookup with the bucket lock found it.
> +              * This case is highly unlikely, but has to be dealt with:
> +              * grab the element lock in addition to the bucket lock
> +              * and update element in place
> +              */
> +             copy_map_value_locked(map,
> +                                   l_old->key + round_up(key_size, 8),
> +                                   value, false);
> +             ret = 0;
> +             goto err;
> +     }
> +
>       l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
>                               l_old);
>       if (IS_ERR(l_new)) {

> diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
> index 0295427f06e2..6b572e2de7fb 100644
> --- a/kernel/bpf/local_storage.c
> +++ b/kernel/bpf/local_storage.c
> @@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map 
> *map, void *_key,
>       struct bpf_cgroup_storage *storage;
>       struct bpf_storage_buffer *new;
>  
> -     if (flags != BPF_ANY && flags != BPF_EXIST)
> +     if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
> +             return -EINVAL;
> +
> +     if (unlikely(flags & BPF_NOEXIST))
> +             return -EINVAL;

nit: for arrays we have a different return code with BPF_NOEXIST, but
     here its both EINVAL, so perhaps just drop the BPF_NOEXIST from
     the mask?

> +     if (unlikely((flags & BPF_F_LOCK) &&
> +                  !map_value_has_spin_lock(map)))
>               return -EINVAL;
>  
>       storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
> @@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map 
> *map, void *_key,
>       if (!storage)
>               return -ENOENT;
>  
> +     if (flags & BPF_F_LOCK) {
> +             copy_map_value_locked(map, storage->buf->data, value, false);
> +             return 0;
> +     }
> +
>       new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
>                          map->value_size,
>                          __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index b29e6dc44650..ed55f03ba586 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -682,7 +682,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 
> key_size)
>  }
>  
>  /* last field in 'union bpf_attr' used by this command */
> -#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
> +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
>  
>  static int map_lookup_elem(union bpf_attr *attr)
>  {
> @@ -698,6 +698,9 @@ static int map_lookup_elem(union bpf_attr *attr)
>       if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
>               return -EINVAL;
>  
> +     if (attr->flags & ~BPF_F_LOCK)
> +             return -EINVAL;
> +
>       f = fdget(ufd);
>       map = __bpf_map_get(f);
>       if (IS_ERR(map))
> @@ -708,6 +711,12 @@ static int map_lookup_elem(union bpf_attr *attr)
>               goto err_put;
>       }
>  
> +     if ((attr->flags & BPF_F_LOCK) &&
> +         !map_value_has_spin_lock(map)) {
> +             err = -EINVAL;
> +             goto err_put;
> +     }
> +
>       key = __bpf_copy_key(ukey, map->key_size);
>       if (IS_ERR(key)) {
>               err = PTR_ERR(key);
> @@ -758,7 +767,16 @@ static int map_lookup_elem(union bpf_attr *attr)
>                       err = -ENOENT;
>               } else {
>                       err = 0;
> -                     copy_map_value(map, value, ptr);
> +                     if (attr->flags & BPF_F_LOCK) {
> +                             /* lock 'ptr' elem and copy
> +                              * everything but the lock
> +                              */
> +                             copy_map_value_locked(map, value, ptr, true);
> +                             /* mask lock, since value was kmalloced */
> +                             check_and_init_map_lock(map, value);
> +                     } else {
> +                             copy_map_value(map, value, ptr);
> +                     }
>               }
>               rcu_read_unlock();
>       }
> @@ -818,6 +836,12 @@ static int map_update_elem(union bpf_attr *attr)
>               goto err_put;
>       }
>  
> +     if ((attr->flags & BPF_F_LOCK) &&
> +         !map_value_has_spin_lock(map)) {

nit: do we need this check in syscall.c for update?  Just wondering.

> +             err = -EINVAL;
> +             goto err_put;
> +     }
> +
>       key = __bpf_copy_key(ukey, map->key_size);
>       if (IS_ERR(key)) {
>               err = PTR_ERR(key);

Reply via email to