On 01/16/2019 06:08 AM, Alexei Starovoitov wrote: > Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let > bpf program serialize access to other variables. > > Example: > struct hash_elem { > int cnt; > struct bpf_spin_lock lock; > }; > struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key); > if (val) { > bpf_spin_lock(&val->lock); > val->cnt++; > bpf_spin_unlock(&val->lock); > } > > Restrictions and safety checks: > - bpf_spin_lock is only allowed inside HASH and ARRAY maps. > - BTF description of the map is mandatory for safety analysis. > - bpf program can take one bpf_spin_lock at a time, since two or more can > cause dead locks. > - only one 'struct bpf_spin_lock' is allowed per map element. > It drastically simplifies implementation yet allows bpf program to use > any number of bpf_spin_locks. > - when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not > allowed. > - bpf program must bpf_spin_unlock() before return. > - bpf program can access 'struct bpf_spin_lock' only via > bpf_spin_lock()/bpf_spin_unlock() helpers. > - load/store into 'struct bpf_spin_lock lock;' field is not allowed. > - to use bpf_spin_lock() helper the BTF description of map value must be > a struct and have 'struct bpf_spin_lock anyname;' field at the top level. > Nested lock inside another struct is not allowed. > - syscall map_lookup doesn't copy bpf_spin_lock field to user space. > - syscall map_update and program map_update do not update bpf_spin_lock field. > - bpf_spin_lock cannot be on the stack or inside networking packet. > bpf_spin_lock can only be inside HASH or ARRAY map value. > - bpf_spin_lock is available to root only and to all program types. > > Implementation details: > - on !SMP bpf_spin_lock() becomes nop > - presence of bpf_spin_lock inside map value could have been indicated via > extra flag during map_create, but specifying it via BTF is cleaner. > It provides introspection for map key/value and reduces user coding > mistakes. > > Next steps: > - allow bpf_spin_lock in other map types (like cgroup local storage) > - introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper > to request kernel to grab bpf_spin_lock before rewriting the value. > That will serialize access to map elements. > > Signed-off-by: Alexei Starovoitov <a...@kernel.org> [...] > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c > index a74972b07e74..591fdedae7bf 100644 > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -221,6 +221,41 @@ const struct bpf_func_proto bpf_get_current_comm_proto = > { > .arg2_type = ARG_CONST_SIZE, > }; > > +BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) > +{ > +#if defined(CONFIG_SMP) > + struct qspinlock *qlock = (void *)lock; > + > + BUILD_BUG_ON(sizeof(*qlock) != sizeof(*lock)); > + queued_spin_lock(qlock); > +#endif > + return 0; > +} > + > +const struct bpf_func_proto bpf_spin_lock_proto = { > + .func = bpf_spin_lock, > + .gpl_only = false, > + .ret_type = RET_VOID, > + .arg1_type = ARG_PTR_TO_SPIN_LOCK, > +}; > + > +BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) > +{ > +#if defined(CONFIG_SMP) > + struct qspinlock *qlock = (void *)lock; > + > + queued_spin_unlock(qlock); > +#endif > + return 0; > +} > + > +const struct bpf_func_proto bpf_spin_unlock_proto = { > + .func = bpf_spin_unlock, > + .gpl_only = false, > + .ret_type = RET_VOID, > + .arg1_type = ARG_PTR_TO_SPIN_LOCK, > +}; > + > #ifdef CONFIG_CGROUPS > BPF_CALL_0(bpf_get_current_cgroup_id) > { > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index b155cd17c1bd..ebf0a673cb83 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -463,7 +463,7 @@ int map_check_no_btf(const struct bpf_map *map, > return -ENOTSUPP; > } > > -static int map_check_btf(const struct bpf_map *map, const struct btf *btf, > +static int map_check_btf(struct bpf_map *map, const struct btf *btf, > u32 btf_key_id, u32 btf_value_id) > { > const struct btf_type *key_type, *value_type; > @@ -478,6 +478,21 @@ static int map_check_btf(const struct bpf_map *map, > const struct btf *btf, > if (!value_type || value_size != map->value_size) > return -EINVAL; > > + map->spin_lock_off = btf_find_spin_lock(btf, value_type); > + > + if (map_value_has_spin_lock(map)) { > + if (map->map_type != BPF_MAP_TYPE_HASH && > + map->map_type != BPF_MAP_TYPE_ARRAY) > + return -ENOTSUPP; > + if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > > + map->value_size) { > + WARN_ONCE(1, > + "verifier bug spin_lock_off %d value_size > %d\n", > + map->spin_lock_off, map->value_size); > + return -EFAULT; > + } > + } > + > if (map->ops->map_check_btf) > ret = map->ops->map_check_btf(map, btf, key_type, value_type); > > @@ -542,6 +557,8 @@ static int map_create(union bpf_attr *attr) > map->btf = btf; > map->btf_key_type_id = attr->btf_key_type_id; > map->btf_value_type_id = attr->btf_value_type_id; > + } else { > + map->spin_lock_off = -EINVAL; > } > > err = security_bpf_map_alloc(map); > @@ -740,7 +757,7 @@ static int map_lookup_elem(union bpf_attr *attr) > err = -ENOENT; > } else { > err = 0; > - memcpy(value, ptr, value_size); > + copy_map_value(map, value, ptr); > } > rcu_read_unlock(); > } > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 56674a7c3778..0f3d1fb30d7a 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -213,6 +213,7 @@ struct bpf_call_arg_meta { > s64 msize_smax_value; > u64 msize_umax_value; > int ptr_id; > + int func_id; > }; > > static DEFINE_MUTEX(bpf_verifier_lock); > @@ -351,6 +352,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state > *reg) > return type_is_refcounted(reg->type); > } > > +static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) > +{ > + return reg->type == PTR_TO_MAP_VALUE && > + map_value_has_spin_lock(reg->map_ptr); > +} > + > static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) > { > return type_is_refcounted_or_null(reg->type); > @@ -712,6 +719,7 @@ static int copy_verifier_state(struct bpf_verifier_state > *dst_state, > } > dst_state->speculative = src->speculative; > dst_state->curframe = src->curframe; > + dst_state->active_spin_lock = src->active_spin_lock; > for (i = 0; i <= src->curframe; i++) { > dst = dst_state->frame[i]; > if (!dst) { > @@ -1483,6 +1491,21 @@ static int check_map_access(struct bpf_verifier_env > *env, u32 regno, > if (err) > verbose(env, "R%d max value is outside of the array range\n", > regno); > + > + if (map_value_has_spin_lock(reg->map_ptr)) { > + u32 lock = reg->map_ptr->spin_lock_off; > + > + /* if any part of struct bpf_spin_lock can be touched by > + * load/store reject this program > + */ > + if ((reg->smin_value + off <= lock && > + lock < reg->umax_value + off + size) || > + (reg->smin_value + off < lock + sizeof(struct > bpf_spin_lock) && > + lock + sizeof(struct bpf_spin_lock) <= reg->umax_value + > off + size)) { > + verbose(env, "bpf_spin_lock cannot be accessed directly > by load/store\n"); > + return -EACCES; > + } > + } > return err; > } > > @@ -2192,6 +2215,91 @@ static int check_helper_mem_access(struct > bpf_verifier_env *env, int regno, > } > } > > +/* Implementation details: > + * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL > + * Two bpf_map_lookups (even with the same key) will have different reg->id. > + * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after > + * value_or_null->value transition, since the verifier only cares about > + * the range of access to valid map value pointer and doesn't care about > actual > + * address of the map element. > + * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps > + * reg->id > 0 after value_or_null->value transition. By doing so > + * two bpf_map_lookups will be considered two different pointers that > + * point to different bpf_spin_locks. > + * The verifier allows taking only one bpf_spin_lock at a time to avoid > + * dead-locks. > + * Since only one bpf_spin_lock is allowed the checks are simpler than > + * reg_is_refcounted() logic. The verifier needs to remember only > + * one spin_lock instead of array of acquired_refs. > + * cur_state->active_spin_lock remembers which map value element got locked > + * and clears it after bpf_spin_unlock. > + */ > +static int process_spin_lock(struct bpf_verifier_env *env, int regno, > + bool is_lock) > +{ > + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; > + struct bpf_verifier_state *cur = env->cur_state; > + bool is_const = tnum_is_const(reg->var_off); > + struct bpf_map *map = reg->map_ptr; > + u64 val = reg->var_off.value; > + > + if (reg->type != PTR_TO_MAP_VALUE) { > + verbose(env, "R%d is not a pointer to map_value\n", regno); > + return -EINVAL; > + } > + if (!is_const) { > + verbose(env, > + "R%d doesn't have constant offset. bpf_spin_lock has to > be at the constant offset\n", > + regno); > + return -EINVAL; > + } > + if (!map->btf) { > + verbose(env, > + "map '%s' has to have BTF in order to use > bpf_spin_lock\n", > + map->name); > + return -EINVAL; > + } > + if (!map_value_has_spin_lock(map)) { > + if (map->spin_lock_off == -E2BIG) > + verbose(env, > + "map '%s' has more than one 'struct > bpf_spin_lock'\n", > + map->name); > + else if (map->spin_lock_off == -ENOENT) > + verbose(env, > + "map '%s' doesn't have 'struct > bpf_spin_lock'\n", > + map->name); > + else > + verbose(env, > + "map '%s' is not a struct type or bpf_spin_lock > is mangled\n", > + map->name); > + return -EINVAL; > + } > + if (map->spin_lock_off != val + reg->off) { > + verbose(env, "off %lld doesn't point to 'struct > bpf_spin_lock'\n", > + val + reg->off); > + return -EINVAL; > + } > + if (is_lock) { > + if (cur->active_spin_lock) { > + verbose(env, > + "Locking two bpf_spin_locks are not allowed\n"); > + return -EINVAL; > + } > + cur->active_spin_lock = reg->id; > + } else { > + if (!cur->active_spin_lock) { > + verbose(env, "bpf_spin_unlock without taking a lock\n"); > + return -EINVAL; > + } > + if (cur->active_spin_lock != reg->id) { > + verbose(env, "bpf_spin_unlock of different lock\n"); > + return -EINVAL; > + } > + cur->active_spin_lock = 0; > + } > + return 0; > +} > + > static bool arg_type_is_mem_ptr(enum bpf_arg_type type) > { > return type == ARG_PTR_TO_MEM || > @@ -2268,6 +2376,17 @@ static int check_func_arg(struct bpf_verifier_env > *env, u32 regno, > return -EFAULT; > } > meta->ptr_id = reg->id; > + } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { > + if (meta->func_id == BPF_FUNC_spin_lock) { > + if (process_spin_lock(env, regno, true)) > + return -EACCES; > + } else if (meta->func_id == BPF_FUNC_spin_unlock) { > + if (process_spin_lock(env, regno, false)) > + return -EACCES; > + } else { > + verbose(env, "verifier internal error\n"); > + return -EFAULT; > + } > } else if (arg_type_is_mem_ptr(arg_type)) { > expected_type = PTR_TO_STACK; > /* One exception here. In case function allows for NULL to be > @@ -2887,6 +3006,7 @@ static int check_helper_call(struct bpf_verifier_env > *env, int func_id, int insn > return err; > } > > + meta.func_id = func_id; > /* check args */ > err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); > if (err) > @@ -4344,7 +4464,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state > *state, > } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { > reg->type = PTR_TO_SOCKET; > } > - if (is_null || !reg_is_refcounted(reg)) { > + if (is_null || !(reg_is_refcounted(reg) || > + reg_may_point_to_spin_lock(reg))) { > /* We don't need id from this point onwards anymore, > * thus we should better reset it, so that state > * pruning has chances to take effect. > @@ -5651,6 +5772,9 @@ static bool states_equal(struct bpf_verifier_env *env, > if (old->speculative && !cur->speculative) > return false; > > + if (old->active_spin_lock != cur->active_spin_lock) > + return false; > + > /* for states to be equal callsites have to be the same > * and all frame states need to be equivalent > */ > @@ -6068,6 +6192,12 @@ static int do_check(struct bpf_verifier_env *env) > return -EINVAL; > } > > + if (env->cur_state->active_spin_lock && > + (insn->src_reg == BPF_PSEUDO_CALL || > + insn->imm != BPF_FUNC_spin_unlock)) { > + verbose(env, "function calls are not > allowed while holding a lock\n"); > + return -EINVAL; > + } > if (insn->src_reg == BPF_PSEUDO_CALL) > err = check_func_call(env, insn, > &env->insn_idx); > else > @@ -6096,6 +6226,11 @@ static int do_check(struct bpf_verifier_env *env) > return -EINVAL; > } > > + if (env->cur_state->active_spin_lock) { > + verbose(env, "bpf_spin_unlock is > missing\n"); > + return -EINVAL; > + } > + > if (state->curframe) { > /* exit from nested function */ > env->prev_insn_idx = env->insn_idx;
I think if I'm not mistaken there should still be a possibility for causing a deadlock, namely if in the middle of the critical section I'm using an LD_ABS or LD_IND instruction with oob index such that I cause an implicit return 0 while lock is held. At least I don't see this being caught, probably also for such case a test_verifier snippet would be good. Wouldn't we also need to mark queued spinlock functions as notrace such that e.g. from kprobe one cannot attach to these causing a deadlock?