From: Gyutae Bae <[email protected]> map_delete_elem() now honours BPF_F_COMPARE: it copies the expected bytes, validates the compare window against value_size, and dispatches to a new ->map_delete_elem_cmp(). The hash-map implementation, under the bucket lock, memcmps the [off, off+size) window of the live value and unlinks only on a match (mismatch -EBUSY, absent -ENOENT), making the compare and the delete atomic w.r.t. every bucket-lock holder. Only BPF_MAP_TYPE_HASH wires up the op; every other map type lacks it and returns -EOPNOTSUPP.
Atomicity boundary: the compare is read under the bucket lock, so it is atomic against every other bucket-lock holder (concurrent deletes, element-replacing updates). It is NOT mutual exclusion against a BPF program that writes the value in place via the pointer from bpf_map_lookup_elem(), which takes no bucket lock; against such a writer the compare may observe a torn value within the locked section. For a meaningful result the compared region should be a synchronization variable -- written with a single aligned store such as a monotonic revision, or replaced via an update -- not a multi-field blob mutated in place. It compares the raw stored value, so maps whose value carries BTF-managed fields (bpf_spin_lock, bpf_timer, kptr, ...) are also rejected with -EOPNOTSUPP: those bytes are sanitised on lookup, so a raw compare would never match the caller's snapshot and could expose kernel-internal bytes. BPF_MAP_DELETE_ELEM_LAST_FIELD now reaches compare_size, so the compare* fields pass CHECK_ATTR even without BPF_F_COMPARE. Reject that combination with -EINVAL: a dropped flag must not silently turn a compare-and-delete into an unconditional delete. Co-developed-by: Minsu Jeon <[email protected]> Signed-off-by: Minsu Jeon <[email protected]> Co-developed-by: Siwan Kim <[email protected]> Signed-off-by: Siwan Kim <[email protected]> Co-developed-by: Jonghyeon Kim <[email protected]> Signed-off-by: Jonghyeon Kim <[email protected]> Signed-off-by: Gyutae Bae <[email protected]> --- include/linux/bpf.h | 2 ++ kernel/bpf/hashtab.c | 39 ++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 54 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 92 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7719f6528445..2e858272f7a7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -110,6 +110,8 @@ struct bpf_map_ops { void *(*map_lookup_elem)(struct bpf_map *map, void *key); long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); long (*map_delete_elem)(struct bpf_map *map, void *key); + long (*map_delete_elem_cmp)(struct bpf_map *map, void *key, + const void *compare, u32 off, u32 size); long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 9f394e1aa2e8..8fc3ab938b9e 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1534,6 +1534,44 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) return ret; } +static long htab_map_delete_elem_cmp(struct bpf_map *map, void *key, + const void *compare, u32 off, u32 size) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_head *head; + struct bucket *b; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size, htab->hashrnd); + b = __select_bucket(htab, hash); + head = &b->head; + + ret = htab_lock_bucket(b, &flags); + if (ret) + return ret; + + l = lookup_elem_raw(head, hash, key, key_size); + if (!l) + ret = -ENOENT; + else if (memcmp(htab_elem_value(l, key_size) + off, compare, size) != 0) + ret = -EBUSY; + else + hlist_nulls_del_rcu(&l->hash_node); + + htab_unlock_bucket(b, flags); + + if (!ret) + free_htab_elem(htab, l); /* ret==0 implies we unlinked l */ + return ret; +} + static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); @@ -2366,6 +2404,7 @@ const struct bpf_map_ops htab_map_ops = { .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_delete_elem_cmp = htab_map_delete_elem_cmp, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b44106c8ea75..42fc432f82e1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1892,18 +1892,30 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) return err; } -#define BPF_MAP_DELETE_ELEM_LAST_FIELD key +#define BPF_MAP_DELETE_ELEM_LAST_FIELD compare_size static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) { bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); struct bpf_map *map; - void *key; + void *key, *compare = NULL; + u32 off = 0, size = 0; int err; if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) return -EINVAL; + if (attr->flags & ~BPF_F_COMPARE) + return -EINVAL; + + /* The compare* fields are meaningful only with BPF_F_COMPARE. Reject them + * when the flag is absent so a dropped BPF_F_COMPARE cannot silently turn a + * compare-and-delete into an unconditional delete. + */ + if (!(attr->flags & BPF_F_COMPARE) && + (attr->compare || attr->compare_offset || attr->compare_size)) + return -EINVAL; + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) @@ -1920,6 +1932,38 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } + if (attr->flags & BPF_F_COMPARE) { + bpfptr_t ucmp = make_bpfptr(attr->compare, uattr.is_kernel); + + off = attr->compare_offset; + size = attr->compare_size ?: map->value_size; + /* off + size must fit in value_size, overflow-safe */ + if (size > map->value_size || off > map->value_size - size) { + err = -EINVAL; + goto out; + } + if (!map->ops->map_delete_elem_cmp) { + err = -EOPNOTSUPP; + goto out; + } + /* Compare-and-delete reads the raw stored value. Maps whose value carries + * BTF-managed fields (bpf_spin_lock, bpf_timer, kptr, ...) have + * those bytes sanitised on lookup, so a raw compare would never + * match the caller's snapshot and could expose kernel-internal + * bytes. Reject such maps for now. + */ + if (!IS_ERR_OR_NULL(map->record)) { + err = -EOPNOTSUPP; + goto out; + } + compare = kvmemdup_bpfptr(ucmp, size); + if (IS_ERR(compare)) { + err = PTR_ERR(compare); + compare = NULL; + goto out; + } + } + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; @@ -1932,12 +1976,16 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) bpf_disable_instrumentation(); rcu_read_lock(); - err = map->ops->map_delete_elem(map, key); + if (compare) + err = map->ops->map_delete_elem_cmp(map, key, compare, off, size); + else + err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); bpf_enable_instrumentation(); if (!err) maybe_wait_bpf_programs(map); out: + kvfree(compare); kvfree(key); err_put: bpf_map_write_active_dec(map); -- 2.39.5 (Apple Git-154)

