From: Gyutae Bae <[email protected]>

map_delete_elem() now honours BPF_F_COMPARE: it copies the expected
bytes, validates the compare window against value_size, and dispatches to a
new ->map_delete_elem_cmp().  The hash-map implementation, under the bucket
lock, memcmps the [off, off+size) window of the live value and unlinks only
on a match (mismatch -EBUSY, absent -ENOENT), making the compare and the
delete atomic w.r.t. every bucket-lock holder.  Only BPF_MAP_TYPE_HASH
wires up the op; every other map type lacks it and returns -EOPNOTSUPP.

Atomicity boundary: the compare is read under the bucket lock, so it is
atomic against every other bucket-lock holder (concurrent deletes,
element-replacing updates).  It is NOT mutual exclusion against a BPF
program that writes the value in place via the pointer from
bpf_map_lookup_elem(), which takes no bucket lock; against such a writer
the compare may observe a torn value within the locked section.  For a
meaningful result the compared region should be a synchronization variable
-- written with a single aligned store such as a monotonic revision, or
replaced via an update -- not a multi-field blob mutated in place.

It compares the raw stored value, so maps whose value carries BTF-managed
fields (bpf_spin_lock, bpf_timer, kptr, ...) are also rejected with
-EOPNOTSUPP: those bytes are sanitised on lookup, so a raw compare would
never match the caller's snapshot and could expose kernel-internal bytes.

BPF_MAP_DELETE_ELEM_LAST_FIELD now reaches compare_size, so the compare*
fields pass CHECK_ATTR even without BPF_F_COMPARE.  Reject that combination
with -EINVAL: a dropped flag must not silently turn a compare-and-delete
into an unconditional delete.

Co-developed-by: Minsu Jeon <[email protected]>
Signed-off-by: Minsu Jeon <[email protected]>
Co-developed-by: Siwan Kim <[email protected]>
Signed-off-by: Siwan Kim <[email protected]>
Co-developed-by: Jonghyeon Kim <[email protected]>
Signed-off-by: Jonghyeon Kim <[email protected]>
Signed-off-by: Gyutae Bae <[email protected]>
---
 include/linux/bpf.h  |  2 ++
 kernel/bpf/hashtab.c | 39 ++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c | 54 +++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7719f6528445..2e858272f7a7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -110,6 +110,8 @@ struct bpf_map_ops {
        void *(*map_lookup_elem)(struct bpf_map *map, void *key);
        long (*map_update_elem)(struct bpf_map *map, void *key, void *value, 
u64 flags);
        long (*map_delete_elem)(struct bpf_map *map, void *key);
+       long (*map_delete_elem_cmp)(struct bpf_map *map, void *key,
+                                   const void *compare, u32 off, u32 size);
        long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
        long (*map_pop_elem)(struct bpf_map *map, void *value);
        long (*map_peek_elem)(struct bpf_map *map, void *value);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 9f394e1aa2e8..8fc3ab938b9e 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1534,6 +1534,44 @@ static long htab_map_delete_elem(struct bpf_map *map, 
void *key)
        return ret;
 }
 
+static long htab_map_delete_elem_cmp(struct bpf_map *map, void *key,
+                                    const void *compare, u32 off, u32 size)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct hlist_nulls_head *head;
+       struct bucket *b;
+       struct htab_elem *l;
+       unsigned long flags;
+       u32 hash, key_size;
+       int ret;
+
+       WARN_ON_ONCE(!bpf_rcu_lock_held());
+
+       key_size = map->key_size;
+
+       hash = htab_map_hash(key, key_size, htab->hashrnd);
+       b = __select_bucket(htab, hash);
+       head = &b->head;
+
+       ret = htab_lock_bucket(b, &flags);
+       if (ret)
+               return ret;
+
+       l = lookup_elem_raw(head, hash, key, key_size);
+       if (!l)
+               ret = -ENOENT;
+       else if (memcmp(htab_elem_value(l, key_size) + off, compare, size) != 0)
+               ret = -EBUSY;
+       else
+               hlist_nulls_del_rcu(&l->hash_node);
+
+       htab_unlock_bucket(b, flags);
+
+       if (!ret)
+               free_htab_elem(htab, l);        /* ret==0 implies we unlinked l 
*/
+       return ret;
+}
+
 static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 {
        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
@@ -2366,6 +2404,7 @@ const struct bpf_map_ops htab_map_ops = {
        .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
        .map_update_elem = htab_map_update_elem,
        .map_delete_elem = htab_map_delete_elem,
+       .map_delete_elem_cmp = htab_map_delete_elem_cmp,
        .map_gen_lookup = htab_map_gen_lookup,
        .map_seq_show_elem = htab_map_seq_show_elem,
        .map_set_for_each_callback_args = map_set_for_each_callback_args,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b44106c8ea75..42fc432f82e1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1892,18 +1892,30 @@ static int map_update_elem(union bpf_attr *attr, 
bpfptr_t uattr)
        return err;
 }
 
-#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD compare_size
 
 static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
 {
        bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
        struct bpf_map *map;
-       void *key;
+       void *key, *compare = NULL;
+       u32 off = 0, size = 0;
        int err;
 
        if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
                return -EINVAL;
 
+       if (attr->flags & ~BPF_F_COMPARE)
+               return -EINVAL;
+
+       /* The compare* fields are meaningful only with BPF_F_COMPARE.  Reject 
them
+        * when the flag is absent so a dropped BPF_F_COMPARE cannot silently 
turn a
+        * compare-and-delete into an unconditional delete.
+        */
+       if (!(attr->flags & BPF_F_COMPARE) &&
+           (attr->compare || attr->compare_offset || attr->compare_size))
+               return -EINVAL;
+
        CLASS(fd, f)(attr->map_fd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
@@ -1920,6 +1932,38 @@ static int map_delete_elem(union bpf_attr *attr, 
bpfptr_t uattr)
                goto err_put;
        }
 
+       if (attr->flags & BPF_F_COMPARE) {
+               bpfptr_t ucmp = make_bpfptr(attr->compare, uattr.is_kernel);
+
+               off = attr->compare_offset;
+               size = attr->compare_size ?: map->value_size;
+               /* off + size must fit in value_size, overflow-safe */
+               if (size > map->value_size || off > map->value_size - size) {
+                       err = -EINVAL;
+                       goto out;
+               }
+               if (!map->ops->map_delete_elem_cmp) {
+                       err = -EOPNOTSUPP;
+                       goto out;
+               }
+               /* Compare-and-delete reads the raw stored value.  Maps whose 
value carries
+                * BTF-managed fields (bpf_spin_lock, bpf_timer, kptr, ...) have
+                * those bytes sanitised on lookup, so a raw compare would never
+                * match the caller's snapshot and could expose kernel-internal
+                * bytes.  Reject such maps for now.
+                */
+               if (!IS_ERR_OR_NULL(map->record)) {
+                       err = -EOPNOTSUPP;
+                       goto out;
+               }
+               compare = kvmemdup_bpfptr(ucmp, size);
+               if (IS_ERR(compare)) {
+                       err = PTR_ERR(compare);
+                       compare = NULL;
+                       goto out;
+               }
+       }
+
        if (bpf_map_is_offloaded(map)) {
                err = bpf_map_offload_delete_elem(map, key);
                goto out;
@@ -1932,12 +1976,16 @@ static int map_delete_elem(union bpf_attr *attr, 
bpfptr_t uattr)
 
        bpf_disable_instrumentation();
        rcu_read_lock();
-       err = map->ops->map_delete_elem(map, key);
+       if (compare)
+               err = map->ops->map_delete_elem_cmp(map, key, compare, off, 
size);
+       else
+               err = map->ops->map_delete_elem(map, key);
        rcu_read_unlock();
        bpf_enable_instrumentation();
        if (!err)
                maybe_wait_bpf_programs(map);
 out:
+       kvfree(compare);
        kvfree(key);
 err_put:
        bpf_map_write_active_dec(map);
-- 
2.39.5 (Apple Git-154)


Reply via email to