The commit is pushed to "branch-rh7-3.10.0-1160.119.1.vz7.224.x-ovz" and will appear at g...@bitbucket.org:openvz/vzkernel.git after rh7-3.10.0-1160.119.1.vz7.224.3 ------> commit ae4bea1d495a6601691bf52d3b02ec1dadb331e4 Author: Jozsef Kadlecsik <kad...@netfilter.org> Date: Wed Sep 25 17:35:45 2024 +0800
ms/netfilter: ipset: Fix race between namespace cleanup and gc in the list:set type Lion Ackermann reported that there is a race condition between namespace cleanup in ipset and the garbage collection of the list:set type. The namespace cleanup can destroy the list:set type of sets while the gc of the set type is waiting to run in rcu cleanup. The latter uses data from the destroyed set which thus leads use after free. The patch contains the following parts: - When destroying all sets, first remove the garbage collectors, then wait if needed and then destroy the sets. - Fix the badly ordered "wait then remove gc" for the destroy a single set case. - Fix the missing rcu locking in the list:set type in the userspace test case. - Use proper RCU list handlings in the list:set type. The patch depends on c1193d9bbbd3 (netfilter: ipset: Add list flush to cancel_gc). Fixes: 97f7cf1cd80e (netfilter: ipset: fix performance regression in swap operation) Reported-by: Lion Ackermann <nnam...@gmail.com> Tested-by: Lion Ackermann <nnam...@gmail.com> Signed-off-by: Jozsef Kadlecsik <kad...@netfilter.org> Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org> https://virtuozzo.atlassian.net/browse/PSBM-155867 (cherry picked from commit 4e7aaa6b82d63e8ddcbfb56b4fd3d014ca586f10) Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> ================= Patchset description: netfilter: ipset: Fix possible cause of memory corruption Patch [1] fixes possible race between swap/destroy and add/del/test. Here is is possible order of events when this race can lead to double free with kfree_rcu on already freed hash bucket: # Thread 1 +-> ip_set_add +-> set = ip_set_rcu_get(xt_net(par), index) < pause > # Thread 2 ipset swap ipset destroy +-> mtype_destroy +-> mtype_ahash_destroy +-> n = __ipset_dereference(hbucket(t, i)) +-> kfree(n) # Thread 1 < unpause > +-> ip_set_lock(set) +-> hash_net4_kadt +-> mtype_variant->adt[adt] +-> net4_kadt_add (mtype_add) +-> n = rcu_dereference_bh(hbucket(t, key)) +-> if (n->pos >= n->size) +-> old = n +-> if (old != ERR_PTR(-ENOENT)) +-> kfree_rcu(old, rcu) That can in it's turn lead to possible rcu free list corruption if this double fried memory is reused just after rcu_free and before actual rcu callback. note1: The patch [1] has a reproducer but, sadly, I was unable to reproduce the situation, even adding mdelay(100) to ip_set_add/del/test. note2: All other patches are fixups to the original fixing patch. note3: We don't have proof that this a fix to original issue, we only know that ipset hash buckets were several times seen double freed just before crash, which might indirectly indicate that the original problem is related to ipset hash buckets. Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> https://virtuozzo.atlassian.net/browse/PSBM-155867 Alexander Maltsev (1): netfilter: ipset: Add list flush to cancel_gc Eric Dumazet (1): netns: add pre_exit method to struct pernet_operations Jozsef Kadlecsik (5): netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test [1] netfilter: ipset: fix performance regression in swap operation netfilter: ipset: Missing gc cancellations fixed netfilter: ipset: Fix race between namespace cleanup and gc in the list:set type netfilter: ipset: Fix suspicious rcu_dereference_protected() --- net/netfilter/ipset/ip_set_core.c | 81 ++++++++++++++++++++--------------- net/netfilter/ipset/ip_set_list_set.c | 30 ++++++------- 2 files changed, 60 insertions(+), 51 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 41fb34167a15..61d457961708 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1028,23 +1028,50 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { .len = IPSET_MAXNAMELEN - 1 }, }; +/* In order to return quickly when destroying a single set, it is split + * into two stages: + * - Cancel garbage collector + * - Destroy the set itself via call_rcu() + */ + static void -ip_set_destroy_set(struct ip_set *set) +ip_set_destroy_set_rcu(struct rcu_head *head) { - pr_debug("set: %s\n", set->name); + struct ip_set *set = container_of(head, struct ip_set, rcu); - /* Must call it without holding any lock */ set->variant->destroy(set); module_put(set->type->me); kfree(set); } static void -ip_set_destroy_set_rcu(struct rcu_head *head) +_destroy_all_sets(struct ip_set_net *inst) { - struct ip_set *set = container_of(head, struct ip_set, rcu); + struct ip_set *set; + ip_set_id_t i; + bool need_wait = false; - ip_set_destroy_set(set); + /* First cancel gc's: set:list sets are flushed as well */ + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set) { + set->variant->cancel_gc(set); + if (set->type->features & IPSET_TYPE_NAME) + need_wait = true; + } + } + /* Must wait for flush to be really finished */ + if (need_wait) + rcu_barrier(); + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set) { + ip_set(inst, i) = NULL; + set->variant->destroy(set); + module_put(set->type->me); + kfree(set); + } + } } static int @@ -1060,11 +1087,10 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; - /* Commands are serialized and references are * protected by the ip_set_ref_lock. * External systems (i.e. xt_set) must call - * ip_set_put|get_nfnl_* functions, that way we + * ip_set_nfnl_get_* functions, that way we * can safely check references here. * * list:set timer can only decrement the reference @@ -1072,8 +1098,6 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, * without holding the lock. */ if (!attr[IPSET_ATTR_SETNAME]) { - /* Must wait for flush to be really finished in list:set */ - rcu_barrier(); read_lock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); @@ -1084,15 +1108,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, } inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); - for (i = 0; i < inst->ip_set_max; i++) { - s = ip_set(inst, i); - if (s) { - ip_set(inst, i) = NULL; - /* Must cancel garbage collectors */ - s->variant->cancel_gc(s); - ip_set_destroy_set(s); - } - } + _destroy_all_sets(inst); /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { @@ -1111,12 +1127,12 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, features = s->type->features; ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); + /* Must cancel garbage collectors */ + s->variant->cancel_gc(s); if (features & IPSET_TYPE_NAME) { /* Must wait for flush to be really finished */ rcu_barrier(); } - /* Must cancel garbage collectors */ - s->variant->cancel_gc(s); call_rcu(&s->rcu, ip_set_destroy_set_rcu); } return 0; @@ -2240,30 +2256,25 @@ ip_set_net_init(struct net *net) } static void __net_exit -ip_set_net_exit(struct net *net) +ip_set_net_pre_exit(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); - struct ip_set *set = NULL; - ip_set_id_t i; - inst->is_deleted = true; /* flag for ip_set_nfnl_put */ +} - nfnl_lock(NFNL_SUBSYS_IPSET); - for (i = 0; i < inst->ip_set_max; i++) { - set = ip_set(inst, i); - if (set) { - ip_set(inst, i) = NULL; - set->variant->cancel_gc(set); - ip_set_destroy_set(set); - } - } - nfnl_unlock(NFNL_SUBSYS_IPSET); +static void __net_exit +ip_set_net_exit(struct net *net) +{ + struct ip_set_net *inst = ip_set_pernet(net); + + _destroy_all_sets(inst); kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { .init = ip_set_net_init, + .pre_exit = ip_set_net_pre_exit, .exit = ip_set_net_exit, .id = &ip_set_net_id, .size = sizeof(struct ip_set_net) diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index dd0fa9ada492..845c9fd2b22f 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -83,7 +83,7 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb, struct set_elem *e; int ret; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -103,7 +103,7 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb, struct set_elem *e; int ret; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -192,9 +192,10 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct list_set *map = set->data; struct set_adt_elem *d = value; struct set_elem *e, *next, *prev = NULL; - int ret; + int ret = 0; - list_for_each_entry(e, &map->members, list) { + rcu_read_lock(); + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -205,6 +206,7 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (d->before == 0) { ret = 1; + goto out; } else if (d->before > 0) { next = list_next_entry(e, list); ret = !list_is_last(&e->list, &map->members) && @@ -212,9 +214,11 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, } else { ret = prev && prev->id == d->refid; } - return ret; + goto out; } - return 0; +out: + rcu_read_unlock(); + return ret; } static void @@ -243,7 +247,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, /* Find where to add the new entry */ n = prev = next = NULL; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -320,9 +324,9 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e, *next, *prev = NULL; + struct set_elem *e, *n, *next, *prev = NULL; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_safe(e, n, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -428,14 +432,8 @@ static void list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e, *n; - list_for_each_entry_safe(e, n, &map->members, list) { - list_del(&e->list); - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - kfree(e); - } + WARN_ON_ONCE(!list_empty(&map->members)); kfree(map); set->data = NULL; _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel