From: Jozsef Kadlecsik <kad...@netfilter.org>

The patch "netfilter: ipset: fix race condition between swap/destroy
and kernel side add/del/test", commit 28628fa9 fixes a race condition.
But the synchronize_rcu() added to the swap function unnecessarily slows
it down: it can safely be moved to destroy and use call_rcu() instead.

Eric Dumazet pointed out that simply calling the destroy functions as
rcu callback does not work: sets with timeout use garbage collectors
which need cancelling at destroy which can wait. Therefore the destroy
functions are split into two: cancelling garbage collectors safely at
executing the command received by netlink and moving the remaining
part only into the rcu callback.

Link: 
https://lore.kernel.org/lkml/c0829b10-eaa6-4809-874e-e1e9c05a8...@automattic.com/
Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy 
and kernel side add/del/test")
Reported-by: Ale Crismani <ale.crism...@automattic.com>
Reported-by: David Wang <00107...@163.com>
Tested-by: David Wang <00107...@163.com>
Signed-off-by: Jozsef Kadlecsik <kad...@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>

https://virtuozzo.atlassian.net/browse/PSBM-155867
(cherry picked from commit 97f7cf1cd80eeed3b7c808b7c12463295c751001)
Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com>
---
 include/linux/netfilter/ipset/ip_set.h  |  4 +++
 net/netfilter/ipset/ip_set_bitmap_gen.h | 14 ++++++++--
 net/netfilter/ipset/ip_set_core.c       | 37 +++++++++++++++++++------
 net/netfilter/ipset/ip_set_hash_gen.h   | 15 ++++++++--
 net/netfilter/ipset/ip_set_list_set.c   | 13 +++++++--
 5 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set.h 
b/include/linux/netfilter/ipset/ip_set.h
index 471363b8862fe..3288022198572 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -191,6 +191,8 @@ struct ip_set_type_variant {
        /* Return true if "b" set is the same as "a"
         * according to the create set parameters */
        bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
+       /* Cancel ongoing garbage collectors before destroying the set*/
+       void (*cancel_gc)(struct ip_set *set);
        /* Region-locking is used */
        bool region_lock;
 };
@@ -239,6 +241,8 @@ extern void ip_set_type_unregister(struct ip_set_type 
*set_type);
 
 /* A generic IP set */
 struct ip_set {
+       /* For call_rcu in destroy */
+       struct rcu_head rcu;
        /* The name of the set */
        char name[IPSET_MAXNAMELEN];
        /* Lock protecting the set data */
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h 
b/net/netfilter/ipset/ip_set_bitmap_gen.h
index af480ffefaf32..2518258c6e674 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -32,6 +32,7 @@
 #define mtype_del              IPSET_TOKEN(MTYPE, _del)
 #define mtype_list             IPSET_TOKEN(MTYPE, _list)
 #define mtype_gc               IPSET_TOKEN(MTYPE, _gc)
+#define mtype_cancel_gc                IPSET_TOKEN(MTYPE, _cancel_gc)
 #define mtype                  MTYPE
 
 #define get_ext(set, map, id)  ((map)->extensions + ((set)->dsize * (id)))
@@ -61,9 +62,6 @@ mtype_destroy(struct ip_set *set)
 {
        struct mtype *map = set->data;
 
-       if (SET_WITH_TIMEOUT(set))
-               del_timer_sync(&map->gc);
-
        if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
                mtype_ext_cleanup(set);
        ip_set_free(map->members);
@@ -292,6 +290,15 @@ mtype_gc(struct timer_list *t)
        add_timer(&map->gc);
 }
 
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+       struct mtype *map = set->data;
+
+       if (SET_WITH_TIMEOUT(set))
+               del_timer_sync(&map->gc);
+}
+
 static const struct ip_set_type_variant mtype = {
        .kadt   = mtype_kadt,
        .uadt   = mtype_uadt,
@@ -305,6 +312,7 @@ static const struct ip_set_type_variant mtype = {
        .head   = mtype_head,
        .list   = mtype_list,
        .same_set = mtype_same_set,
+       .cancel_gc = mtype_cancel_gc,
 };
 
 #endif /* __IP_SET_BITMAP_IP_GEN_H */
diff --git a/net/netfilter/ipset/ip_set_core.c 
b/net/netfilter/ipset/ip_set_core.c
index ca30e6495fc74..7cf9da0881d50 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1038,6 +1038,14 @@ ip_set_destroy_set(struct ip_set *set)
        kfree(set);
 }
 
+static void
+ip_set_destroy_set_rcu(struct rcu_head *head)
+{
+       struct ip_set *set = container_of(head, struct ip_set, rcu);
+
+       ip_set_destroy_set(set);
+}
+
 static int
 ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
               const struct nlmsghdr *nlh,
@@ -1051,8 +1059,6 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
        if (unlikely(protocol_min_failed(attr)))
                return -IPSET_ERR_PROTOCOL;
 
-       /* Must wait for flush to be really finished in list:set */
-       rcu_barrier();
 
        /* Commands are serialized and references are
         * protected by the ip_set_ref_lock.
@@ -1064,8 +1070,10 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
         * counter, so if it's already zero, we can proceed
         * without holding the lock.
         */
-       read_lock_bh(&ip_set_ref_lock);
        if (!attr[IPSET_ATTR_SETNAME]) {
+               /* Must wait for flush to be really finished in list:set */
+               rcu_barrier();
+               read_lock_bh(&ip_set_ref_lock);
                for (i = 0; i < inst->ip_set_max; i++) {
                        s = ip_set(inst, i);
                        if (s && (s->ref || s->ref_netlink)) {
@@ -1079,12 +1087,17 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
                        s = ip_set(inst, i);
                        if (s) {
                                ip_set(inst, i) = NULL;
+                               /* Must cancel garbage collectors */
+                               s->variant->cancel_gc(s);
                                ip_set_destroy_set(s);
                        }
                }
                /* Modified by ip_set_destroy() only, which is serialized */
                inst->is_destroyed = false;
        } else {
+               u16 features = 0;
+
+               read_lock_bh(&ip_set_ref_lock);
                s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
                                    &i);
                if (!s) {
@@ -1094,10 +1107,16 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
                        ret = -IPSET_ERR_BUSY;
                        goto out;
                }
+               features = s->type->features;
                ip_set(inst, i) = NULL;
                read_unlock_bh(&ip_set_ref_lock);
-
-               ip_set_destroy_set(s);
+               if (features & IPSET_TYPE_NAME) {
+                       /* Must wait for flush to be really finished  */
+                       rcu_barrier();
+               }
+               /* Must cancel garbage collectors */
+               s->variant->cancel_gc(s);
+               call_rcu(&s->rcu, ip_set_destroy_set_rcu);
        }
        return 0;
 out:
@@ -1256,9 +1275,6 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
        ip_set(inst, to_id) = from;
        write_unlock_bh(&ip_set_ref_lock);
 
-       /* Make sure all readers of the old set pointers are completed. */
-       synchronize_rcu();
-
        return 0;
 }
 
@@ -2285,8 +2301,11 @@ ip_set_fini(void)
 {
        nf_unregister_sockopt(&so_set);
        nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-
        unregister_pernet_subsys(&ip_set_net_ops);
+
+       /* Wait for call_rcu() in destroy */
+       rcu_barrier();
+
        pr_debug("these are the famous last words\n");
 }
 
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h 
b/net/netfilter/ipset/ip_set_hash_gen.h
index 596bb2662b0e9..9c6383592c204 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -253,6 +253,7 @@ htable_bits(u32 hashsize)
 #undef mtype_gc_do
 #undef mtype_gc
 #undef mtype_gc_init
+#undef mtype_cancel_gc
 #undef mtype_variant
 #undef mtype_data_match
 
@@ -297,6 +298,7 @@ htable_bits(u32 hashsize)
 #define mtype_gc_do            IPSET_TOKEN(MTYPE, _gc_do)
 #define mtype_gc               IPSET_TOKEN(MTYPE, _gc)
 #define mtype_gc_init          IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_cancel_gc                IPSET_TOKEN(MTYPE, _cancel_gc)
 #define mtype_variant          IPSET_TOKEN(MTYPE, _variant)
 #define mtype_data_match       IPSET_TOKEN(MTYPE, _data_match)
 
@@ -482,9 +484,6 @@ mtype_destroy(struct ip_set *set)
        struct htype *h = set->data;
        struct list_head *l, *lt;
 
-       if (SET_WITH_TIMEOUT(set))
-               cancel_delayed_work_sync(&h->gc.dwork);
-
        mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
        list_for_each_safe(l, lt, &h->ad) {
                list_del(l);
@@ -631,6 +630,15 @@ mtype_gc_init(struct htable_gc *gc)
        queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
 }
 
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+       struct htype *h = set->data;
+
+       if (SET_WITH_TIMEOUT(set))
+               cancel_delayed_work_sync(&h->gc.dwork);
+}
+
 static int
 mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
          struct ip_set_ext *mext, u32 flags);
@@ -1447,6 +1455,7 @@ static const struct ip_set_type_variant mtype_variant = {
        .uref   = mtype_uref,
        .resize = mtype_resize,
        .same_set = mtype_same_set,
+       .cancel_gc = mtype_cancel_gc,
        .region_lock = true,
 };
 
diff --git a/net/netfilter/ipset/ip_set_list_set.c 
b/net/netfilter/ipset/ip_set_list_set.c
index 8da228da53ae9..b9dc3063e0d44 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -430,9 +430,6 @@ list_set_destroy(struct ip_set *set)
        struct list_set *map = set->data;
        struct set_elem *e, *n;
 
-       if (SET_WITH_TIMEOUT(set))
-               del_timer_sync(&map->gc);
-
        list_for_each_entry_safe(e, n, &map->members, list) {
                list_del(&e->list);
                ip_set_put_byindex(map->net, e->id);
@@ -549,6 +546,15 @@ list_set_same_set(const struct ip_set *a, const struct 
ip_set *b)
               a->extensions == b->extensions;
 }
 
+static void
+list_set_cancel_gc(struct ip_set *set)
+{
+       struct list_set *map = set->data;
+
+       if (SET_WITH_TIMEOUT(set))
+               del_timer_sync(&map->gc);
+}
+
 static const struct ip_set_type_variant set_variant = {
        .kadt   = list_set_kadt,
        .uadt   = list_set_uadt,
@@ -562,6 +568,7 @@ static const struct ip_set_type_variant set_variant = {
        .head   = list_set_head,
        .list   = list_set_list,
        .same_set = list_set_same_set,
+       .cancel_gc = list_set_cancel_gc,
 };
 
 static void
-- 
2.46.0

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to