The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh9-5.14.0-4.vz9.10.12 ------> commit ef7de621cc508f32963c85edc6fd5d5df362940f Author: Konstantin Khorenko <khore...@virtuozzo.com> Date: Wed Oct 20 11:39:33 2021 +0300
ve/netfilter: Implement pernet net->ct.max / virtualize "nf_conntrack_max" sysctl Rebasing and splitting netfilters sybsystem (port 66-diff-ve-net-netfilter-combined). Part 1. https://jira.sw.ru/browse/PSBM-18322 Signed-off-by: Kirill Tkhai <ktk...@parallels.com> (cherry picked from vz7 commit c34a99c00f9d ("ve/netfilter: Implement pernet net->ct.max / virtualize "nf_conntrack_max" sysctl")) VZ 8 rebase part https://jira.sw.ru/browse/PSBM-127783 Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalit...@virtuozzo.com> +++ ve/nf_conntrack: expose "nf_conntrack_max" in containers Series: This series brings to vz7 all the nf_conntrack sysctl's, which are available in vz6. https://jira.sw.ru/browse/PSBM-40044 This sysctl table contains only one entry: "/proc/sys/net/nf_conntrack_max". This is now visible inside ct. However, have to say, that "/proc/sys/net/netfilter/nf_conntrack_max" and friends (despite on they are containerized) arebehind init_user_ns. Signed-off-by: Stanislav Kinsburskiy <skinsbur...@virtuozzo.com> Reviewed-by: Kirill Tkhai <ktk...@virtuozzo.com> (cherry picked from vz7 commit 9d3a8c692557 ("ve/nf_conntrack: expose "nf_conntrack_max" in containers")) VZ 8 rebase part https://jira.sw.ru/browse/PSBM-127783 Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalit...@virtuozzo.com> Ported vz8 commit 4c888c1fa5e4 ("ve/netfilter: Implement pernet net->ct.max / virtualize "nf_conntrack_max" sysctl") The policy of using init_net's value as the upper limit for settable value in other namespaces is not fully relable: - if init_net's value is changed to a lower value, some namespaces can end having a value above the (new) limit, - "zero=unlimited" semantics is not honoured. Because of that, removed set-time limit, instead check at runtime against both init_net's limit and per-namespace limit. Signed-off-by: Nikita Yushchenko <nikita.yushche...@virtuozzo.com> --- include/net/netfilter/nf_conntrack.h | 4 +- net/netfilter/nf_conntrack_core.c | 35 ++++++++++++---- net/netfilter/nf_conntrack_netlink.c | 9 +++-- net/netfilter/nf_conntrack_standalone.c | 72 +++++++++++++++++++++------------ 4 files changed, 81 insertions(+), 39 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 42dd967fdfbb..81983ac7e28a 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -46,6 +46,7 @@ union nf_conntrack_expect_proto { struct nf_conntrack_net { /* only used when new connection is allocated: */ atomic_t count; + unsigned int max; unsigned int expect_count; unsigned int expect_max; u8 sysctl_auto_assign_helper; @@ -57,6 +58,7 @@ struct nf_conntrack_net { unsigned int users_bridge; #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_header; + struct ctl_table_header *parent_sysctl_header; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS struct delayed_work ecache_dwork; @@ -314,7 +316,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize); extern struct hlist_nulls_head *nf_conntrack_hash; extern unsigned int nf_conntrack_htable_size; extern seqcount_spinlock_t nf_conntrack_generation; -extern unsigned int nf_conntrack_max; /* must be called with rcu read lock held */ static inline void @@ -340,6 +341,7 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl); u32 nf_ct_get_id(const struct nf_conn *ct); u32 nf_conntrack_count(const struct net *net); +u32 nf_conntrack_max(const struct net *net); static inline void nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 68209532f0be..8dc77131f2bc 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -183,11 +183,11 @@ static void nf_conntrack_all_unlock(void) unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); -unsigned int nf_conntrack_max __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_max); seqcount_spinlock_t nf_conntrack_generation __read_mostly; static unsigned int nf_conntrack_hash_rnd __read_mostly; +static unsigned int initial_nf_conntrack_max __ro_after_init; + static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, const struct net *net) { @@ -1361,14 +1361,15 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) static void gc_worker(struct work_struct *work) { unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION; - unsigned int i, hashsz, nf_conntrack_max95 = 0; + unsigned int i, hashsz, init_nf_conntrack_max95 = 0; unsigned long next_run = GC_SCAN_INTERVAL; struct conntrack_gc_work *gc_work; gc_work = container_of(work, struct conntrack_gc_work, dwork.work); i = gc_work->next_bucket; if (gc_work->early_drop) - nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; + init_nf_conntrack_max95 = + nf_ct_pernet(&init_net)->max / 100u * 95u; do { struct nf_conntrack_tuple_hash *h; @@ -1387,6 +1388,8 @@ static void gc_worker(struct work_struct *work) hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { struct nf_conntrack_net *cnet; struct net *net; + unsigned int nf_conntrack_max95 = 0; + unsigned int ct_count; tmp = nf_ct_tuplehash_to_ctrack(h); @@ -1400,12 +1403,21 @@ static void gc_worker(struct work_struct *work) continue; } - if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) + if (gc_worker_skip_ct(tmp)) continue; net = nf_ct_net(tmp); cnet = nf_ct_pernet(net); - if (atomic_read(&cnet->count) < nf_conntrack_max95) + if (gc_work->early_drop) + nf_conntrack_max95 = cnet->max / 100u * 95u; + + /* skip if cnet->count is small enough againt both + * global and per-ns limit */ + ct_count = atomic_read(&cnet->count); + if ((nf_conntrack_max95 == 0 || + ct_count < nf_conntrack_max95) && + (init_nf_conntrack_max95 == 0 || + ct_count < init_nf_conntrack_max95)) continue; /* need to take reference to avoid possible races */ @@ -1469,13 +1481,15 @@ __nf_conntrack_alloc(struct net *net, gfp_t gfp, u32 hash) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); + struct nf_conntrack_net *init_cnet = nf_ct_pernet(&init_net); unsigned int ct_count; struct nf_conn *ct; /* We don't want any race condition at early drop stage */ ct_count = atomic_inc_return(&cnet->count); - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { + if ((cnet->max && unlikely(ct_count > cnet->max)) || + (init_cnet->max && unlikely(ct_count > init_cnet->max))) { if (!early_drop(net, hash)) { if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; @@ -2624,7 +2638,7 @@ int nf_conntrack_init_start(void) if (!nf_conntrack_hash) return -ENOMEM; - nf_conntrack_max = max_factor * nf_conntrack_htable_size; + initial_nf_conntrack_max = max_factor * nf_conntrack_htable_size; nf_conntrack_cachep = kmem_cache_create("nf_conntrack", sizeof(struct nf_conn), @@ -2726,6 +2740,11 @@ int nf_conntrack_init_net(struct net *net) BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); atomic_set(&cnet->count, 0); + if (net == &init_net) + cnet->max = initial_nf_conntrack_max; + else + cnet->max = nf_ct_pernet(&init_net)->max; + net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); if (!net->ct.pcpu_lists) goto err_stat; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e81af33b233b..b741d9ef5aa9 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2542,7 +2542,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct net *net) { unsigned int flags = portid ? NLM_F_MULTI : 0, event; - unsigned int nr_conntracks; + unsigned int conntrack_count, conntrack_max; struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS); @@ -2551,11 +2551,12 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, if (!nlh) goto nlmsg_failure; - nr_conntracks = nf_conntrack_count(net); - if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) + conntrack_count = nf_conntrack_count(net); + if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(conntrack_count))) goto nla_put_failure; - if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max))) + conntrack_max = nf_conntrack_max(net); + if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(conntrack_max))) goto nla_put_failure; nlmsg_end(skb, nlh); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9340a3c993f0..7085d1a94298 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -518,6 +518,14 @@ u32 nf_conntrack_count(const struct net *net) } EXPORT_SYMBOL_GPL(nf_conntrack_count); +u32 nf_conntrack_max(const struct net *net) +{ + const struct nf_conntrack_net *cnet = nf_ct_pernet(net); + + return cnet->max; +} +EXPORT_SYMBOL_GPL(nf_conntrack_max); + /* Sysctl support */ #ifdef CONFIG_SYSCTL @@ -545,8 +553,6 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write, return ret; } -static struct ctl_table_header *nf_ct_netfilter_header; - enum nf_ct_sysctl_index { NF_SYSCTL_CT_MAX, NF_SYSCTL_CT_COUNT, @@ -621,7 +627,6 @@ enum nf_ct_sysctl_index { static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", - .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -961,10 +966,16 @@ static struct ctl_table nf_ct_sysctl_table[] = { {} }; -static struct ctl_table nf_ct_netfilter_table[] = { - { +enum nf_ct_parent_sysctl_index { + NF_PARENT_SYSCTL_CT_MAX, + __NF_PARENT_SYSCTL_CT_LAST_SYSCTL, +}; + +#define NF_PARENT_SYSCTL_CT_LAST_SYSCTL (__NF_PARENT_SYSCTL_CT_LAST_SYSCTL + 1) + +static struct ctl_table nf_ct_parent_sysctl_table[] = { + [NF_PARENT_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", - .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -1068,7 +1079,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_udp_net *un = nf_udp_pernet(net); - struct ctl_table *table; + struct ctl_table *table, *parent_table; BUILD_BUG_ON(ARRAY_SIZE(nf_ct_sysctl_table) != NF_SYSCTL_CT_LAST_SYSCTL); @@ -1077,6 +1088,8 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) if (!table) return -ENOMEM; + table[NF_SYSCTL_CT_MAX].data = &cnet->max; + table[NF_SYSCTL_CT_COUNT].data = &cnet->count; table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; @@ -1105,17 +1118,35 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) /* Don't allow non-init_net ns to alter global sysctls */ if (!net_eq(&init_net, net)) { - table[NF_SYSCTL_CT_MAX].mode = 0444; table[NF_SYSCTL_CT_BUCKETS].mode = 0444; } cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!cnet->sysctl_header) - goto out_unregister_netfilter; + goto out_free; + + BUILD_BUG_ON(ARRAY_SIZE(nf_ct_parent_sysctl_table) != + NF_PARENT_SYSCTL_CT_LAST_SYSCTL); + + parent_table = kmemdup(nf_ct_parent_sysctl_table, + sizeof(nf_ct_parent_sysctl_table), GFP_KERNEL); + if (!parent_table) + goto out_unregister; + + parent_table[NF_PARENT_SYSCTL_CT_MAX].data = &cnet->max; + + cnet->parent_sysctl_header = register_net_sysctl(net, "net", + parent_table); + if (!cnet->parent_sysctl_header) + goto out_free_parent; return 0; -out_unregister_netfilter: +out_free_parent: + kfree(parent_table); +out_unregister: + unregister_net_sysctl_table(cnet->sysctl_header); +out_free: kfree(table); return -ENOMEM; } @@ -1123,7 +1154,11 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) static void nf_conntrack_standalone_fini_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); - struct ctl_table *table; + struct ctl_table *table, *parent_table; + + parent_table = cnet->parent_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(cnet->parent_sysctl_header); + kfree(parent_table); table = cnet->sysctl_header->ctl_table_arg; unregister_net_sysctl_table(cnet->sysctl_header); @@ -1210,14 +1245,6 @@ static int __init nf_conntrack_standalone_init(void) BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER); #ifdef CONFIG_SYSCTL - nf_ct_netfilter_header = - register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); - if (!nf_ct_netfilter_header) { - pr_err("nf_conntrack: can't register to sysctl.\n"); - ret = -ENOMEM; - goto out_sysctl; - } - nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif @@ -1229,10 +1256,6 @@ static int __init nf_conntrack_standalone_init(void) return 0; out_pernet: -#ifdef CONFIG_SYSCTL - unregister_net_sysctl_table(nf_ct_netfilter_header); -out_sysctl: -#endif nf_conntrack_cleanup_end(); out_start: return ret; @@ -1242,9 +1265,6 @@ static void __exit nf_conntrack_standalone_fini(void) { nf_conntrack_cleanup_start(); unregister_pernet_subsys(&nf_conntrack_net_ops); -#ifdef CONFIG_SYSCTL - unregister_net_sysctl_table(nf_ct_netfilter_header); -#endif nf_conntrack_cleanup_end(); } _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel