Make default TCP default congestion control to a per namespace
value. The congestion control setting of new namespaces is inherited
from the root namespace. Modules are only autoloaded in the root namespace.

Signed-off-by: Stephen Hemminger <step...@networkplumber.org>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  6 ++--
 net/ipv4/fib_semantics.c   |  4 +--
 net/ipv4/sysctl_net_ipv4.c | 19 ++++++-----
 net/ipv4/tcp_cong.c        | 78 ++++++++++++++++++++++------------------------
 net/ipv4/tcp_ipv4.c        |  9 ++++++
 net/ipv6/route.c           |  3 +-
 7 files changed, 65 insertions(+), 55 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 379550f8124a..23ddfcfc8afe 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -155,6 +155,7 @@ struct netns_ipv4 {
        int sysctl_tcp_invalid_ratelimit;
        int sysctl_tcp_pacing_ss_ratio;
        int sysctl_tcp_pacing_ca_ratio;
+       const struct tcp_congestion_ops __rcu  *tcp_congestion_control;
        struct inet_timewait_death_row tcp_death_row;
        int sysctl_max_syn_backlog;
        int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index babfd4da1515..64d4099d41da 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1005,8 +1005,8 @@ void tcp_unregister_congestion_control(struct 
tcp_congestion_ops *type);
 void tcp_assign_congestion_control(struct sock *sk);
 void tcp_init_congestion_control(struct sock *sk);
 void tcp_cleanup_congestion_control(struct sock *sk);
-int tcp_set_default_congestion_control(const char *name);
-void tcp_get_default_congestion_control(char *name);
+int tcp_set_default_congestion_control(struct net *net, const char *name);
+void tcp_get_default_congestion_control(struct net *net, char *name);
 void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
@@ -1020,7 +1020,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 
acked);
 extern struct tcp_congestion_ops tcp_reno;
 
 struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
-u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca);
+u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
 #ifdef CONFIG_INET
 char *tcp_ca_get_name_by_key(u32 key, char *buffer);
 #else
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 589caaa90613..f04d944f8abe 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -710,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct 
fib_info *fi)
                        bool ecn_ca = false;
 
                        nla_strlcpy(tmp, nla, sizeof(tmp));
-                       val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+                       val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
                } else {
                        val = nla_get_u32(nla);
                }
@@ -1030,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct 
fib_config *cfg)
                        char tmp[TCP_CA_NAME_MAX];
 
                        nla_strlcpy(tmp, nla, sizeof(tmp));
-                       val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+                       val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
                        if (val == TCP_CA_UNSPEC)
                                return -EINVAL;
                } else {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a82b44038308..c97d9e614017 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -201,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, 
int write,
 static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
                                       void __user *buffer, size_t *lenp, 
loff_t *ppos)
 {
+       struct net *net = container_of(ctl->data, struct net,
+                                      ipv4.tcp_congestion_control);
        char val[TCP_CA_NAME_MAX];
        struct ctl_table tbl = {
                .data = val,
@@ -208,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table 
*ctl, int write,
        };
        int ret;
 
-       tcp_get_default_congestion_control(val);
+       tcp_get_default_congestion_control(net, val);
 
        ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
        if (write && ret == 0)
-               ret = tcp_set_default_congestion_control(val);
+               ret = tcp_set_default_congestion_control(net, val);
        return ret;
 }
 
@@ -463,12 +465,6 @@ static struct ctl_table ipv4_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
-       {
-               .procname       = "tcp_congestion_control",
-               .mode           = 0644,
-               .maxlen         = TCP_CA_NAME_MAX,
-               .proc_handler   = proc_tcp_congestion_control,
-       },
 #ifdef CONFIG_NETLABEL
        {
                .procname       = "cipso_cache_enable",
@@ -780,6 +776,13 @@ static struct ctl_table ipv4_net_table[] = {
        },
 #endif
        {
+               .procname       = "tcp_congestion_control",
+               .data           = &init_net.ipv4.tcp_congestion_control,
+               .mode           = 0644,
+               .maxlen         = TCP_CA_NAME_MAX,
+               .proc_handler   = proc_tcp_congestion_control,
+       },
+       {
                .procname       = "tcp_keepalive_time",
                .data           = &init_net.ipv4.sysctl_tcp_keepalive_time,
                .maxlen         = sizeof(int),
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 2f26124fd160..8c54cf1083a6 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -33,11 +33,13 @@ static struct tcp_congestion_ops *tcp_ca_find(const char 
*name)
 }
 
 /* Must be called with rcu lock held */
-static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char 
*name)
+static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
+                                                      const char *name)
 {
-       const struct tcp_congestion_ops *ca = tcp_ca_find(name);
+       struct tcp_congestion_ops *ca = tcp_ca_find(name);
+
 #ifdef CONFIG_MODULES
-       if (!ca && capable(CAP_NET_ADMIN)) {
+       if (!ca && net == &init_net && capable(CAP_NET_ADMIN)) {
                rcu_read_unlock();
                request_module("tcp_%s", name);
                rcu_read_lock();
@@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct 
tcp_congestion_ops *ca)
 }
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 
-u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
+u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
 {
        const struct tcp_congestion_ops *ca;
        u32 key = TCP_CA_UNSPEC;
@@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
        might_sleep();
 
        rcu_read_lock();
-       ca = __tcp_ca_find_autoload(name);
+       ca = tcp_ca_find_autoload(net, name);
        if (ca) {
                key = ca->key;
                *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
@@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
 /* Assign choice of congestion control. */
 void tcp_assign_congestion_control(struct sock *sk)
 {
+       struct net *net = sock_net(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
-       struct tcp_congestion_ops *ca;
+       const struct tcp_congestion_ops *ca;
 
        rcu_read_lock();
-       list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
-               if (likely(try_module_get(ca->owner))) {
-                       icsk->icsk_ca_ops = ca;
-                       goto out;
-               }
-               /* Fallback to next available. The last really
-                * guaranteed fallback is Reno from this list.
-                */
-       }
-out:
+       ca = rcu_dereference(net->ipv4.tcp_congestion_control);
+       if (unlikely(!try_module_get(ca->owner)))
+               ca = &tcp_reno;
+       icsk->icsk_ca_ops = ca;
        rcu_read_unlock();
-       memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
 
+       memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
        if (ca->flags & TCP_CONG_NEEDS_ECN)
                INET_ECN_xmit(sk);
        else
@@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk)
 }
 
 /* Used by sysctl to change default congestion control */
-int tcp_set_default_congestion_control(const char *name)
+int tcp_set_default_congestion_control(struct net *net, const char *name)
 {
        struct tcp_congestion_ops *ca;
-       int ret = -ENOENT;
+       const struct tcp_congestion_ops *prev;
+       int ret;
 
-       spin_lock(&tcp_cong_list_lock);
-       ca = tcp_ca_find(name);
-#ifdef CONFIG_MODULES
-       if (!ca && capable(CAP_NET_ADMIN)) {
-               spin_unlock(&tcp_cong_list_lock);
-
-               request_module("tcp_%s", name);
-               spin_lock(&tcp_cong_list_lock);
-               ca = tcp_ca_find(name);
-       }
-#endif
+       rcu_read_lock();
+       ca = tcp_ca_find_autoload(net, name);
+       if (!ca) {
+               ret = -ENOENT;
+       } else if (!try_module_get(ca->owner)) {
+               ret = -EBUSY;
+       } else {
+               prev = xchg(&net->ipv4.tcp_congestion_control, ca);
+               if (prev)
+                       module_put(prev->owner);
 
-       if (ca) {
-               ca->flags |= TCP_CONG_NON_RESTRICTED;   /* default is always 
allowed */
-               list_move(&ca->list, &tcp_cong_list);
+               ca->flags |= TCP_CONG_NON_RESTRICTED;
                ret = 0;
        }
-       spin_unlock(&tcp_cong_list_lock);
+       rcu_read_unlock();
 
        return ret;
 }
@@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name)
 /* Set default value from kernel configuration at bootup */
 static int __init tcp_congestion_default(void)
 {
-       return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
+       return tcp_set_default_congestion_control(&init_net,
+                                                 CONFIG_DEFAULT_TCP_CONG);
 }
 late_initcall(tcp_congestion_default);
 
@@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, 
size_t maxlen)
 }
 
 /* Get current default congestion control */
-void tcp_get_default_congestion_control(char *name)
+void tcp_get_default_congestion_control(struct net *net, char *name)
 {
-       struct tcp_congestion_ops *ca;
-       /* We will always have reno... */
-       BUG_ON(list_empty(&tcp_cong_list));
+       const struct tcp_congestion_ops *ca;
 
        rcu_read_lock();
-       ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
+       ca = rcu_dereference(net->ipv4.tcp_congestion_control);
        strncpy(name, ca->name, TCP_CA_NAME_MAX);
        rcu_read_unlock();
 }
@@ -351,12 +345,14 @@ int tcp_set_congestion_control(struct sock *sk, const 
char *name, bool load, boo
        if (!load)
                ca = tcp_ca_find(name);
        else
-               ca = __tcp_ca_find_autoload(name);
+               ca = tcp_ca_find_autoload(sock_net(sk), name);
+
        /* No change asking for existing value */
        if (ca == icsk->icsk_ca_ops) {
                icsk->icsk_ca_setsockopt = 1;
                goto out;
        }
+
        if (!ca) {
                err = -ENOENT;
        } else if (!load) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0162c577bb9c..d5361c0d007c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2430,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net)
 {
        int cpu;
 
+       module_put(net->ipv4.tcp_congestion_control->owner);
+
        for_each_possible_cpu(cpu)
                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
        free_percpu(net->ipv4.tcp_sk);
@@ -2515,6 +2517,13 @@ static int __net_init tcp_sk_init(struct net *net)
        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
 
+       /* Reno is always built in */
+       if (!net_eq(net, &init_net) &&
+           try_module_get(init_net.ipv4.tcp_congestion_control->owner))
+               net->ipv4.tcp_congestion_control = 
init_net.ipv4.tcp_congestion_control;
+       else
+               net->ipv4.tcp_congestion_control = &tcp_reno;
+
        return 0;
 fail:
        tcp_sk_exit(net);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 70d9659fc1e9..05eb7bc36156 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2378,6 +2378,7 @@ static int ip6_dst_gc(struct dst_ops *ops)
 static int ip6_convert_metrics(struct mx6_config *mxc,
                               const struct fib6_config *cfg)
 {
+       struct net *net = cfg->fc_nlinfo.nl_net;
        bool ecn_ca = false;
        struct nlattr *nla;
        int remaining;
@@ -2403,7 +2404,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
                        char tmp[TCP_CA_NAME_MAX];
 
                        nla_strlcpy(tmp, nla, sizeof(tmp));
-                       val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+                       val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
                        if (val == TCP_CA_UNSPEC)
                                goto err;
                } else {
-- 
2.11.0

Reply via email to