Register pernet subsys init/stop functions that will set up and tear down per-net RDS-TCP listen endpoints. Unregister pernet subusys functions on 'modprobe -r' to clean up these end points.
Enable keepalive on both accept and connect socket endpoints. The keepalive timer expiration will ensure that cleanup_net() will eventually complete, allowing the pernet ->exit to be invoked. Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com> --- net/rds/tcp.c | 112 ++++++++++++++++++++++++++++++++++++++++++------ net/rds/tcp.h | 7 ++- net/rds/tcp_connect.c | 6 ++- net/rds/tcp_listen.c | 38 ++++------------- 4 files changed, 115 insertions(+), 48 deletions(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 98f5de3..fadf1a1 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -35,6 +35,8 @@ #include <linux/in.h> #include <linux/module.h> #include <net/tcp.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include "rds.h" #include "tcp.h" @@ -250,16 +252,32 @@ static void rds_tcp_destroy_conns(void) } } -static void rds_tcp_exit(void) +static void rds_tcp_destroy_conns_for_net(struct net *net) { - rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); - rds_tcp_listen_stop(); - rds_tcp_destroy_conns(); - rds_trans_unregister(&rds_tcp_transport); - rds_tcp_recv_exit(); - kmem_cache_destroy(rds_tcp_conn_slab); + struct rds_tcp_connection *tc, *_tc; + struct list_head tmp_list; + + BUG_ON(!net); + INIT_LIST_HEAD(&tmp_list); + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&rds_tcp_conn_lock); + list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { + struct net *c_net = read_pnet(&tc->conn->c_net); + + if (net == c_net) { + list_del(&tc->t_tcp_node); + list_add_tail(&tc->t_tcp_node, &tmp_list); + } + } + spin_unlock_irq(&rds_tcp_conn_lock); + list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { + if (tc->conn->c_passive) + rds_conn_destroy(tc->conn->c_passive); + rds_conn_destroy(tc->conn); + } } -module_exit(rds_tcp_exit); + +static void rds_tcp_exit(void); struct rds_transport rds_tcp_transport = { .laddr_check = rds_tcp_laddr_check, @@ -281,6 +299,73 @@ struct rds_transport rds_tcp_transport = { .t_prefer_loopback = 1, }; +static int rds_tcp_netid; + +/* per-network namespace private data for this module */ +struct rds_tcp_net { + struct socket *rds_tcp_listen_sock; + struct work_struct rds_tcp_accept_w; +}; + +static void rds_tcp_accept_worker(struct work_struct *work) +{ + struct rds_tcp_net *rtn = container_of(work, + struct rds_tcp_net, + rds_tcp_accept_w); + + while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0) + cond_resched(); +} + +void rds_tcp_accept_work(struct sock *sk) +{ + struct net *net = sock_net(sk); + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + queue_work(rds_wq, &rtn->rds_tcp_accept_w); +} + +static __net_init int rds_tcp_init_net(struct net *net) +{ + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); + if (!rtn->rds_tcp_listen_sock) { + pr_warn("could not set up listen sock\n"); + return -EAFNOSUPPORT; + } + INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); + return 0; +} + +static void __net_exit rds_tcp_exit_net(struct net *net) +{ + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + rds_tcp_listen_stop(rtn->rds_tcp_listen_sock); + rtn->rds_tcp_listen_sock = NULL; + flush_work(&rtn->rds_tcp_accept_w); + rds_tcp_destroy_conns_for_net(net); +} + +static struct pernet_operations rds_tcp_net_ops = { + .init = rds_tcp_init_net, + .exit = rds_tcp_exit_net, + .id = &rds_tcp_netid, + .size = sizeof(struct rds_tcp_net), +}; + +static void rds_tcp_exit(void) +{ + rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + unregister_pernet_subsys(&rds_tcp_net_ops); + rds_tcp_destroy_conns(); + rds_trans_unregister(&rds_tcp_transport); + rds_tcp_recv_exit(); + kmem_cache_destroy(rds_tcp_conn_slab); +} +module_exit(rds_tcp_exit); + static int rds_tcp_init(void) { int ret; @@ -293,6 +378,10 @@ static int rds_tcp_init(void) goto out; } + ret = register_pernet_subsys(&rds_tcp_net_ops); + if (ret) + goto out_slab; + ret = rds_tcp_recv_init(); if (ret) goto out_slab; @@ -301,19 +390,14 @@ static int rds_tcp_init(void) if (ret) goto out_recv; - ret = rds_tcp_listen_init(); - if (ret) - goto out_register; - rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); goto out; -out_register: - rds_trans_unregister(&rds_tcp_transport); out_recv: rds_tcp_recv_exit(); out_slab: + unregister_pernet_subsys(&rds_tcp_net_ops); kmem_cache_destroy(rds_tcp_conn_slab); out: return ret; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 0dbdd37..64f873c 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -52,6 +52,7 @@ u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq); extern struct rds_transport rds_tcp_transport; +void rds_tcp_accept_work(struct sock *sk); /* tcp_connect.c */ int rds_tcp_conn_connect(struct rds_connection *conn); @@ -59,9 +60,11 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn); void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ -int rds_tcp_listen_init(void); -void rds_tcp_listen_stop(void); +struct socket *rds_tcp_listen_init(struct net *); +void rds_tcp_listen_stop(struct socket *); void rds_tcp_listen_data_ready(struct sock *sk); +int rds_tcp_accept_one(struct socket *sock); +int rds_tcp_keepalive(struct socket *sock); /* tcp_recv.c */ int rds_tcp_recv_init(void); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 54a4609..a1d948e 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -112,10 +112,12 @@ int rds_tcp_conn_connect(struct rds_connection *conn) rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) ret = 0; - if (ret == 0) + if (ret == 0) { + rds_tcp_keepalive(sock); sock = NULL; - else + } else { rds_tcp_restore_callbacks(sock, conn->c_transport_data); + } out: if (sock) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 398ffe5..444d78d 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -38,14 +38,7 @@ #include "rds.h" #include "tcp.h" -/* - * cheesy, but simple.. - */ -static void rds_tcp_accept_worker(struct work_struct *work); -static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); -static struct socket *rds_tcp_listen_sock; - -static int rds_tcp_keepalive(struct socket *sock) +int rds_tcp_keepalive(struct socket *sock) { /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ @@ -77,7 +70,7 @@ static int rds_tcp_keepalive(struct socket *sock) return ret; } -static int rds_tcp_accept_one(struct socket *sock) +int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; struct rds_connection *conn; @@ -150,12 +143,6 @@ static int rds_tcp_accept_one(struct socket *sock) return ret; } -static void rds_tcp_accept_worker(struct work_struct *work) -{ - while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0) - cond_resched(); -} - void rds_tcp_listen_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); @@ -176,26 +163,20 @@ void rds_tcp_listen_data_ready(struct sock *sk) * socket */ if (sk->sk_state == TCP_LISTEN) - queue_work(rds_wq, &rds_tcp_listen_work); + rds_tcp_accept_work(sk); out: read_unlock(&sk->sk_callback_lock); ready(sk); } -int rds_tcp_listen_init(void) +struct socket *rds_tcp_listen_init(struct net *net) { struct sockaddr_in sin; struct socket *sock = NULL; int ret; - /* MUST call sock_create_kern directly so that we avoid get_net() - * in sk_alloc(). Doing a get_net() will result in cleanup_net() - * never getting invoked, which will leave sock and other things - * in limbo. - */ - ret = sock_create_kern(current->nsproxy->net_ns, PF_INET, - SOCK_STREAM, IPPROTO_TCP, &sock); + ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) goto out; @@ -219,17 +200,15 @@ int rds_tcp_listen_init(void) if (ret < 0) goto out; - rds_tcp_listen_sock = sock; - sock = NULL; + return sock; out: if (sock) sock_release(sock); - return ret; + return NULL; } -void rds_tcp_listen_stop(void) +void rds_tcp_listen_stop(struct socket *sock) { - struct socket *sock = rds_tcp_listen_sock; struct sock *sk; if (!sock) @@ -250,5 +229,4 @@ void rds_tcp_listen_stop(void) /* wait for accepts to stop and close the socket */ flush_workqueue(rds_wq); sock_release(sock); - rds_tcp_listen_sock = NULL; } -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html