From: Peter Krystad <peter.krys...@linux.intel.com> Add hooks to tcp_output.c to add MP_CAPABLE to an outgoing SYN request for a subflow socket and to the final ACK of the three-way handshake.
Use the .sk_rx_dst_set() handler in the subflow proto to capture when the responding SYN-ACK is received and notify the MPTCP connection layer. Signed-off-by: Peter Krystad <peter.krys...@linux.intel.com> --- include/net/mptcp.h | 35 ++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 3 +++ net/ipv4/tcp_output.c | 29 +++++++++++++++++++++-- net/mptcp/options.c | 45 ++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.c | 53 +++++++++++++++++++++++++++++++------------ net/mptcp/protocol.h | 16 +++++++++++-- net/mptcp/subflow.c | 25 ++++++++++++++++++-- 7 files changed, 185 insertions(+), 21 deletions(-) diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 0d3e02c6c817..81255b0f57d7 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -14,17 +14,30 @@ #define OPTION_MPTCP_MPC_ACK BIT(2) struct mptcp_out_options { +#if IS_ENABLED(CONFIG_MPTCP) u16 suboptions; u64 sndr_key; u64 rcvr_key; +#endif }; #ifdef CONFIG_MPTCP void mptcp_init(void); +static inline bool sk_is_mptcp(const struct sock *sk) +{ + return tcp_sk(sk)->is_mptcp; +} + void mptcp_parse_option(const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx); +bool mptcp_syn_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts); +void mptcp_rcv_synsent(struct sock *sk); +bool mptcp_established_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts); + void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); #else @@ -33,10 +46,32 @@ static inline void mptcp_init(void) { } +static inline bool sk_is_mptcp(const struct sock *sk) +{ + return false; +} + static inline void mptcp_parse_option(const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx) { } +static inline bool mptcp_syn_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts) +{ + return false; +} + +static inline void mptcp_rcv_synsent(struct sock *sk) +{ +} + +static inline bool mptcp_established_options(struct sock *sk, + unsigned int *size, + struct mptcp_out_options *opts) +{ + return false; +} + #endif /* CONFIG_MPTCP */ #endif /* __NET_MPTCP_H */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 117f0efbbad5..4aa60fe0deca 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5901,6 +5901,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); + if (sk_is_mptcp(sk)) + mptcp_rcv_synsent(sk); + /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 69c4f39efe8b..f46e58347d73 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -438,9 +438,7 @@ struct tcp_out_options { __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ -#if IS_ENABLED(CONFIG_MPTCP) struct mptcp_out_options mptcp; -#endif }; static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) @@ -665,6 +663,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, smc_set_option(tp, opts, &remaining); + if (sk_is_mptcp(sk)) { + unsigned int size; + + if (mptcp_syn_options(sk, &size, &opts->mptcp)) { + opts->options |= OPTION_MPTCP; + remaining -= size; + } + } + return MAX_TCP_OPTION_SPACE - remaining; } @@ -763,6 +770,24 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb size += TCPOLEN_TSTAMP_ALIGNED; } + /* MPTCP options have precedence over SACK for the limited TCP + * option space because a MPTCP connection would be forced to + * fall back to regular TCP if a required multipath option is + * missing. SACK still gets a chance to use whatever space is + * left. + */ + if (sk_is_mptcp(sk)) { + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + unsigned int opt_size; + + if (mptcp_established_options(sk, &opt_size, &opts->mptcp)) { + if (remaining >= opt_size) { + opts->options |= OPTION_MPTCP; + size += opt_size; + } + } + } + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 42626cd0a9f7..071e937d5c1f 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -121,6 +121,51 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize, } } +bool mptcp_syn_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts) +{ + struct subflow_context *subflow = subflow_ctx(sk); + + if (subflow->request_mptcp) { + pr_debug("local_key=%llu", subflow->local_key); + opts->suboptions = OPTION_MPTCP_MPC_SYN; + opts->sndr_key = subflow->local_key; + *size = TCPOLEN_MPTCP_MPC_SYN; + return true; + } + return false; +} + +void mptcp_rcv_synsent(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct subflow_context *subflow = subflow_ctx(sk); + + pr_debug("subflow=%p", subflow); + if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { + subflow->mp_capable = 1; + subflow->remote_key = tp->rx_opt.mptcp.sndr_key; + } +} + +bool mptcp_established_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts) +{ + struct subflow_context *subflow = subflow_ctx(sk); + + if (subflow->mp_capable && !subflow->fourth_ack) { + opts->suboptions = OPTION_MPTCP_MPC_ACK; + opts->sndr_key = subflow->local_key; + opts->rcvr_key = subflow->remote_key; + *size = TCPOLEN_MPTCP_MPC_ACK; + subflow->fourth_ack = 1; + pr_debug("subflow=%p, local_key=%llu, remote_key=%llu", + subflow, subflow->local_key, subflow->remote_key); + return true; + } + return false; +} + void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ce2374ea7871..56637e4474da 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -18,9 +18,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *subflow = msk->subflow; - - pr_debug("subflow=%p", subflow_ctx(subflow->sk)); + struct socket *subflow; + + if (msk->connection_list) { + subflow = msk->connection_list; + pr_debug("conn_list->subflow=%p", subflow_ctx(subflow->sk)); + } else { + subflow = msk->subflow; + pr_debug("subflow=%p", subflow_ctx(subflow->sk)); + } return sock_sendmsg(subflow, msg); } @@ -29,9 +35,15 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *subflow = msk->subflow; - - pr_debug("subflow=%p", subflow_ctx(subflow->sk)); + struct socket *subflow; + + if (msk->connection_list) { + subflow = msk->connection_list; + pr_debug("conn_list->subflow=%p", subflow_ctx(subflow->sk)); + } else { + subflow = msk->subflow; + pr_debug("subflow=%p", subflow_ctx(subflow->sk)); + } return sock_recvmsg(subflow, msg, flags); } @@ -56,24 +68,36 @@ static void mptcp_close(struct sock *sk, long timeout) sock_release(msk->subflow); } + if (msk->connection_list) { + pr_debug("conn_list->subflow=%p", msk->connection_list->sk); + sock_release(msk->connection_list); + } + sock_orphan(sk); sock_put(sk); } -static int mptcp_connect(struct sock *sk, struct sockaddr *saddr, int len) +static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); - int err; - - saddr->sa_family = AF_INET; pr_debug("msk=%p, subflow=%p", msk, subflow_ctx(msk->subflow->sk)); - err = kernel_connect(msk->subflow, saddr, len, 0); + return inet_csk_get_port(msk->subflow->sk, snum); +} - sk->sk_state = TCP_ESTABLISHED; +void mptcp_finish_connect(struct sock *sk, int mp_capable) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct subflow_context *subflow = subflow_ctx(msk->subflow->sk); - return err; + if (mp_capable) { + msk->remote_key = subflow->remote_key; + msk->local_key = subflow->local_key; + msk->connection_list = msk->subflow; + msk->subflow = NULL; + } + sk->sk_state = TCP_ESTABLISHED; } static struct proto mptcp_prot = { @@ -82,13 +106,12 @@ static struct proto mptcp_prot = { .init = mptcp_init_sock, .close = mptcp_close, .accept = inet_csk_accept, - .connect = mptcp_connect, .shutdown = tcp_shutdown, .sendmsg = mptcp_sendmsg, .recvmsg = mptcp_recvmsg, .hash = inet_hash, .unhash = inet_unhash, - .get_port = inet_csk_get_port, + .get_port = mptcp_get_port, .obj_size = sizeof(struct mptcp_sock), .no_autobind = 1, }; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b6adc2aa6222..9206e60ef6d3 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -33,7 +33,10 @@ struct mptcp_sock { /* inet_connection_sock must be the first member */ struct inet_connection_sock sk; - struct socket *subflow; + u64 local_key; + u64 remote_key; + struct socket *connection_list; /* @@ needs to be a list */ + struct socket *subflow; /* outgoing connect, listener or !mp_capable */ }; static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) @@ -43,9 +46,14 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) /* MPTCP subflow context */ struct subflow_context { + u64 local_key; + u64 remote_key; u32 request_mptcp : 1, /* send MP_CAPABLE */ request_cksum : 1, - version : 4; + mp_capable : 1, /* remote is MPTCP capable */ + fourth_ack : 1, /* send initial DSS */ + version : 4, + conn_finished : 1; struct socket *tcp_sock; /* underlying tcp_sock */ struct sock *conn; /* parent mptcp_sock */ }; @@ -65,4 +73,8 @@ mptcp_subflow_tcp_socket(const struct subflow_context *subflow) void subflow_init(void); +extern const struct inet_connection_sock_af_ops ipv4_specific; + +void mptcp_finish_connect(struct sock *sk, int mp_capable); + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 8d13713ee159..91df2c4be339 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -15,6 +15,22 @@ #include <net/mptcp.h> #include "protocol.h" +static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) +{ + struct subflow_context *subflow = subflow_ctx(sk); + + inet_sk_rx_dst_set(sk, skb); + + if (subflow->conn && !subflow->conn_finished) { + pr_debug("subflow=%p, remote_key=%llu", subflow_ctx(sk), + subflow->remote_key); + mptcp_finish_connect(subflow->conn, subflow->mp_capable); + subflow->conn_finished = 1; + } +} + +static struct inet_connection_sock_af_ops subflow_specific; + static struct subflow_context *subflow_create_ctx(struct sock *sk, struct socket *sock) { @@ -36,7 +52,8 @@ static struct subflow_context *subflow_create_ctx(struct sock *sk, static int subflow_ulp_init(struct sock *sk) { - struct tcp_sock *tsk = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct subflow_context *ctx; int err = 0; @@ -48,7 +65,8 @@ static int subflow_ulp_init(struct sock *sk) pr_debug("subflow=%p", ctx); - tsk->is_mptcp = 1; + tp->is_mptcp = 1; + icsk->icsk_af_ops = &subflow_specific; out: return err; } @@ -71,6 +89,9 @@ static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { void subflow_init(void) { + subflow_specific = ipv4_specific; + subflow_specific.sk_rx_dst_set = subflow_finish_connect; + if (tcp_register_ulp(&subflow_ulp_ops) != 0) panic("MPTCP: failed to register subflows to ULP\n"); } -- 2.22.0