Adopt the extra-option framework for SMC.
It allows us to entirely remove SMC-code out of the TCP-stack.

The static key is gone, as this is now covered by the static key of the
extra-option framework.

We allocate state (struct tcp_smc_opt) that indicates whether SMC was
successfully negotiated or not and check this state in the relevant
functions.

Cc: Ursula Braun <ubr...@linux.vnet.ibm.com>
Signed-off-by: Christoph Paasch <cpaa...@apple.com>
Reviewed-by: Mat Martineau <mathew.j.martin...@linux.intel.com>
---
 include/linux/tcp.h      |   3 +-
 include/net/inet_sock.h  |   3 +-
 include/net/tcp.h        |   4 -
 net/ipv4/tcp.c           |   5 --
 net/ipv4/tcp_input.c     |  36 ---------
 net/ipv4/tcp_minisocks.c |  18 -----
 net/ipv4/tcp_output.c    |  54 --------------
 net/smc/af_smc.c         | 190 +++++++++++++++++++++++++++++++++++++++++++++--
 8 files changed, 186 insertions(+), 127 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6e1f0f29bf24..0958b3760cfc 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -257,8 +257,7 @@ struct tcp_sock {
                syn_fastopen_ch:1, /* Active TFO re-enabling probe */
                syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
                save_syn:1,     /* Save headers of SYN packet */
-               is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
-               syn_smc:1;      /* SYN includes SMC */
+               is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
        u32     tlp_high_seq;   /* snd_nxt at the time of TLP retransmit. */
 
 /* RTT measurement */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 0a671c32d6b9..4efa6cb14705 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -90,8 +90,7 @@ struct inet_request_sock {
                                wscale_ok  : 1,
                                ecn_ok     : 1,
                                acked      : 1,
-                               no_srccheck: 1,
-                               smc_ok     : 1;
+                               no_srccheck: 1;
        u32                     ir_mark;
        union {
                struct ip_options_rcu __rcu     *ireq_opt;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index be6709e380a6..2a565883e2ef 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2093,10 +2093,6 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
        return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
 }
 
-#if IS_ENABLED(CONFIG_SMC)
-extern struct static_key_false tcp_have_smc;
-#endif
-
 struct tcp_extopt_store;
 
 struct tcp_extopt_ops {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ffb5f4fbd935..f08542d91e1c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -292,11 +292,6 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
 atomic_long_t tcp_memory_allocated;    /* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
 
-#if IS_ENABLED(CONFIG_SMC)
-DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
-EXPORT_SYMBOL(tcp_have_smc);
-#endif
-
 /*
  * Current number of TCP sockets.
  */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 187e3fa761c8..fd2693baee4a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3691,24 +3691,6 @@ static void tcp_parse_fastopen_option(int len, const 
unsigned char *cookie,
        foc->exp = exp_opt;
 }
 
-static int smc_parse_options(const struct tcphdr *th,
-                            struct tcp_options_received *opt_rx,
-                            const unsigned char *ptr,
-                            int opsize)
-{
-#if IS_ENABLED(CONFIG_SMC)
-       if (static_branch_unlikely(&tcp_have_smc)) {
-               if (th->syn && !(opsize & 1) &&
-                   opsize >= TCPOLEN_EXP_SMC_BASE &&
-                   get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
-                       opt_rx->smc_ok = 1;
-                       return 1;
-               }
-       }
-#endif
-       return 0;
-}
-
 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
  * But, this can also be called on packets in the established flow when
  * the fast version below fails.
@@ -3816,9 +3798,6 @@ void tcp_parse_options(const struct net *net,
                                        tcp_parse_fastopen_option(opsize -
                                                TCPOLEN_EXP_FASTOPEN_BASE,
                                                ptr + 2, th->syn, foc, true);
-                               else if (smc_parse_options(th, opt_rx, ptr,
-                                                          opsize))
-                                       break;
                                else if (opsize >= TCPOLEN_EXP_BASE)
                                        
tcp_extopt_parse(get_unaligned_be32(ptr),
                                                         opsize, ptr, skb,
@@ -5595,16 +5574,6 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, 
struct sk_buff *synack,
        return false;
 }
 
-static void smc_check_reset_syn(struct tcp_sock *tp)
-{
-#if IS_ENABLED(CONFIG_SMC)
-       if (static_branch_unlikely(&tcp_have_smc)) {
-               if (tp->syn_smc && !tp->rx_opt.smc_ok)
-                       tp->syn_smc = 0;
-       }
-#endif
-}
-
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                                         const struct tcphdr *th)
 {
@@ -5715,8 +5684,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, 
struct sk_buff *skb,
                 * is initialized. */
                tp->copied_seq = tp->rcv_nxt;
 
-               smc_check_reset_syn(tp);
-
                smp_mb();
 
                tcp_finish_connect(sk, skb);
@@ -6173,9 +6140,6 @@ static void tcp_openreq_init(struct request_sock *req,
        ireq->ir_rmt_port = tcp_hdr(skb)->source;
        ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
        ireq->ir_mark = inet_request_mark(sk, skb);
-#if IS_ENABLED(CONFIG_SMC)
-       ireq->smc_ok = rx_opt->smc_ok;
-#endif
 }
 
 struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 46eb5a33aec1..5e08dce49a00 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -435,21 +435,6 @@ void tcp_ca_openreq_child(struct sock *sk, const struct 
dst_entry *dst)
 }
 EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
 
-static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
-                                   struct request_sock *req,
-                                   struct tcp_sock *newtp)
-{
-#if IS_ENABLED(CONFIG_SMC)
-       struct inet_request_sock *ireq;
-
-       if (static_branch_unlikely(&tcp_have_smc)) {
-               ireq = inet_rsk(req);
-               if (oldtp->syn_smc && !ireq->smc_ok)
-                       newtp->syn_smc = 0;
-       }
-#endif
-}
-
 /* This is not only more efficient than what we used to do, it eliminates
  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
  *
@@ -467,9 +452,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
                struct tcp_request_sock *treq = tcp_rsk(req);
                struct inet_connection_sock *newicsk = inet_csk(newsk);
                struct tcp_sock *newtp = tcp_sk(newsk);
-               struct tcp_sock *oldtp = tcp_sk(sk);
-
-               smc_check_reset_syn_req(oldtp, req, newtp);
 
                /* Now setup tcp_sock */
                newtp->pred_flags = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6d418ce06b59..549e33a30b41 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -398,21 +398,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
        return tp->snd_una != tp->snd_up;
 }
 
-static void smc_options_write(__be32 *ptr, u16 *options)
-{
-#if IS_ENABLED(CONFIG_SMC)
-       if (static_branch_unlikely(&tcp_have_smc)) {
-               if (unlikely(OPTION_SMC & *options)) {
-                       *ptr++ = htonl((TCPOPT_NOP  << 24) |
-                                      (TCPOPT_NOP  << 16) |
-                                      (TCPOPT_EXP <<  8) |
-                                      (TCPOLEN_EXP_SMC_BASE));
-                       *ptr++ = htonl(TCPOPT_SMC_MAGIC);
-               }
-       }
-#endif
-}
-
 /* Write previously computed TCP options to the packet.
  *
  * Beware: Something in the Internet is very sensitive to the ordering of
@@ -527,45 +512,10 @@ static void tcp_options_write(__be32 *ptr, struct sk_buff 
*skb, struct sock *sk,
                ptr += (len + 3) >> 2;
        }
 
-       smc_options_write(ptr, &options);
-
        if (unlikely(!hlist_empty(extopt_list)))
                tcp_extopt_write(ptr, skb, opts, sk);
 }
 
-static void smc_set_option(const struct tcp_sock *tp,
-                          struct tcp_out_options *opts,
-                          unsigned int *remaining)
-{
-#if IS_ENABLED(CONFIG_SMC)
-       if (static_branch_unlikely(&tcp_have_smc)) {
-               if (tp->syn_smc) {
-                       if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
-                               opts->options |= OPTION_SMC;
-                               *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
-                       }
-               }
-       }
-#endif
-}
-
-static void smc_set_option_cond(const struct tcp_sock *tp,
-                               const struct inet_request_sock *ireq,
-                               struct tcp_out_options *opts,
-                               unsigned int *remaining)
-{
-#if IS_ENABLED(CONFIG_SMC)
-       if (static_branch_unlikely(&tcp_have_smc)) {
-               if (tp->syn_smc && ireq->smc_ok) {
-                       if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
-                               opts->options |= OPTION_SMC;
-                               *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
-                       }
-               }
-       }
-#endif
-}
-
 /* Compute TCP options for SYN packets. This is not the final
  * network wire format yet.
  */
@@ -631,8 +581,6 @@ static unsigned int tcp_syn_options(struct sock *sk, struct 
sk_buff *skb,
                }
        }
 
-       smc_set_option(tp, opts, &remaining);
-
        if (unlikely(!hlist_empty(&tp->tcp_option_list)))
                remaining -= tcp_extopt_prepare(skb, TCPHDR_SYN, remaining,
                                                opts, tcp_to_sk(tp));
@@ -698,8 +646,6 @@ static unsigned int tcp_synack_options(const struct sock 
*sk,
                }
        }
 
-       smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
-
        if (unlikely(!hlist_empty(&tcp_rsk(req)->tcp_option_list)))
                remaining -= tcp_extopt_prepare(skb, TCPHDR_SYN | TCPHDR_ACK,
                                                remaining, opts,
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 267e68379110..1b942a73609e 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -44,6 +44,149 @@
 #include "smc_rx.h"
 #include "smc_close.h"
 
+static unsigned int tcp_smc_opt_prepare(struct sk_buff *skb, u8 flags,
+                                       unsigned int remaining,
+                                       struct tcp_out_options *opts,
+                                       const struct sock *sk,
+                                       struct tcp_extopt_store *store);
+static __be32 *tcp_smc_opt_write(__be32 *ptr, struct sk_buff *skb,
+                                struct tcp_out_options *opts,
+                                struct sock *sk,
+                                struct tcp_extopt_store *store);
+static void tcp_smc_opt_parse(int opsize, const unsigned char *opptr,
+                             const struct sk_buff *skb,
+                             struct tcp_options_received *opt_rx,
+                             struct sock *sk,
+                             struct tcp_extopt_store *store);
+static void tcp_smc_opt_post_process(struct sock *sk,
+                                    struct tcp_options_received *opt,
+                                    struct tcp_extopt_store *store);
+static struct tcp_extopt_store *tcp_smc_opt_copy(struct sock *listener,
+                                                struct request_sock *req,
+                                                struct tcp_options_received 
*opt,
+                                                struct tcp_extopt_store 
*store);
+static void tcp_smc_opt_destroy(struct tcp_extopt_store *store);
+
+struct tcp_smc_opt {
+       struct tcp_extopt_store store;
+       int                     smc_ok:1; /* SMC supported on this connection */
+       struct rcu_head         rcu;
+};
+
+static const struct tcp_extopt_ops tcp_smc_extra_ops = {
+       .option_kind    = TCPOPT_SMC_MAGIC,
+       .parse          = tcp_smc_opt_parse,
+       .post_process   = tcp_smc_opt_post_process,
+       .prepare        = tcp_smc_opt_prepare,
+       .write          = tcp_smc_opt_write,
+       .copy           = tcp_smc_opt_copy,
+       .destroy        = tcp_smc_opt_destroy,
+       .owner          = THIS_MODULE,
+};
+
+static struct tcp_smc_opt *tcp_extopt_to_smc(struct tcp_extopt_store *store)
+{
+       return container_of(store, struct tcp_smc_opt, store);
+}
+
+static struct tcp_smc_opt *tcp_smc_opt_find(struct sock *sk)
+{
+       struct tcp_extopt_store *ext_opt;
+
+       ext_opt = tcp_extopt_find_kind(TCPOPT_SMC_MAGIC, sk);
+
+       return tcp_extopt_to_smc(ext_opt);
+}
+
+static unsigned int tcp_smc_opt_prepare(struct sk_buff *skb, u8 flags,
+                                       unsigned int remaining,
+                                       struct tcp_out_options *opts,
+                                       const struct sock *sk,
+                                       struct tcp_extopt_store *store)
+{
+       if (!(flags & TCPHDR_SYN))
+               return 0;
+
+       if (remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+               opts->options |= OPTION_SMC;
+               return TCPOLEN_EXP_SMC_BASE_ALIGNED;
+       }
+
+       return 0;
+}
+
+static __be32 *tcp_smc_opt_write(__be32 *ptr, struct sk_buff *skb,
+                                struct tcp_out_options *opts,
+                                struct sock *sk,
+                                struct tcp_extopt_store *store)
+{
+       if (unlikely(OPTION_SMC & opts->options)) {
+               *ptr++ = htonl((TCPOPT_NOP  << 24) |
+                              (TCPOPT_NOP  << 16) |
+                              (TCPOPT_EXP <<  8) |
+                              (TCPOLEN_EXP_SMC_BASE));
+               *ptr++ = htonl(TCPOPT_SMC_MAGIC);
+       }
+
+       return ptr;
+}
+
+static void tcp_smc_opt_parse(int opsize, const unsigned char *opptr,
+                             const struct sk_buff *skb,
+                             struct tcp_options_received *opt_rx,
+                             struct sock *sk,
+                             struct tcp_extopt_store *store)
+{
+       struct tcphdr *th = tcp_hdr(skb);
+
+       if (th->syn && !(opsize & 1) && opsize >= TCPOLEN_EXP_SMC_BASE)
+               opt_rx->smc_ok = 1;
+}
+
+static void tcp_smc_opt_post_process(struct sock *sk,
+                                    struct tcp_options_received *opt,
+                                    struct tcp_extopt_store *store)
+{
+       struct tcp_smc_opt *smc_opt = tcp_extopt_to_smc(store);
+
+       if (sk->sk_state != TCP_SYN_SENT)
+               return;
+
+       if (opt->smc_ok)
+               smc_opt->smc_ok = 1;
+       else
+               smc_opt->smc_ok = 0;
+}
+
+static struct tcp_extopt_store *tcp_smc_opt_copy(struct sock *listener,
+                                                struct request_sock *req,
+                                                struct tcp_options_received 
*opt,
+                                                struct tcp_extopt_store *store)
+{
+       struct tcp_smc_opt *smc_opt;
+
+       /* First, check if the peer sent us the smc-opt */
+       if (!opt->smc_ok)
+               return NULL;
+
+       smc_opt = kzalloc(sizeof(*smc_opt), GFP_ATOMIC);
+       if (!smc_opt)
+               return NULL;
+
+       smc_opt->store.ops = &tcp_smc_extra_ops;
+
+       smc_opt->smc_ok = 1;
+
+       return (struct tcp_extopt_store *)smc_opt;
+}
+
+static void tcp_smc_opt_destroy(struct tcp_extopt_store *store)
+{
+       struct tcp_smc_opt *smc_opt = tcp_extopt_to_smc(store);
+
+       kfree_rcu(smc_opt, rcu);
+}
+
 static DEFINE_MUTEX(smc_create_lgr_pending);   /* serialize link group
                                                 * creation
                                                 */
@@ -389,6 +532,7 @@ static int smc_connect_rdma(struct smc_sock *smc)
        struct smc_clc_msg_accept_confirm aclc;
        int local_contact = SMC_FIRST_CONTACT;
        struct smc_ib_device *smcibdev;
+       struct tcp_smc_opt *smc_opt;
        struct smc_link *link;
        u8 srv_first_contact;
        int reason_code = 0;
@@ -397,7 +541,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
 
        sock_hold(&smc->sk); /* sock put in passive closing */
 
-       if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
+       smc_opt = tcp_smc_opt_find(smc->clcsock->sk);
+       if (!smc_opt || !smc_opt->smc_ok) {
                /* peer has not signalled SMC-capability */
                smc->use_fallback = true;
                goto out_connected;
@@ -548,6 +693,7 @@ static int smc_connect_rdma(struct smc_sock *smc)
 static int smc_connect(struct socket *sock, struct sockaddr *addr,
                       int alen, int flags)
 {
+       struct tcp_smc_opt *smc_opt;
        struct sock *sk = sock->sk;
        struct smc_sock *smc;
        int rc = -EINVAL;
@@ -561,9 +707,17 @@ static int smc_connect(struct socket *sock, struct 
sockaddr *addr,
                goto out_err;
        smc->addr = addr;       /* needed for nonblocking connect */
 
+       smc_opt = kzalloc(sizeof(*smc_opt), GFP_KERNEL);
+       if (!smc_opt) {
+               rc = -ENOMEM;
+               goto out_err;
+       }
+       smc_opt->store.ops = &tcp_smc_extra_ops;
+
        lock_sock(sk);
        switch (sk->sk_state) {
        default:
+               rc = -EINVAL;
                goto out;
        case SMC_ACTIVE:
                rc = -EISCONN;
@@ -573,8 +727,15 @@ static int smc_connect(struct socket *sock, struct 
sockaddr *addr,
                break;
        }
 
+       /* We are the only owner of smc->clcsock->sk, so we can be lockless */
+       rc = tcp_register_extopt(&smc_opt->store, smc->clcsock->sk);
+       if (rc) {
+               release_sock(smc->clcsock->sk);
+               kfree(smc_opt);
+               goto out_err;
+       }
+
        smc_copy_sock_settings_to_clc(smc);
-       tcp_sk(smc->clcsock->sk)->syn_smc = 1;
        rc = kernel_connect(smc->clcsock, addr, alen, flags);
        if (rc)
                goto out;
@@ -768,6 +929,7 @@ static void smc_listen_work(struct work_struct *work)
        struct smc_clc_msg_proposal *pclc;
        struct smc_ib_device *smcibdev;
        struct sockaddr_in peeraddr;
+       struct tcp_smc_opt *smc_opt;
        u8 buf[SMC_CLC_MAX_LEN];
        struct smc_link *link;
        int reason_code = 0;
@@ -777,7 +939,8 @@ static void smc_listen_work(struct work_struct *work)
        u8 ibport;
 
        /* check if peer is smc capable */
-       if (!tcp_sk(newclcsock->sk)->syn_smc) {
+       smc_opt = tcp_smc_opt_find(newclcsock->sk);
+       if (!smc_opt || !smc_opt->smc_ok) {
                new_smc->use_fallback = true;
                goto out_connected;
        }
@@ -987,10 +1150,18 @@ static void smc_tcp_listen_work(struct work_struct *work)
 
 static int smc_listen(struct socket *sock, int backlog)
 {
+       struct tcp_smc_opt *smc_opt;
        struct sock *sk = sock->sk;
        struct smc_sock *smc;
        int rc;
 
+       smc_opt = kzalloc(sizeof(*smc_opt), GFP_KERNEL);
+       if (!smc_opt) {
+               rc = -ENOMEM;
+               goto out_err;
+       }
+       smc_opt->store.ops = &tcp_smc_extra_ops;
+
        smc = smc_sk(sk);
        lock_sock(sk);
 
@@ -1003,11 +1174,19 @@ static int smc_listen(struct socket *sock, int backlog)
                sk->sk_max_ack_backlog = backlog;
                goto out;
        }
+
+       /* We are the only owner of smc->clcsock->sk, so we can be lockless */
+       rc = tcp_register_extopt(&smc_opt->store, smc->clcsock->sk);
+       if (rc) {
+               release_sock(smc->clcsock->sk);
+               kfree(smc_opt);
+               goto out_err;
+       }
+
        /* some socket options are handled in core, so we could not apply
         * them to the clc socket -- copy smc socket options to clc socket
         */
        smc_copy_sock_settings_to_clc(smc);
-       tcp_sk(smc->clcsock->sk)->syn_smc = 1;
 
        rc = kernel_listen(smc->clcsock, backlog);
        if (rc)
@@ -1022,6 +1201,7 @@ static int smc_listen(struct socket *sock, int backlog)
 
 out:
        release_sock(sk);
+out_err:
        return rc;
 }
 
@@ -1460,7 +1640,6 @@ static int __init smc_init(void)
                goto out_sock;
        }
 
-       static_branch_enable(&tcp_have_smc);
        return 0;
 
 out_sock:
@@ -1485,7 +1664,6 @@ static void __exit smc_exit(void)
                list_del_init(&lgr->list);
                smc_lgr_free(lgr); /* free link group */
        }
-       static_branch_disable(&tcp_have_smc);
        smc_ib_unregister_client();
        sock_unregister(PF_SMC);
        proto_unregister(&smc_proto);
-- 
2.16.1

Reply via email to