From: Chia-Yu Chang <chia-yu.ch...@nokia-bell-labs.com>

AccECN option may fail in various way, handle these:
- Remove option from SYN/ACK rexmits to handle blackholes
- If no option arrives in SYN/ACK, assume Option is not usable
        - If an option arrives later, re-enabled
- If option is zeroed, disable AccECN option processing

This patch use existing padding bits in tcp_request_sock and
holes in tcp_sock without increasing the size.

Signed-off-by: Ilpo Järvinen <i...@kernel.org>
Signed-off-by: Chia-Yu Chang <chia-yu.ch...@nokia-bell-labs.com>
---
 include/linux/tcp.h      |  4 ++-
 include/net/tcp.h        |  7 +++++
 net/ipv4/tcp.c           |  1 +
 net/ipv4/tcp_input.c     | 68 +++++++++++++++++++++++++++++++++++-----
 net/ipv4/tcp_minisocks.c | 38 ++++++++++++++++++++++
 net/ipv4/tcp_output.c    |  5 ++-
 6 files changed, 113 insertions(+), 10 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 0740efaaef28..b5066eef8782 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -173,6 +173,7 @@ struct tcp_request_sock {
        u8                              syn_ect_snt: 2,
                                        syn_ect_rcv: 2,
                                        accecn_fail_mode:4;
+       u8                              saw_accecn_opt  :2;
 #ifdef CONFIG_TCP_AO
        u8                              ao_keyid;
        u8                              ao_rcv_next;
@@ -409,7 +410,8 @@ struct tcp_sock {
                syn_fastopen_child:1; /* created TFO passive child socket */
 
        u8      keepalive_probes; /* num of allowed keep alive probes   */
-       u8      accecn_fail_mode:4;     /* AccECN failure handling */
+       u8      accecn_fail_mode:4,     /* AccECN failure handling */
+               saw_accecn_opt:2;       /* An AccECN option was seen */
        u32     tcp_tx_delay;   /* delay (in usec) added to TX packets */
 
 /* RTT measurement */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3419618a7891..5e4593e39de4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -279,6 +279,12 @@ static inline void tcp_accecn_fail_mode_set(struct 
tcp_sock *tp, u8 mode)
        tp->accecn_fail_mode |= mode;
 }
 
+/* tp->saw_accecn_opt states */
+#define TCP_ACCECN_OPT_NOT_SEEN                0x0
+#define TCP_ACCECN_OPT_EMPTY_SEEN      0x1
+#define TCP_ACCECN_OPT_COUNTER_SEEN    0x2
+#define TCP_ACCECN_OPT_FAIL_SEEN       0x3
+
 /* Flags in tp->nonagle */
 #define TCP_NAGLE_OFF          1       /* Nagle's algo is disabled */
 #define TCP_NAGLE_CORK         2       /* Socket is corked         */
@@ -480,6 +486,7 @@ static inline int tcp_accecn_extract_syn_ect(u8 ace)
 bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect);
 void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb,
                          u8 syn_ect_snt);
+u8 tcp_accecn_option_init(const struct sk_buff *skb, u8 opt_offset);
 void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb,
                               u32 payload_len);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 20a2e30e15f3..e68b9706eeff 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3399,6 +3399,7 @@ int tcp_disconnect(struct sock *sk, int flags)
        tp->delivered_ce = 0;
        tp->wait_third_ack = 0;
        tp->accecn_fail_mode = 0;
+       tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
        tcp_accecn_init_counters(tp);
        tp->prev_ecnfield = 0;
        tp->accecn_opt_tstamp = 0;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 10c37a41b9a5..c93e4bffb23e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -446,9 +446,8 @@ bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 
ace, u8 sent_ect)
 }
 
 /* See Table 2 of the AccECN draft */
-
-static void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th,
-                              u8 ip_dsfield)
+static void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb,
+                              const struct tcphdr *th, u8 ip_dsfield)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        u8 ace = tcp_accecn_ace(th);
@@ -487,7 +486,19 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const 
struct tcphdr *th,
        default:
                tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
                tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK;
-               tp->accecn_opt_demand = 2;
+               if (tp->rx_opt.accecn &&
+                   tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+                       u8 saw_opt = tcp_accecn_option_init(skb,
+                                                           tp->rx_opt.accecn);
+
+                       tp->saw_accecn_opt = saw_opt;
+                       if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) {
+                               u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV;
+
+                               tcp_accecn_fail_mode_set(tp, fail_mode);
+                       }
+                       tp->accecn_opt_demand = 2;
+               }
                if (INET_ECN_is_ce(ip_dsfield) &&
                    tcp_accecn_validate_syn_feedback(sk, ace,
                                                     tp->syn_ect_snt)) {
@@ -603,7 +614,23 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp,
        unsigned int i;
        u8 *ptr;
 
+       if (tcp_accecn_opt_fail_recv(tp))
+               return false;
+
        if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) {
+               if (!tp->saw_accecn_opt) {
+                       /* Too late to enable after this point due to
+                        * potential counter wraps
+                        */
+                       if (tp->bytes_sent >= (1 << 23) - 1) {
+                               u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV;
+
+                               tp->saw_accecn_opt = TCP_ACCECN_OPT_FAIL_SEEN;
+                               tcp_accecn_fail_mode_set(tp, fail_mode);
+                       }
+                       return false;
+               }
+
                if (estimate_ecnfield) {
                        u8 ecnfield = estimate_ecnfield - 1;
 
@@ -619,6 +646,13 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp,
        order1 = (ptr[0] == TCPOPT_ACCECN1);
        ptr += 2;
 
+       if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+               tp->saw_accecn_opt = tcp_accecn_option_init(skb,
+                                                           tp->rx_opt.accecn);
+               if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
+                       tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
+       }
+
        res = !!estimate_ecnfield;
        for (i = 0; i < 3; i++) {
                if (optlen < TCPOLEN_ACCECN_PERFIELD)
@@ -6481,10 +6515,25 @@ static bool tcp_validate_incoming(struct sock *sk, 
struct sk_buff *skb,
         */
        if (th->syn) {
                if (tcp_ecn_mode_accecn(tp)) {
-                       u8 opt_demand = max_t(u8, 1, tp->accecn_opt_demand);
-
                        accecn_reflector = true;
-                       tp->accecn_opt_demand = opt_demand;
+                       if (tp->rx_opt.accecn &&
+                           tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+                               u8 offset = tp->rx_opt.accecn;
+                               u8 opt_demand;
+                               u8 saw_opt;
+
+                               saw_opt = tcp_accecn_option_init(skb, offset);
+                               tp->saw_accecn_opt = saw_opt;
+                               if (tp->saw_accecn_opt ==
+                                   TCP_ACCECN_OPT_FAIL_SEEN) {
+                                       u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV;
+
+                                       tcp_accecn_fail_mode_set(tp, fail_mode);
+                               }
+                               opt_demand = max_t(u8, 1,
+                                                  tp->accecn_opt_demand);
+                               tp->accecn_opt_demand = opt_demand;
+                       }
                }
                if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
                    TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
@@ -6974,7 +7023,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, 
struct sk_buff *skb,
                 */
 
                if (tcp_ecn_mode_any(tp))
-                       tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield);
+                       tcp_ecn_rcv_synack(sk, skb, th,
+                                          TCP_SKB_CB(skb)->ip_dsfield);
 
                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
                tcp_try_undo_spurious_syn(sk);
@@ -7549,6 +7599,8 @@ static void tcp_openreq_init(struct request_sock *req,
        tcp_rsk(req)->snt_tsval_first = 0;
        tcp_rsk(req)->last_oow_ack_time = 0;
        tcp_rsk(req)->accecn_ok = 0;
+       tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
+       tcp_rsk(req)->accecn_fail_mode = 0;
        tcp_rsk(req)->syn_ect_rcv = 0;
        tcp_rsk(req)->syn_ect_snt = 0;
        req->mss = rx_opt->mss_clamp;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e0f2bd2cee9e..87b03ee74676 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -501,6 +501,7 @@ static void tcp_ecn_openreq_child(struct sock *sk,
                tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
                tp->syn_ect_snt = treq->syn_ect_snt;
                tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
+               tp->saw_accecn_opt = treq->saw_accecn_opt;
                tp->prev_ecnfield = treq->syn_ect_rcv;
                tp->accecn_opt_demand = 1;
                tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4);
@@ -555,6 +556,30 @@ static void smc_check_reset_syn_req(const struct tcp_sock 
*oldtp,
 #endif
 }
 
+u8 tcp_accecn_option_init(const struct sk_buff *skb, u8 opt_offset)
+{
+       u8 *ptr = skb_transport_header(skb) + opt_offset;
+       unsigned int optlen = ptr[1] - 2;
+
+       WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1);
+       ptr += 2;
+
+       /* Detect option zeroing: an AccECN connection "MAY check that the
+        * initial value of the EE0B field or the EE1B field is non-zero"
+        */
+       if (optlen < TCPOLEN_ACCECN_PERFIELD)
+               return TCP_ACCECN_OPT_EMPTY_SEEN;
+       if (get_unaligned_be24(ptr) == 0)
+               return TCP_ACCECN_OPT_FAIL_SEEN;
+       if (optlen < TCPOLEN_ACCECN_PERFIELD * 3)
+               return TCP_ACCECN_OPT_COUNTER_SEEN;
+       ptr += TCPOLEN_ACCECN_PERFIELD * 2;
+       if (get_unaligned_be24(ptr) == 0)
+               return TCP_ACCECN_OPT_FAIL_SEEN;
+
+       return TCP_ACCECN_OPT_COUNTER_SEEN;
+}
+
 /* This is not only more efficient than what we used to do, it eliminates
  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
  *
@@ -716,6 +741,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff 
*skb,
        bool own_req;
 
        tmp_opt.saw_tstamp = 0;
+       tmp_opt.accecn = 0;
        if (th->doff > (sizeof(struct tcphdr)>>2)) {
                tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
 
@@ -893,6 +919,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff 
*skb,
        if (!(flg & TCP_FLAG_ACK))
                return NULL;
 
+       if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn &&
+           tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+               u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn);
+
+               tcp_rsk(req)->saw_accecn_opt = saw_opt;
+               if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) {
+                       u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV;
+
+                       tcp_rsk(req)->accecn_fail_mode |= fail_mode;
+               }
+       }
+
        /* For Fast Open no more processing is needed (sk is the
         * child socket).
         */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ea37a30ff71c..b630923c4cef 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1086,6 +1086,7 @@ static unsigned int tcp_syn_options(struct sock *sk, 
struct sk_buff *skb,
        /* Simultaneous open SYN/ACK needs AccECN option but not SYN */
        if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) &&
                     tcp_ecn_mode_accecn(tp) &&
+                    inet_csk(sk)->icsk_retransmits < 2 &&
                     sock_net(sk)->ipv4.sysctl_tcp_ecn_option &&
                     remaining >= TCPOLEN_ACCECN_BASE)) {
                u32 saving = tcp_synack_options_combine_saving(opts);
@@ -1175,7 +1176,7 @@ static unsigned int tcp_synack_options(const struct sock 
*sk,
        smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
 
        if (treq->accecn_ok && sock_net(sk)->ipv4.sysctl_tcp_ecn_option &&
-           remaining >= TCPOLEN_ACCECN_BASE) {
+           req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) {
                u32 saving = tcp_synack_options_combine_saving(opts);
 
                opts->ecn_bytes = synack_ecn_bytes;
@@ -1254,6 +1255,8 @@ static unsigned int tcp_established_options(struct sock 
*sk, struct sk_buff *skb
 
        if (tcp_ecn_mode_accecn(tp) &&
            sock_net(sk)->ipv4.sysctl_tcp_ecn_option &&
+           tp->saw_accecn_opt &&
+           !tcp_accecn_opt_fail_send(tp) &&
            (sock_net(sk)->ipv4.sysctl_tcp_ecn_option >= TCP_ECN_OPTION_FULL ||
             tp->accecn_opt_demand ||
             tcp_accecn_option_beacon_check(sk))) {
-- 
2.34.1


Reply via email to