On 7/7/21 7:00 AM, Mateusz Guzik wrote:
This breaks NOIP kernel builds.

Thanks for pointing this out,  it should be fixed in 4150a5a87ed

On 7/6/21, Andrew Gallatin <galla...@freebsd.org> wrote:
The branch main has been updated by gallatin:

URL:
https://urldefense.com/v3/__https://cgit.FreeBSD.org/src/commit/?id=28d0a740dd9a67e4a4fa9fda5bb39b5963316f35__;!!OToaGQ!_d4pkzhNaWowgMsR4-c1qtLXr1H9SC_kBWNDvXvVV15lerMV4elltm-V6OZj3iET-A$

commit 28d0a740dd9a67e4a4fa9fda5bb39b5963316f35
Author:     Andrew Gallatin <galla...@freebsd.org>
AuthorDate: 2021-07-06 14:17:33 +0000
Commit:     Andrew Gallatin <galla...@freebsd.org>
CommitDate: 2021-07-06 14:28:32 +0000

     ktls: auto-disable ifnet (inline hw) kTLS

     Ifnet (inline) hw kTLS NICs typically keep state within
     a TLS record, so that when transmitting in-order,
     they can continue encryption on each segment sent without
     DMA'ing extra state from the host.

     This breaks down when transmits are out of order (eg,
     TCP retransmits).  In this case, the NIC must re-DMA
     the entire TLS record up to and including the segment
     being retransmitted.  This means that when re-transmitting
     the last 1448 byte segment of a TLS record, the NIC will
     have to re-DMA the entire 16KB TLS record. This can lead
     to the NIC running out of PCIe bus bandwidth well before
     it saturates the network link if a lot of TCP connections have
     a high retransmoit rate.

     This change introduces a new sysctl
(kern.ipc.tls.ifnet_max_rexmit_pct),
     where TCP connections with higher retransmit rate will be
     switched to SW kTLS so as to conserve PCIe bandwidth.

     Reviewed by:    hselasky, markj, rrs
     Sponsored by:   Netflix
     Differential Revision:  
https://urldefense.com/v3/__https://reviews.freebsd.org/D30908__;!!OToaGQ!_d4pkzhNaWowgMsR4-c1qtLXr1H9SC_kBWNDvXvVV15lerMV4elltm-V6OYOYLaV0A$
---
  sys/kern/uipc_ktls.c  | 107
++++++++++++++++++++++++++++++++++++++++++++++++++
  sys/netinet/tcp_var.h |  13 +++++-
  sys/sys/ktls.h        |  15 ++++++-
  3 files changed, 133 insertions(+), 2 deletions(-)

diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index 7e87e7c740e3..88e29157289d 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$");

  #include "opt_inet.h"
  #include "opt_inet6.h"
+#include "opt_kern_tls.h"
  #include "opt_ratelimit.h"
  #include "opt_rss.h"

@@ -121,6 +122,11 @@ SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads,
CTLFLAG_RD,
      &ktls_number_threads, 0,
      "Number of TLS threads in thread-pool");

+unsigned int ktls_ifnet_max_rexmit_pct = 2;
+SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
+    &ktls_ifnet_max_rexmit_pct, 2,
+    "Max percent bytes retransmitted before ifnet TLS is disabled");
+
  static bool ktls_offload_enable;
  SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
      &ktls_offload_enable, 0,
@@ -184,6 +190,14 @@ static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
  SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed,
CTLFLAG_RD,
      &ktls_switch_failed, "TLS sessions unable to switch between SW and
ifnet");

+static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
+SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed,
CTLFLAG_RD,
+    &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from
ifnet");
+
+static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
+SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok,
CTLFLAG_RD,
+    &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from
ifnet");
+
  SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
      "Software TLS session stats");
  SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE,
0,
@@ -2187,3 +2201,96 @@ ktls_work_thread(void *ctx)
                }
        }
  }
+
+static void
+ktls_disable_ifnet_help(void *context, int pending __unused)
+{
+       struct ktls_session *tls;
+       struct inpcb *inp;
+       struct tcpcb *tp;
+       struct socket *so;
+       int err;
+
+       tls = context;
+       inp = tls->inp;
+       if (inp == NULL)
+               return;
+       INP_WLOCK(inp);
+       so = inp->inp_socket;
+       MPASS(so != NULL);
+       if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+           (inp->inp_flags2 & INP_FREED)) {
+               goto out;
+       }
+
+       if (so->so_snd.sb_tls_info != NULL)
+               err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
+       else
+               err = ENXIO;
+       if (err == 0) {
+               counter_u64_add(ktls_ifnet_disable_ok, 1);
+               /* ktls_set_tx_mode() drops inp wlock, so recheck flags */
+               if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
+                   (inp->inp_flags2 & INP_FREED) == 0 &&
+                   (tp = intotcpcb(inp)) != NULL &&
+                   tp->t_fb->tfb_hwtls_change != NULL)
+                       (*tp->t_fb->tfb_hwtls_change)(tp, 0);
+       } else {
+               counter_u64_add(ktls_ifnet_disable_fail, 1);
+       }
+
+out:
+       SOCK_LOCK(so);
+       sorele(so);
+       if (!in_pcbrele_wlocked(inp))
+               INP_WUNLOCK(inp);
+       ktls_free(tls);
+}
+
+/*
+ * Called when re-transmits are becoming a substantial portion of the
+ * sends on this connection.  When this happens, we transition the
+ * connection to software TLS.  This is needed because most inline TLS
+ * NICs keep crypto state only for in-order transmits.  This means
+ * that to handle a TCP rexmit (which is out-of-order), the NIC must
+ * re-DMA the entire TLS record up to and including the current
+ * segment.  This means that when re-transmitting the last ~1448 byte
+ * segment of a 16KB TLS record, we could wind up re-DMA'ing an order
+ * of magnitude more data than we are sending.  This can cause the
+ * PCIe link to saturate well before the network, which can cause
+ * output drops, and a general loss of capacity.
+ */
+void
+ktls_disable_ifnet(void *arg)
+{
+       struct tcpcb *tp;
+       struct inpcb *inp;
+       struct socket *so;
+       struct ktls_session *tls;
+
+       tp = arg;
+       inp = tp->t_inpcb;
+       INP_WLOCK_ASSERT(inp);
+       so = inp->inp_socket;
+       SOCK_LOCK(so);
+       tls = so->so_snd.sb_tls_info;
+       if (tls->disable_ifnet_pending) {
+               SOCK_UNLOCK(so);
+               return;
+       }
+
+       /*
+        * note that disable_ifnet_pending is never cleared; disabling
+        * ifnet can only be done once per session, so we never want
+        * to do it again
+        */
+
+       (void)ktls_hold(tls);
+       in_pcbref(inp);
+       soref(so);
+       tls->disable_ifnet_pending = true;
+       tls->inp = inp;
+       SOCK_UNLOCK(so);
+       TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
+       (void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
+}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index dd30f89896d2..3f72a821e71f 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -39,8 +39,10 @@
  #include <netinet/tcp_fsm.h>

  #ifdef _KERNEL
+#include "opt_kern_tls.h"
  #include <net/vnet.h>
  #include <sys/mbuf.h>
+#include <sys/ktls.h>
  #endif

  #define TCP_END_BYTE_INFO 8   /* Bytes that makeup the "end information
array" */
@@ -1139,8 +1141,10 @@ tcp_fields_to_net(struct tcphdr *th)

  static inline void
  tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
-    uint8_t is_tlp, int hw_tls __unused)
+    uint8_t is_tlp, int hw_tls)
  {
+       uint64_t rexmit_percent;
+
        if (is_tlp) {
                tp->t_sndtlppack++;
                tp->t_sndtlpbyte += len;
@@ -1150,6 +1154,13 @@ tcp_account_for_send(struct tcpcb *tp, uint32_t len,
uint8_t is_rxt,
                tp->t_snd_rxt_bytes += len;
        else
                tp->t_sndbytes += len;
+
+       if (hw_tls && is_rxt) {
+               rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL *
(tp->t_snd_rxt_bytes + tp->t_sndbytes));
+               if (rexmit_percent > ktls_ifnet_max_rexmit_pct)
+                       ktls_disable_ifnet(tp);
+       }
+
  }
  #endif /* _KERNEL */

diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h
index b28c94965c97..7fd8831878b4 100644
--- a/sys/sys/ktls.h
+++ b/sys/sys/ktls.h
@@ -189,10 +189,12 @@ struct ktls_session {
        u_int   wq_index;
        volatile u_int refcount;
        int mode;
-       bool reset_pending;

        struct task reset_tag_task;
+       struct task disable_ifnet_task;
        struct inpcb *inp;
+       bool reset_pending;
+       bool disable_ifnet_pending;
  } __aligned(CACHE_LINE_SIZE);

  void ktls_check_rx(struct sockbuf *sb);
@@ -231,5 +233,16 @@ ktls_free(struct ktls_session *tls)
                ktls_destroy(tls);
  }

+#ifdef KERN_TLS
+extern unsigned int ktls_ifnet_max_rexmit_pct;
+void ktls_disable_ifnet(void *arg);
+#else
+#define ktls_ifnet_max_rexmit_pct 1
+inline void
+ktls_disable_ifnet(void *arg __unused)
+{
+}
+#endif
+
  #endif /* !_KERNEL */
  #endif /* !_SYS_KTLS_H_ */




_______________________________________________
dev-commits-src-main@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/dev-commits-src-main
To unsubscribe, send any mail to "dev-commits-src-main-unsubscr...@freebsd.org"

Reply via email to