Author: np
Date: Sat Jun  1 03:03:48 2019
New Revision: 348491
URL: https://svnweb.freebsd.org/changeset/base/348491

Log:
  cxgbe/t4_tom: adjust the hardware receive window to match changes to the
  receive sockbuf's high water mark.
  
  Calculate rx credits on the spot instead of tracking sbused/sb_cc and
  rx_credits in the toepcb.  The previous method worked when the high
  water mark changed due to SB_AUTOSIZE but not when it was adjusted
  directly (for example, by the soreserve in nfsrvd_addsock).
  
  This fixes a connection hang while running iozone over an NFS mounted
  share where nfsd's TCP sockets are being handled by t4_tom.
  
  MFC after:    3 days
  Sponsored by: Chelsio Communications

Modified:
  head/sys/dev/cxgbe/cxgbei/cxgbei.c
  head/sys/dev/cxgbe/tom/t4_connect.c
  head/sys/dev/cxgbe/tom/t4_cpl_io.c
  head/sys/dev/cxgbe/tom/t4_ddp.c
  head/sys/dev/cxgbe/tom/t4_listen.c
  head/sys/dev/cxgbe/tom/t4_tls.c
  head/sys/dev/cxgbe/tom/t4_tom.h

Modified: head/sys/dev/cxgbe/cxgbei/cxgbei.c
==============================================================================
--- head/sys/dev/cxgbe/cxgbei/cxgbei.c  Sat Jun  1 01:40:14 2019        
(r348490)
+++ head/sys/dev/cxgbe/cxgbei/cxgbei.c  Sat Jun  1 03:03:48 2019        
(r348491)
@@ -398,7 +398,6 @@ do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_he
        tp->t_rcvtime = ticks;
 
        /* update rx credits */
-       toep->rx_credits += pdu_len;
        t4_rcvd(&toep->td->tod, tp);    /* XXX: sc->tom_softc.tod */
 
        so = inp->inp_socket;

Modified: head/sys/dev/cxgbe/tom/t4_connect.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_connect.c Sat Jun  1 01:40:14 2019        
(r348490)
+++ head/sys/dev/cxgbe/tom/t4_connect.c Sat Jun  1 03:03:48 2019        
(r348491)
@@ -385,8 +385,7 @@ t4_connect(struct toedev *tod, struct socket *so, stru
        toep->vnet = so->so_vnet;
        set_ulp_mode(toep, select_ulp_mode(so, sc, &settings));
        SOCKBUF_LOCK(&so->so_rcv);
-       /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
-       toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
+       toep->opt0_rcv_bufsize = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
        SOCKBUF_UNLOCK(&so->so_rcv);
 
        /*
@@ -440,7 +439,7 @@ t4_connect(struct toedev *tod, struct socket *so, stru
                cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0];
                cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8];
                cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale,
-                   toep->rx_credits, toep->ulp_mode, &settings);
+                   toep->opt0_rcv_bufsize, toep->ulp_mode, &settings);
                cpl->opt2 = calc_opt2a(so, toep, &settings);
        } else {
                struct cpl_act_open_req *cpl = wrtod(wr);
@@ -469,7 +468,7 @@ t4_connect(struct toedev *tod, struct socket *so, stru
                inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port,
                    &cpl->peer_ip, &cpl->peer_port);
                cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale,
-                   toep->rx_credits, toep->ulp_mode, &settings);
+                   toep->opt0_rcv_bufsize, toep->ulp_mode, &settings);
                cpl->opt2 = calc_opt2a(so, toep, &settings);
        }
 

Modified: head/sys/dev/cxgbe/tom/t4_cpl_io.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_cpl_io.c  Sat Jun  1 01:40:14 2019        
(r348490)
+++ head/sys/dev/cxgbe/tom/t4_cpl_io.c  Sat Jun  1 03:03:48 2019        
(r348491)
@@ -399,20 +399,10 @@ make_established(struct toepcb *toep, uint32_t iss, ui
 
        tp->irs = irs;
        tcp_rcvseqinit(tp);
-       tp->rcv_wnd = toep->rx_credits << 10;
+       tp->rcv_wnd = toep->opt0_rcv_bufsize << 10;
        tp->rcv_adv += tp->rcv_wnd;
        tp->last_ack_sent = tp->rcv_nxt;
 
-       /*
-        * If we were unable to send all rx credits via opt0, save the remainder
-        * in rx_credits so that they can be handed over with the next credit
-        * update.
-        */
-       SOCKBUF_LOCK(&so->so_rcv);
-       bufsize = select_rcv_wnd(so);
-       SOCKBUF_UNLOCK(&so->so_rcv);
-       toep->rx_credits = bufsize - tp->rcv_wnd;
-
        tp->iss = iss;
        tcp_sendseqinit(tp);
        tp->snd_una = iss + 1;
@@ -483,37 +473,29 @@ t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
        struct socket *so = inp->inp_socket;
        struct sockbuf *sb = &so->so_rcv;
        struct toepcb *toep = tp->t_toe;
-       int credits;
+       int rx_credits;
 
        INP_WLOCK_ASSERT(inp);
-
        SOCKBUF_LOCK_ASSERT(sb);
-       KASSERT(toep->sb_cc >= sbused(sb),
-           ("%s: sb %p has more data (%d) than last time (%d).",
-           __func__, sb, sbused(sb), toep->sb_cc));
 
-       credits = toep->sb_cc - sbused(sb);
-       toep->sb_cc = sbused(sb);
+       rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
        if (toep->ulp_mode == ULP_MODE_TLS) {
-               if (toep->tls.rcv_over >= credits) {
-                       toep->tls.rcv_over -= credits;
-                       credits = 0;
+               if (toep->tls.rcv_over >= rx_credits) {
+                       toep->tls.rcv_over -= rx_credits;
+                       rx_credits = 0;
                } else {
-                       credits -= toep->tls.rcv_over;
+                       rx_credits -= toep->tls.rcv_over;
                        toep->tls.rcv_over = 0;
                }
        }
-       toep->rx_credits += credits;
 
-       if (toep->rx_credits > 0 &&
-           (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 ||
-           (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
-           toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) {
-
-               credits = send_rx_credits(sc, toep, toep->rx_credits);
-               toep->rx_credits -= credits;
-               tp->rcv_wnd += credits;
-               tp->rcv_adv += credits;
+       if (rx_credits > 0 &&
+           (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
+           (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
+           sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
+               rx_credits = send_rx_credits(sc, toep, rx_credits);
+               tp->rcv_wnd += rx_credits;
+               tp->rcv_adv += rx_credits;
        } else if (toep->flags & TPF_FORCE_CREDITS)
                send_rx_modulate(sc, toep);
 }
@@ -1551,7 +1533,7 @@ do_rx_data(struct sge_iq *iq, const struct rss_header 
        struct socket *so;
        struct sockbuf *sb;
        struct epoch_tracker et;
-       int len;
+       int len, rx_credits;
        uint32_t ddp_placed = 0;
 
        if (__predict_false(toep->flags & TPF_SYNQE)) {
@@ -1636,8 +1618,6 @@ do_rx_data(struct sge_iq *iq, const struct rss_header 
 
                if (!sbreserve_locked(sb, newsize, so, NULL))
                        sb->sb_flags &= ~SB_AUTOSIZE;
-               else
-                       toep->rx_credits += newsize - hiwat;
        }
 
        if (toep->ulp_mode == ULP_MODE_TCPDDP) {
@@ -1675,19 +1655,12 @@ do_rx_data(struct sge_iq *iq, const struct rss_header 
                }
        }
 
-       KASSERT(toep->sb_cc >= sbused(sb),
-           ("%s: sb %p has more data (%d) than last time (%d).",
-           __func__, sb, sbused(sb), toep->sb_cc));
-       toep->rx_credits += toep->sb_cc - sbused(sb);
        sbappendstream_locked(sb, m, 0);
-       toep->sb_cc = sbused(sb);
-       if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
-               int credits;
-
-               credits = send_rx_credits(sc, toep, toep->rx_credits);
-               toep->rx_credits -= credits;
-               tp->rcv_wnd += credits;
-               tp->rcv_adv += credits;
+       rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
+       if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
+               rx_credits = send_rx_credits(sc, toep, rx_credits);
+               tp->rcv_wnd += rx_credits;
+               tp->rcv_adv += rx_credits;
        }
 
        if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&

Modified: head/sys/dev/cxgbe/tom/t4_ddp.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_ddp.c     Sat Jun  1 01:40:14 2019        
(r348490)
+++ head/sys/dev/cxgbe/tom/t4_ddp.c     Sat Jun  1 03:03:48 2019        
(r348491)
@@ -304,9 +304,6 @@ insert_ddp_data(struct toepcb *toep, uint32_t n)
        KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
        tp->rcv_wnd -= n;
 #endif
-#ifndef USE_DDP_RX_FLOW_CONTROL
-       toep->rx_credits += n;
-#endif
        CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
            __func__, n);
        while (toep->ddp.active_count > 0) {
@@ -556,16 +553,10 @@ handle_ddp_data(struct toepcb *toep, __be32 ddp_report
 
                if (!sbreserve_locked(sb, newsize, so, NULL))
                        sb->sb_flags &= ~SB_AUTOSIZE;
-               else
-                       toep->rx_credits += newsize - hiwat;
        }
        SOCKBUF_UNLOCK(sb);
        CURVNET_RESTORE();
 
-#ifndef USE_DDP_RX_FLOW_CONTROL
-       toep->rx_credits += len;
-#endif
-
        job->msgrcv = 1;
        if (db->cancel_pending) {
                /*
@@ -714,12 +705,9 @@ handle_ddp_close(struct toepcb *toep, struct tcpcb *tp
 
        INP_WLOCK_ASSERT(toep->inp);
        DDP_ASSERT_LOCKED(toep);
-       len = be32toh(rcv_nxt) - tp->rcv_nxt;
 
+       len = be32toh(rcv_nxt) - tp->rcv_nxt;
        tp->rcv_nxt += len;
-#ifndef USE_DDP_RX_FLOW_CONTROL
-       toep->rx_credits += len;
-#endif
 
        while (toep->ddp.active_count > 0) {
                MPASS(toep->ddp.active_id != -1);

Modified: head/sys/dev/cxgbe/tom/t4_listen.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_listen.c  Sat Jun  1 01:40:14 2019        
(r348490)
+++ head/sys/dev/cxgbe/tom/t4_listen.c  Sat Jun  1 03:03:48 2019        
(r348491)
@@ -1400,7 +1400,6 @@ found:
 
                mtu_idx = find_best_mtu_idx(sc, &inc, &settings);
                rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ?  
select_rcv_wscale() : 0;
-               /* opt0 rcv_bufsiz initially, assumes its normal meaning later 
*/
                wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND);
                wnd = min(wnd, MAX_RCV_WND);
                rx_credits = min(wnd >> 10, M_RCV_BUFSIZ);
@@ -1552,8 +1551,7 @@ reset:
        toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
        toep->vnet = lctx->vnet;
        set_ulp_mode(toep, synqe->ulp_mode);
-       /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
-       toep->rx_credits = synqe->rcv_bufsize;
+       toep->opt0_rcv_bufsize = synqe->rcv_bufsize;
 
        MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
        MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);

Modified: head/sys/dev/cxgbe/tom/t4_tls.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_tls.c     Sat Jun  1 01:40:14 2019        
(r348490)
+++ head/sys/dev/cxgbe/tom/t4_tls.c     Sat Jun  1 03:03:48 2019        
(r348491)
@@ -1458,7 +1458,7 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_head
        struct socket *so;
        struct sockbuf *sb;
        struct mbuf *tls_data;
-       int len, pdu_length, pdu_overhead, sb_length;
+       int len, pdu_length, rx_credits;
 
        KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
        KASSERT(!(toep->flags & TPF_SYNQE),
@@ -1562,24 +1562,10 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_head
        }
 
        /*
-        * Not all of the bytes on the wire are included in the socket
-        * buffer (e.g. the MAC of the TLS record).  However, those
-        * bytes are included in the TCP sequence space.  To handle
-        * this, compute the delta for this TLS record in
-        * 'pdu_overhead' and treat those bytes as having already been
-        * "read" by the application for the purposes of expanding the
-        * window.  The meat of the TLS record passed to the
-        * application ('sb_length') will still not be counted as
-        * "read" until userland actually reads the bytes.
-        *
-        * XXX: Some of the calculations below are probably still not
-        * really correct.
+        * Not all of the bytes on the wire are included in the socket buffer
+        * (e.g. the MAC of the TLS record).  However, those bytes are included
+        * in the TCP sequence space.
         */
-       sb_length = m->m_pkthdr.len;
-       pdu_overhead = pdu_length - sb_length;
-       toep->rx_credits += pdu_overhead;
-       tp->rcv_wnd += pdu_overhead;
-       tp->rcv_adv += pdu_overhead;
 
        /* receive buffer autosize */
        MPASS(toep->vnet == so->so_vnet);
@@ -1587,34 +1573,25 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_head
        if (sb->sb_flags & SB_AUTOSIZE &&
            V_tcp_do_autorcvbuf &&
            sb->sb_hiwat < V_tcp_autorcvbuf_max &&
-           sb_length > (sbspace(sb) / 8 * 7)) {
+           m->m_pkthdr.len > (sbspace(sb) / 8 * 7)) {
                unsigned int hiwat = sb->sb_hiwat;
                unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
                    V_tcp_autorcvbuf_max);
 
                if (!sbreserve_locked(sb, newsize, so, NULL))
                        sb->sb_flags &= ~SB_AUTOSIZE;
-               else
-                       toep->rx_credits += newsize - hiwat;
        }
 
-       KASSERT(toep->sb_cc >= sbused(sb),
-           ("%s: sb %p has more data (%d) than last time (%d).",
-           __func__, sb, sbused(sb), toep->sb_cc));
-       toep->rx_credits += toep->sb_cc - sbused(sb);
        sbappendstream_locked(sb, m, 0);
-       toep->sb_cc = sbused(sb);
+       rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 #ifdef VERBOSE_TRACES
        CTR5(KTR_CXGBE, "%s: tid %u PDU overhead %d rx_credits %u rcv_wnd %u",
-           __func__, tid, pdu_overhead, toep->rx_credits, tp->rcv_wnd);
+           __func__, tid, pdu_overhead, rx_credits, tp->rcv_wnd);
 #endif
-       if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
-               int credits;
-
-               credits = send_rx_credits(sc, toep, toep->rx_credits);
-               toep->rx_credits -= credits;
-               tp->rcv_wnd += credits;
-               tp->rcv_adv += credits;
+       if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
+               rx_credits = send_rx_credits(sc, toep, rx_credits);
+               tp->rcv_wnd += rx_credits;
+               tp->rcv_adv += rx_credits;
        }
 
        sorwakeup_locked(so);

Modified: head/sys/dev/cxgbe/tom/t4_tom.h
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_tom.h     Sat Jun  1 01:40:14 2019        
(r348490)
+++ head/sys/dev/cxgbe/tom/t4_tom.h     Sat Jun  1 03:03:48 2019        
(r348491)
@@ -181,9 +181,7 @@ struct toepcb {
        u_int tx_nocompl;       /* tx WR credits since last compl request */
        u_int plen_nocompl;     /* payload since last compl request */
 
-       /* rx credit handling */
-       u_int sb_cc;            /* last noted value of so_rcv->sb_cc */
-       int rx_credits;         /* rx credits (in bytes) to be returned to hw */
+       int opt0_rcv_bufsize;   /* XXX: save full opt0/opt2 for later? */
 
        u_int ulp_mode; /* ULP mode */
        void *ulpcb;
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to