Well, I'm back from vacation.  I see nobody in the general group has
    commented much on my bandwidth delay product code.  A couple of people
    have corresponded with me in email and generally the response is 
    positive.

    Since this code must be enabled via a sysctl I feel it is safe to
    commit it to -current.  I also intend to MFC it to -stable prior
    to the freeze (MFC after: 1 week).  I believe that we can eventually
    enable the sysctl by default.

    I intend to commit this code on Saturday (tomorrow).  I've included the
    patch set below for those who need a reminder of what this is.  Generally
    speaking this code is very similar, though not intended to duplicate,
    the algorithm described by the TCP Vegas paper.  I will also commit
    manual page updates to tcp(4) and tuning(7) to describe the effects
    of the sysctls.

                                                -Matt

Index: netinet/tcp_input.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.169
diff -u -r1.169 tcp_input.c
--- netinet/tcp_input.c 15 Aug 2002 18:51:26 -0000      1.169
+++ netinet/tcp_input.c 17 Aug 2002 02:24:01 -0000
@@ -1018,6 +1018,7 @@
                                else if (tp->t_rtttime &&
                                            SEQ_GT(th->th_ack, tp->t_rtseq))
                                        tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+                               tcp_xmit_bandwidth_limit(tp, th->th_ack);
                                acked = th->th_ack - tp->snd_una;
                                tcpstat.tcps_rcvackpack++;
                                tcpstat.tcps_rcvackbyte += acked;
@@ -1819,6 +1820,7 @@
                        tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
                else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
                        tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+               tcp_xmit_bandwidth_limit(tp, th->th_ack);
 
                /*
                 * If all outstanding data is acked, stop retransmit
@@ -2445,6 +2447,8 @@
                delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
                if ((tp->t_rttvar += delta) <= 0)
                        tp->t_rttvar = 1;
+               if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
+                   tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
        } else {
                /*
                 * No rtt measurement yet - use the unsmoothed rtt.
@@ -2453,6 +2457,7 @@
                 */
                tp->t_srtt = rtt << TCP_RTT_SHIFT;
                tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+               tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
        }
        tp->t_rtttime = 0;
        tp->t_rxtshift = 0;
@@ -2592,6 +2597,7 @@
                if (rt->rt_rmx.rmx_locks & RTV_RTT)
                        tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
                tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
+               tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
                tcpstat.tcps_usedrtt++;
                if (rt->rt_rmx.rmx_rttvar) {
                        tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
Index: netinet/tcp_output.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_output.c,v
retrieving revision 1.67
diff -u -r1.67 tcp_output.c
--- netinet/tcp_output.c        12 Aug 2002 03:22:46 -0000      1.67
+++ netinet/tcp_output.c        17 Aug 2002 02:24:01 -0000
@@ -168,6 +168,7 @@
        sendalot = 0;
        off = tp->snd_nxt - tp->snd_una;
        win = min(tp->snd_wnd, tp->snd_cwnd);
+       win = min(win, tp->snd_bwnd);
 
        flags = tcp_outflags[tp->t_state];
        /*
@@ -780,7 +781,7 @@
                        tp->snd_max = tp->snd_nxt;
                        /*
                         * Time this transmission if not a retransmission and
-                        * not currently timing anything.
+                        * not currently timing anything. 
                         */
                        if (tp->t_rtttime == 0) {
                                tp->t_rtttime = ticks;
Index: netinet/tcp_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.140
diff -u -r1.140 tcp_subr.c
--- netinet/tcp_subr.c  1 Aug 2002 03:54:43 -0000       1.140
+++ netinet/tcp_subr.c  17 Aug 2002 02:24:01 -0000
@@ -146,6 +146,32 @@
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
 
+static int     tcp_inflight_enable = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
+    &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
+
+static int     tcp_inflight_debug = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
+    &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
+
+static int     tcp_inflight_min = 1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
+    &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
+
+static int     tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
+    &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
+
+#if 0
+static int     tcp_inflight_attack = 20;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_attack, CTLFLAG_RW,
+    &tcp_inflight_attack, 0, "TCP inflight compensation attack rate (%)");
+
+static int     tcp_inflight_shift = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_shift, CTLFLAG_RW,
+    &tcp_inflight_shift, 0, "TCP inflight compensation shift (+/-100) ");
+#endif
+
 static void    tcp_cleartaocache(void);
 static struct inpcb *tcp_notify(struct inpcb *, int);
 
@@ -566,8 +592,10 @@
        tp->t_rttmin = tcp_rexmit_min;
        tp->t_rxtcur = TCPTV_RTOBASE;
        tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+       tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
        tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
        tp->t_rcvtime = ticks;
+       tp->t_bw_rtttime = ticks;
         /*
         * IPv4 TTL initialization is necessary for an IPv6 socket as well,
         * because the socket may be bound to an IPv6 wildcard address,
@@ -1531,3 +1559,101 @@
 tcp_cleartaocache()
 {
 }
+
+/*
+ * This code attempts to calculate the bandwidth-delay product.
+ * The problem with calculating this product is that our manipulation
+ * of the congestion window modifies both the perceived bandwidth
+ * and the srtt.  It is possible to get a fairly stable maximal
+ * bandwidth by increasing the congestion window.  The bandwidth
+ * calculation will be fairly good even if bwnd is set very high.
+ * However, figuring out the minimal srtt is far more difficult
+ * because we do not want the TCP stream to suffer greatly and therefore
+ * cannot reduce the congestion window to something very small.
+ *
+ * What we do is first increase the congestion window to try to
+ * obtain a maximal (or at least a 'larger') bandwidth, then decrease
+ * the congestion window to try to obtain a minimal (or at least a 'smaller')
+ * rtt.  We also have to detect the case where BWND is too high and
+ * neither increasing nor decreasing it has the desired effect on the
+ * calculation.  By detecting this special case we can stabilize the
+ * algorithm and recalculate bwnd within a reasonable period of time.
+ */
+void
+tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
+{
+       u_long bw;
+       u_long bwnd;
+       int save_ticks;
+
+       /*
+        * If inflight_enable is disabled in the middle of a tcp connection,
+        * make sure snd_bwnd is effectively disabled.
+        */
+       if (tcp_inflight_enable == 0) {
+               tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+               tp->snd_bandwidth = 0;
+               return;
+       }
+
+       /*
+        * Figure out the bandwidth.  Due to the tick granularity this
+        * is a very rough number and it MUST be averaged over a fairly
+        * long period of time.
+        */
+       save_ticks = ticks;
+       if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
+               return;
+
+       bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 
+           (save_ticks - tp->t_bw_rtttime);
+       tp->t_bw_rtttime = save_ticks;
+       tp->t_bw_rtseq = ack_seq;
+       if (tp->t_bw_rtttime == 0)
+               return;
+       bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
+
+       tp->snd_bandwidth = bw;
+
+       /*
+        * Calculate the semi-static bandwidth delay product, plus two maximal
+        * segments.  The additional slop puts us squarely in the sweet
+        * spot and also handles the bandwidth run-up case.  Without the
+        * slop we could be locking ourselves into a lower bandwidth.
+        *
+        * Situations Handled:
+        *      (1) prevents over-queueing of packets on LANs, especially
+        *          high speed LANs, allowing larger TCP buffers to be
+        *          specified.
+        *
+        *      (2) able to handle increased network loads (bandwidth drops
+        *          so bwnd drops).
+        *
+        *      (3) Randomly changes the window size in order to force
+        *          bandwidth balancing between connections.
+        */
+#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2)
+       bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + 2 * tp->t_maxseg;
+
+       if (tcp_inflight_debug > 0) {
+               static int ltime;
+               if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
+                       ltime = ticks;
+                       printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
+                           tp,
+                           bw,
+                           tp->t_rttbest,
+                           tp->t_srtt,
+                           bwnd
+                       );
+               }
+       }
+       if ((long)bwnd < tcp_inflight_min)
+               bwnd = tcp_inflight_min;
+       if (bwnd > tcp_inflight_max)
+               bwnd = tcp_inflight_max;
+       if ((long)bwnd < tp->t_maxseg * 2)
+               bwnd = tp->t_maxseg * 2;
+       tp->snd_bwnd = bwnd;
+}
+
Index: netinet/tcp_usrreq.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.79
diff -u -r1.79 tcp_usrreq.c
--- netinet/tcp_usrreq.c        29 Jul 2002 09:01:39 -0000      1.79
+++ netinet/tcp_usrreq.c        17 Aug 2002 02:24:01 -0000
@@ -875,6 +875,7 @@
        tp->t_state = TCPS_SYN_SENT;
        callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
        tp->iss = tcp_new_isn(tp);
+       tp->t_bw_rtseq = tp->iss;
        tcp_sendseqinit(tp);
 
        /*
@@ -961,6 +962,7 @@
        tp->t_state = TCPS_SYN_SENT;
        callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
        tp->iss = tcp_new_isn(tp);
+       tp->t_bw_rtseq = tp->iss;
        tcp_sendseqinit(tp);
 
        /*
Index: netinet/tcp_var.h
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_var.h,v
retrieving revision 1.82
diff -u -r1.82 tcp_var.h
--- netinet/tcp_var.h   19 Jul 2002 18:27:39 -0000      1.82
+++ netinet/tcp_var.h   21 Jul 2002 02:26:36 -0000
@@ -124,10 +124,12 @@
 
        u_long  snd_wnd;                /* send window */
        u_long  snd_cwnd;               /* congestion-controlled window */
+       u_long  snd_bwnd;               /* bandwidth-controlled window */
        u_long  snd_ssthresh;           /* snd_cwnd size threshold for
                                         * for slow start exponential to
                                         * linear switch
                                         */
+       u_long  snd_bandwidth;          /* calculated bandwidth or 0 */
        tcp_seq snd_recover;            /* for use in fast recovery */
 
        u_int   t_maxopd;               /* mss plus options */
@@ -137,6 +139,9 @@
        int     t_rtttime;              /* round trip time */
        tcp_seq t_rtseq;                /* sequence number being timed */
 
+       int     t_bw_rtttime;           /* used for bandwidth calculation */
+       tcp_seq t_bw_rtseq;             /* used for bandwidth calculation */
+
        int     t_rxtcur;               /* current retransmit value (ticks) */
        u_int   t_maxseg;               /* maximum segment size */
        int     t_srtt;                 /* smoothed round-trip time */
@@ -144,6 +149,7 @@
 
        int     t_rxtshift;             /* log(2) of rexmt exp. backoff */
        u_int   t_rttmin;               /* minimum rtt allowed */
+       u_int   t_rttbest;              /* best rtt we've seen */
        u_long  t_rttupdated;           /* number of times rtt sampled */
        u_long  max_sndwnd;             /* largest window peer has offered */
 
@@ -473,6 +479,7 @@
 struct tcpcb *
         tcp_timers(struct tcpcb *, int);
 void    tcp_trace(int, int, struct tcpcb *, void *, struct tcphdr *, int);
+void    tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq);
 void    syncache_init(void);
 void    syncache_unreach(struct in_conninfo *, struct tcphdr *);
 int     syncache_expand(struct in_conninfo *, struct tcphdr *,

To Unsubscribe: send mail to [EMAIL PROTECTED]
with "unsubscribe freebsd-hackers" in the body of the message

Reply via email to