Provide a sysctl called "tcp_tape_broken_windows" to disable limiting
the advertised TCP window to 32767 bytes when the remote has not sent
us a window scale option.

Long long ago and far away, there were it seems some broken TCP stacks
which mistakenly treated the 16 bit TCP window field as a signed
rather than unsigned quantity.  If they ever received a window
advertisement > 32767 (eg 15 bits worth) they would think it was a
negative number. Prior to this patch, the code was using the
negotiation of the window scaling option as an indication that the
remote properly understood the TCP window field.  If no window scaling
option was received, it was assumed the remote TCP was broken with
regards to the unsigned nature of the TCP window field

Since such stacks are rapidly fading into the mists of time, and since
it is perfectly legal and not entirely uncommon to use a TCP window up
to 65535 bytes when window scaling is not in use, we want to allow an
administrator to let his system advertise a window > 32767 and <=
65535 bytes even when the remote has not told us it is going to use
window scaling.  This is accomplished by setting
"tcp_tape_broken_windows" to a value of 0 rather than the default
value of 1.

In some basic netperf bidirectional tests, with a 2.6.12 initiator
configured to not use window scaling, to a 2.6.15 server with the
patch I see the following performance change:

tcp_tape_broken_windows=1 # default behaviour, matching prior

loiter:/opt/netperf2_work# src/netperf -I 99,1 -i 30,3 -H 192.168.3.125 -t 
TCP_RR -l 10 -- -b 1 -r 32K -s 65535 -S 65535
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 
192.168.3.125 (192.168.3.125) port 0 AF_INET : +/-0.5% @ 99% conf. : first 
burst 1
Local /Remote
Socket Size   Request  Resp.   Elapsed  Trans.
Send   Recv   Size     Size    Time     Rate
bytes  Bytes  bytes    bytes   secs.    per sec

131070 131070 32768    32768   10.00    2035.38
131070 131070

tcp_tape_broken_windows=0 # "new" behaviour, don't limit window

loiter:/opt/netperf2_work# src/netperf -I 99,1 -i 30,3 -H 192.168.3.125 -t 
TCP_RR -l 10 -- -b 1 -r 32K -s 65535 -S 65535
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 
192.168.3.125 (192.168.3.125) port 0 AF_INET : +/-0.5% @ 99% conf. : first 
burst 1
Local /Remote
Socket Size   Request  Resp.   Elapsed  Trans.
Send   Recv   Size     Size    Time     Rate
bytes  Bytes  bytes    bytes   secs.    per sec

131070 131070 32768    32768   10.00    2128.91
131070 131070

I expect that had my 2.6.12 system also been patched, the performance
change would have been greater.  It would also likely be greater the
greater the RTT between the two systems. 


Signed-off-by: Rick Jones <[EMAIL PROTECTED]>

--- linux-2.6-2.6.15/net/ipv4/tcp_output.c.orig 2006-01-02 19:21:10.000000000 
-0800
+++ linux-2.6-2.6.15/net/ipv4/tcp_output.c      2006-03-01 17:46:16.000000000 
-0800
@@ -45,6 +45,11 @@
 /* People can turn this off for buggy TCP's found in printers etc. */
 int sysctl_tcp_retrans_collapse = 1;
 
+/* People can turn this off to stop taping-over broken TCPs that 
+ * interpret the window field as a signed quantity.
+ */
+int sysctl_tcp_tape_broken_windows = 1;
+
 /* This limits the percentage of the congestion window which we
  * will allow a single TSO frame to consume.  Building TSO frames
  * which are too large can cause TCP streams to be bursty.
@@ -173,10 +178,15 @@ void tcp_select_initial_window(int __spa
        /* NOTE: offering an initial window larger than 32767
         * will break some buggy TCP stacks. We try to be nice.
         * If we are not window scaling, then this truncates
-        * our initial window offering to 32k. There should also
-        * be a sysctl option to stop being nice.
+        * our initial window offering to 32k-1 unless the admin
+        * has told us we don't have to cover for a buggy stack which
+        * treats the TCP window field as a signed quantity.
         */
-       (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+       if (sysctl_tcp_tape_broken_windows)
+               (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+       else
+               (*rcv_wnd) = space;
+
        (*rcv_wscale) = 0;
        if (wscale_ok) {
                /* Set window scaling on max possible window
@@ -235,7 +245,7 @@ static __inline__ u16 tcp_select_window(
        /* Make sure we do not exceed the maximum possible
         * scaled window.
         */
-       if (!tp->rx_opt.rcv_wscale)
+       if (!tp->rx_opt.rcv_wscale && sysctl_tcp_tape_broken_windows)
                new_win = min(new_win, MAX_TCP_WINDOW);
        else
                new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
--- linux-2.6-2.6.15/include/net/tcp.h.orig     2006-02-28 13:40:33.000000000 
-0800
+++ linux-2.6-2.6.15/include/net/tcp.h  2006-03-01 17:46:10.000000000 -0800
@@ -219,6 +219,7 @@ extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
 extern int sysctl_tcp_abc;
+extern int sysctl_tcp_tape_broken_windows;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
--- linux-2.6-2.6.15/include/linux/sysctl.h.orig        2006-02-28 
13:40:46.000000000 -0800
+++ linux-2.6-2.6.15/include/linux/sysctl.h     2006-03-01 17:46:03.000000000 
-0800
@@ -390,6 +390,7 @@ enum
        NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
        NET_TCP_CONG_CONTROL=110,
        NET_TCP_ABC=111,
+       NET_IPV4_TCP_TAPE_BROKEN_WINDOWS=112,
 };
 
 enum {
--- linux-2.6-2.6.15/net/ipv4/sysctl_net_ipv4.c.orig    2006-03-01 
17:45:46.000000000 -0800
+++ linux-2.6-2.6.15/net/ipv4/sysctl_net_ipv4.c 2006-03-01 17:46:22.000000000 
-0800
@@ -653,7 +653,14 @@ ctl_table ipv4_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
-
+        {
+               .ctl_name       = NET_IPV4_TCP_TAPE_BROKEN_WINDOWS,
+               .procname       = "tcp_tape_broken_windows",
+               .data           = &sysctl_tcp_tape_broken_windows,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec
+       },
        { .ctl_name = 0 }
 };
 
--- linux-2.6-2.6.15/Documentation/networking/ip-sysctl.txt.orig        
2006-01-02 19:21:10.000000000 -0800
+++ linux-2.6-2.6.15/Documentation/networking/ip-sysctl.txt     2006-03-01 
17:44:37.000000000 -0800
@@ -332,6 +332,13 @@ somaxconn - INTEGER
        Defaults to 128.  See also tcp_max_syn_backlog for additional tuning
        for TCP sockets.
 
+tcp_tape_broken_windows - BOOLEAN
+       If set, assume no receipt of a window scaling option means the
+       remote TCP is broken and treats the window as a signed quantity. 
+       If unset, assume the remote TCP is not broken even if we do
+       not receive a window scaling option from them.
+       Default: 1
+
 IP Variables:
 
 ip_local_port_range - 2 INTEGERS
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to