The existing mechanism for detecting thin streams (tcp_stream_is_thin) is based on a static limit of less than 4 packets in flight. This treats streams differently depending on the connections RTT, such that a stream on a high RTT link may never be considered thin, whereas the same application would produce a stream that would always be thin in a low RTT scenario (e.g. data center).
By calculating a dynamic packets in flight limit (DPIFL), the thin stream detection will be independent of the RTT and treat streams equally based on the transmission pattern, i.e. the inter-transmission time (ITT). Cc: Andreas Petlund <apetl...@simula.no> Cc: Carsten Griwodz <gr...@simula.no> Cc: Pål Halvorsen <pa...@simula.no> Cc: Jonas Markussen <jona...@ifi.uio.no> Cc: Kristian Evensen <kristian.even...@gmail.com> Cc: Kenneth Klette Jonassen <kenne...@ifi.uio.no> Signed-off-by: Bendik Rønning Opstad <bro.devel+ker...@gmail.com> --- Documentation/networking/ip-sysctl.txt | 8 ++++++++ include/net/tcp.h | 21 +++++++++++++++++++++ net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ net/ipv4/tcp.c | 2 ++ 4 files changed, 40 insertions(+) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 2ea4c45..938ae73 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -700,6 +700,14 @@ tcp_thin_dupack - BOOLEAN Documentation/networking/tcp-thin.txt Default: 0 +tcp_thin_dpifl_itt_lower_bound - INTEGER + Controls the lower bound inter-transmission time (ITT) threshold + for when a stream is considered thin. The value is specified in + microseconds, and may not be lower than 10000 (10 ms). Based on + this threshold, a dynamic packets in flight limit (DPIFL) is + calculated, which is used to classify whether a stream is thin. + Default: 10000 + tcp_limit_output_bytes - INTEGER Controls TCP Small Queue limit per tcp socket. TCP bulk sender tends to increase packets in flight until it diff --git a/include/net/tcp.h b/include/net/tcp.h index 4fc457b..deac96f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -215,6 +215,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* TCP thin-stream limits */ #define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */ +/* Lowest possible DPIFL lower bound ITT is 10 ms (10000 usec) */ +#define TCP_THIN_DPIFL_ITT_LOWER_BOUND_MIN 10000 /* TCP initial congestion window as per draft-hkchu-tcpm-initcwnd-01 */ #define TCP_INIT_CWND 10 @@ -274,6 +276,7 @@ extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; +extern int sysctl_tcp_thin_dpifl_itt_lower_bound; extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; @@ -1631,6 +1634,24 @@ static inline bool tcp_stream_is_thin(struct tcp_sock *tp) return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp); } +/** + * tcp_stream_is_thin_dpifl() - Tests if the stream is thin based on dynamic PIF + * limit + * @tp: the tcp_sock struct + * + * Return: true if current packets in flight (PIF) count is lower than + * the dynamic PIF limit, else false + */ +static inline bool tcp_stream_is_thin_dpifl(const struct tcp_sock *tp) +{ + /* Calculate the maximum allowed PIF limit by dividing the RTT by + * the minimum allowed inter-transmission time (ITT). + * Tests if PIF < RTT / ITT-lower-bound + */ + return (u64) tcp_packets_in_flight(tp) * + sysctl_tcp_thin_dpifl_itt_lower_bound < (tp->srtt_us >> 3); +} + /* /proc */ enum tcp_seq_states { TCP_SEQ_STATE_LISTENING, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a0bd7a5..5b12446 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -42,6 +42,7 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +static int tcp_thin_dpifl_itt_lower_bound_min = TCP_THIN_DPIFL_ITT_LOWER_BOUND_MIN; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) @@ -709,6 +710,14 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_thin_dpifl_itt_lower_bound", + .data = &sysctl_tcp_thin_dpifl_itt_lower_bound, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &tcp_thin_dpifl_itt_lower_bound_min, + }, + { .procname = "tcp_early_retrans", .data = &sysctl_tcp_early_retrans, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c172877..cb3354d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -287,6 +287,8 @@ int sysctl_tcp_min_tso_segs __read_mostly = 2; int sysctl_tcp_autocorking __read_mostly = 1; +int sysctl_tcp_thin_dpifl_itt_lower_bound __read_mostly = TCP_THIN_DPIFL_ITT_LOWER_BOUND_MIN; + struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html