On 9/27/19 10:25 AM, Jonathan Maxwell wrote: > Acked-by: Jon Maxwell <jmaxwel...@gmail.com> > > Thanks for fixing that Eric. >
The patch seems to do the job. Tested-by: Marek Majkowski <ma...@cloudflare.com> Here's a selftest: ---8<--- From: Marek Majkowski <ma...@cloudflare.com> Date: Fri, 27 Sep 2019 13:37:52 +0200 Subject: [PATCH] selftests/net: TCP_USER_TIMEOUT in SYN-SENT state Test the TCP_USER_TIMEOUT behavior, overriding TCP_SYNCNT when socket is in SYN-SENT state. Signed-off-by: Marek Majkowski <ma...@cloudflare.com> --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 3 +- .../selftests/net/tcp_user_timeout_syn_sent.c | 322 ++++++++++++++++++ .../net/tcp_user_timeout_syn_sent.sh | 4 + 4 files changed, 329 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/tcp_user_timeout_syn_sent.c create mode 100755 tools/testing/selftests/net/tcp_user_timeout_syn_sent.sh diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index c7cced739c34..bc6a2b7199b6 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -21,3 +21,4 @@ ipv6_flowlabel ipv6_flowlabel_mgr so_txtime tcp_fastopen_backup_key +tcp_user_timeout_syn_sent diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 0bd6b23c97ef..065a171b8834 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -11,13 +11,14 @@ TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh ipv6_flowlabel.sh TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh +TEST_PROGS += tcp_user_timeout_syn_sent.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx ip_defrag TEST_GEN_FILES += so_txtime ipv6_flowlabel ipv6_flowlabel_mgr -TEST_GEN_FILES += tcp_fastopen_backup_key +TEST_GEN_FILES += tcp_fastopen_backup_key tcp_user_timeout_syn_sent TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls diff --git a/tools/testing/selftests/net/tcp_user_timeout_syn_sent.c b/tools/testing/selftests/net/tcp_user_timeout_syn_sent.c new file mode 100644 index 000000000000..1c9ec582359a --- /dev/null +++ b/tools/testing/selftests/net/tcp_user_timeout_syn_sent.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Testing if TCP_USER_TIMEOUT on SYN-SENT state overides TCP_SYNCNT. + * + * Historically, TCP_USER_TIMEOUT made only sense on synchronized TCP + * states, like ESTABLISHED. There was a bug on SYN-SENT state: with + * TCP_USER_TIMEOUT set, the connect() would ETIMEDOUT after given + * time, but near the end of the timer would flood SYN packets to + * fulfill the TCP_SYNCNT counter. For example for 2000ms user + * timeout and default TCP_SYNCNT=6, the tcpdump would look like: + * + * 00:00.000000 IP 127.0.0.1.1 > 127.0.0.1.2: Flags [S] + * 00:01.029452 IP 127.0.0.1.1 > 127.0.0.1.2: Flags [S] + * 00:02.021354 IP 127.0.0.1.1 > 127.0.0.1.2: Flags [S] + * 00:02.033419 IP 127.0.0.1.1 > 127.0.0.1.2: Flags [S] + * 00:02.041633 IP 127.0.0.1.1 > 127.0.0.1.2: Flags [S] + * 00:02.049263 IP 127.0.0.1.1 > 127.0.0.1.2: Flags [S] + * 00:02.057264 IP 127.0.0.1.1 > 127.0.0.1.2: Flags [S] + * + * Notice, 5 out of 6 retransmissions are aligned to 2s. We fixed + * that, and this code tests for the regression. We do this by + * actively dropping SYN packets on listen socket with ebpf + * SOCKET_FILTER, and counting how many packets did we drop. + * + * See: https://blog.cloudflare.com/when-tcp-sockets-refuse-to-die/ + */ +#include <arpa/inet.h> +#include <errno.h> +#include <error.h> +#include <linux/bpf.h> +#include <linux/tcp.h> +#include <linux/unistd.h> +#include <netinet/in.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> + +int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, + int max_entries, uint32_t map_flags) +{ + union bpf_attr attr = {}; + + attr.map_type = map_type; + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + attr.map_flags = map_flags; + return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); +} + +int bpf_load_program(enum bpf_prog_type prog_type, const struct bpf_insn *insns, + size_t insns_cnt, const char *license, + uint32_t kern_version) +{ + union bpf_attr attr = {}; + + attr.prog_type = prog_type; + attr.insns = (long)insns; + attr.insn_cnt = insns_cnt; + attr.license = (long)license; + attr.log_buf = (long)NULL; + attr.log_size = 0; + attr.log_level = 0; + attr.kern_version = kern_version; + + int fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + return fd; +} + +int bpf_map_update_elem(int fd, const void *key, const void *value, + uint64_t flags) +{ + union bpf_attr attr = {}; + + attr.map_fd = fd; + attr.key = (long)key; + attr.value = (long)value; + attr.flags = flags; + + return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); +} + +int bpf_map_lookup_elem(int fd, const void *key, void *value) +{ + union bpf_attr attr = {}; + + attr.map_fd = fd; + attr.key = (long)key; + attr.value = (long)value; + + return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); +} + +/* +struct bpf_map_def SEC("maps") stats_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(uint32_t), + .value_size = sizeof(uint64_t), + .max_entries = 2, +}; + +SEC("socket_filter") +int _socket_filter(struct __sk_buff *skb) +{ + (void)skb; + + uint32_t no = 0; + uint64_t *value = bpf_map_lookup_elem(&stats_map, &no); + if (value) { + __sync_fetch_and_add(value, 1); + } + return 0; // DROP inbound SYN packets +} + */ + +size_t bpf_insn_socket_filter_cnt = 12; +struct bpf_insn bpf_insn_socket_filter[] = { + { + .code = 0xb7, + .dst_reg = BPF_REG_1, + .src_reg = BPF_REG_0, + .off = 0, + .imm = 0 /**/ + }, + { + .code = 0x63, + .dst_reg = BPF_REG_10, + .src_reg = BPF_REG_1, + .off = -4, + .imm = 0 /**/ + }, + { + .code = 0xbf, + .dst_reg = BPF_REG_2, + .src_reg = BPF_REG_10, + .off = 0, + .imm = 0 /**/ + }, + { + .code = 0x7, + .dst_reg = BPF_REG_2, + .src_reg = BPF_REG_0, + .off = 0, + .imm = -4 /**/ + }, + { + .code = 0x18, + .dst_reg = BPF_REG_1, + .src_reg = BPF_REG_0, + .off = 0, + .imm = 0 /* relocation for stats_map */ + }, + { + .code = 0x0, + .dst_reg = BPF_REG_0, + .src_reg = BPF_REG_0, + .off = 0, + .imm = 0 /**/ + }, + { + .code = 0x85, + .dst_reg = BPF_REG_0, + .src_reg = BPF_REG_0, + .off = 0, + .imm = 1 /**/ + }, + { + .code = 0x15, + .dst_reg = BPF_REG_0, + .src_reg = BPF_REG_0, + .off = 2, + .imm = 0 /**/ + }, + { + .code = 0xb7, + .dst_reg = BPF_REG_1, + .src_reg = BPF_REG_0, + .off = 0, + .imm = 1 /**/ + }, + { + .code = 0xdb, + .dst_reg = BPF_REG_0, + .src_reg = BPF_REG_1, + .off = 0, + .imm = 0 /**/ + }, + { + .code = 0xb7, + .dst_reg = BPF_REG_0, + .src_reg = BPF_REG_0, + .off = 0, + .imm = 0 /**/ + }, + { + .code = 0x95, + .dst_reg = BPF_REG_0, + .src_reg = BPF_REG_0, + .off = 0, + .imm = 0 /**/ + } +}; + +void socket_filter_fill_stats_map(int fd) +{ + bpf_insn_socket_filter[4].src_reg = BPF_REG_1; + bpf_insn_socket_filter[4].imm = fd; +} + +static int net_setup_ebpf(int sd) +{ + int stats_map, bpf, r; + + stats_map = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t), + sizeof(uint64_t), 2, 0); + if (stats_map < 0) + error(1, errno, "bpf"); + + socket_filter_fill_stats_map(stats_map); + + bpf = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, + bpf_insn_socket_filter, + bpf_insn_socket_filter_cnt, "Dual BSD/GPL", 0); + if (bpf < 0) + error(1, errno, "bpf"); + + r = setsockopt(sd, SOL_SOCKET, SO_ATTACH_BPF, &bpf, sizeof(bpf)); + if (r < 0) + error(1, errno, "setsockopt(SO_ATTACH_FILTER)"); + + return stats_map; +} + +static int setup_server(struct sockaddr_in *addr) +{ + int sd, r; + socklen_t addr_sz; + + sd = socket(AF_INET, SOCK_STREAM, 0); + if (sd < 0) + error(1, errno, "socket()"); + + r = bind(sd, (struct sockaddr *)addr, sizeof(*addr)); + if (r != 0) + error(1, errno, "bind()"); + + r = listen(sd, 16); + if (r != 0) + error(1, errno, "listen()"); + + addr_sz = sizeof(*addr); + r = getsockname(sd, (struct sockaddr *)addr, &addr_sz); + if (r != 0) + error(1, errno, "getsockname()"); + + return sd; +} + +int main(void) +{ + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_addr = { inet_addr("127.0.0.1") }, + }; + + int sd = setup_server(&addr); + int stats_map = net_setup_ebpf(sd); + struct { + int user_timeout; + int expected_counter; + } tests[] = { + { 200, 2 }, // TCP_USER_TIMEOUT kicks in on first retranmission + { 1500, 2 }, + { 3500, 3 }, + { -1, -1 }, + }; + + int failed = 0, i; + + for (i = 0; tests[i].user_timeout >= 0; i++) { + int r, cd, v; + uint32_t k = 0; + uint64_t counter = 0; + + r = bpf_map_update_elem(stats_map, &k, &counter, 0); + + cd = socket(AF_INET, SOCK_STREAM, 0); + if (cd < 0) + error(1, errno, "socket()"); + + v = tests[i].user_timeout; + r = setsockopt(cd, IPPROTO_TCP, TCP_USER_TIMEOUT, &v, + sizeof(v)); + if (r != 0) + error(1, errno, "setsockopt()"); + + r = connect(cd, (struct sockaddr *)&addr, sizeof(addr)); + if (r != -1 && errno != ETIMEDOUT) + error(1, errno, "connect()"); + + r = bpf_map_lookup_elem(stats_map, &k, &counter); + if (r != 0) + error(1, errno, "bpf_map_lookup_elem()"); + + if ((int)counter != tests[i].expected_counter) { + failed += 1; + printf("[!] Expecting %d SYN packets on " + "TCP_USER_TIMEOUT=%d, got %d\n", + tests[i].expected_counter, tests[i].user_timeout, + (int)counter); + } + close(cd); + } + close(sd); + close(stats_map); + if (failed == 0) + fprintf(stderr, "PASSED\n"); + return failed; +} diff --git a/tools/testing/selftests/net/tcp_user_timeout_syn_sent.sh b/tools/testing/selftests/net/tcp_user_timeout_syn_sent.sh new file mode 100755 index 000000000000..26765f3a92c6 --- /dev/null +++ b/tools/testing/selftests/net/tcp_user_timeout_syn_sent.sh @@ -0,0 +1,4 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +./in_netns.sh ./tcp_user_timeout_syn_sent -- 2.17.1