With this newly introduced TRACE_EVENT, it will be very easy to minotor TCP/IPv4 state transition.
A new TRACE_SYSTEM named tcp is added, in which we can trace other TCP event as well. Two helpers are added, static inline void __tcp_set_state(struct sock *sk, int state) static inline void __sk_state_store(struct sock *sk, int newstate) When do TCP/IPv4 state transition, we should use these two helpers or use tcp_set_state() instead of assign a value to sk_state directly. Signed-off-by: Yafang Shao <laoar.s...@gmail.com> --- include/net/tcp.h | 16 ++++++++++++ include/trace/events/tcp.h | 58 +++++++++++++++++++++++++++++++++++++++++ net/ipv4/inet_connection_sock.c | 9 ++++--- net/ipv4/inet_hashtables.c | 2 +- net/ipv4/tcp.c | 2 +- 5 files changed, 82 insertions(+), 5 deletions(-) create mode 100644 include/trace/events/tcp.h diff --git a/include/net/tcp.h b/include/net/tcp.h index 89974c5..a8336d3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -49,6 +49,7 @@ #include <linux/bpf.h> #include <linux/filter.h> #include <linux/bpf-cgroup.h> +#include <trace/events/tcp.h> extern struct inet_hashinfo tcp_hashinfo; @@ -1284,6 +1285,21 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) #endif void tcp_set_state(struct sock *sk, int state); +/* + * To trace TCP state transition. + */ +static inline void __tcp_set_state(struct sock *sk, int state) +{ + trace_tcp_set_state(sk, sk->sk_state, state); + sk->sk_state = state; +} + +static inline void __sk_state_store(struct sock *sk, int newstate) +{ + trace_tcp_set_state(sk, sk->sk_state, newstate); + sk_state_store(sk, newstate); +} + void tcp_done(struct sock *sk); int tcp_abort(struct sock *sk, int err); diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h new file mode 100644 index 0000000..abf65af --- /dev/null +++ b/include/trace/events/tcp.h @@ -0,0 +1,58 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM tcp + +#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TCP_H + +#include <linux/tracepoint.h> +#include <net/sock.h> +#include <net/inet_timewait_sock.h> +#include <net/request_sock.h> +#include <net/inet_sock.h> +#include <net/tcp_states.h> + +TRACE_EVENT(tcp_set_state, + TP_PROTO(struct sock *sk, int oldstate, int newstate), + TP_ARGS(sk, oldstate, newstate), + + TP_STRUCT__entry( + __field(__be32, dst) + __field(__be32, src) + __field(__u16, dport) + __field(__u16, sport) + __field(int, oldstate) + __field(int, newstate) + ), + + TP_fast_assign( + if (oldstate == TCP_TIME_WAIT) { + __entry->dst = inet_twsk(sk)->tw_daddr; + __entry->src = inet_twsk(sk)->tw_rcv_saddr; + __entry->dport = ntohs(inet_twsk(sk)->tw_dport); + __entry->sport = ntohs(inet_twsk(sk)->tw_sport); + } else if (oldstate == TCP_NEW_SYN_RECV) { + __entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr; + __entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr; + __entry->dport = + ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port); + __entry->sport = inet_rsk(inet_reqsk(sk))->ir_num; + } else { + __entry->dst = inet_sk(sk)->inet_daddr; + __entry->src = inet_sk(sk)->inet_rcv_saddr; + __entry->dport = ntohs(inet_sk(sk)->inet_dport); + __entry->sport = ntohs(inet_sk(sk)->inet_sport); + } + + __entry->oldstate = oldstate; + __entry->newstate = newstate; + ), + + TP_printk("%08X:%04X %08X:%04X, %02x %02x", + __entry->src, __entry->sport, __entry->dst, __entry->dport, + __entry->oldstate, __entry->newstate) +); + +#endif + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index c039c93..307a046 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -27,6 +27,9 @@ #include <net/sock_reuseport.h> #include <net/addrconf.h> +#define CREATE_TRACE_POINTS +#include <trace/events/tcp.h> + #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); @@ -786,7 +789,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); - newsk->sk_state = TCP_SYN_RECV; + __tcp_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; @@ -880,7 +883,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) * It is OK, because this socket enters to hash table only * after validation is complete. */ - sk_state_store(sk, TCP_LISTEN); + __sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); @@ -891,7 +894,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) return 0; } - sk->sk_state = TCP_CLOSE; + __tcp_set_state(sk, TCP_CLOSE); return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 597bb4c..0f45d456 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -430,7 +430,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { percpu_counter_inc(sk->sk_prot->orphan_count); - sk->sk_state = TCP_CLOSE; + __tcp_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5091402..984dce6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2040,7 +2040,7 @@ void tcp_set_state(struct sock *sk, int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk_state_store(sk, state); + __sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); -- 1.8.3.1