tp->rcv_wnd is an advertised window, but later receive-side accounting needs to recover the hard memory budget that window represented when it was exposed.
Prepare for that by storing the scaling basis alongside tp->rcv_wnd and centralizing the helper API around the paired state. While here, make the existing receive-memory arithmetic use the shared helper names so later behavioral changes can build on one explicit accounting model. This patch is groundwork only. Later patches will refresh the snapshot at window write sites and consume it in the receive-memory paths. Signed-off-by: Wesley Atwell <[email protected]> --- .../networking/net_cachelines/tcp_sock.rst | 1 + include/linux/tcp.h | 1 + include/net/tcp.h | 79 +++++++++++++++++-- net/ipv4/tcp.c | 1 + 4 files changed, 76 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 563daea10d6c..1415981b9d8a 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -12,6 +12,7 @@ struct inet_connection_sock inet_conn u16 tcp_header_len read_mostly read_mostly tcp_bound_to_half_wnd,tcp_current_mss(tx);tcp_rcv_established(rx) u16 gso_segs read_mostly tcp_xmit_size_goal __be32 pred_flags read_write read_mostly tcp_select_window(tx);tcp_rcv_established(rx) +u8 rcv_wnd_scaling_ratio read_write read_mostly tcp_set_rcv_wnd,tcp_can_ingest,tcp_clamp_window u64 bytes_received read_write tcp_rcv_nxt_update(rx) u32 segs_in read_write tcp_v6_rcv(rx) u32 data_segs_in read_write tcp_v6_rcv(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f72eef31fa23..ec6b70c1174b 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -297,6 +297,7 @@ struct tcp_sock { est_ecnfield:2,/* ECN field for AccECN delivered estimates */ accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */ prev_ecnfield:2; /* ECN bits from the previous segment */ + u8 rcv_wnd_scaling_ratio; /* 0 if unknown, else tp->rcv_wnd basis */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 978eea2d5df0..187e6d660f62 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1702,6 +1702,26 @@ static inline int tcp_space_from_win(const struct sock *sk, int win) return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); } +static inline bool tcp_rcv_wnd_snapshot_valid(const struct tcp_sock *tp) +{ + return tp->rcv_wnd_scaling_ratio != 0; +} + +/* Rebuild hard receive-memory units for data already covered by tp->rcv_wnd if + * the advertise-time basis is known. Legacy TCP_REPAIR restores can only + * recover tp->rcv_wnd itself; callers must fall back when the snapshot is + * unknown. + */ +static inline bool tcp_space_from_rcv_wnd(const struct tcp_sock *tp, int win, + int *space) +{ + if (!tcp_rcv_wnd_snapshot_valid(tp)) + return false; + + *space = __tcp_space_from_win(tp->rcv_wnd_scaling_ratio, win); + return true; +} + /* Assume a 50% default for skb->len/skb->truesize ratio. * This may be adjusted later in tcp_measure_rcv_mss(). */ @@ -1709,15 +1729,62 @@ static inline int tcp_space_from_win(const struct sock *sk, int win) static inline void tcp_scaling_ratio_init(struct sock *sk) { - tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; + struct tcp_sock *tp = tcp_sk(sk); + + tp->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; + tp->rcv_wnd_scaling_ratio = TCP_DEFAULT_SCALING_RATIO; +} + +/* tp->rcv_wnd is paired with the scaling_ratio that was in force when that + * window was last advertised. Legacy TCP_REPAIR restores can only recover the + * window value itself and use a zero snapshot until a fresh local window + * advertisement refreshes the pair. + */ +static inline void tcp_set_rcv_wnd_snapshot(struct tcp_sock *tp, u32 win, + u8 scaling_ratio) +{ + tp->rcv_wnd = win; + tp->rcv_wnd_scaling_ratio = scaling_ratio; +} + +static inline void tcp_set_rcv_wnd(struct tcp_sock *tp, u32 win) +{ + tcp_set_rcv_wnd_snapshot(tp, win, tp->scaling_ratio); +} + +static inline void tcp_set_rcv_wnd_unknown(struct tcp_sock *tp, u32 win) +{ + tcp_set_rcv_wnd_snapshot(tp, win, 0); +} + +/* TCP receive-side accounting reuses sk_rcvbuf as both a hard memory limit + * and as the source material for the advertised receive window after + * scaling_ratio conversion. Keep the byte accounting explicit so admission, + * pruning, and rwnd selection all start from the same quantities. + */ +static inline int tcp_rmem_used(const struct sock *sk) +{ + return atomic_read(&sk->sk_rmem_alloc); +} + +static inline int tcp_rmem_avail(const struct sock *sk) +{ + return READ_ONCE(sk->sk_rcvbuf) - tcp_rmem_used(sk); +} + +/* Sender-visible rwnd headroom also reserves bytes already queued on backlog. + * Those bytes are not free to advertise again until __release_sock() drains + * backlog and clears sk_backlog.len. + */ +static inline int tcp_rwnd_avail(const struct sock *sk) +{ + return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len); } /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { - return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - - READ_ONCE(sk->sk_backlog.len) - - atomic_read(&sk->sk_rmem_alloc)); + return tcp_win_from_space(sk, tcp_rwnd_avail(sk)); } static inline int tcp_full_space(const struct sock *sk) @@ -1760,7 +1827,7 @@ static inline bool tcp_rmem_pressure(const struct sock *sk) rcvbuf = READ_ONCE(sk->sk_rcvbuf); threshold = rcvbuf - (rcvbuf >> 3); - return atomic_read(&sk->sk_rmem_alloc) > threshold; + return tcp_rmem_used(sk) > threshold; } static inline bool tcp_epollin_ready(const struct sock *sk, int target) @@ -1910,7 +1977,7 @@ static inline void tcp_fast_path_check(struct sock *sk) if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && tp->rcv_wnd && - atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && + tcp_rmem_avail(sk) > 0 && !tp->urg_data) tcp_fast_path_on(tp); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 202a4e57a218..cec9ae1bf875 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5238,6 +5238,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd_scaling_ratio); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); -- 2.34.1
