The branch main has been updated by rrs: URL: https://cgit.FreeBSD.org/src/commit/?id=138e74ceadb237cdd42fcda11ddef18b509f571d
commit 138e74ceadb237cdd42fcda11ddef18b509f571d Author: Randall Stewart <[email protected]> AuthorDate: 2026-01-05 16:30:22 +0000 Commit: Randall Stewart <[email protected]> CommitDate: 2026-01-05 16:30:22 +0000 TCP Stacks, Improve rack to better handle reordering With a recent bug in the igb (and a few other) driver LRO mis-queuing, rack did things ok, better than the base stack, due to the rack reordering protections in rack, but there was still room for improvements. When a series of packets are completely mis-ordered you often times can get the acks shortly after you have entered recovery and retransmitted the first of the packets indicated in the sack stream. Then the cum-ack arrives basically acking all those packets. If you look at the time from when you sent the packet to when the ack came back you can quickly determine that the ack was not to what you just transmitted but instead was original and you had a completely false recovery entry. Dropping out of that you can then restore the congestion state and continue on your way. The Dup-acks that also arrive help increase your reordering windows which makes you less likely to repeat the scenario. Differential Revision:<https://reviews.freebsd.org/D53832> --- sys/netinet/tcp_stacks/rack.c | 94 +++++++++++++++++++++++++++++++++++---- sys/netinet/tcp_stacks/tcp_rack.h | 1 + 2 files changed, 87 insertions(+), 8 deletions(-) diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 76bf8f2e3b17..f55abbf919b7 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -204,6 +204,7 @@ static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default fo static int32_t rack_rxt_controls = 0; static int32_t rack_fill_cw_state = 0; static uint8_t rack_req_measurements = 1; +static uint32_t rack_rtt_divisor = 2; static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ static int32_t rack_hw_rate_caps = 0; /* 1; */ static int32_t rack_hw_rate_cap_per = 0; /* 0 -- off */ @@ -497,7 +498,7 @@ static uint64_t rack_get_gp_est(struct tcp_rack *rack); static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint32_t cts); + struct rack_sendmap *rsm, uint32_t cts, int line); static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); static int32_t rack_output(struct tcpcb *tp); @@ -1349,6 +1350,11 @@ rack_init_sysctls(void) OID_AUTO, "reset_ssth_rec_rto", CTLFLAG_RW, &rack_ssthresh_rest_rto_rec, 0, "When doing recovery -> rto -> recovery do we reset SSthresh?"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timers), + OID_AUTO, "rtt_divisor", CTLFLAG_RW, + &rack_rtt_divisor, 2, + "When calculating the rtt threshold what 1/N is a rtt that indicates reordering"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "scoreboard_thresh", CTLFLAG_RW, @@ -2663,6 +2669,8 @@ rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, log.u_bbr.cur_del_rate = (uintptr_t)prev; log.u_bbr.delRate = (uintptr_t)rsm; log.u_bbr.rttProp = (uintptr_t)next; + if (rsm) + log.u_bbr.flex1 = rsm->r_flags; log.u_bbr.flex7 = 0; if (prev) { log.u_bbr.flex1 = prev->r_start; @@ -5584,6 +5592,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) rack->r_ctl.rc_prr_delivered = 0; rack->r_ctl.rc_prr_out = 0; rack->r_fast_output = 0; + rack->r_ctl.recovery_rxt_cnt = 0; if (rack->rack_no_prr == 0) { rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); rack_log_to_prr(rack, 2, in_rec_at_entry, line); @@ -7416,6 +7425,7 @@ rack_remxt_tmr(struct tcpcb *tp) */ TAILQ_INIT(&rack->r_ctl.rc_tmap); + rack->r_ctl.recovery_rxt_cnt = 0; TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { rsm->r_dupack = 0; if (rack_verbose_logging) @@ -8042,6 +8052,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, /* We have retransmitted due to the SACK pass */ rsm->r_flags &= ~RACK_SACK_PASSED; rsm->r_flags |= RACK_WAS_SACKPASS; + rack->r_ctl.recovery_rxt_cnt += (rsm->r_end - rsm->r_start); } } @@ -8948,7 +8959,7 @@ ts_not_found: */ static void rack_log_sack_passed(struct tcpcb *tp, - struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts) + struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts, int line) { struct rack_sendmap *nrsm; uint32_t thresh; @@ -8997,6 +9008,7 @@ rack_log_sack_passed(struct tcpcb *tp, */ break; } + rack_log_dsack_event(rack, 12, __LINE__, nrsm->r_start, nrsm->r_end); nrsm->r_flags |= RACK_SACK_PASSED; nrsm->r_flags &= ~RACK_WAS_SACKPASS; } @@ -9172,6 +9184,39 @@ is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm return (1); } + +static int +rack_check_reorder_ack(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, int the_end, uint32_t cts, int can_exit_recovery, int line) +{ + if ((rack_rtt_divisor > 0) && + (rsm->r_rtr_cnt == 2) && + IN_RECOVERY(tp->t_flags) && + (rsm->r_flags & RACK_WAS_SACKPASS)){ + uint32_t fractional, snt_to_ack; + + fractional = (tp->t_srtt / rack_rtt_divisor); + if (fractional == 0) + fractional = 1; + snt_to_ack = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; + if (snt_to_ack <= fractional) { + rack->r_ctl.rc_reorder_ts = cts; + KASSERT((rack->r_ctl.recovery_rxt_cnt >= (the_end - rsm->r_start)), + ("rsm:%p rack:%p recovery_rxt_cnt would go negative recovery_rxt_cnt:%u sub:%u", rsm, rack, rack->r_ctl.recovery_rxt_cnt, (the_end - rsm->r_start))); + rack->r_ctl.recovery_rxt_cnt -= (the_end - rsm->r_start); + rack_log_to_prr(rack, 18, rack->r_ctl.recovery_rxt_cnt, line); + if (can_exit_recovery && (rack->r_ctl.recovery_rxt_cnt == 0)) { + tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec; + rack_exit_recovery(tp, rack, 4); + rack->r_might_revert = 0; + rack->r_ctl.retran_during_recovery = 0; + rack_log_to_prr(rack, 17, snt_to_ack, line); + } + return (1); + } + } + return (0); +} + static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, @@ -9183,6 +9228,7 @@ rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack int insret __diagused; int32_t used_ref = 1; int can_use_hookery = 0; + int prohibit_marking = 0; start = sack->start; end = sack->end; @@ -9273,6 +9319,8 @@ do_rest_ofb: (rsm->bindex == next->bindex) && ((rsm->r_flags & RACK_STRADDLE) == 0) && ((next->r_flags & RACK_STRADDLE) == 0) && + ((rsm->r_flags & RACK_WAS_SACKPASS) == 0) && + ((next->r_flags & RACK_WAS_SACKPASS) == 0) && ((rsm->r_flags & RACK_IS_PCM) == 0) && ((next->r_flags & RACK_IS_PCM) == 0) && (rsm->r_flags & RACK_IN_GP_WIN) && @@ -9345,6 +9393,8 @@ do_rest_ofb: rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); /* Now lets make sure our fudge block is right */ nrsm->r_start = start; + /* Check if the ack was too soon i.e. reordering + ack arrives too quickly */ + prohibit_marking = rack_check_reorder_ack(tp, rack, nrsm, nrsm->r_end, cts, 0, __LINE__); /* Now lets update all the stats and such */ rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); if (rack->app_limited_needs_set) @@ -9388,8 +9438,8 @@ do_rest_ofb: * Now that we have the next * one walk backwards from there. */ - if (nrsm && nrsm->r_in_tmap) - rack_log_sack_passed(tp, rack, nrsm, cts); + if (nrsm && nrsm->r_in_tmap && (prohibit_marking == 0)) + rack_log_sack_passed(tp, rack, nrsm, cts, __LINE__); } /* Now are we done? */ if (SEQ_LT(end, next->r_end) || @@ -9445,6 +9495,8 @@ do_rest_ofb: } rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__); rsm->r_flags &= (~RACK_HAS_FIN); + /* Check if the ack was too soon i.e. reordering + ack arrives too quickly */ + prohibit_marking = rack_check_reorder_ack(tp, rack, nrsm, nrsm->r_end, cts, 0, __LINE__); /* Position us to point to the new nrsm that starts the sack blk */ rsm = nrsm; } @@ -9521,6 +9573,8 @@ do_rest_ofb: } rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); changed += (rsm->r_end - rsm->r_start); + /* Check if the ack was too soon i.e. reordering + ack arrives too quickly */ + prohibit_marking = rack_check_reorder_ack(tp, rack, rsm, rsm->r_end, cts, 0, __LINE__); /* You get a count for acking a whole segment or more */ if (rsm->r_flags & RACK_WAS_LOST) { /* @@ -9530,8 +9584,9 @@ do_rest_ofb: rack_mark_nolonger_lost(rack, rsm); } rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); - if (rsm->r_in_tmap) /* should be true */ - rack_log_sack_passed(tp, rack, rsm, cts); + if (rsm->r_in_tmap && (prohibit_marking == 0)) /* should be true */ + rack_log_sack_passed(tp, rack, rsm, cts, __LINE__); + /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { rsm->r_flags &= ~RACK_SACK_PASSED; @@ -9620,6 +9675,8 @@ do_rest_ofb: (rsm->bindex == prev->bindex) && ((rsm->r_flags & RACK_STRADDLE) == 0) && ((prev->r_flags & RACK_STRADDLE) == 0) && + ((prev->r_flags & RACK_WAS_SACKPASS) == 0) && + ((rsm->r_flags & RACK_WAS_SACKPASS) == 0) && ((rsm->r_flags & RACK_IS_PCM) == 0) && ((prev->r_flags & RACK_IS_PCM) == 0) && (rsm->r_flags & RACK_IN_GP_WIN) && @@ -9658,6 +9715,8 @@ do_rest_ofb: */ nrsm->r_end = end; rsm->r_dupack = 0; + /* Check if the ack was too soon i.e. reordering + ack arrives too quickly */ + prohibit_marking = rack_check_reorder_ack(tp, rack, nrsm, nrsm->r_end, cts, 0, __LINE__); /* * Which timestamp do we keep? It is rather * important in GP measurements to have the @@ -9807,6 +9866,8 @@ do_rest_ofb: nrsm->r_in_tmap = 1; } nrsm->r_dupack = 0; + /* Check if the ack was too soon i.e. reordering + ack arrives too quickly */ + prohibit_marking = rack_check_reorder_ack(tp, rack, nrsm, nrsm->r_end, cts, 0, __LINE__); rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); changed += (rsm->r_end - rsm->r_start); @@ -9818,8 +9879,8 @@ do_rest_ofb: } rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); - if (rsm->r_in_tmap) /* should be true */ - rack_log_sack_passed(tp, rack, rsm, cts); + if (rsm->r_in_tmap && (prohibit_marking == 0)) /* should be true */ + rack_log_sack_passed(tp, rack, rsm, cts, __LINE__); /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { rsm->r_flags &= ~RACK_SACK_PASSED; @@ -9857,6 +9918,10 @@ out: ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { break; } + /* We can't merge retransmitted with sack-pass set */ + if ((rsm->r_flags & RACK_WAS_SACKPASS) || + (next->r_flags & RACK_WAS_SACKPASS)) + break; if ((rsm->r_flags & RACK_IN_GP_WIN) && ((next->r_flags & RACK_IN_GP_WIN) == 0)) { break; @@ -9888,6 +9953,10 @@ out: ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { break; } + /* We can't merge retransmitted with sack-pass set */ + if ((rsm->r_flags & RACK_WAS_SACKPASS) || + (prev->r_flags & RACK_WAS_SACKPASS)) + break; if ((rsm->r_flags & RACK_IN_GP_WIN) && ((prev->r_flags & RACK_IN_GP_WIN) == 0)) { break; @@ -10254,6 +10323,7 @@ more: } rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end); } else { + (void)rack_check_reorder_ack(tp, rack, rsm, rsm->r_end, cts, 1, __LINE__); rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end); } if ((rsm->r_flags & RACK_TO_REXT) && @@ -10354,6 +10424,14 @@ more: } rsm->soff += (th_ack - rsm->r_start); rack_rsm_sender_update(rack, tp, rsm, 5); + + /* + * Handle the special case where we retransmitted part of a segment we + * in this case pass in th_ack which is shorter than r_end. + */ + if (rsm->r_flags & RACK_WAS_SACKPASS) { + rack_check_reorder_ack(tp, rack, rsm, th_ack, cts, 1, __LINE__); + } /* The trim will move th_ack into r_start for us */ tqhash_trim(rack->r_ctl.tqh, th_ack); /* Now do we need to move the mbuf fwd too? */ diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h index cac17d9aeb50..2e2ced089f67 100644 --- a/sys/netinet/tcp_stacks/tcp_rack.h +++ b/sys/netinet/tcp_stacks/tcp_rack.h @@ -539,6 +539,7 @@ struct rack_control { uint32_t last_rcv_tstmp_for_rtt; uint32_t last_time_of_arm_rcv; uint32_t rto_ssthresh; + uint32_t recovery_rxt_cnt; uint32_t rc_saved_beta; uint32_t rc_saved_beta_ecn; /* * For newreno cc: rc_saved_beta and
