The branch main has been updated by rrs:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=138e74ceadb237cdd42fcda11ddef18b509f571d

commit 138e74ceadb237cdd42fcda11ddef18b509f571d
Author:     Randall Stewart <[email protected]>
AuthorDate: 2026-01-05 16:30:22 +0000
Commit:     Randall Stewart <[email protected]>
CommitDate: 2026-01-05 16:30:22 +0000

    TCP Stacks, Improve rack to better handle reordering
    
    With a recent bug in the igb (and a few other) driver LRO mis-queuing, rack 
did things ok, better
    than the base stack, due to the rack reordering protections in rack, but 
there was still room for improvements.
    When a series of packets are completely mis-ordered you often times can get 
the acks shortly after you have
    entered recovery and retransmitted the first of the packets indicated in 
the sack stream. Then the cum-ack
    arrives basically acking all those packets. If you look at the time from 
when you sent the packet to when the
    ack came back you can quickly determine that the ack was not to what you 
just transmitted but instead
    was original and you had a completely false recovery entry. Dropping out of 
that you can then restore the
    congestion state and continue on your way. The Dup-acks that also arrive 
help increase your reordering windows
    which makes you less likely to repeat the scenario.
    
    Differential Revision:<https://reviews.freebsd.org/D53832>
---
 sys/netinet/tcp_stacks/rack.c     | 94 +++++++++++++++++++++++++++++++++++----
 sys/netinet/tcp_stacks/tcp_rack.h |  1 +
 2 files changed, 87 insertions(+), 8 deletions(-)

diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 76bf8f2e3b17..f55abbf919b7 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -204,6 +204,7 @@ static int32_t rack_dnd_default = 0;                /* For 
rr_conf = 3, what is the default fo
 static int32_t rack_rxt_controls = 0;
 static int32_t rack_fill_cw_state = 0;
 static uint8_t rack_req_measurements = 1;
+static uint32_t rack_rtt_divisor = 2;
 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by 
default */
 static int32_t rack_hw_rate_caps = 0; /* 1; */
 static int32_t rack_hw_rate_cap_per = 0;       /* 0 -- off  */
@@ -497,7 +498,7 @@ static uint64_t rack_get_gp_est(struct tcp_rack *rack);
 
 static void
 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
-    struct rack_sendmap *rsm, uint32_t cts);
+    struct rack_sendmap *rsm, uint32_t cts, int line);
 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct 
rack_sendmap *rsm);
 static int32_t rack_output(struct tcpcb *tp);
 
@@ -1349,6 +1350,11 @@ rack_init_sysctls(void)
            OID_AUTO, "reset_ssth_rec_rto", CTLFLAG_RW,
            &rack_ssthresh_rest_rto_rec, 0,
            "When doing recovery -> rto -> recovery do we reset SSthresh?");
+       SYSCTL_ADD_U32(&rack_sysctl_ctx,
+           SYSCTL_CHILDREN(rack_timers),
+           OID_AUTO, "rtt_divisor", CTLFLAG_RW,
+           &rack_rtt_divisor, 2,
+           "When calculating the rtt threshold what 1/N is a rtt that 
indicates reordering");
        SYSCTL_ADD_U32(&rack_sysctl_ctx,
            SYSCTL_CHILDREN(rack_timers),
            OID_AUTO, "scoreboard_thresh", CTLFLAG_RW,
@@ -2663,6 +2669,8 @@ rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,
                log.u_bbr.cur_del_rate = (uintptr_t)prev;
                log.u_bbr.delRate = (uintptr_t)rsm;
                log.u_bbr.rttProp = (uintptr_t)next;
+               if (rsm)
+                       log.u_bbr.flex1 = rsm->r_flags;
                log.u_bbr.flex7 = 0;
                if (prev) {
                        log.u_bbr.flex1 = prev->r_start;
@@ -5584,6 +5592,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, 
uint32_t ack, int line)
                        rack->r_ctl.rc_prr_delivered = 0;
                        rack->r_ctl.rc_prr_out = 0;
                        rack->r_fast_output = 0;
+                       rack->r_ctl.recovery_rxt_cnt = 0;
                        if (rack->rack_no_prr == 0) {
                                rack->r_ctl.rc_prr_sndcnt = 
ctf_fixed_maxseg(tp);
                                rack_log_to_prr(rack, 2, in_rec_at_entry, line);
@@ -7416,6 +7425,7 @@ rack_remxt_tmr(struct tcpcb *tp)
         */
        TAILQ_INIT(&rack->r_ctl.rc_tmap);
 
+       rack->r_ctl.recovery_rxt_cnt = 0;
        TQHASH_FOREACH(rsm, rack->r_ctl.tqh)  {
                rsm->r_dupack = 0;
                if (rack_verbose_logging)
@@ -8042,6 +8052,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
                /* We have retransmitted due to the SACK pass */
                rsm->r_flags &= ~RACK_SACK_PASSED;
                rsm->r_flags |= RACK_WAS_SACKPASS;
+               rack->r_ctl.recovery_rxt_cnt += (rsm->r_end - rsm->r_start);
        }
 }
 
@@ -8948,7 +8959,7 @@ ts_not_found:
  */
 static void
 rack_log_sack_passed(struct tcpcb *tp,
-    struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts)
+    struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts, int line)
 {
        struct rack_sendmap *nrsm;
        uint32_t thresh;
@@ -8997,6 +9008,7 @@ rack_log_sack_passed(struct tcpcb *tp,
                         */
                        break;
                }
+               rack_log_dsack_event(rack, 12, __LINE__, nrsm->r_start, 
nrsm->r_end);
                nrsm->r_flags |= RACK_SACK_PASSED;
                nrsm->r_flags &= ~RACK_WAS_SACKPASS;
        }
@@ -9172,6 +9184,39 @@ is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, 
struct rack_sendmap *rsm
        return (1);
 }
 
+
+static int
+rack_check_reorder_ack(struct tcpcb *tp, struct tcp_rack *rack, struct 
rack_sendmap *rsm, int the_end, uint32_t cts, int can_exit_recovery, int line)
+{
+       if ((rack_rtt_divisor > 0) &&
+           (rsm->r_rtr_cnt == 2) &&
+           IN_RECOVERY(tp->t_flags) &&
+           (rsm->r_flags & RACK_WAS_SACKPASS)){
+               uint32_t fractional, snt_to_ack;
+
+               fractional = (tp->t_srtt / rack_rtt_divisor);
+               if (fractional == 0)
+                       fractional = 1;
+               snt_to_ack = cts - 
(uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
+               if (snt_to_ack <= fractional) {
+                       rack->r_ctl.rc_reorder_ts = cts;
+                       KASSERT((rack->r_ctl.recovery_rxt_cnt >= (the_end - 
rsm->r_start)),
+                               ("rsm:%p rack:%p recovery_rxt_cnt would go 
negative recovery_rxt_cnt:%u sub:%u", rsm,  rack, rack->r_ctl.recovery_rxt_cnt, 
(the_end - rsm->r_start)));
+                       rack->r_ctl.recovery_rxt_cnt -= (the_end - 
rsm->r_start);
+                       rack_log_to_prr(rack, 18, rack->r_ctl.recovery_rxt_cnt, 
line);
+                       if (can_exit_recovery && (rack->r_ctl.recovery_rxt_cnt 
== 0)) {
+                               tp->snd_ssthresh = 
rack->r_ctl.rc_ssthresh_at_erec;
+                               rack_exit_recovery(tp, rack, 4);
+                               rack->r_might_revert = 0;
+                               rack->r_ctl.retran_during_recovery = 0;
+                               rack_log_to_prr(rack, 17, snt_to_ack, line);
+                       }
+                       return (1);
+               }
+       }
+       return (0);
+}
+
 static uint32_t
 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk 
*sack,
                   struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts,
@@ -9183,6 +9228,7 @@ rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack 
*rack, struct sackblk *sack
        int insret __diagused;
        int32_t used_ref = 1;
        int can_use_hookery = 0;
+       int prohibit_marking = 0;
 
        start = sack->start;
        end = sack->end;
@@ -9273,6 +9319,8 @@ do_rest_ofb:
                            (rsm->bindex == next->bindex) &&
                            ((rsm->r_flags & RACK_STRADDLE) == 0) &&
                            ((next->r_flags & RACK_STRADDLE) == 0) &&
+                           ((rsm->r_flags & RACK_WAS_SACKPASS) == 0) &&
+                           ((next->r_flags & RACK_WAS_SACKPASS) == 0) &&
                            ((rsm->r_flags & RACK_IS_PCM) == 0) &&
                            ((next->r_flags & RACK_IS_PCM) == 0) &&
                            (rsm->r_flags & RACK_IN_GP_WIN) &&
@@ -9345,6 +9393,8 @@ do_rest_ofb:
                                rack_log_retran_reason(rack, rsm, __LINE__, 0, 
2);
                                /* Now lets make sure our fudge block is right 
*/
                                nrsm->r_start = start;
+                               /* Check if the ack was too soon i.e. 
reordering + ack arrives too quickly */
+                               prohibit_marking = rack_check_reorder_ack(tp, 
rack, nrsm, nrsm->r_end, cts, 0, __LINE__);
                                /* Now lets update all the stats and such */
                                rack_update_rtt(tp, rack, nrsm, to, cts, 
SACKED, 0);
                                if (rack->app_limited_needs_set)
@@ -9388,8 +9438,8 @@ do_rest_ofb:
                                         * Now that we have the next
                                         * one walk backwards from there.
                                         */
-                                       if (nrsm && nrsm->r_in_tmap)
-                                               rack_log_sack_passed(tp, rack, 
nrsm, cts);
+                                       if (nrsm && nrsm->r_in_tmap && 
(prohibit_marking == 0))
+                                               rack_log_sack_passed(tp, rack, 
nrsm, cts, __LINE__);
                                }
                                /* Now are we done? */
                                if (SEQ_LT(end, next->r_end) ||
@@ -9445,6 +9495,8 @@ do_rest_ofb:
                                }
                                rack_log_map_chg(tp, rack, NULL, rsm, nrsm, 
MAP_SACK_M2, end, __LINE__);
                                rsm->r_flags &= (~RACK_HAS_FIN);
+                               /* Check if the ack was too soon i.e. 
reordering + ack arrives too quickly */
+                               prohibit_marking = rack_check_reorder_ack(tp, 
rack, nrsm, nrsm->r_end, cts, 0, __LINE__);
                                /* Position us to point to the new nrsm that 
starts the sack blk */
                                rsm = nrsm;
                        }
@@ -9521,6 +9573,8 @@ do_rest_ofb:
                        }
                        rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
                        changed += (rsm->r_end - rsm->r_start);
+                       /* Check if the ack was too soon i.e. reordering + ack 
arrives too quickly */
+                       prohibit_marking = rack_check_reorder_ack(tp, rack, 
rsm, rsm->r_end, cts, 0, __LINE__);
                        /* You get a count for acking a whole segment or more */
                        if (rsm->r_flags & RACK_WAS_LOST) {
                                /*
@@ -9530,8 +9584,9 @@ do_rest_ofb:
                                rack_mark_nolonger_lost(rack, rsm);
                        }
                        rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
-                       if (rsm->r_in_tmap) /* should be true */
-                               rack_log_sack_passed(tp, rack, rsm, cts);
+                       if (rsm->r_in_tmap && (prohibit_marking == 0)) /* 
should be true */
+                               rack_log_sack_passed(tp, rack, rsm, cts, 
__LINE__);
+
                        /* Is Reordering occuring? */
                        if (rsm->r_flags & RACK_SACK_PASSED) {
                                rsm->r_flags &= ~RACK_SACK_PASSED;
@@ -9620,6 +9675,8 @@ do_rest_ofb:
                    (rsm->bindex == prev->bindex) &&
                    ((rsm->r_flags & RACK_STRADDLE) == 0) &&
                    ((prev->r_flags & RACK_STRADDLE) == 0) &&
+                   ((prev->r_flags & RACK_WAS_SACKPASS) == 0) &&
+                   ((rsm->r_flags & RACK_WAS_SACKPASS) == 0) &&
                    ((rsm->r_flags & RACK_IS_PCM) == 0) &&
                    ((prev->r_flags & RACK_IS_PCM) == 0) &&
                    (rsm->r_flags & RACK_IN_GP_WIN) &&
@@ -9658,6 +9715,8 @@ do_rest_ofb:
                         */
                        nrsm->r_end = end;
                        rsm->r_dupack = 0;
+                       /* Check if the ack was too soon i.e. reordering + ack 
arrives too quickly */
+                       prohibit_marking = rack_check_reorder_ack(tp, rack, 
nrsm, nrsm->r_end, cts, 0, __LINE__);
                        /*
                         * Which timestamp do we keep? It is rather
                         * important in GP measurements to have the
@@ -9807,6 +9866,8 @@ do_rest_ofb:
                                nrsm->r_in_tmap = 1;
                        }
                        nrsm->r_dupack = 0;
+                       /* Check if the ack was too soon i.e. reordering + ack 
arrives too quickly */
+                       prohibit_marking = rack_check_reorder_ack(tp, rack, 
nrsm, nrsm->r_end, cts, 0, __LINE__);
                        rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
                        rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
                        changed += (rsm->r_end - rsm->r_start);
@@ -9818,8 +9879,8 @@ do_rest_ofb:
                        }
                        rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
 
-                       if (rsm->r_in_tmap) /* should be true */
-                               rack_log_sack_passed(tp, rack, rsm, cts);
+                       if (rsm->r_in_tmap && (prohibit_marking == 0)) /* 
should be true */
+                               rack_log_sack_passed(tp, rack, rsm, cts, 
__LINE__);
                        /* Is Reordering occuring? */
                        if (rsm->r_flags & RACK_SACK_PASSED) {
                                rsm->r_flags &= ~RACK_SACK_PASSED;
@@ -9857,6 +9918,10 @@ out:
                            ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) {
                                break;
                        }
+                       /* We can't merge retransmitted with sack-pass set */
+                       if ((rsm->r_flags & RACK_WAS_SACKPASS) ||
+                           (next->r_flags & RACK_WAS_SACKPASS))
+                               break;
                        if ((rsm->r_flags & RACK_IN_GP_WIN) &&
                            ((next->r_flags & RACK_IN_GP_WIN) == 0)) {
                                break;
@@ -9888,6 +9953,10 @@ out:
                            ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) {
                                break;
                        }
+                       /* We can't merge retransmitted with sack-pass set */
+                       if ((rsm->r_flags & RACK_WAS_SACKPASS) ||
+                           (prev->r_flags & RACK_WAS_SACKPASS))
+                               break;
                        if ((rsm->r_flags & RACK_IN_GP_WIN) &&
                            ((prev->r_flags & RACK_IN_GP_WIN) == 0)) {
                                break;
@@ -10254,6 +10323,7 @@ more:
                        }
                        rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end);
                } else {
+                       (void)rack_check_reorder_ack(tp, rack, rsm, rsm->r_end, 
cts, 1, __LINE__);
                        rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end);
                }
                if ((rsm->r_flags & RACK_TO_REXT) &&
@@ -10354,6 +10424,14 @@ more:
        }
        rsm->soff += (th_ack - rsm->r_start);
        rack_rsm_sender_update(rack, tp, rsm, 5);
+
+       /*
+        * Handle the special case where we retransmitted part of a segment we
+        * in this case pass in th_ack which is shorter than r_end.
+        */
+       if (rsm->r_flags & RACK_WAS_SACKPASS) {
+               rack_check_reorder_ack(tp, rack, rsm, th_ack, cts, 1, __LINE__);
+       }
        /* The trim will move th_ack into r_start for us */
        tqhash_trim(rack->r_ctl.tqh, th_ack);
        /* Now do we need to move the mbuf fwd too? */
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h 
b/sys/netinet/tcp_stacks/tcp_rack.h
index cac17d9aeb50..2e2ced089f67 100644
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -539,6 +539,7 @@ struct rack_control {
        uint32_t last_rcv_tstmp_for_rtt;
        uint32_t last_time_of_arm_rcv;
        uint32_t rto_ssthresh;
+       uint32_t recovery_rxt_cnt;
        uint32_t rc_saved_beta;
        uint32_t rc_saved_beta_ecn;     /*
                                         * For newreno cc: rc_saved_beta and

Reply via email to