Use hits to speed up the SACK processing. Various forms 
of this have been used by TCP developers (Web100, STCP, BIC)
to avoid the 2x linear search of outstanding segments.

Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]>

--- net-2.6.orig/include/linux/tcp.h
+++ net-2.6/include/linux/tcp.h
@@ -307,6 +307,21 @@ struct tcp_sock {
        struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
        struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
 
+       struct tcp_sack_block recv_sack_cache[4];
+
+       /* from STCP, retrans queue hinting */
+       struct sk_buff* lost_skb_hint;
+
+       struct sk_buff *scoreboard_skb_hint;
+       struct sk_buff *retransmit_skb_hint;
+       struct sk_buff *forward_skb_hint;
+       struct sk_buff *fastpath_skb_hint;
+
+       int     fastpath_cnt_hint;
+       int     lost_cnt_hint;
+       int     retransmit_cnt_hint;
+       int     forward_cnt_hint;
+
        __u16   advmss;         /* Advertised MSS                       */
        __u16   prior_ssthresh; /* ssthresh saved at recovery start     */
        __u32   lost_out;       /* Lost packets                 */
--- net-2.6.orig/include/net/sock.h
+++ net-2.6/include/net/sock.h
@@ -1247,6 +1247,12 @@ static inline struct page *sk_stream_all
                     (skb != (struct sk_buff *)&(sk)->sk_write_queue);  \
                     skb = skb->next)
 
+/*from STCP for fast SACK Process*/
+#define sk_stream_for_retrans_queue_from(skb, sk)                      \
+               for (; (skb != (sk)->sk_send_head) &&                   \
+                    (skb != (struct sk_buff *)&(sk)->sk_write_queue);  \
+                    skb = skb->next)
+
 /*
  *     Default write policy as shown to user space via poll/select/SIGIO
  */
--- net-2.6.orig/include/net/tcp.h
+++ net-2.6/include/net/tcp.h
@@ -1204,6 +1204,15 @@ static inline void tcp_mib_init(void)
        TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1);
 }
 
+/*from STCP */
+static inline void clear_all_retrans_hints(struct tcp_sock *tp){
+       tp->lost_skb_hint = NULL;
+       tp->scoreboard_skb_hint = NULL;
+       tp->retransmit_skb_hint = NULL;
+       tp->forward_skb_hint = NULL;
+       tp->fastpath_skb_hint = NULL;
+}
+
 /* /proc */
 enum tcp_seq_states {
        TCP_SEQ_STATE_LISTENING,
--- net-2.6.orig/net/ipv4/tcp_input.c
+++ net-2.6/net/ipv4/tcp_input.c
@@ -897,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk,
        int prior_fackets;
        u32 lost_retrans = 0;
        int flag = 0;
+       int dup_sack = 0;
        int i;
 
        if (!tp->sacked_out)
                tp->fackets_out = 0;
        prior_fackets = tp->fackets_out;
 
-       for (i=0; i<num_sacks; i++, sp++) {
-               struct sk_buff *skb;
-               __u32 start_seq = ntohl(sp->start_seq);
-               __u32 end_seq = ntohl(sp->end_seq);
-               int fack_count = 0;
-               int dup_sack = 0;
+       /* SACK fastpath:
+        * if the only SACK change is the increase of the end_seq of
+        * the first block then only apply that SACK block
+        * and use retrans queue hinting otherwise slowpath */
+       flag = 1;
+       for (i = 0; i< num_sacks; i++) {
+               __u32 start_seq = ntohl(sp[i].start_seq);
+               __u32 end_seq =  ntohl(sp[i].end_seq);
+
+               if (i == 0){
+                       if (tp->recv_sack_cache[i].start_seq != start_seq)
+                               flag = 0;
+               } else {
+                       if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
+                           (tp->recv_sack_cache[i].end_seq != end_seq))
+                               flag = 0;
+               }
+               tp->recv_sack_cache[i].start_seq = start_seq;
+               tp->recv_sack_cache[i].end_seq = end_seq;
 
                /* Check for D-SACK. */
                if (i == 0) {
@@ -940,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk,
                        if (before(ack, prior_snd_una - tp->max_window))
                                return 0;
                }
+       }
+
+       if (flag)
+               num_sacks = 1;
+       else {
+               int j;
+               tp->fastpath_skb_hint = NULL;
+
+               /* order SACK blocks to allow in order walk of the retrans 
queue */
+               for (i = num_sacks-1; i > 0; i--) {
+                       for (j = 0; j < i; j++){
+                               if (after(ntohl(sp[j].start_seq),
+                                         ntohl(sp[j+1].start_seq))){
+                                       sp[j].start_seq = 
htonl(tp->recv_sack_cache[j+1].start_seq);
+                                       sp[j].end_seq = 
htonl(tp->recv_sack_cache[j+1].end_seq);
+                                       sp[j+1].start_seq = 
htonl(tp->recv_sack_cache[j].start_seq);
+                                       sp[j+1].end_seq = 
htonl(tp->recv_sack_cache[j].end_seq);
+                               }
+
+                       }
+               }
+       }
+
+       /* clear flag as used for different purpose in following code */
+       flag = 0;
+
+       for (i=0; i<num_sacks; i++, sp++) {
+               struct sk_buff *skb;
+               __u32 start_seq = ntohl(sp->start_seq);
+               __u32 end_seq = ntohl(sp->end_seq);
+               int fack_count;
+
+               /* Use SACK fastpath hint if valid */
+               if (tp->fastpath_skb_hint) {
+                       skb = tp->fastpath_skb_hint;
+                       fack_count = tp->fastpath_cnt_hint;
+               } else {
+                       skb = sk->sk_write_queue.next;
+                       fack_count = 0;
+               }
 
                /* Event "B" in the comment above. */
                if (after(end_seq, tp->high_seq))
                        flag |= FLAG_DATA_LOST;
 
-               sk_stream_for_retrans_queue(skb, sk) {
+               sk_stream_for_retrans_queue_from(skb, sk) {
                        int in_sack, pcount;
                        u8 sacked;
 
+                       tp->fastpath_skb_hint = skb;
+                       tp->fastpath_cnt_hint = fack_count;
+
                        /* The retransmission queue is always in order, so
                         * we can short-circuit the walk early.
                         */
@@ -1023,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk,
                                                TCP_SKB_CB(skb)->sacked &= 
~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
                                                tp->lost_out -= 
tcp_skb_pcount(skb);
                                                tp->retrans_out -= 
tcp_skb_pcount(skb);
+
+                                               /* clear lost hint */
+                                               tp->retransmit_skb_hint = NULL;
                                        }
                                } else {
                                        /* New sack for not retransmitted frame,
@@ -1035,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk,
                                        if (sacked & TCPCB_LOST) {
                                                TCP_SKB_CB(skb)->sacked &= 
~TCPCB_LOST;
                                                tp->lost_out -= 
tcp_skb_pcount(skb);
+
+                                               /* clear lost hint */
+                                               tp->retransmit_skb_hint = NULL;
                                        }
                                }
 
@@ -1058,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk,
                            (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
                                TCP_SKB_CB(skb)->sacked &= 
~TCPCB_SACKED_RETRANS;
                                tp->retrans_out -= tcp_skb_pcount(skb);
+                               tp->retransmit_skb_hint = NULL;
                        }
                }
        }
@@ -1085,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk,
                                TCP_SKB_CB(skb)->sacked &= 
~TCPCB_SACKED_RETRANS;
                                tp->retrans_out -= tcp_skb_pcount(skb);
 
+                               /* clear lost hint */
+                               tp->retransmit_skb_hint = NULL;
+
                                if 
(!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
                                        tp->lost_out += tcp_skb_pcount(skb);
                                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1192,6 +1259,8 @@ static void tcp_enter_frto_loss(struct s
        tcp_set_ca_state(sk, TCP_CA_Loss);
        tp->high_seq = tp->frto_highmark;
        TCP_ECN_queue_cwr(tp);
+
+       clear_all_retrans_hints(tp);
 }
 
 void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1258,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int
        tcp_set_ca_state(sk, TCP_CA_Loss);
        tp->high_seq = tp->snd_nxt;
        TCP_ECN_queue_cwr(tp);
+
+       clear_all_retrans_hints(tp);
 }
 
 static int tcp_check_sack_reneging(struct sock *sk)
@@ -1482,17 +1553,37 @@ static void tcp_mark_head_lost(struct so
                               int packets, u32 high_seq)
 {
        struct sk_buff *skb;
-       int cnt = packets;
+       int cnt;
 
-       BUG_TRAP(cnt <= tp->packets_out);
+       BUG_TRAP(packets <= tp->packets_out);
+       if (tp->lost_skb_hint) {
+               skb = tp->lost_skb_hint;
+               cnt = tp->lost_cnt_hint;
+       } else {
+               skb = sk->sk_write_queue.next;
+               cnt = 0;
+       }
 
-       sk_stream_for_retrans_queue(skb, sk) {
-               cnt -= tcp_skb_pcount(skb);
-               if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq))
+       sk_stream_for_retrans_queue_from(skb, sk) {
+               /* TODO: do this better */
+               /* this is not the most efficient way to do this... */
+               tp->lost_skb_hint = skb;
+               tp->lost_cnt_hint = cnt;
+               cnt += tcp_skb_pcount(skb);
+               if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
                        break;
                if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                        tp->lost_out += tcp_skb_pcount(skb);
+
+                       /* clear xmit_retransmit_queue hints
+                        *  if this is beyond hint */
+                       if(tp->retransmit_skb_hint != NULL &&
+                          before(TCP_SKB_CB(skb)->seq,
+                                 TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
+
+                               tp->retransmit_skb_hint = NULL;
+                       }
                }
        }
        tcp_sync_left_out(tp);
@@ -1519,13 +1610,28 @@ static void tcp_update_scoreboard(struct
        if (tcp_head_timedout(sk, tp)) {
                struct sk_buff *skb;
 
-               sk_stream_for_retrans_queue(skb, sk) {
-                       if (tcp_skb_timedout(sk, skb) &&
-                           !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+               skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
+                       : sk->sk_write_queue.next;
+
+               sk_stream_for_retrans_queue_from(skb, sk) {
+                       if (!tcp_skb_timedout(sk, skb))
+                               break;
+
+                       if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
                                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                                tp->lost_out += tcp_skb_pcount(skb);
+
+                               /* clear xmit_retrans hint */
+                               if (tp->retransmit_skb_hint &&
+                                   before(TCP_SKB_CB(skb)->seq,
+                                          
TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+
+                                       tp->retransmit_skb_hint = NULL;
                        }
                }
+
+               tp->scoreboard_skb_hint = skb;
+
                tcp_sync_left_out(tp);
        }
 }
@@ -1605,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk
        }
        tcp_moderate_cwnd(tp);
        tp->snd_cwnd_stamp = tcp_time_stamp;
+
+       /* There is something screwy going on with the retrans hints after
+          an undo */
+       clear_all_retrans_hints(tp);
 }
 
 static inline int tcp_may_undo(struct tcp_sock *tp)
@@ -1688,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock
                sk_stream_for_retrans_queue(skb, sk) {
                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
                }
+
+               clear_all_retrans_hints(tp);
+
                DBGUNDO(sk, tp, "partial loss");
                tp->lost_out = 0;
                tp->left_out = tp->sacked_out;
@@ -2116,6 +2229,7 @@ static int tcp_clean_rtx_queue(struct so
                tcp_packets_out_dec(tp, skb);
                __skb_unlink(skb, &sk->sk_write_queue);
                sk_stream_free_skb(sk, skb);
+               clear_all_retrans_hints(tp);
        }
 
        if (acked&FLAG_ACKED) {
--- net-2.6.orig/net/ipv4/tcp_output.c
+++ net-2.6/net/ipv4/tcp_output.c
@@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct
        u16 flags;
 
        BUG_ON(len > skb->len);
+
+       clear_all_retrans_hints(tp);
        nsize = skb_headlen(skb) - len;
        if (nsize < 0)
                nsize = 0;
@@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(str
                BUG_ON(tcp_skb_pcount(skb) != 1 ||
                       tcp_skb_pcount(next_skb) != 1);
 
-               /* Ok.  We will be able to collapse the packet. */
+               /* changing transmit queue under us so clear hints */
+               clear_all_retrans_hints(tp);
+
+               /* Ok.  We will be able to collapse the packet. */
                __skb_unlink(next_skb, &sk->sk_write_queue);
 
                memcpy(skb_put(skb, next_skb_size), next_skb->data, 
next_skb_size);
@@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *
                }
        }
 
+       clear_all_retrans_hints(tp);
+
        if (!lost)
                return;
 
@@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct so
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
-       int packet_cnt = tp->lost_out;
+       int packet_cnt;
+
+       if (tp->retransmit_skb_hint) {
+               skb = tp->retransmit_skb_hint;
+               packet_cnt = tp->retransmit_cnt_hint;
+       }else{
+               skb = sk->sk_write_queue.next;
+               packet_cnt = 0;
+       }
 
        /* First pass: retransmit lost packets. */
-       if (packet_cnt) {
-               sk_stream_for_retrans_queue(skb, sk) {
+       if (tp->lost_out) {
+               sk_stream_for_retrans_queue_from(skb, sk) {
                        __u8 sacked = TCP_SKB_CB(skb)->sacked;
 
+                       /* we could do better than to assign each time */
+                       tp->retransmit_skb_hint = skb;
+                       tp->retransmit_cnt_hint = packet_cnt;
+
                        /* Assume this retransmit will generate
                         * only one packet for congestion window
                         * calculation purposes.  This works because
@@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct so
                        if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
                                return;
 
-                       if (sacked&TCPCB_LOST) {
+                       if (sacked & TCPCB_LOST) {
                                if 
(!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
-                                       if (tcp_retransmit_skb(sk, skb))
+                                       if (tcp_retransmit_skb(sk, skb)) {
+                                               tp->retransmit_skb_hint = NULL;
                                                return;
+                                       }
                                        if (icsk->icsk_ca_state != TCP_CA_Loss)
                                                
NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
                                        else
@@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct so
                                                                          
TCP_RTO_MAX);
                                }
 
-                               packet_cnt -= tcp_skb_pcount(skb);
-                               if (packet_cnt <= 0)
+                               packet_cnt += tcp_skb_pcount(skb);
+                               if (packet_cnt >= tp->lost_out)
                                        break;
                        }
                }
@@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct so
        if (tcp_may_send_now(sk, tp))
                return;
 
-       packet_cnt = 0;
+       if (tp->forward_skb_hint) {
+               skb = tp->forward_skb_hint;
+               packet_cnt = tp->forward_cnt_hint;
+       } else{
+               skb = sk->sk_write_queue.next;
+               packet_cnt = 0;
+       }
+
+       sk_stream_for_retrans_queue_from(skb, sk) {
+               tp->forward_cnt_hint = packet_cnt;
+               tp->forward_skb_hint = skb;
 
-       sk_stream_for_retrans_queue(skb, sk) {
                /* Similar to the retransmit loop above we
                 * can pretend that the retransmitted SKB
                 * we send out here will be composed of one
@@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct so
                        continue;
 
                /* Ok, retransmit it. */
-               if (tcp_retransmit_skb(sk, skb))
+               if (tcp_retransmit_skb(sk, skb)) {
+                       tp->forward_skb_hint = NULL;
                        break;
+               }
 
                if (skb == skb_peek(&sk->sk_write_queue))
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,

--
Stephen Hemminger <[EMAIL PROTECTED]>
OSDL http://developer.osdl.org/~shemminger

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to