> Patch 1: David Miller's red-black tree code, tweaked for 2.6.22.6, > with some bugfixes
oops... The original patch can cause a kernel panic if tcp_sacktag_write_queue is called when the write queue is empty. Use this one instead. -Tom --- diff -ur linux-2.6.22.6/include/linux/skbuff.h linux-2.6.22.6-rbtree-davem-fixed/include/linux/skbuff.h --- linux-2.6.22.6/include/linux/skbuff.h 2007-08-30 23:21:01.000000000 -0700 +++ linux-2.6.22.6-rbtree-davem-fixed/include/linux/skbuff.h 2007-09-13 18:23:16.000000000 -0700 @@ -18,6 +18,7 @@ #include <linux/compiler.h> #include <linux/time.h> #include <linux/cache.h> +#include <linux/rbtree.h> #include <asm/atomic.h> #include <asm/types.h> @@ -242,6 +243,8 @@ struct sk_buff *next; struct sk_buff *prev; + struct rb_node rb; + struct sock *sk; ktime_t tstamp; struct net_device *dev; diff -ur linux-2.6.22.6/include/linux/tcp.h linux-2.6.22.6-rbtree-davem-fixed/include/linux/tcp.h --- linux-2.6.22.6/include/linux/tcp.h 2007-08-30 23:21:01.000000000 -0700 +++ linux-2.6.22.6-rbtree-davem-fixed/include/linux/tcp.h 2007-09-13 18:23:16.000000000 -0700 @@ -174,6 +174,7 @@ #include <linux/skbuff.h> #include <linux/dmaengine.h> +#include <linux/rbtree.h> #include <net/sock.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> @@ -321,6 +322,7 @@ u32 snd_cwnd_used; u32 snd_cwnd_stamp; + struct rb_root write_queue_rb; struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ u32 rcv_wnd; /* Current receiver window */ @@ -339,9 +341,7 @@ struct sk_buff *scoreboard_skb_hint; struct sk_buff *retransmit_skb_hint; struct sk_buff *forward_skb_hint; - struct sk_buff *fastpath_skb_hint; - int fastpath_cnt_hint; int lost_cnt_hint; int retransmit_cnt_hint; int forward_cnt_hint; diff -ur linux-2.6.22.6/include/net/tcp.h linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h --- linux-2.6.22.6/include/net/tcp.h 2007-08-30 23:21:01.000000000 -0700 +++ linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h 2007-09-19 17:36:07.000000000 -0700 @@ -540,6 +540,7 @@ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ __u32 when; /* used to compute rtt's */ + unsigned int fack_count; /* speed up SACK processing */ __u8 flags; /* TCP header flags. */ /* NOTE: These must match up to the flags byte in a @@ -1043,12 +1044,12 @@ } /*from STCP */ -static inline void clear_all_retrans_hints(struct tcp_sock *tp){ +static inline void clear_all_retrans_hints(struct tcp_sock *tp) +{ tp->lost_skb_hint = NULL; tp->scoreboard_skb_hint = NULL; tp->retransmit_skb_hint = NULL; tp->forward_skb_hint = NULL; - tp->fastpath_skb_hint = NULL; } /* MD5 Signature */ @@ -1166,6 +1167,7 @@ while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) sk_stream_free_skb(sk, skb); + tcp_sk(sk)->write_queue_rb = RB_ROOT; sk_stream_mem_reclaim(sk); } @@ -1208,7 +1210,7 @@ { struct tcp_sock *tp = tcp_sk(sk); - sk->sk_send_head = skb->next; + sk->sk_send_head = tcp_write_queue_next(sk, skb); if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) sk->sk_send_head = NULL; /* Don't override Nagle indefinately with F-RTO */ @@ -1227,9 +1229,61 @@ sk->sk_send_head = NULL; } +static inline struct sk_buff *tcp_write_queue_find(struct sock *sk, __u32 seq) +{ + struct rb_node *rb_node = tcp_sk(sk)->write_queue_rb.rb_node; + struct sk_buff *skb = NULL; + + while (rb_node) { + struct sk_buff *tmp = rb_entry(rb_node,struct sk_buff,rb); + if (TCP_SKB_CB(tmp)->end_seq > seq) { + skb = tmp; + if (TCP_SKB_CB(tmp)->seq <= seq) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; + + } + return skb; +} + +static inline void tcp_rb_insert(struct sk_buff *skb, struct rb_root *root) +{ + struct rb_node **rb_link, *rb_parent; + __u32 seq = TCP_SKB_CB(skb)->seq; + + rb_link = &root->rb_node; + rb_parent = NULL; + while (*rb_link != NULL) { + struct sk_buff *tmp = rb_entry(*rb_link,struct sk_buff,rb); + rb_parent = *rb_link; + if (TCP_SKB_CB(tmp)->end_seq > seq) { + BUG_ON(TCP_SKB_CB(tmp)->seq <= seq); + rb_link = &rb_parent->rb_left; + } else { + rb_link = &rb_parent->rb_right; + } + } + rb_link_node(&skb->rb, rb_parent, rb_link); + rb_insert_color(&skb->rb, root); +} + +static inline void tcp_rb_unlink(struct sk_buff *skb, struct rb_root *root) +{ + rb_erase(&skb->rb, root); +} + static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { + struct sk_buff *tail = tcp_write_queue_tail(sk); + unsigned int fc = 0; + + if (tail) + fc = TCP_SKB_CB(tail)->fack_count + tcp_skb_pcount(tail); + TCP_SKB_CB(skb)->fack_count = fc; __skb_queue_tail(&sk->sk_write_queue, skb); + tcp_rb_insert(skb, &tcp_sk(sk)->write_queue_rb); } static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) @@ -1241,9 +1295,35 @@ sk->sk_send_head = skb; } +/* This is only used for tcp_send_synack(), so the write queue should + * be empty. If that stops being true, the fack_count assignment + * will need to be more elaborate. + */ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *skb) { + BUG_ON(!skb_queue_empty(&sk->sk_write_queue)); __skb_queue_head(&sk->sk_write_queue, skb); + TCP_SKB_CB(skb)->fack_count = 0; + tcp_rb_insert(skb, &tcp_sk(sk)->write_queue_rb); +} + +/* An insert into the middle of the write queue causes the fack + * counts in subsequent packets to become invalid, fix them up. + */ +static inline void tcp_reset_fack_counts(struct sock *sk, struct sk_buff *first) +{ + struct sk_buff *prev = first->prev; + unsigned int fc = 0; + + if (prev != (struct sk_buff *) &sk->sk_write_queue) + fc = TCP_SKB_CB(prev)->fack_count + tcp_skb_pcount(prev); + + while (first != (struct sk_buff *)&sk->sk_write_queue) { + TCP_SKB_CB(first)->fack_count = fc; + + fc += tcp_skb_pcount(first); + first = first->next; + } } /* Insert buff after skb on the write queue of sk. */ @@ -1252,19 +1332,24 @@ struct sock *sk) { __skb_append(skb, buff, &sk->sk_write_queue); + tcp_reset_fack_counts(sk, buff); + tcp_rb_insert(buff, &tcp_sk(sk)->write_queue_rb); } -/* Insert skb between prev and next on the write queue of sk. */ +/* Insert new before skb on the write queue of sk. */ static inline void tcp_insert_write_queue_before(struct sk_buff *new, struct sk_buff *skb, struct sock *sk) { __skb_insert(new, skb->prev, skb, &sk->sk_write_queue); + tcp_reset_fack_counts(sk, new); + tcp_rb_insert(new, &tcp_sk(sk)->write_queue_rb); } static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { __skb_unlink(skb, &sk->sk_write_queue); + tcp_rb_unlink(skb, &tcp_sk(sk)->write_queue_rb); } static inline int tcp_skb_is_last(const struct sock *sk, diff -ur linux-2.6.22.6/net/ipv4/tcp_input.c linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c --- linux-2.6.22.6/net/ipv4/tcp_input.c 2007-08-30 23:21:01.000000000 -0700 +++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c 2007-09-20 14:10:26.000000000 -0700 @@ -947,14 +947,13 @@ unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); - struct sk_buff *cached_skb; int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; int reord = tp->packets_out; int prior_fackets; u32 lost_retrans = 0; int flag = 0; int found_dup_sack = 0; - int cached_fack_count; + int fack_count_base; int i; int first_sack_index; @@ -1020,7 +1019,6 @@ num_sacks = 1; else { int j; - tp->fastpath_skb_hint = NULL; /* order SACK blocks to allow in order walk of the retrans queue */ for (i = num_sacks-1; i > 0; i--) { @@ -1045,13 +1043,8 @@ /* clear flag as used for different purpose in following code */ flag = 0; - /* Use SACK fastpath hint if valid */ - cached_skb = tp->fastpath_skb_hint; - cached_fack_count = tp->fastpath_cnt_hint; - if (!cached_skb) { - cached_skb = tcp_write_queue_head(sk); - cached_fack_count = 0; - } + if (likely(tcp_write_queue_head(sk))) + fack_count_base = TCP_SKB_CB(tcp_write_queue_head(sk))->fack_count; for (i=0; i<num_sacks; i++, sp++) { struct sk_buff *skb; @@ -1060,8 +1053,10 @@ int fack_count; int dup_sack = (found_dup_sack && (i == first_sack_index)); - skb = cached_skb; - fack_count = cached_fack_count; + skb = tcp_write_queue_find(sk, start_seq); + if (!skb) + continue; + fack_count = TCP_SKB_CB(skb)->fack_count - fack_count_base; /* Event "B" in the comment above. */ if (after(end_seq, tp->high_seq)) @@ -1074,13 +1069,6 @@ if (skb == tcp_send_head(sk)) break; - cached_skb = skb; - cached_fack_count = fack_count; - if (i == first_sack_index) { - tp->fastpath_skb_hint = skb; - tp->fastpath_cnt_hint = fack_count; - } - /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ diff -ur linux-2.6.22.6/net/ipv4/tcp_ipv4.c linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_ipv4.c --- linux-2.6.22.6/net/ipv4/tcp_ipv4.c 2007-08-30 23:21:01.000000000 -0700 +++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_ipv4.c 2007-09-13 18:23:16.000000000 -0700 @@ -1845,6 +1845,7 @@ struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + tp->write_queue_rb = RB_ROOT; skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); diff -ur linux-2.6.22.6/net/ipv4/tcp_minisocks.c linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_minisocks.c --- linux-2.6.22.6/net/ipv4/tcp_minisocks.c 2007-08-30 23:21:01.000000000 -0700 +++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_minisocks.c 2007-09-13 18:23:16.000000000 -0700 @@ -421,6 +421,7 @@ tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); + newtp->write_queue_rb = RB_ROOT; skb_queue_head_init(&newtp->out_of_order_queue); newtp->write_seq = treq->snt_isn + 1; newtp->pushed_seq = newtp->write_seq; diff -ur linux-2.6.22.6/net/ipv4/tcp_output.c linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_output.c --- linux-2.6.22.6/net/ipv4/tcp_output.c 2007-08-30 23:21:01.000000000 -0700 +++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_output.c 2007-09-13 18:23:16.000000000 -0700 @@ -1278,11 +1278,11 @@ sk_charge_skb(sk, nskb); skb = tcp_send_head(sk); + TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; tcp_insert_write_queue_before(nskb, skb, sk); tcp_advance_send_head(sk, skb); - TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; - TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(nskb)->sacked = 0; nskb->csum = 0; diff -ur linux-2.6.22.6/net/ipv6/tcp_ipv6.c linux-2.6.22.6-rbtree-davem-fixed/net/ipv6/tcp_ipv6.c --- linux-2.6.22.6/net/ipv6/tcp_ipv6.c 2007-08-30 23:21:01.000000000 -0700 +++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv6/tcp_ipv6.c 2007-09-13 18:23:16.000000000 -0700 @@ -1900,6 +1900,7 @@ struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + tp->write_queue_rb = RB_ROOT; skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html