> Patch 1: David Miller's red-black tree code, tweaked for 2.6.22.6,
> with some bugfixes

oops...

The original patch can cause a kernel panic if tcp_sacktag_write_queue
is called when the write queue is empty. Use this one instead.

-Tom

---
diff -ur linux-2.6.22.6/include/linux/skbuff.h 
linux-2.6.22.6-rbtree-davem-fixed/include/linux/skbuff.h
--- linux-2.6.22.6/include/linux/skbuff.h       2007-08-30 23:21:01.000000000 
-0700
+++ linux-2.6.22.6-rbtree-davem-fixed/include/linux/skbuff.h    2007-09-13 
18:23:16.000000000 -0700
@@ -18,6 +18,7 @@
 #include <linux/compiler.h>
 #include <linux/time.h>
 #include <linux/cache.h>
+#include <linux/rbtree.h>
 
 #include <asm/atomic.h>
 #include <asm/types.h>
@@ -242,6 +243,8 @@
        struct sk_buff          *next;
        struct sk_buff          *prev;
 
+       struct rb_node          rb;
+
        struct sock             *sk;
        ktime_t                 tstamp;
        struct net_device       *dev;
diff -ur linux-2.6.22.6/include/linux/tcp.h 
linux-2.6.22.6-rbtree-davem-fixed/include/linux/tcp.h
--- linux-2.6.22.6/include/linux/tcp.h  2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/include/linux/tcp.h       2007-09-13 
18:23:16.000000000 -0700
@@ -174,6 +174,7 @@
 
 #include <linux/skbuff.h>
 #include <linux/dmaengine.h>
+#include <linux/rbtree.h>
 #include <net/sock.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -321,6 +322,7 @@
        u32     snd_cwnd_used;
        u32     snd_cwnd_stamp;
 
+       struct rb_root          write_queue_rb;
        struct sk_buff_head     out_of_order_queue; /* Out of order segments go 
here */
 
        u32     rcv_wnd;        /* Current receiver window              */
@@ -339,9 +341,7 @@
        struct sk_buff *scoreboard_skb_hint;
        struct sk_buff *retransmit_skb_hint;
        struct sk_buff *forward_skb_hint;
-       struct sk_buff *fastpath_skb_hint;
 
-       int     fastpath_cnt_hint;
        int     lost_cnt_hint;
        int     retransmit_cnt_hint;
        int     forward_cnt_hint;
diff -ur linux-2.6.22.6/include/net/tcp.h 
linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h
--- linux-2.6.22.6/include/net/tcp.h    2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h 2007-09-19 
17:36:07.000000000 -0700
@@ -540,6 +540,7 @@
        __u32           seq;            /* Starting sequence number     */
        __u32           end_seq;        /* SEQ + FIN + SYN + datalen    */
        __u32           when;           /* used to compute rtt's        */
+       unsigned int    fack_count;     /* speed up SACK processing     */
        __u8            flags;          /* TCP header flags.            */
 
        /* NOTE: These must match up to the flags byte in a
@@ -1043,12 +1044,12 @@
 }
 
 /*from STCP */
-static inline void clear_all_retrans_hints(struct tcp_sock *tp){
+static inline void clear_all_retrans_hints(struct tcp_sock *tp)
+{
        tp->lost_skb_hint = NULL;
        tp->scoreboard_skb_hint = NULL;
        tp->retransmit_skb_hint = NULL;
        tp->forward_skb_hint = NULL;
-       tp->fastpath_skb_hint = NULL;
 }
 
 /* MD5 Signature */
@@ -1166,6 +1167,7 @@
 
        while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
                sk_stream_free_skb(sk, skb);
+       tcp_sk(sk)->write_queue_rb = RB_ROOT;
        sk_stream_mem_reclaim(sk);
 }
 
@@ -1208,7 +1210,7 @@
 {
        struct tcp_sock *tp = tcp_sk(sk);
 
-       sk->sk_send_head = skb->next;
+       sk->sk_send_head = tcp_write_queue_next(sk, skb);
        if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
                sk->sk_send_head = NULL;
        /* Don't override Nagle indefinately with F-RTO */
@@ -1227,9 +1229,61 @@
        sk->sk_send_head = NULL;
 }
 
+static inline struct sk_buff *tcp_write_queue_find(struct sock *sk, __u32 seq)
+{
+       struct rb_node *rb_node = tcp_sk(sk)->write_queue_rb.rb_node;
+       struct sk_buff *skb = NULL;
+
+       while (rb_node) {
+               struct sk_buff *tmp = rb_entry(rb_node,struct sk_buff,rb);
+               if (TCP_SKB_CB(tmp)->end_seq > seq) {
+                       skb = tmp;
+                       if (TCP_SKB_CB(tmp)->seq <= seq)
+                               break;
+                       rb_node = rb_node->rb_left;
+               } else
+                       rb_node = rb_node->rb_right;
+
+       }
+       return skb;
+}
+
+static inline void tcp_rb_insert(struct sk_buff *skb, struct rb_root *root)
+{
+       struct rb_node **rb_link, *rb_parent;
+       __u32 seq = TCP_SKB_CB(skb)->seq;
+
+       rb_link = &root->rb_node;
+       rb_parent = NULL;
+       while (*rb_link != NULL) {
+               struct sk_buff *tmp = rb_entry(*rb_link,struct sk_buff,rb);
+               rb_parent = *rb_link;
+               if (TCP_SKB_CB(tmp)->end_seq > seq) {
+                       BUG_ON(TCP_SKB_CB(tmp)->seq <= seq);
+                       rb_link = &rb_parent->rb_left;
+               } else {
+                       rb_link = &rb_parent->rb_right;
+               }
+       }
+       rb_link_node(&skb->rb, rb_parent, rb_link);
+       rb_insert_color(&skb->rb, root);
+}
+
+static inline void tcp_rb_unlink(struct sk_buff *skb, struct rb_root *root)
+{
+       rb_erase(&skb->rb, root);
+}
+
 static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff 
*skb)
 {
+       struct sk_buff *tail = tcp_write_queue_tail(sk);
+       unsigned int fc = 0;
+
+       if (tail)
+               fc = TCP_SKB_CB(tail)->fack_count + tcp_skb_pcount(tail);
+       TCP_SKB_CB(skb)->fack_count = fc;
        __skb_queue_tail(&sk->sk_write_queue, skb);
+       tcp_rb_insert(skb, &tcp_sk(sk)->write_queue_rb);
 }
 
 static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff 
*skb)
@@ -1241,9 +1295,35 @@
                sk->sk_send_head = skb;
 }
 
+/* This is only used for tcp_send_synack(), so the write queue should
+ * be empty.  If that stops being true, the fack_count assignment
+ * will need to be more elaborate.
+ */
 static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff 
*skb)
 {
+       BUG_ON(!skb_queue_empty(&sk->sk_write_queue));
        __skb_queue_head(&sk->sk_write_queue, skb);
+       TCP_SKB_CB(skb)->fack_count = 0;
+       tcp_rb_insert(skb, &tcp_sk(sk)->write_queue_rb);
+}
+
+/* An insert into the middle of the write queue causes the fack
+ * counts in subsequent packets to become invalid, fix them up.
+ */
+static inline void tcp_reset_fack_counts(struct sock *sk, struct sk_buff 
*first)
+{
+       struct sk_buff *prev = first->prev;
+       unsigned int fc = 0;
+
+       if (prev != (struct sk_buff *) &sk->sk_write_queue)
+               fc = TCP_SKB_CB(prev)->fack_count + tcp_skb_pcount(prev);
+
+       while (first != (struct sk_buff *)&sk->sk_write_queue) {
+               TCP_SKB_CB(first)->fack_count = fc;
+
+               fc += tcp_skb_pcount(first);
+               first = first->next;
+       }
 }
 
 /* Insert buff after skb on the write queue of sk.  */
@@ -1252,19 +1332,24 @@
                                                struct sock *sk)
 {
        __skb_append(skb, buff, &sk->sk_write_queue);
+       tcp_reset_fack_counts(sk, buff);
+       tcp_rb_insert(buff, &tcp_sk(sk)->write_queue_rb);
 }
 
-/* Insert skb between prev and next on the write queue of sk.  */
+/* Insert new before skb on the write queue of sk.  */
 static inline void tcp_insert_write_queue_before(struct sk_buff *new,
                                                  struct sk_buff *skb,
                                                  struct sock *sk)
 {
        __skb_insert(new, skb->prev, skb, &sk->sk_write_queue);
+       tcp_reset_fack_counts(sk, new);
+       tcp_rb_insert(new, &tcp_sk(sk)->write_queue_rb);
 }
 
 static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
 {
        __skb_unlink(skb, &sk->sk_write_queue);
+       tcp_rb_unlink(skb, &tcp_sk(sk)->write_queue_rb);
 }
 
 static inline int tcp_skb_is_last(const struct sock *sk,
diff -ur linux-2.6.22.6/net/ipv4/tcp_input.c 
linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c
--- linux-2.6.22.6/net/ipv4/tcp_input.c 2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c      2007-09-20 
14:10:26.000000000 -0700
@@ -947,14 +947,13 @@
        unsigned char *ptr = (skb_transport_header(ack_skb) +
                              TCP_SKB_CB(ack_skb)->sacked);
        struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2);
-       struct sk_buff *cached_skb;
        int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
        int reord = tp->packets_out;
        int prior_fackets;
        u32 lost_retrans = 0;
        int flag = 0;
        int found_dup_sack = 0;
-       int cached_fack_count;
+       int fack_count_base;
        int i;
        int first_sack_index;
 
@@ -1020,7 +1019,6 @@
                num_sacks = 1;
        else {
                int j;
-               tp->fastpath_skb_hint = NULL;
 
                /* order SACK blocks to allow in order walk of the retrans 
queue */
                for (i = num_sacks-1; i > 0; i--) {
@@ -1045,13 +1043,8 @@
        /* clear flag as used for different purpose in following code */
        flag = 0;
 
-       /* Use SACK fastpath hint if valid */
-       cached_skb = tp->fastpath_skb_hint;
-       cached_fack_count = tp->fastpath_cnt_hint;
-       if (!cached_skb) {
-               cached_skb = tcp_write_queue_head(sk);
-               cached_fack_count = 0;
-       }
+       if (likely(tcp_write_queue_head(sk)))
+               fack_count_base = 
TCP_SKB_CB(tcp_write_queue_head(sk))->fack_count;
 
        for (i=0; i<num_sacks; i++, sp++) {
                struct sk_buff *skb;
@@ -1060,8 +1053,10 @@
                int fack_count;
                int dup_sack = (found_dup_sack && (i == first_sack_index));
 
-               skb = cached_skb;
-               fack_count = cached_fack_count;
+               skb = tcp_write_queue_find(sk, start_seq);
+               if (!skb)
+                       continue;
+               fack_count = TCP_SKB_CB(skb)->fack_count - fack_count_base;
 
                /* Event "B" in the comment above. */
                if (after(end_seq, tp->high_seq))
@@ -1074,13 +1069,6 @@
                        if (skb == tcp_send_head(sk))
                                break;
 
-                       cached_skb = skb;
-                       cached_fack_count = fack_count;
-                       if (i == first_sack_index) {
-                               tp->fastpath_skb_hint = skb;
-                               tp->fastpath_cnt_hint = fack_count;
-                       }
-
                        /* The retransmission queue is always in order, so
                         * we can short-circuit the walk early.
                         */
diff -ur linux-2.6.22.6/net/ipv4/tcp_ipv4.c 
linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_ipv4.c
--- linux-2.6.22.6/net/ipv4/tcp_ipv4.c  2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_ipv4.c       2007-09-13 
18:23:16.000000000 -0700
@@ -1845,6 +1845,7 @@
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
 
+       tp->write_queue_rb = RB_ROOT;
        skb_queue_head_init(&tp->out_of_order_queue);
        tcp_init_xmit_timers(sk);
        tcp_prequeue_init(tp);
diff -ur linux-2.6.22.6/net/ipv4/tcp_minisocks.c 
linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_minisocks.c
--- linux-2.6.22.6/net/ipv4/tcp_minisocks.c     2007-08-30 23:21:01.000000000 
-0700
+++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_minisocks.c  2007-09-13 
18:23:16.000000000 -0700
@@ -421,6 +421,7 @@
 
                tcp_set_ca_state(newsk, TCP_CA_Open);
                tcp_init_xmit_timers(newsk);
+               newtp->write_queue_rb = RB_ROOT;
                skb_queue_head_init(&newtp->out_of_order_queue);
                newtp->write_seq = treq->snt_isn + 1;
                newtp->pushed_seq = newtp->write_seq;
diff -ur linux-2.6.22.6/net/ipv4/tcp_output.c 
linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_output.c
--- linux-2.6.22.6/net/ipv4/tcp_output.c        2007-08-30 23:21:01.000000000 
-0700
+++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_output.c     2007-09-13 
18:23:16.000000000 -0700
@@ -1278,11 +1278,11 @@
        sk_charge_skb(sk, nskb);
 
        skb = tcp_send_head(sk);
+       TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+       TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
        tcp_insert_write_queue_before(nskb, skb, sk);
        tcp_advance_send_head(sk, skb);
 
-       TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
-       TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
        TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
        TCP_SKB_CB(nskb)->sacked = 0;
        nskb->csum = 0;
diff -ur linux-2.6.22.6/net/ipv6/tcp_ipv6.c 
linux-2.6.22.6-rbtree-davem-fixed/net/ipv6/tcp_ipv6.c
--- linux-2.6.22.6/net/ipv6/tcp_ipv6.c  2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv6/tcp_ipv6.c       2007-09-13 
18:23:16.000000000 -0700
@@ -1900,6 +1900,7 @@
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
 
+       tp->write_queue_rb = RB_ROOT;
        skb_queue_head_init(&tp->out_of_order_queue);
        tcp_init_xmit_timers(sk);
        tcp_prequeue_init(tp);
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to