Resending and requesting comments. (Patch was against 2.6.13-rc1, 
so it's a little stale by now.)

---------- Forwarded message ----------
.....

Version 2 of the rfc/patch is attached. It has been changed 
as indicated in the commentary below.

Diffstat:
 include/linux/sysctl.h     |    1
 net/ipv4/ip_fragment.c     |  195 +++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/sysctl_net_ipv4.c |   11 ++

Signed-off-by: Arthur Kepner <[EMAIL PROTECTED]>

On Tue, 28 Jun 2005, Arthur Kepner wrote:

> 
> On Sun, 26 Jun 2005, Herbert Xu wrote:
> 
> > 
> > > +struct ipc {
> > > ......
> > > + struct rcu_head         rcu;
> > 
> > Is RCU worth it here? The only time we'd be taking the locks on this
> > is when the first fragment of a packet comes in.  At that point we'll
> > be taking write_lock(&ipfrag_lock) anyway.
> > 

Yes, I think rcu is worth it here. The reason is that to 
not use rcu would necessitate grabbing the (global) 
ipfrag_lock an additional time, when we free an ipc.

Adding an "ipc" to the hashtable could be done under the
ipfrag_lock, as you mention. But removing an ipc shouldn't 
be done at the same time that fragments are destroyed, 
because the common case is that another fragment queue will 
soon be created for the same (src,dst,proto). Better to 
save the ipc for a while to avoid freeing and then 
immediately recreating it. 

Since the freeing of the ipc has to be deferred until 
well after the last associated fragment queue has been 
freed, we can't take advantage of the fact that the 
ipfrag_lock is held when the fragment queue is freed.  

So when finally freeing the ipc, we can either grab the 
global ipfrag_lock again, or use some other, finer-grained 
lock to protect the ipc_hash entries. I'd prefer to avoid 
introducing new uses of global locks.

If we use the finer-grained ipc_hash[].lock locks then 
rcu allows us to avoid taking any locks in ipc_find when we 
create a new fragment chain and there already happens to be 
an ipc for the associated (src,dst,proto). (I suspect this 
would be a fairly common case.)

> > The only other use of RCU in your patch is ip_count.  That should be
> > changed to be done in ip_defrag instead.  At that point you can simply
> > find the ipc by deferencing ipq, so no need for __ipc_find and hence
> > RCU.
> > 
> > The reason you need to change it in this way is because you can't make
> > assumptions about ip_rcv_finish being the first place where a packet
> > is defragmented.  With connection tracking enabled conntrack is the first
> > place where defragmentation occurs.
> >   
> .....

This has been fixed. ip_input.c isn't changed by this version 
of the patch. But there's the caveat that I mentioned earlier:

> 
> There is a (big) advantage to doing this in ip_defrag() - this 
> becomes a no-op for non-fragmented datagrams. The disadvantage 
> is that there could be a situation where you receive:
> 
>   1) first fragment of datagram X [for a particular (src,dst,proto)]
>   2) a zillion non-fragmented datagrams [for the same (src,dst,proto)]
>   3) last fragment of datagram X [for (src,dst,proto)]
> 
> and no "disorder" would be detected for the datagrams associated 
> with (src,dst,proto), even though the ip id could have wrapped in the 
> meantime. This seems like a very uncommon case, however.
> 
> 
> > > +#define IPC_HASHSZ       IPQ_HASHSZ
> > > +static struct {
> > > + struct hlist_head head;
> > > + spinlock_t lock;
> > > +} ipc_hash[IPC_HASHSZ];
> > 
> > I'd store ipc entries in the main ipq hash table since they can use
> > the same keys for lookup as ipq entries.  You just need to set protocol
> > to zero and map the user to values specific to ipc for ipc entries.
> > One mapping would be to set the top bit of user for ipc entries, e.g.
> > 
> > #define IP_DEFRAG_IPC 0x80000000
> >     ipc->user = ipq->user | IP_DEFRAG_IPC;
> > 
> > Of course you also need to make sure that the two structures share
> > the leading elements.  You can then use the user field to distinguish
> > between ipc/ipq entries.
> 

I thought about this point, but I dislike reusing the same 
structure for such different purposes, so left this unchanged. 

Comments?

--
Arthur
diff -rup linux.orig/include/linux/sysctl.h linux/include/linux/sysctl.h
--- linux.orig/include/linux/sysctl.h   2005-07-06 12:32:03.224546953 -0700
+++ linux/include/linux/sysctl.h        2005-07-07 09:56:42.854845089 -0700
@@ -343,6 +343,7 @@ enum
        NET_TCP_BIC_BETA=108,
        NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
        NET_TCP_CONG_CONTROL=110,
+       NET_IPV4_REASM_COUNT=111,
 };
 
 enum {
diff -rup linux.orig/net/ipv4/ip_fragment.c linux/net/ipv4/ip_fragment.c
--- linux.orig/net/ipv4/ip_fragment.c   2005-07-06 12:30:51.033380830 -0700
+++ linux/net/ipv4/ip_fragment.c        2005-07-07 09:56:42.856798234 -0700
@@ -56,6 +56,8 @@
 int sysctl_ipfrag_high_thresh = 256*1024;
 int sysctl_ipfrag_low_thresh = 192*1024;
 
+int sysctl_ip_reassembly_count;
+
 /* Important NOTE! Fragment queue must be destroyed before MSL expires.
  * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
  */
@@ -69,6 +71,25 @@ struct ipfrag_skb_cb
 
 #define FRAG_CB(skb)   ((struct ipfrag_skb_cb*)((skb)->cb))
 
+/* struct ipc contains a count of the number of IP datagrams 
+ * received for a (saddr, daddr, protocol) tuple - but one of 
+ * these structures exists for a given (saddr, daddr, protocol) 
+ * if and only if there is a queue of IP fragments associated 
+ * with that 3-tuple and sysctl_ip_reassembly_count is non-zero.
+ */
+struct ipc {
+       struct hlist_node       node;
+       u32                     saddr;
+       u32                     daddr;
+       u8                      protocol;
+       atomic_t                refcnt; /* how many ipqs hold refs to us */
+       atomic_t                seq;    /* how many ip fragments for this 
+                                        * (saddr,daddr,protocol) since we 
+                                        * were created */
+       struct timer_list       timer;
+       struct rcu_head         rcu;
+};
+
 /* Describe an entry in the "incomplete datagrams" queue. */
 struct ipq {
        struct ipq      *next;          /* linked list pointers                 
*/
@@ -92,6 +113,14 @@ struct ipq {
        struct ipq      **pprev;
        int             iif;
        struct timeval  stamp;
+       struct ipc      *ipc;
+       atomic_t        seq;            
+       /* ipq->seq defines the "bottom" of the window of sequence numbers 
+        * that are valid for this fragment - the "top" of the window is 
+        * (ipq->seq + sysctl_ip_reassembly_count). ipq->seq is initialized
+        * to the value in the associated ipc when the fragment queue is 
+        * created, and incremented each time a fragment is added to the 
+        * queue */
 };
 
 /* Hash table. */
@@ -105,6 +134,12 @@ static u32 ipfrag_hash_rnd;
 static LIST_HEAD(ipq_lru_list);
 int ip_frag_nqueues = 0;
 
+#define IPC_HASHSZ     IPQ_HASHSZ
+static struct {
+       struct hlist_head head;
+       spinlock_t lock;
+} ipc_hash[IPC_HASHSZ];
+
 static __inline__ void __ipq_unlink(struct ipq *qp)
 {
        if(qp->next)
@@ -121,6 +156,11 @@ static __inline__ void ipq_unlink(struct
        write_unlock(&ipfrag_lock);
 }
 
+static unsigned int ipchashfn(u32 saddr, u32 daddr, u8 prot)
+{
+       return jhash_3words(prot, saddr, daddr, 0) & (IPC_HASHSZ - 1);
+}
+
 static unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot)
 {
        return jhash_3words((u32)id << 16 | prot, saddr, daddr,
@@ -231,8 +271,16 @@ static __inline__ void ipq_put(struct ip
  */
 static void ipq_kill(struct ipq *ipq)
 {
+       struct ipc *cp = ipq->ipc;
+
        if (del_timer(&ipq->timer))
                atomic_dec(&ipq->refcnt);
+       if (cp) {
+               atomic_dec(&cp->refcnt);
+               /* no particular reason to use sysctl_ipfrag_time 
+                * for this timer */
+               mod_timer(&cp->timer, jiffies + sysctl_ipfrag_time);
+       }
 
        if (!(ipq->last_in & COMPLETE)) {
                ipq_unlink(ipq);
@@ -348,10 +396,110 @@ static struct ipq *ip_frag_intern(unsign
        return qp;
 }
 
+static inline void __ipc_destroy(struct rcu_head *head)
+{
+       kfree(container_of(head, struct ipc, rcu));
+}
+
+static void ipc_destroy(unsigned long arg) 
+{
+       struct ipc *cp = (struct ipc *) arg;
+       unsigned int hash = ipchashfn(cp->saddr, cp->daddr, cp->protocol);
+
+       spin_lock(&ipc_hash[hash].lock);
+       BUG_ON((atomic_read(&cp->refcnt)) < 0);
+
+       if (atomic_read(&cp->refcnt) == 0) {
+               hlist_del_rcu(&cp->node);
+               call_rcu(&cp->rcu, __ipc_destroy);
+       }
+       spin_unlock(&ipc_hash[hash].lock);
+}
+
+/* 
+ * must hold spinlock for the appropriate hash list head when 
+ * __ipc_create is called 
+ */
+
+static inline struct ipc *__ipc_create(struct iphdr *iph, 
+                                      const unsigned int hash) 
+{
+       struct ipc *cp = kmalloc(sizeof(struct ipc), GFP_ATOMIC);
+       /* XXX should we account size to ip_frag_mem ??? */
+       if (cp) {
+               cp->saddr = iph->saddr;
+               cp->daddr = iph->daddr;
+               cp->protocol = iph->protocol;
+               atomic_set(&cp->seq, 0);
+               atomic_set(&cp->refcnt, 1);
+               INIT_HLIST_NODE(&cp->node);
+               hlist_add_head_rcu(&cp->node, &ipc_hash[hash].head);
+               init_timer(&cp->timer);
+               cp->timer.data = (unsigned long) cp;
+               cp->timer.function = ipc_destroy;
+       } else {
+               NETDEBUG(if (net_ratelimit()) 
+                       printk(KERN_ERR "__ipc_create: no memory left !\n"));
+       }
+       return cp;
+}
+
+/* 
+ * must be "rcu safe" when __ipc_find is called - either use 
+ * rcu_read_lock (if you intend only to read the returned struct) 
+ * or grab the spinlock for the appropriate hash list head (if 
+ * you might modify the returned struct) 
+ */
+static inline struct ipc *__ipc_find(u32 saddr, u32 daddr, u8 protocol, 
+                                    const unsigned int hash)
+{
+       struct hlist_node *p;
+
+       hlist_for_each_rcu(p, &ipc_hash[hash].head) {
+               struct ipc * cp = (struct ipc *)p;
+               if(cp->saddr == saddr &&
+                  cp->daddr == daddr &&
+                  cp->protocol == protocol) {
+                       return cp;
+               }
+       }
+       return NULL;
+}
+
+static struct ipc *ipc_find(struct iphdr *iph)
+{
+       struct ipc *cp;
+       unsigned int hash = ipchashfn(iph->saddr, iph->daddr, iph->protocol);
+
+       rcu_read_lock();
+       if((cp = __ipc_find(iph->saddr, iph->daddr, 
+                           iph->protocol, hash)) != NULL) {
+               atomic_inc(&cp->refcnt);
+               rcu_read_unlock();
+               return cp;
+       }
+       rcu_read_unlock();
+       spin_lock(&ipc_hash[hash].lock);
+       if((cp = __ipc_find(iph->saddr, iph->daddr, 
+                           iph->protocol, hash)) != NULL) {
+               atomic_inc(&cp->refcnt);
+               spin_unlock(&ipc_hash[hash].lock);
+               return cp;
+       }
+       cp = __ipc_create(iph, hash);
+       spin_unlock(&ipc_hash[hash].lock);
+       return cp;
+}
+
+
 /* Add an entry to the 'ipq' queue for a newly received IP datagram. */
 static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
 {
        struct ipq *qp;
+       struct ipc *cp = NULL;
+
+       if (sysctl_ip_reassembly_count && (cp = ipc_find(iph)) == NULL)
+               return NULL;
 
        if ((qp = frag_alloc_queue()) == NULL)
                goto out_nomem;
@@ -366,6 +514,10 @@ static struct ipq *ip_frag_create(unsign
        qp->meat = 0;
        qp->fragments = NULL;
        qp->iif = 0;
+       qp->ipc = cp;
+       if (sysctl_ip_reassembly_count && cp) {
+               atomic_set(&qp->seq, atomic_read(&cp->seq));
+       }
 
        /* Initialize a timer for this entry. */
        init_timer(&qp->timer);
@@ -381,6 +533,39 @@ out_nomem:
        return NULL;
 }
 
+static inline int in_window(int bottom, int size, int seq) {
+       return (((seq - bottom) >= 0) && ((seq - (bottom + size)) < 0));
+}
+
+static int __ip_reassembly_count_check(const struct iphdr *iph, struct ipq *qp)
+{
+       struct ipc *cp = qp->ipc;
+       int cseq, qseq;
+
+       /* qp->ipc may be NULL if sysctl_ip_reassembly_count was off 
+        * at the time the fragment queue was created */
+       if (cp == NULL)
+               return 0;
+
+       cseq = atomic_inc_return(&cp->seq);
+       qseq = atomic_inc_return(&qp->seq);
+
+       if (!in_window(qseq, sysctl_ip_reassembly_count, cseq)) {
+               atomic_inc(&qp->refcnt);
+               read_unlock(&ipfrag_lock);
+               spin_lock(&qp->lock);
+               if (!(qp->last_in&COMPLETE))
+                       ipq_kill(qp);
+               spin_unlock(&qp->lock);
+               ipq_put(qp, NULL);
+               IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+               read_lock(&ipfrag_lock);
+               return 1;
+       }
+       return 0;
+}
+
+
 /* Find the correct entry in the "incomplete datagrams" queue for
  * this IP datagram, and create new one, if nothing is found.
  */
@@ -400,6 +585,10 @@ static inline struct ipq *ip_find(struct
                   qp->daddr == daddr   &&
                   qp->protocol == protocol &&
                   qp->user == user) {
+                       if (sysctl_ip_reassembly_count &&
+                               __ip_reassembly_count_check(iph, qp)) {
+                               break;
+                       }
                        atomic_inc(&qp->refcnt);
                        read_unlock(&ipfrag_lock);
                        return qp;
@@ -679,9 +868,15 @@ struct sk_buff *ip_defrag(struct sk_buff
 
 void ipfrag_init(void)
 {
+       int i;
        ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
                                 (jiffies ^ (jiffies >> 6)));
 
+       for (i = 0; i < IPC_HASHSZ; i++ ) {
+               INIT_HLIST_HEAD(&ipc_hash[i].head);
+               spin_lock_init(&ipc_hash[i].lock);
+       }
+
        init_timer(&ipfrag_secret_timer);
        ipfrag_secret_timer.function = ipfrag_secret_rebuild;
        ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval;
diff -rup linux.orig/net/ipv4/sysctl_net_ipv4.c linux/net/ipv4/sysctl_net_ipv4.c
--- linux.orig/net/ipv4/sysctl_net_ipv4.c       2005-07-06 12:34:16.685869538 
-0700
+++ linux/net/ipv4/sysctl_net_ipv4.c    2005-07-07 09:56:42.857774806 -0700
@@ -30,6 +30,7 @@ extern int sysctl_ipfrag_low_thresh;
 extern int sysctl_ipfrag_high_thresh; 
 extern int sysctl_ipfrag_time;
 extern int sysctl_ipfrag_secret_interval;
+extern int sysctl_ip_reassembly_count;
 
 /* From ip_output.c */
 extern int sysctl_ip_dynaddr;
@@ -50,6 +51,7 @@ extern int inet_peer_gc_mintime;
 extern int inet_peer_gc_maxtime;
 
 #ifdef CONFIG_SYSCTL
+static int zero;
 static int tcp_retr1_max = 255; 
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -643,6 +645,15 @@ ctl_table ipv4_table[] = {
                .strategy       = &sysctl_jiffies
        },
        {
+               .ctl_name       = NET_IPV4_REASM_COUNT,
+               .procname       = "ip_reassembly_count",
+               .data           = &sysctl_ip_reassembly_count,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .extra1         = &zero
+       },
+       {
                .ctl_name       = NET_TCP_NO_METRICS_SAVE,
                .procname       = "tcp_no_metrics_save",
                .data           = &sysctl_tcp_nometrics_save,

Reply via email to