Hi Arthur: On Wed, Jul 27, 2005 at 10:24:44AM -0700, Arthur Kepner wrote: > > Resending and requesting comments. (Patch was against 2.6.13-rc1, > so it's a little stale by now.)
I've finished my patch based on your work. Unfortunately the result isn't as clean as I would've liked. However, it does point out a couple of problems with your patch more clearly. Perhaps you could work on those and come up with a better solution. One problem is that you're not incrementing cp->seq for the first fragment. This could lead to a significant deviation from the desired maximum window. > Yes, I think rcu is worth it here. The reason is that to > not use rcu would necessitate grabbing the (global) > ipfrag_lock an additional time, when we free an ipc. I got around this by unlinking the ipc entry when we're unlinking the ipq entry, which means that you get ipfrag_lock for free. However, your point below is still valid. > Adding an "ipc" to the hashtable could be done under the > ipfrag_lock, as you mention. But removing an ipc shouldn't > be done at the same time that fragments are destroyed, > because the common case is that another fragment queue will > soon be created for the same (src,dst,proto). Better to > save the ipc for a while to avoid freeing and then > immediately recreating it. I agree. Unfortunately, leaving ipc entires around opens us to a potential DoS attack. By spoofing source addresses the attacker could fill up our memory with these ipc entries. Granted we could prune them in ip_evictor. However, that does add significant complexity. So overall I'd say that the kmalloc cost is small enough that this isn't worth it. BTW, if we do this, then we really need to account the memory consumed by the ipc structures so that they're kept in check. I've done that in my patch. > I thought about this point, but I dislike reusing the same > structure for such different purposes, so left this unchanged. I'm still undecided on this one. For now I've chosen to put them in the same table as I couldn't see a lot of simplifications with storing them separately. But I'm willing to be convinced otherwise with a smaller ip_fragment.c :) Another thing that I changed is to use only saddr to key ipc. If we receive a long sequence of fragments from the same host, then any fragment we have which are not part of that sequence are likely to have been lost, regardless of the destination/protocol of packets in that sequence. Of course this wouldn't be true with NAT. However, daddr/protocol doesn't really help much with NAT anyway. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -350,6 +350,7 @@ enum NET_TCP_BIC_BETA=108, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, NET_TCP_CONG_CONTROL=110, + NET_IPV4_IPFRAG_MAX_DIST=111, }; enum { diff --git a/include/net/ip.h b/include/net/ip.h --- a/include/net/ip.h +++ b/include/net/ip.h @@ -45,6 +45,7 @@ struct inet_skb_parm #define IPSKB_TRANSLATED 2 #define IPSKB_FORWARDED 4 #define IPSKB_XFRM_TUNNEL_SIZE 8 +#define IPSKB_FRAG_COMPLETE 16 }; struct ipcm_cookie @@ -297,11 +298,13 @@ enum ip_defrag_users IP_DEFRAG_NAT_OUT, IP_DEFRAG_VS_IN, IP_DEFRAG_VS_OUT, - IP_DEFRAG_VS_FWD + IP_DEFRAG_VS_FWD, + IP_DEFRAG_INTERNAL, }; struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user); extern int ip_frag_nqueues; +extern int ip_frag_cnum; extern atomic_t ip_frag_mem; /* diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -22,6 +22,7 @@ * Patrick McHardy : LRU queue of frag heads for evictor. */ +#include <linux/compiler.h> #include <linux/config.h> #include <linux/module.h> #include <linux/types.h> @@ -56,6 +57,8 @@ int sysctl_ipfrag_high_thresh = 256*1024; int sysctl_ipfrag_low_thresh = 192*1024; +int sysctl_ipfrag_max_dist; + /* Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. */ @@ -72,7 +75,7 @@ struct ipfrag_skb_cb /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct ipq *next; /* linked list pointers */ - struct list_head lru_list; /* lru list member */ + struct ipq **pprev; u32 user; u32 saddr; u32 daddr; @@ -83,15 +86,39 @@ struct ipq { #define FIRST_IN 2 #define LAST_IN 1 + struct list_head lru_list; /* lru list member */ struct sk_buff *fragments; /* linked list of received fragments */ int len; /* total length of original datagram */ int meat; spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - struct ipq **pprev; - int iif; struct timeval stamp; + int iif; + + unsigned int genid; + struct ipc *cp; +}; + +/* + * struct ipc contains a count of the number of IP datagrams + * received from a specific saddr - but one of these structures + * exists for a given saddr if and only if there is a queue of + * IP fragments from that address and sysctl_ipfrag_max_dist + * is non-zero. + */ +struct ipc { + /* Shared elements with struct ipq. */ + struct ipq *next; + struct ipq **pprev; + u32 user; + u32 saddr; + u32 daddr; + u16 id; + u8 protocol; + + atomic_t genid; + atomic_t refcnt; }; /* Hash table. */ @@ -104,9 +131,25 @@ static DEFINE_RWLOCK(ipfrag_lock); static u32 ipfrag_hash_rnd; static LIST_HEAD(ipq_lru_list); int ip_frag_nqueues = 0; +int ip_frag_cnum = 0; + +static inline struct ipc *__ipc_put(struct ipc *cp) +{ + if (!cp || !atomic_dec_and_test(&cp->refcnt)) + return NULL; + + if (cp->next) + cp->next->pprev = cp->pprev; + *cp->pprev = cp->next; + ip_frag_cnum--; + + return cp; +} static __inline__ void __ipq_unlink(struct ipq *qp) { + qp->cp = __ipc_put(qp->cp); + if(qp->next) qp->next->pprev = qp->pprev; *qp->pprev = qp->next; @@ -178,24 +221,44 @@ static __inline__ void frag_kfree_skb(st kfree_skb(skb); } -static __inline__ void frag_free_queue(struct ipq *qp, int *work) +static inline void frag_free_mem(void *qp, int size, int *work) { if (work) - *work -= sizeof(struct ipq); - atomic_sub(sizeof(struct ipq), &ip_frag_mem); + *work -= size; + atomic_sub(size, &ip_frag_mem); kfree(qp); } -static __inline__ struct ipq *frag_alloc_queue(void) +static inline void *frag_alloc_mem(int size) { - struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); + void *qp = kmalloc(size, GFP_ATOMIC); if(!qp) return NULL; - atomic_add(sizeof(struct ipq), &ip_frag_mem); + atomic_add(size, &ip_frag_mem); return qp; } +static inline void frag_free_ipc(struct ipc *cp, int *work) +{ + frag_free_mem(cp, sizeof(*cp), work); +} + +static inline struct ipc *frag_alloc_ipc(void) +{ + return frag_alloc_mem(sizeof(struct ipc)); +} + +static inline void frag_free_queue(struct ipq *qp, int *work) +{ + frag_free_mem(qp, sizeof(*qp), work); +} + +static inline struct ipq *frag_alloc_queue(void) +{ + return frag_alloc_mem(sizeof(struct ipq)); +} + /* Destruction primitives. */ @@ -207,6 +270,9 @@ static void ip_frag_destroy(struct ipq * BUG_TRAP(qp->last_in&COMPLETE); BUG_TRAP(del_timer(&qp->timer) == 0); + if (qp->cp) + frag_free_ipc(qp->cp, work); + /* Release all fragment data. */ fp = qp->fragments; while (fp) { @@ -307,28 +373,60 @@ out: /* Creation primitives. */ -static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) +static inline void *ip_frag_find(unsigned int hash, u32 user, u32 saddr, + u32 daddr, u16 id, u8 protocol) +{ + struct ipq *qp; + + for (qp = ipq_hash[hash]; qp; qp = qp->next) { + if (qp->id == id && + qp->saddr == saddr && + qp->daddr == daddr && + qp->protocol == protocol && + qp->user == user) + return qp; + } + + return NULL; +} + +static void *__ip_frag_insert(unsigned int hash, struct ipq *qp_in) { struct ipq *qp; - write_lock(&ipfrag_lock); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could be created on other cpu, while we * promoted read lock to write lock. */ - for(qp = ipq_hash[hash]; qp; qp = qp->next) { - if(qp->id == qp_in->id && - qp->saddr == qp_in->saddr && - qp->daddr == qp_in->daddr && - qp->protocol == qp_in->protocol && - qp->user == qp_in->user) { - atomic_inc(&qp->refcnt); - write_unlock(&ipfrag_lock); - qp_in->last_in |= COMPLETE; - ipq_put(qp_in, NULL); - return qp; - } + qp = ip_frag_find(hash, qp_in->user, qp_in->saddr, qp_in->daddr, + qp_in->id, qp_in->protocol); + if (unlikely(qp)) + return qp; +#endif + qp = qp_in; + + if ((qp->next = ipq_hash[hash]) != NULL) + qp->next->pprev = &qp->next; + ipq_hash[hash] = qp; + qp->pprev = &ipq_hash[hash]; + return NULL; +} + +static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) +{ + struct ipq *qp; + + write_lock(&ipfrag_lock); + qp = __ip_frag_insert(hash, qp_in); +#ifdef CONFIG_SMP + if (unlikely(qp)) { + atomic_inc(&qp->refcnt); + qp_in->cp = __ipc_put(qp_in->cp); + write_unlock(&ipfrag_lock); + qp_in->last_in |= COMPLETE; + ipq_put(qp_in, NULL); + return qp; } #endif qp = qp_in; @@ -337,10 +435,6 @@ static struct ipq *ip_frag_intern(unsign atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); - if((qp->next = ipq_hash[hash]) != NULL) - qp->next->pprev = &qp->next; - ipq_hash[hash] = qp; - qp->pprev = &ipq_hash[hash]; INIT_LIST_HEAD(&qp->lru_list); list_add_tail(&qp->lru_list, &ipq_lru_list); ip_frag_nqueues++; @@ -348,8 +442,40 @@ static struct ipq *ip_frag_intern(unsign return qp; } +static struct ipc *ipc_insert(unsigned int hash, struct ipc *cp_in) +{ + struct ipc *cp; + + write_lock(&ipfrag_lock); +#ifdef CONFIG_SMP + cp = __ip_frag_insert(hash, (struct ipq *)cp_in); + if (unlikely(cp)) { + atomic_inc(&cp->refcnt); + write_unlock(&ipfrag_lock); + frag_free_ipc(cp_in); + return cp; + } +#endif + cp = cp_in; + + ip_frag_cnum++; + write_unlock(&ipfrag_lock); + return cp; +} + +static void ipc_put(struct ipc *cp) +{ + write_lock(&ipfrag_lock); + cp = __ipc_put(cp); + write_unlock(&ipfrag_lock); + + if (cp) + frag_free_ipc(cp, NULL); +} + /* Add an entry to the 'ipq' queue for a newly received IP datagram. */ -static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) +static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user, + struct ipc *cp) { struct ipq *qp; @@ -366,6 +492,9 @@ static struct ipq *ip_frag_create(unsign qp->meat = 0; qp->fragments = NULL; qp->iif = 0; + if (cp) + qp->genid = atomic_inc_return(&cp->genid); + qp->cp = cp; /* Initialize a timer for this entry. */ init_timer(&qp->timer); @@ -377,37 +506,115 @@ static struct ipq *ip_frag_create(unsign return ip_frag_intern(hash, qp); out_nomem: + ipc_put(cp); NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n")); return NULL; } +static struct ipq *ip_frag_create_ipc(unsigned hash, struct iphdr *iph, + u32 user, unsigned hashcp) +{ + struct ipc *cp; + + if (!(cp = frag_alloc_ipc())) + goto out_nomem; + + cp->user = IP_DEFRAG_INTERNAL; + cp->saddr = iph->saddr; + cp->daddr = 0; + cp->id = 0; + cp->protocol = 0; + atomic_set(&cp->genid, 0); + atomic_set(&cp->refcnt, 1); + + cp = ipc_insert(hashcp, cp); + + return ip_frag_create(hash, iph, user, cp); + +out_nomem: + NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create_ipc: no memory left !\n")); + return NULL; +} + +static inline int ip_frag_count_exceeded(struct ipq *qp) +{ + struct ipc *cp = qp->cp; + unsigned int max = sysctl_ipfrag_max_dist; + unsigned int start, end; + + if (!cp || !max) + return 0; + + start = qp->genid; + end = atomic_inc_return(&cp->genid); + + /* This takes care of a long train of 8-byte fragments. */ + qp->genid = end; + + return (unsigned int)(end - start) >= max; +} + +static inline struct ipq *ip_frag_replace(unsigned hash, struct iphdr *iph, + u32 user, struct ipc *cp, + struct ipq *qp) +{ + spin_lock(&qp->lock); + if (!(qp->last_in & COMPLETE)) + ipq_kill(qp); + spin_unlock(&qp->lock); + + ipq_put(qp, NULL); + + /* + * In this case we are incrementing cp->genid twice. + * This makes sense as it is very likely that a + * packet with the same parameters was transmitted + * and lost. + */ + return ip_frag_create(hash, iph, user, cp); +} + /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static inline struct ipq *ip_find(struct iphdr *iph, u32 user) +static struct ipq *ip_find(struct sk_buff *skb, u32 user) { + struct iphdr *iph = skb->nh.iph; __u16 id = iph->id; __u32 saddr = iph->saddr; __u32 daddr = iph->daddr; __u8 protocol = iph->protocol; unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); + unsigned int hashcp = 0; struct ipq *qp; + struct ipc *cp = NULL; read_lock(&ipfrag_lock); - for(qp = ipq_hash[hash]; qp; qp = qp->next) { - if(qp->id == id && - qp->saddr == saddr && - qp->daddr == daddr && - qp->protocol == protocol && - qp->user == user) { - atomic_inc(&qp->refcnt); - read_unlock(&ipfrag_lock); - return qp; + qp = ip_frag_find(hash, user, saddr, daddr, id, protocol); + if (qp) { + atomic_inc(&qp->refcnt); + if (unlikely(ip_frag_count_exceeded(qp))) { + cp = qp->cp; + atomic_inc(&cp->refcnt); + } + read_unlock(&ipfrag_lock); + return unlikely(cp) ? + ip_frag_replace(hash, iph, user, cp, qp) : qp; + } + + if (sysctl_ipfrag_max_dist && + !(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE)) { + hashcp = ipqhashfn(0, saddr, 0, 0); + cp = ip_frag_find(hashcp, IP_DEFRAG_INTERNAL, saddr, 0, 0, 0); + if (cp) { + hashcp = 0; + atomic_inc(&cp->refcnt); } } read_unlock(&ipfrag_lock); - return ip_frag_create(hash, iph, user); + return hashcp ? ip_frag_create_ipc(hash, iph, user, hashcp) : + ip_frag_create(hash, iph, user, cp); } /* Add new segment to existing queue. */ @@ -643,7 +850,6 @@ out_fail: /* Process an incoming IP datagram fragment. */ struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) { - struct iphdr *iph = skb->nh.iph; struct ipq *qp; struct net_device *dev; @@ -656,7 +862,7 @@ struct sk_buff *ip_defrag(struct sk_buff dev = skb->dev; /* Lookup (or create) queue header */ - if ((qp = ip_find(iph, user)) != NULL) { + if ((qp = ip_find(skb, user)) != NULL) { struct sk_buff *ret = NULL; spin_lock(&qp->lock); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -447,6 +447,7 @@ int ip_fragment(struct sk_buff *skb, int hlen = iph->ihl * 4; mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -69,8 +69,8 @@ static int sockstat_seq_show(struct seq_ atomic_read(&tcp_memory_allocated)); seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); - seq_printf(seq, "FRAG: inuse %d memory %d\n", ip_frag_nqueues, - atomic_read(&ip_frag_mem)); + seq_printf(seq, "FRAG: inuse %d ipc %d memory %d\n", + ip_frag_nqueues, ip_frag_cnum, atomic_read(&ip_frag_mem)); return 0; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -30,6 +30,7 @@ extern int sysctl_ipfrag_low_thresh; extern int sysctl_ipfrag_high_thresh; extern int sysctl_ipfrag_time; extern int sysctl_ipfrag_secret_interval; +extern int sysctl_ipfrag_max_dist; /* From ip_output.c */ extern int sysctl_ip_dynaddr; @@ -643,6 +644,14 @@ ctl_table ipv4_table[] = { .strategy = &sysctl_jiffies }, { + .ctl_name = NET_IPV4_IPFRAG_MAX_DIST, + .procname = "ipfrag_max_dist", + .data = &sysctl_ipfrag_max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save,