On Sun, 31 Jul 2005, Herbert Xu wrote:
> > Anyway, here is a patch to use inetpeer instead of that icky ipc > structure.? It sure cuts down the size of the patch :) > .... Herbert; I've been using the updated patch, and I like it. struct inet_peer is the right place to do this accounting. I made a few bug fixes and have tested it, tried to break it, etc. Seems to do the trick. The latest iteration is attached. Please have a look. Regards. -- Arthur
diff -pur linux.orig/include/linux/sysctl.h linux.new/include/linux/sysctl.h --- linux.orig/include/linux/sysctl.h 2005-08-03 11:43:40.923892254 -0700 +++ linux.new/include/linux/sysctl.h 2005-08-04 16:58:17.901171101 -0700 @@ -352,6 +352,7 @@ enum NET_TCP_BIC_BETA=108, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, NET_TCP_CONG_CONTROL=110, + NET_IPV4_IPFRAG_MAX_DIST=111, }; enum { diff -pur linux.orig/include/net/inetpeer.h linux.new/include/net/inetpeer.h --- linux.orig/include/net/inetpeer.h 2005-08-03 11:44:01.778605212 -0700 +++ linux.new/include/net/inetpeer.h 2005-08-04 16:58:38.694360121 -0700 @@ -25,6 +25,7 @@ struct inet_peer __u32 v4daddr; /* peer's address */ __u16 avl_height; __u16 ip_id_count; /* IP ID for the next packet */ + atomic_t rid; /* Frag reception counter */ __u32 tcp_ts; unsigned long tcp_ts_stamp; }; diff -pur linux.orig/include/net/ip.h linux.new/include/net/ip.h --- linux.orig/include/net/ip.h 2005-08-03 11:44:08.654654565 -0700 +++ linux.new/include/net/ip.h 2005-08-04 16:58:50.460109760 -0700 @@ -45,6 +45,7 @@ struct inet_skb_parm #define IPSKB_TRANSLATED 2 #define IPSKB_FORWARDED 4 #define IPSKB_XFRM_TUNNEL_SIZE 8 +#define IPSKB_FRAG_COMPLETE 16 }; struct ipcm_cookie diff -pur linux.orig/net/ipv4/inetpeer.c linux.new/net/ipv4/inetpeer.c --- linux.orig/net/ipv4/inetpeer.c 2005-08-03 11:44:40.086627938 -0700 +++ linux.new/net/ipv4/inetpeer.c 2005-08-04 16:59:20.251440976 -0700 @@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 dad return NULL; n->v4daddr = daddr; atomic_set(&n->refcnt, 1); + atomic_set(&n->rid, 0); n->ip_id_count = secure_ip_id(daddr); n->tcp_ts_stamp = 0; diff -pur linux.orig/net/ipv4/ip_fragment.c linux.new/net/ipv4/ip_fragment.c --- linux.orig/net/ipv4/ip_fragment.c 2005-08-03 11:44:48.086712630 -0700 +++ linux.new/net/ipv4/ip_fragment.c 2005-08-04 17:03:02.162971536 -0700 @@ -22,6 +22,7 @@ * Patrick McHardy : LRU queue of frag heads for evictor. */ +#include <linux/compiler.h> #include <linux/config.h> #include <linux/module.h> #include <linux/types.h> @@ -38,6 +39,7 @@ #include <net/ip.h> #include <net/icmp.h> #include <net/checksum.h> +#include <net/inetpeer.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/inet.h> @@ -56,6 +58,8 @@ int sysctl_ipfrag_high_thresh = 256*1024; int sysctl_ipfrag_low_thresh = 192*1024; +int sysctl_ipfrag_max_dist = 64; + /* Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. */ @@ -90,8 +94,11 @@ struct ipq { atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ struct ipq **pprev; - int iif; struct timeval stamp; + int iif; + + unsigned int rid; + struct inet_peer *peer; }; /* Hash table. */ @@ -207,6 +214,9 @@ static void ip_frag_destroy(struct ipq * BUG_TRAP(qp->last_in&COMPLETE); BUG_TRAP(del_timer(&qp->timer) == 0); + if (qp->peer) + inet_putpeer(qp->peer); + /* Release all fragment data. */ fp = qp->fragments; while (fp) { @@ -366,6 +376,9 @@ static struct ipq *ip_frag_create(unsign qp->meat = 0; qp->fragments = NULL; qp->iif = 0; + qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL; + if (qp->peer) + qp->rid = atomic_read(&qp->peer->rid); /* Initialize a timer for this entry. */ init_timer(&qp->timer); @@ -410,6 +423,63 @@ static inline struct ipq *ip_find(struct return ip_frag_create(hash, iph, user); } +/* Is the fragment too far ahead to be part of ipq? */ +static inline int ip_frag_too_far(struct ipq *qp) +{ + struct inet_peer *peer = qp->peer; + unsigned int max = sysctl_ipfrag_max_dist; + unsigned int start, end; + + int rc; + + if (!peer || !max) + return 0; + + start = ++qp->rid; + end = atomic_inc_return(&peer->rid); + + rc = qp->fragments && (end - start) >= max; + + if (rc) { + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } + + return rc; +} + +static int ip_frag_reinit(struct ipq *qp) +{ + struct sk_buff *fp; + + if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) { + atomic_inc(&qp->refcnt); + return -ETIMEDOUT; + } + + fp = qp->fragments; + do { + struct sk_buff *xp = fp->next; + frag_kfree_skb(fp, NULL); + fp = xp; + } while (fp); + + qp->last_in = 0; + qp->len = 0; + qp->meat = 0; + qp->fragments = NULL; + qp->iif = 0; + if (sysctl_ipfrag_max_dist) { + if (qp->peer == NULL) { + qp->peer = inet_getpeer(qp->saddr, 1); + } + if (qp->peer) { + qp->rid = atomic_read(&qp->peer->rid); + } + } + + return 0; +} + /* Add new segment to existing queue. */ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { @@ -420,6 +490,12 @@ static void ip_frag_queue(struct ipq *qp if (qp->last_in & COMPLETE) goto err; + if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && + unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) { + ipq_kill(qp); + goto err; + } + offset = ntohs(skb->nh.iph->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; diff -pur linux.orig/net/ipv4/ip_output.c linux.new/net/ipv4/ip_output.c --- linux.orig/net/ipv4/ip_output.c 2005-08-03 11:44:53.139500496 -0700 +++ linux.new/net/ipv4/ip_output.c 2005-08-04 16:59:52.609205635 -0700 @@ -447,6 +447,7 @@ int ip_fragment(struct sk_buff *skb, int hlen = iph->ihl * 4; mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing diff -pur linux.orig/net/ipv4/sysctl_net_ipv4.c linux.new/net/ipv4/sysctl_net_ipv4.c --- linux.orig/net/ipv4/sysctl_net_ipv4.c 2005-08-03 11:45:01.530214323 -0700 +++ linux.new/net/ipv4/sysctl_net_ipv4.c 2005-08-04 17:00:04.807577047 -0700 @@ -30,6 +30,7 @@ extern int sysctl_ipfrag_low_thresh; extern int sysctl_ipfrag_high_thresh; extern int sysctl_ipfrag_time; extern int sysctl_ipfrag_secret_interval; +extern int sysctl_ipfrag_max_dist; /* From ip_output.c */ extern int sysctl_ip_dynaddr; @@ -50,6 +51,7 @@ extern int inet_peer_gc_mintime; extern int inet_peer_gc_maxtime; #ifdef CONFIG_SYSCTL +static int zero; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -643,6 +645,15 @@ ctl_table ipv4_table[] = { .strategy = &sysctl_jiffies }, { + .ctl_name = NET_IPV4_IPFRAG_MAX_DIST, + .procname = "ipfrag_max_dist", + .data = &sysctl_ipfrag_max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero + }, + { .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save,