Recently we run into the problems with Path MTU Discovery. An Internet host sent ICMP 3/4 "Fragmentation needed and DF bit set" message but included IP header of original packet had diferent TOS field rather than we sent. As far as I know it's permitted and some of ISPs do that to implement their own QoS policies.
Browsing the Web, I've found interesting discussion "PMTU issues due to TOS field manipulation (for DSCP)" http://oss.sgi.com/projects/netdev/archive/2003-12/msg00180.html which continues on http://oss.sgi.com/projects/netdev/archive/2004-03/msg00040.html. The symptoms are the same even with recent kernels (tried 2.6.16-rc3) - kernel still sends packets with MTU bigger than recommended by ICMP Fragmentation Needed messages from an intermediate host. I adapted the patch from Julian Anastasov (thanks, Julian!), however, attached one doesn't include TOS ignorance for routing decisions. In other words, it's more conservative. I thought that ICMP redirect handling could (and would) suffer from the same problems so I left corresponding part of original patch although it wasn't tested. Although the patch was intended for 2.6.15.4 it applies and works for 2.6.16-rc3. Signed-off-by: Ilia Sotnikov <[EMAIL PROTECTED]> -- Ilia Sotnikov --- linux-2.6.15.4.orig/Documentation/filesystems/proc.txt 2006-02-13 20:05:37.000000000 +0200 +++ linux-2.6.15.4/Documentation/filesystems/proc.txt 2006-02-14 11:12:38.000000000 +0200 @@ -1729,6 +1729,27 @@ algorithm for the routing cache. gc_min_interval is deprecated and replaced by gc_min_interval_ms. +match_tos_for_pmtud +----------------------- + +Boolean: 0 (default) - ignore TOS, 1 - match TOS +Flag to match the TOS value embedded in ICMP errors during the PMTUD process. +By default, the PMTU value is propagated to all routing cache entries, no +matter the received TOS value. This mode is valid only for ignore_tos=0 and is +used when the TOS is manipulated after routing time or from other hosts for +other purposes, eg. QoS. If the flag is set to 1 the propagation is +restricted to cache entries with the same TOS value. + +match_tos_for_redirects +----------------------- + +Boolean: 0 (default) - ignore TOS, 1 - match TOS +Flag to match the TOS value embedded in ICMP redirects. +By default, the new gateway value is propagated to all routing cache entries, +no matter the received TOS value. This mode is valid only for ignore_tos=0 +and is used when the TOS is manipulated after routing time or from other hosts +for other purposes, eg. QoS. If the flag is set to 1 the propagation is +restricted to cache entries with the same TOS value. max_size -------- --- linux-2.6.15.4.orig/include/linux/sysctl.h 2006-02-13 20:06:52.000000000 +0200 +++ linux-2.6.15.4/include/linux/sysctl.h 2006-02-14 11:14:00.000000000 +0200 @@ -412,6 +412,8 @@ NET_IPV4_ROUTE_MIN_ADVMSS=17, NET_IPV4_ROUTE_SECRET_INTERVAL=18, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS=19, + NET_IPV4_ROUTE_MATCH_TOS_FOR_PMTUD=20, + NET_IPV4_ROUTE_MATCH_TOS_FOR_REDIRECTS=21, }; enum --- linux-2.6.15.4.orig/net/ipv4/route.c 2006-01-15 08:16:02.000000000 +0200 +++ linux-2.6.15.4/net/ipv4/route.c 2006-02-14 11:35:06.000000000 +0200 @@ -107,6 +107,12 @@ #include <linux/sysctl.h> #endif +int ip_rt_match_tos_for_pmtud; /* 1=match by iph->tos, 0=ignore TOS */ +int ip_rt_match_tos_for_redirects; /* 1=match by iph->tos, 0=ignore TOS */ + +/* See IPTOS_RT_MASK */ +static u8 all_tos_values[8] = { 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C }; + #define RT_FL_TOS(oldflp) \ ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) @@ -1113,11 +1119,12 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, u32 saddr, u8 tos, struct net_device *dev) { - int i, k; + int i, j, k, ntos; struct in_device *in_dev = in_dev_get(dev); struct rtable *rth, **rthp; u32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; + u8 *tos_values; tos &= IPTOS_RT_MASK; @@ -1138,11 +1145,20 @@ goto reject_redirect; } + if (ip_rt_match_tos_for_redirects) { + tos_values = &tos; + ntos = 1; + } else { + tos_values = all_tos_values; + ntos = ARRAY_SIZE(all_tos_values); + } + + for (j = 0; j < ntos; j++) for (i = 0; i < 2; i++) { for (k = 0; k < 2; k++) { unsigned hash = rt_hash_code(daddr, skeys[i] ^ (ikeys[k] << 5), - tos); + tos_values[j]); rthp=&rt_hash_table[hash].chain; @@ -1152,7 +1168,7 @@ if (rth->fl.fl4_dst != daddr || rth->fl.fl4_src != skeys[i] || - rth->fl.fl4_tos != tos || + rth->fl.fl4_tos != tos_values[j] || rth->fl.oif != ikeys[k] || rth->fl.iif != 0) { rthp = &rth->u.rt_next; @@ -1386,19 +1402,29 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) { - int i; + int i, j, ntos; unsigned short old_mtu = ntohs(iph->tot_len); struct rtable *rth; u32 skeys[2] = { iph->saddr, 0, }; u32 daddr = iph->daddr; u8 tos = iph->tos & IPTOS_RT_MASK; unsigned short est_mtu = 0; + u8 *tos_values; if (ipv4_config.no_pmtu_disc) return 0; + if (ip_rt_match_tos_for_pmtud) { + tos_values = &tos; + ntos = 1; + } else { + tos_values = all_tos_values; + ntos = ARRAY_SIZE(all_tos_values); + } + + for (j = 0; j < ntos; j++) for (i = 0; i < 2; i++) { - unsigned hash = rt_hash_code(daddr, skeys[i], tos); + unsigned hash = rt_hash_code(daddr, skeys[i], tos_values[j]); rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -1407,7 +1433,7 @@ rth->fl.fl4_src == skeys[i] && rth->rt_dst == daddr && rth->rt_src == iph->saddr && - rth->fl.fl4_tos == tos && + rth->fl.fl4_tos == tos_values[j] && rth->fl.iif == 0 && !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) { unsigned short mtu = new_mtu; @@ -3050,6 +3076,22 @@ .proc_handler = &proc_dointvec_jiffies, .strategy = &sysctl_jiffies, }, + { + .ctl_name = NET_IPV4_ROUTE_MATCH_TOS_FOR_PMTUD, + .procname = "match_tos_for_pmtud", + .data = &ip_rt_match_tos_for_pmtud, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_MATCH_TOS_FOR_REDIRECTS, + .procname = "match_tos_for_redirects", + .data = &ip_rt_match_tos_for_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; #endif - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html