Recently we run into the problems with Path MTU Discovery.

An Internet host sent ICMP 3/4 "Fragmentation needed and DF bit set"
message but included IP header of original packet had diferent TOS
field rather than we sent. As far as I know it's permitted and some of
ISPs do that to implement their own QoS policies.

Browsing the Web, I've found interesting discussion "PMTU issues due
to TOS field manipulation (for DSCP)"
http://oss.sgi.com/projects/netdev/archive/2003-12/msg00180.html which
continues on http://oss.sgi.com/projects/netdev/archive/2004-03/msg00040.html.

The symptoms are the same even with recent kernels (tried 2.6.16-rc3)
- kernel still sends packets with MTU bigger than recommended by ICMP
Fragmentation Needed messages from an intermediate host.

I adapted the patch from Julian Anastasov (thanks, Julian!), however,
attached one doesn't include TOS ignorance for routing decisions. In
other words, it's more conservative. I thought that ICMP redirect
handling could (and would) suffer from the same problems so I left
corresponding part of original patch although it wasn't tested.

Although the patch was intended for 2.6.15.4 it applies and works for
2.6.16-rc3.

Signed-off-by: Ilia Sotnikov <[EMAIL PROTECTED]>

--
 Ilia Sotnikov

--- linux-2.6.15.4.orig/Documentation/filesystems/proc.txt      2006-02-13
20:05:37.000000000 +0200
+++ linux-2.6.15.4/Documentation/filesystems/proc.txt   2006-02-14
11:12:38.000000000 +0200
@@ -1729,6 +1729,27 @@
 algorithm for the routing cache. gc_min_interval is deprecated and replaced
 by gc_min_interval_ms.

+match_tos_for_pmtud
+-----------------------
+
+Boolean: 0 (default) - ignore TOS, 1 - match TOS
+Flag to match the TOS value embedded in ICMP errors during the PMTUD process.
+By  default, the  PMTU value  is propagated to  all routing  cache entries, no
+matter the received TOS value. This mode is valid only for ignore_tos=0 and is
+used  when the TOS is  manipulated after routing time  or from other hosts for
+other  purposes,  eg.  QoS.  If  the  flag is  set  to  1  the  propagation is
+restricted to cache entries with the same TOS value.
+
+match_tos_for_redirects
+-----------------------
+
+Boolean: 0 (default) - ignore TOS, 1 - match TOS
+Flag to match the TOS value embedded in ICMP redirects.
+By  default, the new gateway value is propagated to all routing cache entries,
+no  matter the received TOS  value.  This mode is  valid only for ignore_tos=0
+and is used when the TOS is manipulated after routing time or from other hosts
+for  other purposes,  eg. QoS.  If the  flag is  set to  1 the  propagation is
+restricted to cache entries with the same TOS value.

 max_size
 --------
--- linux-2.6.15.4.orig/include/linux/sysctl.h  2006-02-13
20:06:52.000000000 +0200
+++ linux-2.6.15.4/include/linux/sysctl.h       2006-02-14 11:14:00.000000000 
+0200
@@ -412,6 +412,8 @@
        NET_IPV4_ROUTE_MIN_ADVMSS=17,
        NET_IPV4_ROUTE_SECRET_INTERVAL=18,
        NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS=19,
+       NET_IPV4_ROUTE_MATCH_TOS_FOR_PMTUD=20,
+       NET_IPV4_ROUTE_MATCH_TOS_FOR_REDIRECTS=21,
 };

 enum
--- linux-2.6.15.4.orig/net/ipv4/route.c        2006-01-15 08:16:02.000000000 
+0200
+++ linux-2.6.15.4/net/ipv4/route.c     2006-02-14 11:35:06.000000000 +0200
@@ -107,6 +107,12 @@
 #include <linux/sysctl.h>
 #endif

+int ip_rt_match_tos_for_pmtud;         /* 1=match by iph->tos, 0=ignore TOS */
+int ip_rt_match_tos_for_redirects;     /* 1=match by iph->tos, 0=ignore TOS */
+
+/* See IPTOS_RT_MASK */
+static u8 all_tos_values[8] = { 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14,
0x18, 0x1C };
+
 #define RT_FL_TOS(oldflp) \
     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))

@@ -1113,11 +1119,12 @@
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
                    u32 saddr, u8 tos, struct net_device *dev)
 {
-       int i, k;
+       int i, j, k, ntos;
        struct in_device *in_dev = in_dev_get(dev);
        struct rtable *rth, **rthp;
        u32  skeys[2] = { saddr, 0 };
        int  ikeys[2] = { dev->ifindex, 0 };
+       u8 *tos_values;

        tos &= IPTOS_RT_MASK;

@@ -1138,11 +1145,20 @@
                        goto reject_redirect;
        }

+       if (ip_rt_match_tos_for_redirects) {
+               tos_values = &tos;
+               ntos = 1;
+       } else {
+               tos_values = all_tos_values;
+               ntos = ARRAY_SIZE(all_tos_values);
+       }
+
+       for (j = 0; j < ntos; j++)
        for (i = 0; i < 2; i++) {
                for (k = 0; k < 2; k++) {
                        unsigned hash = rt_hash_code(daddr,
                                                     skeys[i] ^ (ikeys[k] << 5),
-                                                    tos);
+                                                    tos_values[j]);

                        rthp=&rt_hash_table[hash].chain;

@@ -1152,7 +1168,7 @@

                                if (rth->fl.fl4_dst != daddr ||
                                    rth->fl.fl4_src != skeys[i] ||
-                                   rth->fl.fl4_tos != tos ||
+                                   rth->fl.fl4_tos != tos_values[j] ||
                                    rth->fl.oif != ikeys[k] ||
                                    rth->fl.iif != 0) {
                                        rthp = &rth->u.rt_next;
@@ -1386,19 +1402,29 @@

 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
 {
-       int i;
+       int i, j, ntos;
        unsigned short old_mtu = ntohs(iph->tot_len);
        struct rtable *rth;
        u32  skeys[2] = { iph->saddr, 0, };
        u32  daddr = iph->daddr;
        u8   tos = iph->tos & IPTOS_RT_MASK;
        unsigned short est_mtu = 0;
+       u8   *tos_values;

        if (ipv4_config.no_pmtu_disc)
                return 0;

+       if (ip_rt_match_tos_for_pmtud) {
+               tos_values = &tos;
+               ntos = 1;
+       } else {
+               tos_values = all_tos_values;
+               ntos = ARRAY_SIZE(all_tos_values);
+       }
+
+       for (j = 0; j < ntos; j++)
        for (i = 0; i < 2; i++) {
-               unsigned hash = rt_hash_code(daddr, skeys[i], tos);
+               unsigned hash = rt_hash_code(daddr, skeys[i], tos_values[j]);

                rcu_read_lock();
                for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
@@ -1407,7 +1433,7 @@
                            rth->fl.fl4_src == skeys[i] &&
                            rth->rt_dst  == daddr &&
                            rth->rt_src  == iph->saddr &&
-                           rth->fl.fl4_tos == tos &&
+                           rth->fl.fl4_tos == tos_values[j] &&
                            rth->fl.iif == 0 &&
                            !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
                                unsigned short mtu = new_mtu;
@@ -3050,6 +3076,22 @@
                .proc_handler   = &proc_dointvec_jiffies,
                .strategy       = &sysctl_jiffies,
        },
+       {
+               .ctl_name       = NET_IPV4_ROUTE_MATCH_TOS_FOR_PMTUD,
+               .procname       = "match_tos_for_pmtud",
+               .data           = &ip_rt_match_tos_for_pmtud,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               .ctl_name       = NET_IPV4_ROUTE_MATCH_TOS_FOR_REDIRECTS,
+               .procname       = "match_tos_for_redirects",
+               .data           = &ip_rt_match_tos_for_redirects,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
        { .ctl_name = 0 }
 };
 #endif
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to