ICMP packets are inspected to let them route together with the flow they
belong to, allowing anycast environments to work with ECMP.

Signed-off-by: Peter Nørlund <p...@ordbogen.com>
---
 net/ipv4/icmp.c  | 27 ++++++++++++++++++-
 net/ipv4/route.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 92 insertions(+), 15 deletions(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 3abcfea..20f1d5e 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -447,6 +447,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 {
        struct rtable *rt, *rt2;
        struct flowi4 fl4_dec;
+       struct flowi4 mp_flow;
        int err;
 
        memset(fl4, 0, sizeof(*fl4));
@@ -459,7 +460,31 @@ static struct rtable *icmp_route_lookup(struct net *net,
        fl4->fl4_icmp_type = type;
        fl4->fl4_icmp_code = code;
        security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
-       rt = __ip_route_output_key(net, fl4, NULL);
+
+       /* Source and destination is swapped. See ip_multipath_flow */
+       mp_flow.saddr = iph->daddr;
+       mp_flow.daddr = iph->saddr;
+       mp_flow.flowi4_proto = iph->protocol;
+       mp_flow.fl4_sport = 0;
+       mp_flow.fl4_dport = 0;
+       if (!ip_is_fragment(iph)) {
+               if (iph->protocol == IPPROTO_TCP ||
+                   iph->protocol == IPPROTO_UDP ||
+                   iph->protocol == IPPROTO_SCTP) {
+                       __be16 _ports[2];
+                       const __be16 *ports;
+
+                       ports = skb_header_pointer(skb_in, iph->ihl * 4,
+                                                  sizeof(_ports),
+                                                  &_ports);
+                       if (ports) {
+                               mp_flow.fl4_sport = ports[1];
+                               mp_flow.fl4_dport = ports[0];
+                       }
+               }
+       }
+
+       rt = __ip_route_output_key(net, fl4, &mp_flow);
        if (IS_ERR(rt))
                return rt;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a1ec62c..bab4318 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1635,31 +1635,83 @@ out:
 /* Fill flow key data based on packet for use in multipath routing. */
 static void ip_multipath_flow(const struct sk_buff *skb, struct flowi4 *flow)
 {
-       const struct iphdr *iph;
-
-       iph = ip_hdr(skb);
-
-       flow->saddr = iph->saddr;
-       flow->daddr = iph->daddr;
-       flow->flowi4_proto = iph->protocol;
+       struct icmphdr _icmph;
+       struct iphdr _inner_iph;
+       const struct iphdr *outer_iph;
+       const struct icmphdr *icmph;
+       const struct iphdr *inner_iph;
+       unsigned int offset;
+       __be16 _ports[2];
+       const __be16 *ports;
+
+       outer_iph = ip_hdr(skb);
+
+       flow->saddr = outer_iph->saddr;
+       flow->daddr = outer_iph->daddr;
+       flow->flowi4_proto = outer_iph->protocol;
        flow->fl4_sport = 0;
        flow->fl4_dport = 0;
 
-       if (unlikely(ip_is_fragment(iph)))
+       if (unlikely(ip_is_fragment(outer_iph)))
                return;
 
-       if (iph->protocol == IPPROTO_TCP ||
-           iph->protocol == IPPROTO_UDP ||
-           iph->protocol == IPPROTO_SCTP) {
-               __be16 _ports;
-               const __be16 *ports;
+       offset = outer_iph->ihl * 4;
 
-               ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
+       if (outer_iph->protocol == IPPROTO_TCP ||
+           outer_iph->protocol == IPPROTO_UDP ||
+           outer_iph->protocol == IPPROTO_SCTP) {
+               ports = skb_header_pointer(skb, offset, sizeof(_ports),
                                           &_ports);
                if (ports) {
                        flow->fl4_sport = ports[0];
                        flow->fl4_dport = ports[1];
                }
+
+               return;
+       }
+
+       if (outer_iph->protocol != IPPROTO_ICMP)
+               return;
+
+       icmph = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+       if (!icmph)
+               return;
+
+       if (icmph->type != ICMP_DEST_UNREACH &&
+           icmph->type != ICMP_SOURCE_QUENCH &&
+           icmph->type != ICMP_REDIRECT &&
+           icmph->type != ICMP_TIME_EXCEEDED &&
+           icmph->type != ICMP_PARAMETERPROB) {
+               return;
+       }
+
+       offset += sizeof(_icmph);
+       inner_iph = skb_header_pointer(skb, offset, sizeof(_inner_iph),
+                                      &_inner_iph);
+       if (inner_iph)
+               return;
+
+       /* Since the ICMP payload contains a packet sent from the current
+        * recipient, we swap source and destination addresses and ports
+        */
+       flow->saddr = inner_iph->daddr;
+       flow->daddr = inner_iph->saddr;
+       flow->flowi4_proto = inner_iph->protocol;
+
+       if (unlikely(ip_is_fragment(inner_iph)))
+               return;
+
+       if (inner_iph->protocol != IPPROTO_TCP &&
+           inner_iph->protocol != IPPROTO_UDP &&
+           inner_iph->protocol != IPPROTO_SCTP) {
+               return;
+       }
+
+       offset += inner_iph->ihl * 4;
+       ports = skb_header_pointer(skb, offset, sizeof(_ports), &_ports);
+       if (ports) {
+               flow->fl4_sport = ports[1];
+               flow->fl4_dport = ports[0];
        }
 }
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to