Allow clamping of the MSS of packets to be encapsulated by a tunnel.
This provides an alternative to using PMTU discovery.

Cc: Jesse Gross <je...@nicira.com>
Cc: Kyle Mestery <kmest...@cisco.com>
Signed-off-by: Simon Horman <ho...@verge.net.au>

---

v5
* Initial posting
---
 lib/flow.h             |   2 +-
 lib/packets.c          | 134 +++++++++++++++++++++++++++++++++++++++++++++++++
 lib/packets.h          |  20 ++++++++
 ofproto/ofproto-dpif.c |  26 ++++++++--
 ofproto/ofproto.h      |   2 +
 vswitchd/bridge.c      |  19 ++++++-
 vswitchd/vswitch.xml   |  23 +++++++++
 7 files changed, 219 insertions(+), 7 deletions(-)

diff --git a/lib/flow.h b/lib/flow.h
index ea808da..d5fab5f 100644
--- a/lib/flow.h
+++ b/lib/flow.h
@@ -86,7 +86,7 @@ struct flow {
     uint8_t arp_tha[6];         /* ARP/ND target hardware address. */
     uint8_t nw_ttl;             /* IP TTL/Hop Limit. */
     uint8_t nw_frag;            /* FLOW_FRAG_* flags. */
-    uint8_t reserved[2];        /* Reserved for 64-bit packing. */
+    ovs_be16 tun_mss;           /* MSS Clamp size */
 };
 
 /* Represents the metadata fields of struct flow.  The masks are used to
diff --git a/lib/packets.c b/lib/packets.c
index 631abf8..ffc976e 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -20,12 +20,14 @@
 #include <arpa/inet.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
+#include <net/ethernet.h>
 #include <stdlib.h>
 #include "byte-order.h"
 #include "csum.h"
 #include "flow.h"
 #include "dynamic-string.h"
 #include "ofpbuf.h"
+#include "route-table.h"
 
 const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT;
 
@@ -555,3 +557,135 @@ packet_format_tcp_flags(struct ds *s, uint8_t tcp_flags)
         ds_put_cstr(s, "[80]");
     }
 }
+
+static uint8_t *
+find_tcp_mss_opt(struct ofpbuf *packet)
+{
+    struct tcp_header *tcp = packet->l4;
+    uint8_t *start = (uint8_t *)(tcp);
+    uint8_t *option = (uint8_t *)(tcp + 1);
+
+    while (option - start + TCP_OPT_MSS_LEN <= TCP_OFFSET(tcp->tcp_ctl) * 4) {
+        if (*(option) == TCP_OPT_MSS && *(option + 1) == TCP_OPT_MSS_LEN) {
+            return option;
+        }
+
+        if (*option > TCP_OPT_NOP && *(option + 1)) {
+            option += *(option + 1);
+        } else {
+            option++;
+        }
+    }
+
+    return NULL;
+}
+
+static bool
+replace_tcp_mss_opt(struct ofpbuf *packet, uint8_t *option, ovs_be16 mss)
+{
+    struct tcp_header *tcp = packet->l4;
+    ovs_be16 prev_mss;
+
+    memcpy(&prev_mss, option + 2, sizeof(prev_mss));
+
+    /* Do not increase MSS */
+    if (ntohs(prev_mss) <= ntohs(mss)) {
+        return false;
+    }
+
+    memcpy(option + 2, &mss, sizeof(mss));
+    tcp->tcp_csum = recalc_csum16(tcp->tcp_csum, prev_mss, mss);
+    return true;
+}
+
+static void
+add_tcp_mss_opt(struct ofpbuf *packet, ovs_be16 mss)
+{
+    struct ip_header *nh = packet->l3;
+    struct tcp_header *tcp = packet->l4;
+    uint8_t *option = (uint8_t *)(tcp + 1);
+    ovs_be16 old_ctl, old_tot_len;
+
+    /* Add MSS option */
+    ofpbuf_prealloc_tailroom(packet, TCP_OPT_MSS_LEN);
+    memmove(option + TCP_OPT_MSS_LEN, option,
+            packet->size + (uint8_t *)packet->data - option);
+    packet->size += TCP_OPT_MSS_LEN;
+    *(option) = TCP_OPT_MSS;
+    *(option + 1) = TCP_OPT_MSS_LEN;
+    memcpy(option + 2, &mss, sizeof(mss));
+    tcp->tcp_csum = recalc_csum32(tcp->tcp_csum, 0, *(ovs_be32 *)option);
+
+    old_tot_len = nh->ip_tot_len;
+    nh->ip_tot_len = htons(ntohs(nh->ip_tot_len) + TCP_OPT_MSS_LEN);
+    nh->ip_csum = recalc_csum16(nh->ip_csum, old_tot_len, nh->ip_tot_len);
+    tcp->tcp_csum = recalc_csum16(tcp->tcp_csum, old_tot_len, nh->ip_tot_len);
+
+    old_ctl = tcp->tcp_ctl;
+    tcp->tcp_ctl = TCP_CTL(TCP_FLAGS(tcp->tcp_ctl),
+                           TCP_OFFSET(tcp->tcp_ctl) + TCP_OPT_MSS_LEN / 4);
+    tcp->tcp_csum = recalc_csum16(tcp->tcp_csum, old_ctl, tcp->tcp_ctl);
+}
+
+bool
+set_tun_tcp_mss(struct ofpbuf *packet, const struct flow *flow)
+{
+    struct tcp_header *tcp = packet->l4;
+    ovs_be16 mss;
+    uint8_t *option;
+
+    /* Only for packets to be encapsulated in a tunnel.
+     * Could be made generic by paramatising the mss */
+    if (!flow->tun_key.ipv4_dst) {
+        return false;
+    }
+
+    /* Nothing to do if PMTU clamping is disabled */
+    if (flow->tun_mss == MSS_NONE) {
+        return false;
+    }
+
+    /* Skip non-TCP packets */
+    if ((flow->dl_type != htons(ETH_TYPE_IP) &&
+         flow->dl_type != htons(ETH_TYPE_IPV6)) ||
+        flow->nw_proto != IPPROTO_TCP) {
+        return false;
+    }
+
+    /* Skip non-SYN packets */
+    if (!(TCP_FLAGS(tcp->tcp_ctl) & TCP_SYN)) {
+        return false;
+    }
+
+    /* Skip non-empty packets */
+    if (packet->size - (packet->l4 - packet->data) >
+        TCP_OFFSET(tcp->tcp_ctl) * 4) {
+        return false;
+    }
+
+    /* Calculate desired MSS */
+    if (flow->tun_mss == MSS_PMTU) {
+        /* hlen = outer header length (eth + ip) +
+         *        tunnel header length +
+         *        inner header length (ip + tcp)
+         */
+        int hlen = ETH_HLEN + sizeof(struct ip_header) +
+                flow->tun_key.tun_hdr_len +
+                sizeof(struct ip_header) + sizeof(struct tcp_header);
+        int mtu = route_table_get_mtu(flow->tun_key.ipv4_dst);
+        if (mtu < hlen) {
+            return false;
+        }
+        mss = htons(mtu - hlen);
+    } else {
+        mss = flow->tun_mss;
+    }
+
+    option = find_tcp_mss_opt(packet);
+    if (option) {
+        return replace_tcp_mss_opt(packet, option, mss);
+    }
+
+    add_tcp_mss_opt(packet, mss);
+    return true;
+}
diff --git a/lib/packets.h b/lib/packets.h
index 00c2b75..5b941fe 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -392,6 +392,13 @@ BUILD_ASSERT_DECL(UDP_HEADER_LEN == sizeof(struct 
udp_header));
 #define TCP_ACK 0x10
 #define TCP_URG 0x20
 
+/* TCP Options */
+#define TCP_OPT_NOP         1
+#define TCP_OPT_MSS         2
+
+/* TCP Option Lengths */
+#define TCP_OPT_MSS_LEN     4
+
 #define TCP_CTL(flags, offset) (htons((flags) | ((offset) << 12)))
 #define TCP_FLAGS(tcp_ctl) (ntohs(tcp_ctl) & 0x003f)
 #define TCP_OFFSET(tcp_ctl) (ntohs(tcp_ctl) >> 12)
@@ -494,4 +501,17 @@ void packet_set_udp_port(struct ofpbuf *, ovs_be16 src, 
ovs_be16 dst);
 uint8_t packet_get_tcp_flags(const struct ofpbuf *, const struct flow *);
 void packet_format_tcp_flags(struct ds *, uint8_t);
 
+/* The maximum length of an IPv4 segment is 65536 bytes
+ * The IP header size is at least 20 bytes
+ * The MSS does not include the IP header side and thus
+ * has a maximum useful value of 65516. Numbers above that
+ * may be used as special values, that can be stored in a 16 bit value
+ */
+#define MSS_NONE  htons(65534)
+#define MSS_PMTU  htons(65535)
+
+#define MSS_MAX   65516
+
+bool set_tun_tcp_mss(struct ofpbuf *, const struct flow *);
+
 #endif /* packets.h */
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 595c50c..972d895 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -374,8 +374,7 @@ static void subfacet_reset_dp_stats(struct subfacet *,
 static void subfacet_update_time(struct subfacet *, long long int used);
 static void subfacet_update_stats(struct subfacet *,
                                   const struct dpif_flow_stats *);
-static void subfacet_make_actions(struct subfacet *,
-                                  const struct ofpbuf *packet,
+static void subfacet_make_actions(struct subfacet *, struct ofpbuf *packet,
                                   struct ofpbuf *odp_actions);
 static int subfacet_install(struct subfacet *,
                             const struct nlattr *actions, size_t actions_len,
@@ -2864,6 +2863,7 @@ handle_flow_miss_without_facet(struct flow_miss *miss,
         ctx.resubmit_stats = &stats;
         xlate_actions(&ctx, rule->up.actions, rule->up.n_actions,
                       &odp_actions);
+        set_tun_tcp_mss(packet, &miss->flow);
 
         if (odp_actions.size) {
             struct dpif_execute *execute = &op->dpif_op.u.execute;
@@ -4330,18 +4330,33 @@ subfacet_get_key(struct subfacet *subfacet, struct 
odputil_keybuf *keybuf,
  * Translates the actions into 'odp_actions', which the caller must have
  * initialized and is responsible for uninitializing. */
 static void
-subfacet_make_actions(struct subfacet *subfacet, const struct ofpbuf *packet,
+subfacet_make_actions(struct subfacet *subfacet, struct ofpbuf *packet,
                       struct ofpbuf *odp_actions)
 {
     struct facet *facet = subfacet->facet;
     struct rule_dpif *rule = facet->rule;
     struct ofproto_dpif *ofproto = ofproto_dpif_cast(rule->up.ofproto);
+    enum subfacet_path slow;
 
     struct action_xlate_ctx ctx;
 
     action_xlate_ctx_init(&ctx, ofproto, &facet->flow, subfacet->initial_tci,
                           rule, 0, packet);
     xlate_actions(&ctx, rule->up.actions, rule->up.n_actions, odp_actions);
+    if (set_tun_tcp_mss(packet, &ctx.flow)) {
+        /* If MSS clamping was affected then do not install the subfacet in
+         * the datapath. This is so that if retransmission occurs then the
+         * retransmitted frame will also be clamped.
+         *
+         * MSS clamping takes effect on the first packet of a TCP
+         * connection. Thus the effect of this is to install the subfacet
+         * in the datapath when the second packet from the client is
+         * received.
+         */
+        slow = SF_SLOW_PATH;
+    } else {
+        slow = ctx.slow;
+    }
     facet->tags = ctx.tags;
     facet->has_learn = ctx.has_learn;
     facet->has_normal = ctx.has_normal;
@@ -4349,7 +4364,7 @@ subfacet_make_actions(struct subfacet *subfacet, const 
struct ofpbuf *packet,
     facet->nf_flow.output_iface = ctx.nf_output_iface;
     facet->mirrors = ctx.mirrors;
 
-    subfacet->slow = (subfacet->slow & SLOW_MATCH) | ctx.slow;
+    subfacet->slow = (subfacet->slow & SLOW_MATCH) | slow;
     if (subfacet->actions_len != odp_actions->size
         || memcmp(subfacet->actions, odp_actions->data, odp_actions->size)) {
         free(subfacet->actions);
@@ -4930,6 +4945,7 @@ compose_output_action__(struct action_xlate_ctx *ctx, 
uint16_t ofp_port,
             ctx->flow.tun_key.ipv4_tos = ofport->tun->s.tos;
             ctx->flow.tun_key.ipv4_ttl = ofport->tun->s.ttl;
             ctx->flow.tun_key.tun_hdr_len = ofport->tun->hdr_len;
+            ctx->flow.tun_mss = ofport->tun->s.mss;
         } else {
             ctx->flow.vlan_tci = htons(0);
         }
@@ -6757,7 +6773,7 @@ ofproto_unixctl_trace(struct unixctl_conn *conn, int 
argc, const char *argv[],
         const char *packet_s = argv[5];
         uint16_t in_port = ofp_port_to_odp_port(atoi(in_port_s));
         ovs_be64 tun_id = htonll(strtoull(tun_id_s, NULL, 0));
-        struct ovs_key_ipv4_tunnel tun_key = { .tun_id = tun_id };
+        struct flow_tun_key tun_key = { .tun_id = tun_id };
         uint32_t priority = atoi(priority_s);
         const char *msg;
 
diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h
index 2c0643a..a7feb1c 100644
--- a/ofproto/ofproto.h
+++ b/ofproto/ofproto.h
@@ -393,6 +393,7 @@ struct tunnel_settings {
     ovs_be64 out_key;
     ovs_be32 saddr;
     ovs_be32 daddr;
+    ovs_be16 mss;
     uint8_t tos;
     uint8_t ttl;
     uint16_t flags;
@@ -406,6 +407,7 @@ tunnel_settings_equal(const struct tunnel_settings *a,
         return a->daddr == b->daddr &&
                 a->in_key == b->in_key &&
                 a->out_key == b->out_key &&
+                a->mss == b->mss &&
                 a->saddr == b->saddr &&
                 a->flags == b->flags &&
                 a->tos == b->tos &&
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
index 93d0809..32b2713 100644
--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -1399,7 +1399,7 @@ iface_parse_tunnel(const struct ovsrec_interface 
*iface_cfg,
     bool is_ipsec = false;
     struct shash args;
     struct shash_node *node;
-    struct tunnel_settings s = { .tos = 0 };
+    struct tunnel_settings s = { .mss = MSS_PMTU };
     bool ipsec_mech_set = false;
     int status;
     const char *key;
@@ -1492,6 +1492,23 @@ iface_parse_tunnel(const struct ovsrec_interface 
*iface_cfg,
             }
         } else if (!strcmp(node->name, "psk") && is_ipsec) {
             ipsec_mech_set = true;
+        } else if (!strcmp(node->name, "mss_clamp")) {
+            if (!strcmp(node->data, "false")) {
+                s.mss = MSS_NONE;
+            } else if (!strcmp(node->data, "pmtu")) {
+                s.mss = MSS_PMTU;
+            } else {
+                char *end;
+                long mss;
+
+                mss = strtol(node->data, &end, 0);
+                if (node->data == end || mss < 0 || mss > MSS_MAX) {
+                    VLOG_ERR("%s: 'mss_clamp' value is invalid or "
+                             "or out of range", iface_cfg->name);
+                    goto err;
+                }
+                s.mss = htons(mss);
+            }
         } else if (is_ipsec
                 && (!strcmp(node->name, "certificate")
                     || !strcmp(node->name, "private_key")
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 0cd9b30..be52fb9 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -1307,6 +1307,29 @@
         enabled; set to <code>false</code> to disable.
       </column>
 
+      <column name="options" key="mss_clamp">
+        <p> Optional. The value at wich to clamp the MSS of outgoing
+            packets, one of:</p>
+        <ul>
+          <li>
+            An positive integer value less than 65496.
+            Used as value to clamp the MSS of outgoing packets.
+          </li>
+          <li>
+            The word <code>false</code>. This disables PMTU clamping.
+          </li>
+          <li>
+            The word <code>pmtu</code>. The value to clamp the MSS of
+            outgoing packets is automatically calcualted from the mtu of
+            the likely egress interface.
+          </li>
+        </ul>
+
+        <p>
+          Defaults to the word <code>pmtu</code>.
+        </p>
+      </column>
+
       <group title="Tunnel Options: gre only">
         <p>
           Only <code>gre</code> interfaces support these options.
-- 
1.7.10.2.484.gcd07cc5

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to