Allow clamping of the MSS of packets to be encapsulated by a tunnel. This provides an alternative to using PMTU discovery.
Cc: Jesse Gross <je...@nicira.com> Cc: Kyle Mestery <kmest...@cisco.com> Signed-off-by: Simon Horman <ho...@verge.net.au> --- v5 * Initial posting --- lib/flow.h | 2 +- lib/packets.c | 134 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/packets.h | 20 ++++++++ ofproto/ofproto-dpif.c | 26 ++++++++-- ofproto/ofproto.h | 2 + vswitchd/bridge.c | 19 ++++++- vswitchd/vswitch.xml | 23 +++++++++ 7 files changed, 219 insertions(+), 7 deletions(-) diff --git a/lib/flow.h b/lib/flow.h index ea808da..d5fab5f 100644 --- a/lib/flow.h +++ b/lib/flow.h @@ -86,7 +86,7 @@ struct flow { uint8_t arp_tha[6]; /* ARP/ND target hardware address. */ uint8_t nw_ttl; /* IP TTL/Hop Limit. */ uint8_t nw_frag; /* FLOW_FRAG_* flags. */ - uint8_t reserved[2]; /* Reserved for 64-bit packing. */ + ovs_be16 tun_mss; /* MSS Clamp size */ }; /* Represents the metadata fields of struct flow. The masks are used to diff --git a/lib/packets.c b/lib/packets.c index 631abf8..ffc976e 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -20,12 +20,14 @@ #include <arpa/inet.h> #include <sys/socket.h> #include <netinet/in.h> +#include <net/ethernet.h> #include <stdlib.h> #include "byte-order.h" #include "csum.h" #include "flow.h" #include "dynamic-string.h" #include "ofpbuf.h" +#include "route-table.h" const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT; @@ -555,3 +557,135 @@ packet_format_tcp_flags(struct ds *s, uint8_t tcp_flags) ds_put_cstr(s, "[80]"); } } + +static uint8_t * +find_tcp_mss_opt(struct ofpbuf *packet) +{ + struct tcp_header *tcp = packet->l4; + uint8_t *start = (uint8_t *)(tcp); + uint8_t *option = (uint8_t *)(tcp + 1); + + while (option - start + TCP_OPT_MSS_LEN <= TCP_OFFSET(tcp->tcp_ctl) * 4) { + if (*(option) == TCP_OPT_MSS && *(option + 1) == TCP_OPT_MSS_LEN) { + return option; + } + + if (*option > TCP_OPT_NOP && *(option + 1)) { + option += *(option + 1); + } else { + option++; + } + } + + return NULL; +} + +static bool +replace_tcp_mss_opt(struct ofpbuf *packet, uint8_t *option, ovs_be16 mss) +{ + struct tcp_header *tcp = packet->l4; + ovs_be16 prev_mss; + + memcpy(&prev_mss, option + 2, sizeof(prev_mss)); + + /* Do not increase MSS */ + if (ntohs(prev_mss) <= ntohs(mss)) { + return false; + } + + memcpy(option + 2, &mss, sizeof(mss)); + tcp->tcp_csum = recalc_csum16(tcp->tcp_csum, prev_mss, mss); + return true; +} + +static void +add_tcp_mss_opt(struct ofpbuf *packet, ovs_be16 mss) +{ + struct ip_header *nh = packet->l3; + struct tcp_header *tcp = packet->l4; + uint8_t *option = (uint8_t *)(tcp + 1); + ovs_be16 old_ctl, old_tot_len; + + /* Add MSS option */ + ofpbuf_prealloc_tailroom(packet, TCP_OPT_MSS_LEN); + memmove(option + TCP_OPT_MSS_LEN, option, + packet->size + (uint8_t *)packet->data - option); + packet->size += TCP_OPT_MSS_LEN; + *(option) = TCP_OPT_MSS; + *(option + 1) = TCP_OPT_MSS_LEN; + memcpy(option + 2, &mss, sizeof(mss)); + tcp->tcp_csum = recalc_csum32(tcp->tcp_csum, 0, *(ovs_be32 *)option); + + old_tot_len = nh->ip_tot_len; + nh->ip_tot_len = htons(ntohs(nh->ip_tot_len) + TCP_OPT_MSS_LEN); + nh->ip_csum = recalc_csum16(nh->ip_csum, old_tot_len, nh->ip_tot_len); + tcp->tcp_csum = recalc_csum16(tcp->tcp_csum, old_tot_len, nh->ip_tot_len); + + old_ctl = tcp->tcp_ctl; + tcp->tcp_ctl = TCP_CTL(TCP_FLAGS(tcp->tcp_ctl), + TCP_OFFSET(tcp->tcp_ctl) + TCP_OPT_MSS_LEN / 4); + tcp->tcp_csum = recalc_csum16(tcp->tcp_csum, old_ctl, tcp->tcp_ctl); +} + +bool +set_tun_tcp_mss(struct ofpbuf *packet, const struct flow *flow) +{ + struct tcp_header *tcp = packet->l4; + ovs_be16 mss; + uint8_t *option; + + /* Only for packets to be encapsulated in a tunnel. + * Could be made generic by paramatising the mss */ + if (!flow->tun_key.ipv4_dst) { + return false; + } + + /* Nothing to do if PMTU clamping is disabled */ + if (flow->tun_mss == MSS_NONE) { + return false; + } + + /* Skip non-TCP packets */ + if ((flow->dl_type != htons(ETH_TYPE_IP) && + flow->dl_type != htons(ETH_TYPE_IPV6)) || + flow->nw_proto != IPPROTO_TCP) { + return false; + } + + /* Skip non-SYN packets */ + if (!(TCP_FLAGS(tcp->tcp_ctl) & TCP_SYN)) { + return false; + } + + /* Skip non-empty packets */ + if (packet->size - (packet->l4 - packet->data) > + TCP_OFFSET(tcp->tcp_ctl) * 4) { + return false; + } + + /* Calculate desired MSS */ + if (flow->tun_mss == MSS_PMTU) { + /* hlen = outer header length (eth + ip) + + * tunnel header length + + * inner header length (ip + tcp) + */ + int hlen = ETH_HLEN + sizeof(struct ip_header) + + flow->tun_key.tun_hdr_len + + sizeof(struct ip_header) + sizeof(struct tcp_header); + int mtu = route_table_get_mtu(flow->tun_key.ipv4_dst); + if (mtu < hlen) { + return false; + } + mss = htons(mtu - hlen); + } else { + mss = flow->tun_mss; + } + + option = find_tcp_mss_opt(packet); + if (option) { + return replace_tcp_mss_opt(packet, option, mss); + } + + add_tcp_mss_opt(packet, mss); + return true; +} diff --git a/lib/packets.h b/lib/packets.h index 00c2b75..5b941fe 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -392,6 +392,13 @@ BUILD_ASSERT_DECL(UDP_HEADER_LEN == sizeof(struct udp_header)); #define TCP_ACK 0x10 #define TCP_URG 0x20 +/* TCP Options */ +#define TCP_OPT_NOP 1 +#define TCP_OPT_MSS 2 + +/* TCP Option Lengths */ +#define TCP_OPT_MSS_LEN 4 + #define TCP_CTL(flags, offset) (htons((flags) | ((offset) << 12))) #define TCP_FLAGS(tcp_ctl) (ntohs(tcp_ctl) & 0x003f) #define TCP_OFFSET(tcp_ctl) (ntohs(tcp_ctl) >> 12) @@ -494,4 +501,17 @@ void packet_set_udp_port(struct ofpbuf *, ovs_be16 src, ovs_be16 dst); uint8_t packet_get_tcp_flags(const struct ofpbuf *, const struct flow *); void packet_format_tcp_flags(struct ds *, uint8_t); +/* The maximum length of an IPv4 segment is 65536 bytes + * The IP header size is at least 20 bytes + * The MSS does not include the IP header side and thus + * has a maximum useful value of 65516. Numbers above that + * may be used as special values, that can be stored in a 16 bit value + */ +#define MSS_NONE htons(65534) +#define MSS_PMTU htons(65535) + +#define MSS_MAX 65516 + +bool set_tun_tcp_mss(struct ofpbuf *, const struct flow *); + #endif /* packets.h */ diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 595c50c..972d895 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -374,8 +374,7 @@ static void subfacet_reset_dp_stats(struct subfacet *, static void subfacet_update_time(struct subfacet *, long long int used); static void subfacet_update_stats(struct subfacet *, const struct dpif_flow_stats *); -static void subfacet_make_actions(struct subfacet *, - const struct ofpbuf *packet, +static void subfacet_make_actions(struct subfacet *, struct ofpbuf *packet, struct ofpbuf *odp_actions); static int subfacet_install(struct subfacet *, const struct nlattr *actions, size_t actions_len, @@ -2864,6 +2863,7 @@ handle_flow_miss_without_facet(struct flow_miss *miss, ctx.resubmit_stats = &stats; xlate_actions(&ctx, rule->up.actions, rule->up.n_actions, &odp_actions); + set_tun_tcp_mss(packet, &miss->flow); if (odp_actions.size) { struct dpif_execute *execute = &op->dpif_op.u.execute; @@ -4330,18 +4330,33 @@ subfacet_get_key(struct subfacet *subfacet, struct odputil_keybuf *keybuf, * Translates the actions into 'odp_actions', which the caller must have * initialized and is responsible for uninitializing. */ static void -subfacet_make_actions(struct subfacet *subfacet, const struct ofpbuf *packet, +subfacet_make_actions(struct subfacet *subfacet, struct ofpbuf *packet, struct ofpbuf *odp_actions) { struct facet *facet = subfacet->facet; struct rule_dpif *rule = facet->rule; struct ofproto_dpif *ofproto = ofproto_dpif_cast(rule->up.ofproto); + enum subfacet_path slow; struct action_xlate_ctx ctx; action_xlate_ctx_init(&ctx, ofproto, &facet->flow, subfacet->initial_tci, rule, 0, packet); xlate_actions(&ctx, rule->up.actions, rule->up.n_actions, odp_actions); + if (set_tun_tcp_mss(packet, &ctx.flow)) { + /* If MSS clamping was affected then do not install the subfacet in + * the datapath. This is so that if retransmission occurs then the + * retransmitted frame will also be clamped. + * + * MSS clamping takes effect on the first packet of a TCP + * connection. Thus the effect of this is to install the subfacet + * in the datapath when the second packet from the client is + * received. + */ + slow = SF_SLOW_PATH; + } else { + slow = ctx.slow; + } facet->tags = ctx.tags; facet->has_learn = ctx.has_learn; facet->has_normal = ctx.has_normal; @@ -4349,7 +4364,7 @@ subfacet_make_actions(struct subfacet *subfacet, const struct ofpbuf *packet, facet->nf_flow.output_iface = ctx.nf_output_iface; facet->mirrors = ctx.mirrors; - subfacet->slow = (subfacet->slow & SLOW_MATCH) | ctx.slow; + subfacet->slow = (subfacet->slow & SLOW_MATCH) | slow; if (subfacet->actions_len != odp_actions->size || memcmp(subfacet->actions, odp_actions->data, odp_actions->size)) { free(subfacet->actions); @@ -4930,6 +4945,7 @@ compose_output_action__(struct action_xlate_ctx *ctx, uint16_t ofp_port, ctx->flow.tun_key.ipv4_tos = ofport->tun->s.tos; ctx->flow.tun_key.ipv4_ttl = ofport->tun->s.ttl; ctx->flow.tun_key.tun_hdr_len = ofport->tun->hdr_len; + ctx->flow.tun_mss = ofport->tun->s.mss; } else { ctx->flow.vlan_tci = htons(0); } @@ -6757,7 +6773,7 @@ ofproto_unixctl_trace(struct unixctl_conn *conn, int argc, const char *argv[], const char *packet_s = argv[5]; uint16_t in_port = ofp_port_to_odp_port(atoi(in_port_s)); ovs_be64 tun_id = htonll(strtoull(tun_id_s, NULL, 0)); - struct ovs_key_ipv4_tunnel tun_key = { .tun_id = tun_id }; + struct flow_tun_key tun_key = { .tun_id = tun_id }; uint32_t priority = atoi(priority_s); const char *msg; diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index 2c0643a..a7feb1c 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -393,6 +393,7 @@ struct tunnel_settings { ovs_be64 out_key; ovs_be32 saddr; ovs_be32 daddr; + ovs_be16 mss; uint8_t tos; uint8_t ttl; uint16_t flags; @@ -406,6 +407,7 @@ tunnel_settings_equal(const struct tunnel_settings *a, return a->daddr == b->daddr && a->in_key == b->in_key && a->out_key == b->out_key && + a->mss == b->mss && a->saddr == b->saddr && a->flags == b->flags && a->tos == b->tos && diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 93d0809..32b2713 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -1399,7 +1399,7 @@ iface_parse_tunnel(const struct ovsrec_interface *iface_cfg, bool is_ipsec = false; struct shash args; struct shash_node *node; - struct tunnel_settings s = { .tos = 0 }; + struct tunnel_settings s = { .mss = MSS_PMTU }; bool ipsec_mech_set = false; int status; const char *key; @@ -1492,6 +1492,23 @@ iface_parse_tunnel(const struct ovsrec_interface *iface_cfg, } } else if (!strcmp(node->name, "psk") && is_ipsec) { ipsec_mech_set = true; + } else if (!strcmp(node->name, "mss_clamp")) { + if (!strcmp(node->data, "false")) { + s.mss = MSS_NONE; + } else if (!strcmp(node->data, "pmtu")) { + s.mss = MSS_PMTU; + } else { + char *end; + long mss; + + mss = strtol(node->data, &end, 0); + if (node->data == end || mss < 0 || mss > MSS_MAX) { + VLOG_ERR("%s: 'mss_clamp' value is invalid or " + "or out of range", iface_cfg->name); + goto err; + } + s.mss = htons(mss); + } } else if (is_ipsec && (!strcmp(node->name, "certificate") || !strcmp(node->name, "private_key") diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 0cd9b30..be52fb9 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -1307,6 +1307,29 @@ enabled; set to <code>false</code> to disable. </column> + <column name="options" key="mss_clamp"> + <p> Optional. The value at wich to clamp the MSS of outgoing + packets, one of:</p> + <ul> + <li> + An positive integer value less than 65496. + Used as value to clamp the MSS of outgoing packets. + </li> + <li> + The word <code>false</code>. This disables PMTU clamping. + </li> + <li> + The word <code>pmtu</code>. The value to clamp the MSS of + outgoing packets is automatically calcualted from the mtu of + the likely egress interface. + </li> + </ul> + + <p> + Defaults to the word <code>pmtu</code>. + </p> + </column> + <group title="Tunnel Options: gre only"> <p> Only <code>gre</code> interfaces support these options. -- 1.7.10.2.484.gcd07cc5 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev