From: Andy Zhou <az...@nicira.com> User space now may receive re-assembled IP fragments. The user space netlink handler can now accept packets with the new OVS_PACKET_ATTR_MRU attribute. This allows the kernel to assemble fragmented packets for the duration of OpenFlow processing, then re-fragment at output time. Most notably this occurs for packets that are sent through the connection tracker.
Signed-off-by: Andy Zhou <az...@nicira.com> --- datapath/linux/compat/include/linux/openvswitch.h | 5 + lib/dpif-netlink.c | 5 + lib/dpif.c | 2 + lib/dpif.h | 3 + ofproto/ofproto-dpif-upcall.c | 17 +- ofproto/ofproto-dpif.c | 3 + tests/system-traffic.at | 221 ++++++++++++++++++++++ 7 files changed, 254 insertions(+), 2 deletions(-) diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h index bbc6ca4..1997175 100644 --- a/datapath/linux/compat/include/linux/openvswitch.h +++ b/datapath/linux/compat/include/linux/openvswitch.h @@ -187,6 +187,10 @@ enum ovs_packet_cmd { * %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the * output port is actually a tunnel port. Contains the output tunnel key * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes. + * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and + * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment + * size. + * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_PACKET_* commands. */ @@ -202,6 +206,7 @@ enum ovs_packet_attr { OVS_PACKET_ATTR_UNUSED2, OVS_PACKET_ATTR_PROBE, /* Packet operation is a feature probe, error logging should be suppressed. */ + OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */ __OVS_PACKET_ATTR_MAX }; diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index ffeb124..007f4ec 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -1547,6 +1547,9 @@ dpif_netlink_encode_execute(int dp_ifindex, const struct dpif_execute *d_exec, if (d_exec->probe) { nl_msg_put_flag(buf, OVS_PACKET_ATTR_PROBE); } + if (d_exec->mru) { + nl_msg_put_u16(buf, OVS_PACKET_ATTR_MRU, d_exec->mru); + } } /* Executes, against 'dpif', up to the first 'n_ops' operations in 'ops'. @@ -1965,6 +1968,7 @@ parse_odp_packet(const struct dpif_netlink *dpif, struct ofpbuf *buf, [OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_UNSPEC, .optional = true }, [OVS_PACKET_ATTR_EGRESS_TUN_KEY] = { .type = NL_A_NESTED, .optional = true }, [OVS_PACKET_ATTR_ACTIONS] = { .type = NL_A_NESTED, .optional = true }, + [OVS_PACKET_ATTR_MRU] = { .type = NL_A_BE16, .optional = true } }; struct ovs_header *ovs_header; @@ -2002,6 +2006,7 @@ parse_odp_packet(const struct dpif_netlink *dpif, struct ofpbuf *buf, upcall->userdata = a[OVS_PACKET_ATTR_USERDATA]; upcall->out_tun_key = a[OVS_PACKET_ATTR_EGRESS_TUN_KEY]; upcall->actions = a[OVS_PACKET_ATTR_ACTIONS]; + upcall->mru = a[OVS_PACKET_ATTR_MRU]; /* Allow overwriting the netlink attribute header without reallocating. */ dp_packet_use_stub(&upcall->packet, diff --git a/lib/dpif.c b/lib/dpif.c index c03aa1d..f56d2f2 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -1126,6 +1126,7 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet **packets, int cnt, execute.packet = packet; execute.needs_help = false; execute.probe = false; + execute.mru = 0; aux->error = dpif_execute(aux->dpif, &execute); log_execute_message(aux->dpif, &execute, true, aux->error); @@ -1693,6 +1694,7 @@ log_execute_message(struct dpif *dpif, const struct dpif_execute *execute, ds_put_format(&ds, " failed (%s)", ovs_strerror(error)); } ds_put_format(&ds, " on packet %s", packet); + ds_put_format(&ds, " mtu %d", execute->mru); vlog(THIS_MODULE, error ? VLL_WARN : VLL_DBG, "%s", ds_cstr(&ds)); ds_destroy(&ds); free(packet); diff --git a/lib/dpif.h b/lib/dpif.h index 1b8a7b9..76317e4 100644 --- a/lib/dpif.h +++ b/lib/dpif.h @@ -697,6 +697,8 @@ struct dpif_execute { size_t actions_len; /* Length of 'actions' in bytes. */ bool needs_help; bool probe; /* Suppress error messages. */ + unsigned int mru; /* packet's Maximum receive unit. + 0 if not a fragmented packet */ /* Input, but possibly modified as a side effect of execution. */ struct dp_packet *packet; /* Packet to execute. */ @@ -780,6 +782,7 @@ struct dpif_upcall { struct nlattr *key; /* Flow key. */ size_t key_len; /* Length of 'key' in bytes. */ ovs_u128 ufid; /* Unique flow identifier for 'key'. */ + struct nlattr *mru; /* Maximum receive unit. */ /* DPIF_UC_ACTION only. */ struct nlattr *userdata; /* Argument to OVS_ACTION_ATTR_USERSPACE. */ diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 8a43bbf..badb4d9 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -183,6 +183,8 @@ struct upcall { unsigned pmd_id; /* Datapath poll mode driver id. */ const struct dp_packet *packet; /* Packet associated with this upcall. */ ofp_port_t in_port; /* OpenFlow in port, or OFPP_NONE. */ + unsigned int mru; /* If !0, Maximum receive unit of + fragmented IP packet */ enum dpif_upcall_type type; /* Datapath type of the upcall. */ const struct nlattr *userdata; /* Userdata for DPIF_UC_ACTION Upcalls. */ @@ -329,6 +331,7 @@ static enum upcall_type classify_upcall(enum dpif_upcall_type type, static int upcall_receive(struct upcall *, const struct dpif_backer *, const struct dp_packet *packet, enum dpif_upcall_type, const struct nlattr *userdata, const struct flow *, + const unsigned int mru, const ovs_u128 *ufid, const unsigned pmd_id); static void upcall_uninit(struct upcall *); @@ -716,6 +719,7 @@ recv_upcalls(struct handler *handler) struct dpif_upcall *dupcall = &dupcalls[n_upcalls]; struct upcall *upcall = &upcalls[n_upcalls]; struct flow *flow = &flows[n_upcalls]; + unsigned int mru; int error; ofpbuf_use_stub(recv_buf, recv_stubs[n_upcalls], @@ -730,8 +734,14 @@ recv_upcalls(struct handler *handler) goto free_dupcall; } + if (dupcall->mru) { + mru = nl_attr_get_u16(dupcall->mru); + } else { + mru = 0; + } + error = upcall_receive(upcall, udpif->backer, &dupcall->packet, - dupcall->type, dupcall->userdata, flow, + dupcall->type, dupcall->userdata, flow, mru, &dupcall->ufid, PMD_ID_NULL); if (error) { if (error == ENODEV) { @@ -977,6 +987,7 @@ static int upcall_receive(struct upcall *upcall, const struct dpif_backer *backer, const struct dp_packet *packet, enum dpif_upcall_type type, const struct nlattr *userdata, const struct flow *flow, + const unsigned int mru, const ovs_u128 *ufid, const unsigned pmd_id) { int error; @@ -1006,6 +1017,7 @@ upcall_receive(struct upcall *upcall, const struct dpif_backer *backer, upcall->ukey = NULL; upcall->key = NULL; upcall->key_len = 0; + upcall->mru = mru; upcall->out_tun_key = NULL; upcall->actions = NULL; @@ -1133,7 +1145,7 @@ upcall_cb(const struct dp_packet *packet, const struct flow *flow, ovs_u128 *ufi atomic_read_relaxed(&udpif->flow_limit, &flow_limit); error = upcall_receive(&upcall, udpif->backer, packet, type, userdata, - flow, ufid, pmd_id); + flow, 0, ufid, pmd_id); if (error) { return error; } @@ -1355,6 +1367,7 @@ handle_upcalls(struct udpif *udpif, struct upcall *upcalls, op->dop.u.execute.actions_len = upcall->odp_actions.size; op->dop.u.execute.needs_help = (upcall->xout.slow & SLOW_ACTION) != 0; op->dop.u.execute.probe = false; + op->dop.u.execute.mru = upcall->mru; } } diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index ddf177a..b210881 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -1116,6 +1116,7 @@ check_variable_length_userdata(struct dpif_backer *backer) execute.packet = &packet; execute.needs_help = false; execute.probe = true; + execute.mru = 0; error = dpif_execute(backer->dpif, &execute); @@ -1214,6 +1215,7 @@ check_masked_set_action(struct dpif_backer *backer) execute.packet = &packet; execute.needs_help = false; execute.probe = true; + execute.mru = 0; error = dpif_execute(backer->dpif, &execute); @@ -3722,6 +3724,7 @@ ofproto_dpif_execute_actions(struct ofproto_dpif *ofproto, execute.packet = packet; execute.needs_help = (xout.slow & SLOW_ACTION) != 0; execute.probe = false; + execute.mru = 0; /* Fix up in_port. */ in_port = flow->in_port.ofp_port; diff --git a/tests/system-traffic.at b/tests/system-traffic.at index cdd0315..4074eaa 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -923,3 +923,224 @@ TIME_WAIT src=10.1.1.1 dst=10.1.1.2 sport=<cleared> dport=<cleared> src=10.1.1.2 OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([fragmentation - ipv4 icmp ]) +CHECK_CONNTRACK() +OVS_TRAFFIC_VSWITCHD_START( + [set-fail-mode br0 secure -- ]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Sending ping through conntrack +AT_DATA([flows.txt], [dnl +priority=1,action=drop +priority=10,arp,action=normal +in_port=1,icmp,action=ct(commit,zone=9),2 +in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9) +in_port=2,ct_state=+trk+est-new,icmp,action=1 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Basic connectivity check. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 larger fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([fragmentation - ipv4 icmp + vlan]) +CHECK_CONNTRACK() +OVS_TRAFFIC_VSWITCHD_START( + [set-fail-mode br0 secure -- ]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") +ADD_VLAN(p0, at_ns0, 100, "10.2.2.1/24") +ADD_VLAN(p1, at_ns1, 100, "10.2.2.2/24") + +dnl Sending ping through conntrack +AT_DATA([flows.txt], [dnl +priority=1,action=drop +priority=10,arp,action=normal +in_port=1,icmp,action=ct(commit,zone=9),2 +in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9) +in_port=2,ct_state=+trk+est-new,icmp,action=1 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Basic connectivity check. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 larger fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([fragmentation - ipv6 icmp ]) +CHECK_CONNTRACK() +OVS_TRAFFIC_VSWITCHD_START( + [set-fail-mode br0 secure -- ]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "fc00::1/96") +ADD_VETH(p1, at_ns1, br0, "fc00::2/96") + +dnl Sending ping through conntrack +AT_DATA([flows.txt], [dnl +priority=1,action=drop +priority=10,in_port=1,ipv6,action=ct(commit,zone=9),2 +priority=10,in_port=2,ct_state=-trk,ipv6,action=ct(table=0,zone=9) +priority=10,in_port=2,ct_state=+trk+est-new,ipv6,action=1 +icmp6,icmp_type=135,action=normal +icmp6,icmp_type=136,action=normal +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Without this sleep, we get occasional failures due to the following error: +dnl "connect: Cannot assign requested address" +sleep 2; + +dnl Basic connectivity check. +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 larger fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([fragmentation - ipv6 icmp + vlan]) +CHECK_CONNTRACK() +OVS_TRAFFIC_VSWITCHD_START( + [set-fail-mode br0 secure -- ]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "fc00::1/96") +ADD_VETH(p1, at_ns1, br0, "fc00::2/96") + +ADD_VLAN(p0, at_ns0, 100, "fc00:1::3/96") +ADD_VLAN(p1, at_ns1, 100, "fc00:1::4/96") + +dnl Sending ping through conntrack +AT_DATA([flows.txt], [dnl +priority=1,action=drop +priority=10,in_port=1,ipv6,action=ct(commit,zone=9),2 +priority=10,in_port=2,ct_state=-trk,ipv6,action=ct(table=0,zone=9) +priority=10,in_port=2,ct_state=+trk+est-new,ipv6,action=1 +icmp6,icmp_type=135,action=normal +icmp6,icmp_type=136,action=normal +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Without this sleep, we get occasional failures due to the following error: +dnl "connect: Cannot assign requested address" +sleep 2; + +dnl Basic connectivity check. +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 larger fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([fragmentation over vxlan tunnel]) +AT_SKIP_IF([! ip link help 2>&1 | grep vxlan >/dev/null]) +CHECK_CONNTRACK() + +OVS_TRAFFIC_VSWITCHD_START( + [set-fail-mode br0 standalone --]) +ADD_BR([br-underlay], [set-fail-mode br-underlay standalone]) +ADD_NAMESPACES(at_ns0) + +dnl Sending ping through conntrack +AT_DATA([flows.txt], [dnl +priority=1,action=drop +priority=10,arp,action=normal +in_port=1,icmp,action=ct(commit,zone=9),LOCAL +in_port=LOCAL,ct_state=-trk,icmp,action=ct(table=0,zone=9) +in_port=LOCAL,ct_state=+trk+est-new,icmp,action=1 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Set up underlay link from host into the namespace using veth pair. +ADD_VETH(p0, at_ns0, br-underlay, "172.31.1.1/24") +AT_CHECK([ip addr add dev br-underlay "172.31.1.100/24"]) +AT_CHECK([ip link set dev br-underlay up]) + +dnl Set up tunnel endpoints on OVS outside the namespace and with a native +dnl linux device inside the namespace. +ADD_OVS_TUNNEL([vxlan], [br0], [at_ns0], [172.31.1.1], [10.1.1.100/24]) +ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], [10.1.1.1/24], + [id 0 dstport 4789]) + +dnl First, check the underlay +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Okay, now check the overlay with different packet sizes +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP -- 2.1.4 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev