On Oct 11, 2012, at 9:32 AM, "Kyle Mestery (kmestery)" <kmest...@cisco.com> wrote: > Thanks for the review, most of this is easy to address > On Oct 10, 2012, at 6:23 PM, Pravin Shelar <pshe...@nicira.com> wrote: > >> Patch looks good, I have few comments inlined. >> >> >> On Tue, Oct 9, 2012 at 11:49 AM, Kyle Mestery <kmest...@cisco.com> wrote: >>> This is a first pass at providing a tun_key which can be >>> used as the basis for flow-based tunnelling. The >>> tun_key includes and replaces the tun_id in both struct >>> ovs_skb_cb and struct sw_tun_key. >>> >>> This patch allows all existing tun_id behaviour to still work. Existing >>> users of tun_id are redirected to tun_key->tun_id to retain compatibility. >>> However, when the userspace code is updated to make use of the new tun_key, >>> the old behaviour will be deprecated and removed. >>> >>> NOTE: With these changes, the tunneling code no longer assumes input and >>> output keys are symmetric. If they are not, PMTUD needs to be disabled >>> for tunneling to work. >>> >>> Signed-off-by: Kyle Mestery <kmest...@cisco.com> >>> Cc: Simon Horman <ho...@verge.net.au> >>> Cc: Jesse Gross <je...@nicira.com> >>> --- >>> V6: >>> - Fix more comments addressed from Jesse. >>> V5: >>> - Address another round of comments from Jesse. >>> V4: >>> - Address 2 comments from Jesse: >>> - When processing actions, if OVS_CB(skb)->tun_key is NULL, point it at one >>> on the stack temporarily. This goes away when we remove the ability to set >>> tun_id outside the scope of tun_key. >>> - Move tun_key to the end of struct sw_flow_key. >>> V3: >>> - Fix issues found during review by Jesse. >>> - Add a NEWS entry around tunnel code no longer assuming symmetric input and >>> output tunnel keys. >>> >>> V2: >>> - Fix blank line addition/removal found by Simon. >>> - Fix hex printing output found by Simon. >>> --- >>> NEWS | 3 ++ >>> datapath/actions.c | 38 +++++++++++++---- >>> datapath/datapath.c | 10 ++++- >>> datapath/datapath.h | 5 ++- >>> datapath/flow.c | 64 ++++++++++++++++++++++++---- >>> datapath/flow.h | 12 ++++-- >>> datapath/tunnel.c | 101 >>> ++++++++++++++++++++++++++++---------------- >>> datapath/tunnel.h | 15 ++++++- >>> datapath/vport-capwap.c | 12 +++--- >>> datapath/vport-gre.c | 15 ++++--- >>> datapath/vport.c | 2 +- >>> include/linux/openvswitch.h | 18 +++++++- >>> lib/dpif-netdev.c | 1 + >>> lib/odp-util.c | 15 ++++++- >>> lib/odp-util.h | 3 +- >>> 15 files changed, 235 insertions(+), 79 deletions(-) >>> >>> diff --git a/NEWS b/NEWS >>> index 29fd9f3..ae831e3 100644 >>> --- a/NEWS >>> +++ b/NEWS >>> @@ -1,5 +1,8 @@ >>> post-v1.8.0 >>> ------------------------ >>> + - The tunneling code no longer assumes input and output keys are >>> symmetric. >>> + If they are not, PMTUD needs to be disabled for tunneling to work. >>> Note >>> + this only applies to flow-based keys. >>> - FreeBSD is now a supported platform, thanks to code contributions from >>> Gaetano Catalli, Ed Maste, and Giuseppe Lettieri. >>> - ovs-bugtool: New --ovs option to report only OVS related information. >>> diff --git a/datapath/actions.c b/datapath/actions.c >>> index ec9b595..fa8c10d 100644 >>> --- a/datapath/actions.c >>> +++ b/datapath/actions.c >>> @@ -37,7 +37,8 @@ >>> #include "vport.h" >>> >>> static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, >>> - const struct nlattr *attr, int len, bool keep_skb); >>> + const struct nlattr *attr, int len, >>> + struct ovs_key_ipv4_tunnel *tun_key, bool keep_skb); >>> >>> static int make_writable(struct sk_buff *skb, int write_len) >>> { >>> @@ -329,11 +330,14 @@ static int sample(struct datapath *dp, struct sk_buff >>> *skb, >>> } >>> >>> return do_execute_actions(dp, skb, nla_data(acts_list), >>> - nla_len(acts_list), true); >>> + nla_len(acts_list), >>> + OVS_CB(skb)->tun_key, >>> + true); >>> } >>> >>> static int execute_set_action(struct sk_buff *skb, >>> - const struct nlattr *nested_attr) >>> + const struct nlattr *nested_attr, >>> + struct ovs_key_ipv4_tunnel *tun_key) >>> { >>> int err = 0; >>> >>> @@ -343,7 +347,21 @@ static int execute_set_action(struct sk_buff *skb, >>> break; >>> >>> case OVS_KEY_ATTR_TUN_ID: >>> - OVS_CB(skb)->tun_id = nla_get_be64(nested_attr); >>> + if (OVS_CB(skb)->tun_key == NULL) { >>> + /* If tun_key is NULL for this skb, assign it to >>> + * a value the caller passed in for action >>> processing >>> + * and output. This can disappear once we drop >>> support >>> + * for setting tun_id outside of tun_key. >>> + */ >>> + memset(tun_key, 0, sizeof(struct >>> ovs_key_ipv4_tunnel)); >>> + OVS_CB(skb)->tun_key = tun_key; >>> + } >>> + >>> + OVS_CB(skb)->tun_key->tun_id = nla_get_be64(nested_attr); >>> + break; >>> + >>> + case OVS_KEY_ATTR_IPV4_TUNNEL: >>> + OVS_CB(skb)->tun_key = nla_data(nested_attr); >>> break; >>> >>> case OVS_KEY_ATTR_ETHERNET: >>> @@ -368,7 +386,8 @@ static int execute_set_action(struct sk_buff *skb, >>> >>> /* Execute a list of actions against 'skb'. */ >>> static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, >>> - const struct nlattr *attr, int len, bool keep_skb) >>> + const struct nlattr *attr, int len, >>> + struct ovs_key_ipv4_tunnel *tun_key, bool keep_skb) >>> { >>> /* Every output action needs a separate clone of 'skb', but the common >>> * case is just a single output action, so that doing a clone and >>> @@ -407,7 +426,7 @@ static int do_execute_actions(struct datapath *dp, >>> struct sk_buff *skb, >>> break; >>> >>> case OVS_ACTION_ATTR_SET: >>> - err = execute_set_action(skb, nla_data(a)); >>> + err = execute_set_action(skb, nla_data(a), tun_key); >>> break; >>> >>> case OVS_ACTION_ATTR_SAMPLE: >>> @@ -458,6 +477,9 @@ int ovs_execute_actions(struct datapath *dp, struct >>> sk_buff *skb) >>> struct sw_flow_actions *acts = >>> rcu_dereference(OVS_CB(skb)->flow->sf_acts); >>> struct loop_counter *loop; >>> int error; >>> + struct ovs_key_ipv4_tunnel tun_key; >>> + >>> + memset(&tun_key, 0, sizeof(tun_key)); >>> >> tun key will zeroed while executing tunnel set action, Do we still >> need to zero it here? >> >>> /* Check whether we've looped too much. */ >>> loop = &__get_cpu_var(loop_counters); >>> @@ -469,9 +491,9 @@ int ovs_execute_actions(struct datapath *dp, struct >>> sk_buff *skb) >>> goto out_loop; >>> } >>> >>> - OVS_CB(skb)->tun_id = 0; >>> + OVS_CB(skb)->tun_key = NULL; >>> error = do_execute_actions(dp, skb, acts->actions, >>> - acts->actions_len, false); >>> + acts->actions_len, &tun_key, >>> false); >>> >>> /* Check whether sub-actions looped too much. */ >>> if (unlikely(loop->looping)) >>> diff --git a/datapath/datapath.c b/datapath/datapath.c >>> index a6915fb..d8a198e 100644 >>> --- a/datapath/datapath.c >>> +++ b/datapath/datapath.c >>> @@ -587,12 +587,20 @@ static int validate_set(const struct nlattr *a, >>> >>> switch (key_type) { >>> const struct ovs_key_ipv4 *ipv4_key; >>> + const struct ovs_key_ipv4_tunnel *tun_key; >>> >>> case OVS_KEY_ATTR_PRIORITY: >>> case OVS_KEY_ATTR_TUN_ID: >>> case OVS_KEY_ATTR_ETHERNET: >>> break; >>> >>> + case OVS_KEY_ATTR_IPV4_TUNNEL: >>> + tun_key = nla_data(ovs_key); >>> + if (!tun_key->ipv4_dst) { >>> + return -EINVAL; >>> + } >>> + break; >>> + >>> case OVS_KEY_ATTR_IPV4: >>> if (flow_key->eth.type != htons(ETH_P_IP)) >>> return -EINVAL; >>> @@ -785,7 +793,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, >>> struct genl_info *info) >>> >>> err = ovs_flow_metadata_from_nlattrs(&flow->key.phy.priority, >>> &flow->key.phy.in_port, >>> - &flow->key.phy.tun_id, >>> + &flow->key.tun.tun_key, >>> a[OVS_PACKET_ATTR_KEY]); >>> if (err) >>> goto err_flow_put; >>> diff --git a/datapath/datapath.h b/datapath/datapath.h >>> index affbf0e..c5df12d 100644 >>> --- a/datapath/datapath.h >>> +++ b/datapath/datapath.h >>> @@ -96,7 +96,8 @@ struct datapath { >>> /** >>> * struct ovs_skb_cb - OVS data in skb CB >>> * @flow: The flow associated with this packet. May be %NULL if no flow. >>> - * @tun_id: ID of the tunnel that encapsulated this packet. It is 0 if the >>> + * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the >>> + * packet is not being tunneled. >>> * @ip_summed: Consistently stores L4 checksumming status across different >>> * kernel versions. >>> * @csum_start: Stores the offset from which to start checksumming >>> independent >>> @@ -107,7 +108,7 @@ struct datapath { >>> */ >>> struct ovs_skb_cb { >>> struct sw_flow *flow; >>> - __be64 tun_id; >>> + struct ovs_key_ipv4_tunnel *tun_key; >>> #ifdef NEED_CSUM_NORMALIZE >>> enum csum_type ip_summed; >>> u16 csum_start; >>> diff --git a/datapath/flow.c b/datapath/flow.c >>> index d07337c..376f4be 100644 >>> --- a/datapath/flow.c >>> +++ b/datapath/flow.c >>> @@ -629,7 +629,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, >>> struct sw_flow_key *key, >>> memset(key, 0, sizeof(*key)); >>> >>> key->phy.priority = skb->priority; >>> - key->phy.tun_id = OVS_CB(skb)->tun_id; >>> + if (OVS_CB(skb)->tun_key) >>> + key->tun.tun_key = *OVS_CB(skb)->tun_key; >>> key->phy.in_port = in_port; >>> >>> skb_reset_mac_header(skb); >>> @@ -847,6 +848,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { >>> >>> /* Not upstream. */ >>> [OVS_KEY_ATTR_TUN_ID] = sizeof(__be64), >>> + [OVS_KEY_ATTR_IPV4_TUNNEL] = sizeof(struct ovs_key_ipv4_tunnel), >>> }; >>> >>> static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, >>> @@ -1022,9 +1024,30 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, >>> int *key_lenp, >>> swkey->phy.in_port = DP_MAX_PORTS; >>> } >>> >>> - if (attrs & (1ULL << OVS_KEY_ATTR_TUN_ID)) { >>> - swkey->phy.tun_id = nla_get_be64(a[OVS_KEY_ATTR_TUN_ID]); >>> + /* OVS_KEY_ATTR_TUN_ID and OVS_KEY_ATTR_IPV4_TUNEL must both arrive >>> + * together. >>> + */ >>> + if (attrs & (1ULL << OVS_KEY_ATTR_TUN_ID) && >>> + attrs & (1ULL << OVS_KEY_ATTR_IPV4_TUNNEL)) { >>> + struct ovs_key_ipv4_tunnel *tun_key; >>> + __be64 tun_id = 0; >>> + tun_key = nla_data(a[OVS_KEY_ATTR_IPV4_TUNNEL]); >>> + tun_id = nla_get_be64(a[OVS_KEY_ATTR_TUN_ID]); >>> + >>> + if (tun_id != tun_key->tun_id) >>> + return -EINVAL; >>> + >>> + swkey->tun.tun_key = *tun_key; >>> attrs &= ~(1ULL << OVS_KEY_ATTR_TUN_ID); >>> + attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4_TUNNEL); >>> + } else if (attrs & (1ULL << OVS_KEY_ATTR_TUN_ID)) { >>> + swkey->tun.tun_key.tun_id = >>> nla_get_be64(a[OVS_KEY_ATTR_TUN_ID]); >>> + attrs &= ~(1ULL << OVS_KEY_ATTR_TUN_ID); >>> + } else if (attrs & (1ULL << OVS_KEY_ATTR_IPV4_TUNNEL)) { >>> + struct ovs_key_ipv4_tunnel *tun_key; >>> + tun_key = nla_data(a[OVS_KEY_ATTR_IPV4_TUNNEL]); >>> + swkey->tun.tun_key = *tun_key; >>> + attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4_TUNNEL); >>> } >> >> While validating set action OVS_KEY_ATTR_IPV4_TUNNEL, we are checking >> for ipv4_dst, I think we need similar check here. >> >>> >>> /* Data attributes. */ >>> @@ -1162,14 +1185,16 @@ int ovs_flow_from_nlattrs(struct sw_flow_key >>> *swkey, int *key_lenp, >>> * get the metadata, that is, the parts of the flow key that cannot be >>> * extracted from the packet itself. >>> */ >>> -int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, __be64 >>> *tun_id, >>> +int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, >>> + struct ovs_key_ipv4_tunnel *tun_key, >>> const struct nlattr *attr) >>> { >>> const struct nlattr *nla; >>> int rem; >>> + __be64 tun_id = 0; >>> >>> *in_port = DP_MAX_PORTS; >>> - *tun_id = 0; >>> + memset(tun_key, 0, sizeof(*tun_key)); >>> *priority = 0; >>> >>> nla_for_each_nested(nla, attr, rem) { >>> @@ -1185,7 +1210,20 @@ int ovs_flow_metadata_from_nlattrs(u32 *priority, >>> u16 *in_port, __be64 *tun_id, >>> break; >>> >>> case OVS_KEY_ATTR_TUN_ID: >>> - *tun_id = nla_get_be64(nla); >>> + tun_id = nla_get_be64(nla); >>> + if ((tun_key->tun_flags & FLOW_TNL_F_KEY) && >>> + tun_key->tun_id != tun_id) >>> + return -EINVAL; >>> + break; >>> + >>> + case OVS_KEY_ATTR_IPV4_TUNNEL: >>> + if (tun_key->tun_flags & FLOW_TNL_F_KEY) >>> + tun_id = tun_key->tun_id; >>> + memcpy(tun_key, nla_data(nla), >>> + sizeof(*tun_key)); >>> + if ((tun_key->tun_flags & FLOW_TNL_F_KEY) && >>> + tun_id != tun_key->tun_id) >>> + return -EINVAL; >>> break; >>> >>> case OVS_KEY_ATTR_IN_PORT: >>> @@ -1210,10 +1248,20 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key >>> *swkey, struct sk_buff *skb) >>> nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority)) >>> goto nla_put_failure; >>> >>> - if (swkey->phy.tun_id != cpu_to_be64(0) && >>> - nla_put_be64(skb, OVS_KEY_ATTR_TUN_ID, swkey->phy.tun_id)) >>> + if ((swkey->tun.tun_key.tun_flags & FLOW_TNL_F_KEY) && >>> + nla_put_be64(skb, OVS_KEY_ATTR_TUN_ID, >>> swkey->tun.tun_key.tun_id)) >>> goto nla_put_failure; >>> >>> + if (swkey->tun.tun_key.ipv4_dst) { >>> + struct ovs_key_ipv4_tunnel *tun_key; >>> + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4_TUNNEL, >>> + sizeof(*tun_key)); >>> + if (!nla) >>> + goto nla_put_failure; >>> + tun_key = nla_data(nla); >>> + *tun_key = swkey->tun.tun_key; >>> + } >>> + >> >> Why are we sending both OVS_KEY_ATTR_TUN_ID and >> OVS_KEY_ATTR_IPV4_TUNNEL attributed here? >> we could first check for ipv4_dst else for FLOW_TNL_F_KEY in flow key. >> >>> if (swkey->phy.in_port != DP_MAX_PORTS && >>> nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port)) >>> goto nla_put_failure; >>> diff --git a/datapath/flow.h b/datapath/flow.h >>> index 02c563a..4430b32 100644 >>> --- a/datapath/flow.h >>> +++ b/datapath/flow.h >>> @@ -42,10 +42,12 @@ struct sw_flow_actions { >>> >>> struct sw_flow_key { >>> struct { >>> - __be64 tun_id; /* Encapsulating tunnel ID. */ >>> u32 priority; /* Packet QoS priority. */ >>> u16 in_port; /* Input switch port (or >>> DP_MAX_PORTS). */ >>> } phy; >>> + struct { >>> + struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating >>> tunnel key. */ >>> + } tun; >>> struct { >>> u8 src[ETH_ALEN]; /* Ethernet source address. */ >>> u8 dst[ETH_ALEN]; /* Ethernet destination address. */ >>> @@ -150,6 +152,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies); >>> * ------ --- ------ ----- >>> * OVS_KEY_ATTR_PRIORITY 4 -- 4 8 >>> * OVS_KEY_ATTR_TUN_ID 8 -- 4 12 >>> + * OVS_KEY_ATTR_IPV4_TUNNEL 24 -- 4 28 >>> * OVS_KEY_ATTR_IN_PORT 4 -- 4 8 >>> * OVS_KEY_ATTR_ETHERNET 12 -- 4 16 >>> * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 (outer VLAN ethertype) >>> @@ -160,14 +163,15 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies); >>> * OVS_KEY_ATTR_ICMPV6 2 2 4 8 >>> * OVS_KEY_ATTR_ND 28 -- 4 32 >>> * ------------------------------------------------- >>> - * total 156 >>> + * total 184 >>> */ >>> -#define FLOW_BUFSIZE 156 >>> +#define FLOW_BUFSIZE 184 >>> >>> int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *); >>> int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, >>> const struct nlattr *); >>> -int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, __be64 >>> *tun_id, >>> +int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, >>> + struct ovs_key_ipv4_tunnel *tun_key, >>> const struct nlattr *); >>> >>> #define MAX_ACTIONS_BUFSIZE (16 * 1024) >>> diff --git a/datapath/tunnel.c b/datapath/tunnel.c >>> index d651c11..739f098 100644 >>> --- a/datapath/tunnel.c >>> +++ b/datapath/tunnel.c >>> @@ -367,9 +367,9 @@ struct vport *ovs_tnl_find_port(struct net *net, __be32 >>> saddr, __be32 daddr, >>> return NULL; >> ñ> } >>> >>> -static void ecn_decapsulate(struct sk_buff *skb, u8 tos) >>> +static void ecn_decapsulate(struct sk_buff *skb) >>> { >>> - if (unlikely(INET_ECN_is_ce(tos))) { >>> + if (unlikely(INET_ECN_is_ce(OVS_CB(skb)->tun_key->ipv4_tos))) { >>> __be16 protocol = skb->protocol; >>> >>> skb_set_network_header(skb, ETH_HLEN); >>> @@ -416,7 +416,7 @@ static void ecn_decapsulate(struct sk_buff *skb, u8 tos) >>> * - skb->csum does not include the inner Ethernet header. >>> * - The layer pointers are undefined. >>> */ >>> -void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, u8 tos) >>> +void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb) >>> { >>> struct ethhdr *eh; >>> >>> @@ -433,7 +433,7 @@ void ovs_tnl_rcv(struct vport *vport, struct sk_buff >>> *skb, u8 tos) >>> skb_clear_rxhash(skb); >>> secpath_reset(skb); >>> >>> - ecn_decapsulate(skb, tos); >>> + ecn_decapsulate(skb); >>> vlan_set_tci(skb, 0); >>> >>> if (unlikely(compute_ip_summed(skb, false))) { >>> @@ -613,7 +613,7 @@ static void ipv6_build_icmp(struct sk_buff *skb, struct >>> sk_buff *nskb, >>> >>> bool ovs_tnl_frag_needed(struct vport *vport, >>> const struct tnl_mutable_config *mutable, >>> - struct sk_buff *skb, unsigned int mtu, __be64 >>> flow_key) >>> + struct sk_buff *skb, unsigned int mtu) >>> { >>> unsigned int eth_hdr_len = ETH_HLEN; >>> unsigned int total_length = 0, header_length = 0, payload_length; >>> @@ -697,17 +697,6 @@ bool ovs_tnl_frag_needed(struct vport *vport, >>> ipv6_build_icmp(skb, nskb, mtu, payload_length); >>> #endif >>> >>> - /* >>> - * Assume that flow based keys are symmetric with respect to input >>> - * and output and use the key that we were going to put on the >>> - * outgoing packet for the fake received packet. If the keys are >>> - * not symmetric then PMTUD needs to be disabled since we won't have >>> - * any way of synthesizing packets. >>> - */ >>> - if ((mutable->flags & (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) >>> == >>> - (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) >>> - OVS_CB(nskb)->tun_id = flow_key; >>> - >>> if (unlikely(compute_ip_summed(nskb, false))) { >>> kfree_skb(nskb); >>> return false; >>> @@ -723,12 +712,20 @@ static bool check_mtu(struct sk_buff *skb, >>> const struct tnl_mutable_config *mutable, >>> const struct rtable *rt, __be16 *frag_offp) >>> { >>> - bool df_inherit = mutable->flags & TNL_F_DF_INHERIT; >>> + bool df_inherit; >>> bool pmtud = mutable->flags & TNL_F_PMTUD; >>> - __be16 frag_off = mutable->flags & TNL_F_DF_DEFAULT ? htons(IP_DF) >>> : 0; >>> + __be16 frag_off; >>> int mtu = 0; >>> unsigned int packet_length = skb->len - ETH_HLEN; >>> >>> + if (OVS_CB(skb)->tun_key->ipv4_dst != 0) { >>> + df_inherit = false; >>> + frag_off = OVS_CB(skb)->tun_key->tun_flags & >>> FLOW_TNL_F_DONT_FRAGMENT; >>> + } else { >>> + df_inherit = mutable->flags & TNL_F_DF_INHERIT; >>> + frag_off = mutable->flags & TNL_F_DF_DEFAULT ? htons(IP_DF) >>> : 0; >>> + } >>> + >> Why not set pmtud to false incase of flow-based tunneling like you did >> it for df_inherit? >> >>> /* Allow for one level of tagging in the packet length. */ >>> if (!vlan_tx_tag_present(skb) && >>> eth_hdr(skb)->h_proto == htons(ETH_P_8021Q)) >>> @@ -760,8 +757,7 @@ static bool check_mtu(struct sk_buff *skb, >>> mtu = max(mtu, IP_MIN_MTU); >>> >>> if (packet_length > mtu && >>> - ovs_tnl_frag_needed(vport, mutable, skb, mtu, >>> - OVS_CB(skb)->tun_id)) >>> + ovs_tnl_frag_needed(vport, mutable, skb, mtu)) >>> return false; >>> } >>> } >>> @@ -777,8 +773,7 @@ static bool check_mtu(struct sk_buff *skb, >>> mtu = max(mtu, IPV6_MIN_MTU); >>> >>> if (packet_length > mtu && >>> - ovs_tnl_frag_needed(vport, mutable, skb, mtu, >>> - OVS_CB(skb)->tun_id)) >>> + ovs_tnl_frag_needed(vport, mutable, skb, mtu)) >>> return false; >>> } >>> } >>> @@ -1000,15 +995,16 @@ unlock: >>> } >>> >>> static struct rtable *__find_route(const struct tnl_mutable_config *mutable, >>> - u8 ipproto, u8 tos) >>> + __be32 saddr, __be32 daddr, u8 ipproto, >>> + u8 tos) >>> { >>> /* Tunnel configuration keeps DSCP part of TOS bits, But Linux >>> * router expect RT_TOS bits only. */ >>> >>> #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) >>> struct flowi fl = { .nl_u = { .ip4_u = { >>> - .daddr = mutable->key.daddr, >>> - .saddr = mutable->key.saddr, >>> + .daddr = daddr, >>> + .saddr = saddr, >>> .tos = RT_TOS(tos) } }, >>> .proto = ipproto }; >>> struct rtable *rt; >>> @@ -1018,8 +1014,8 @@ static struct rtable *__find_route(const struct >>> tnl_mutable_config *mutable, >>> >>> return rt; >>> #else >>> - struct flowi4 fl = { .daddr = mutable->key.daddr, >>> - .saddr = mutable->key.saddr, >>> + struct flowi4 fl = { .daddr = daddr, >>> + .saddr = saddr, >>> .flowi4_tos = RT_TOS(tos), >>> .flowi4_proto = ipproto }; >>> >>> @@ -1029,7 +1025,8 @@ static struct rtable *__find_route(const struct >>> tnl_mutable_config *mutable, >>> >>> static struct rtable *find_route(struct vport *vport, >>> const struct tnl_mutable_config *mutable, >>> - u8 tos, struct tnl_cache **cache) >>> + __be32 saddr, __be32 daddr, u8 tos, >>> + struct tnl_cache **cache) >>> { >>> struct tnl_vport *tnl_vport = tnl_vport_priv(vport); >>> struct tnl_cache *cur_cache = rcu_dereference(tnl_vport->cache); >>> @@ -1037,18 +1034,21 @@ static struct rtable *find_route(struct vport >>> *vport, >>> *cache = NULL; >>> tos = RT_TOS(tos); >>> >>> - if (likely(tos == RT_TOS(mutable->tos) && >>> - check_cache_valid(cur_cache, mutable))) { >>> + if (daddr == mutable->key.daddr && saddr == mutable->key.saddr && >>> + tos == RT_TOS(mutable->tos) && >>> + check_cache_valid(cur_cache, mutable)) { >>> *cache = cur_cache; >>> return cur_cache->rt; >>> } else { >>> struct rtable *rt; >>> >>> - rt = __find_route(mutable, tnl_vport->tnl_ops->ipproto, >>> tos); >>> + rt = __find_route(mutable, saddr, daddr, >>> + tnl_vport->tnl_ops->ipproto, tos); >>> if (IS_ERR(rt)) >>> return NULL; >>> >>> - if (likely(tos == RT_TOS(mutable->tos))) >>> + if (daddr == mutable->key.daddr && saddr == >>> mutable->key.saddr && >>> + tos == RT_TOS(mutable->tos)) >>> *cache = build_cache(vport, mutable, rt); >>> >> I am not sure why checks are added on saddr and daddr here? >> header caching is not set in case of flow based tunneling to >> build_cache call is no-op anyways. >> >>> return rt; >>> @@ -1178,8 +1178,11 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff >>> *skb) >>> struct rtable *rt; >>> struct dst_entry *unattached_dst = NULL; >>> struct tnl_cache *cache; >>> + struct ovs_key_ipv4_tunnel tun_key; >>> int sent_len = 0; >>> __be16 frag_off = 0; >>> + __be32 daddr; >>> + __be32 saddr; >>> u8 ttl; >>> u8 inner_tos; >>> u8 tos; >>> @@ -1207,6 +1210,13 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff >>> *skb) >>> } >>> #endif >>> >>> + /* If OVS_CB(skb)->tun_key is NULL, point it at the local tun_key >>> here, >>> + * and zero it out. >>> + */ >>> + if (OVS_CB(skb)->tun_key == NULL) { >>> + memset(&tun_key, 0, sizeof(tun_key)); >>> + OVS_CB(skb)->tun_key = &tun_key; >>> + } >>> /* ToS */ >>> if (skb->protocol == htons(ETH_P_IP)) >>> inner_tos = ip_hdr(skb)->tos; >>> @@ -1219,11 +1229,23 @@ int ovs_tnl_send(struct vport *vport, struct >>> sk_buff *skb) >>> >>> if (mutable->flags & TNL_F_TOS_INHERIT) >>> tos = inner_tos; >>> + else if (OVS_CB(skb)->tun_key->ipv4_dst != 0) >>> + tos = OVS_CB(skb)->tun_key->ipv4_tos; >>> else >>> tos = mutable->tos; >>> >>> + if (OVS_CB(skb)->tun_key->ipv4_dst != 0) >>> + daddr = OVS_CB(skb)->tun_key->ipv4_dst; >>> + else >>> + daddr = mutable->key.daddr; >>> + >>> + if (OVS_CB(skb)->tun_key->ipv4_dst != 0) >>> + saddr = OVS_CB(skb)->tun_key->ipv4_src; >>> + else >>> + saddr = mutable->key.saddr; >>> + >> >> It will be easier to read if we have one if check on >> OVS_CB(skb)->tun_key->ipv4_dst which is retrieve all tunnel parameter >> for flow-based tunning and else block for campat code. >> >>> /* Route lookup */ >>> - rt = find_route(vport, mutable, tos, &cache); >>> + rt = find_route(vport, mutable, saddr, daddr, tos, &cache); >>> if (unlikely(!rt)) >>> goto error_free; >>> if (unlikely(!cache)) >>> @@ -1260,9 +1282,13 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff >>> *skb) >>> } >>> >>> /* TTL */ >>> - ttl = mutable->ttl; >>> - if (!ttl) >>> - ttl = ip4_dst_hoplimit(&rt_dst(rt)); >>> + if (OVS_CB(skb)->tun_key->ipv4_dst != 0) >>> + ttl = OVS_CB(skb)->tun_key->ipv4_ttl; >>> + else { >>> + ttl = mutable->ttl; >>> + if (!ttl) >>> + ttl = ip4_dst_hoplimit(&rt_dst(rt)); >>> + } >>> >>> if (mutable->flags & TNL_F_TTL_INHERIT) { >> >> This check could be in the compat code block, so it is less confusing. >> >>> if (skb->protocol == htons(ETH_P_IP)) >>> @@ -1442,7 +1468,8 @@ static int tnl_set_config(struct net *net, struct >>> nlattr *options, >>> struct net_device *dev; >>> struct rtable *rt; >>> >>> - rt = __find_route(mutable, tnl_ops->ipproto, mutable->tos); >>> + rt = __find_route(mutable, mutable->key.saddr, >>> mutable->key.daddr, >>> + tnl_ops->ipproto, mutable->tos); >>> if (IS_ERR(rt)) >>> return -EADDRNOTAVAIL; >>> dev = rt_dst(rt).dev; >>> diff --git a/datapath/tunnel.h b/datapath/tunnel.h >>> index 1924017..b7858d3 100644 >>> --- a/datapath/tunnel.h >>> +++ b/datapath/tunnel.h >>> @@ -269,14 +269,14 @@ int ovs_tnl_set_addr(struct vport *vport, const >>> unsigned char *addr); >>> const char *ovs_tnl_get_name(const struct vport *vport); >>> const unsigned char *ovs_tnl_get_addr(const struct vport *vport); >>> int ovs_tnl_send(struct vport *vport, struct sk_buff *skb); >>> -void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, u8 tos); >>> +void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb); >>> >>> struct vport *ovs_tnl_find_port(struct net *net, __be32 saddr, __be32 daddr, >>> __be64 key, int tunnel_type, >>> const struct tnl_mutable_config **mutable); >>> bool ovs_tnl_frag_needed(struct vport *vport, >>> const struct tnl_mutable_config *mutable, >>> - struct sk_buff *skb, unsigned int mtu, __be64 >>> flow_key); >>> + struct sk_buff *skb, unsigned int mtu); >>> void ovs_tnl_free_linked_skbs(struct sk_buff *skb); >>> >>> int ovs_tnl_init(void); >>> @@ -286,4 +286,15 @@ static inline struct tnl_vport *tnl_vport_priv(const >>> struct vport *vport) >>> return vport_priv(vport); >>> } >>> >>> +static inline void tnl_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, >>> + const struct iphdr *iph, __be64 tun_id) >>> +{ >>> + tun_key->tun_id = tun_id; >>> + tun_key->ipv4_src = iph->saddr; >>> + tun_key->ipv4_dst = iph->daddr; >>> + tun_key->ipv4_tos = iph->tos; >>> + tun_key->ipv4_ttl = iph->ttl; >>> + tun_key->tun_flags = 0; >>> +} >>> + >>> #endif /* tunnel.h */ >>> diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c >>> index 05a099d..fbf50d9 100644 >>> --- a/datapath/vport-capwap.c >>> +++ b/datapath/vport-capwap.c >>> @@ -220,7 +220,7 @@ static struct sk_buff *capwap_update_header(const >>> struct vport *vport, >>> struct capwaphdr_wsi *wsi = (struct capwaphdr_wsi *)(cwh + 1); >>> struct capwaphdr_wsi_key *opt = (struct capwaphdr_wsi_key >>> *)(wsi + 1); >>> >>> - opt->key = OVS_CB(skb)->tun_id; >>> + opt->key = OVS_CB(skb)->tun_key->tun_id; >>> } >>> >>> udph->len = htons(skb->len - skb_transport_offset(skb)); >>> @@ -316,6 +316,7 @@ static int capwap_rcv(struct sock *sk, struct sk_buff >>> *skb) >>> struct vport *vport; >>> const struct tnl_mutable_config *mutable; >>> struct iphdr *iph; >>> + struct ovs_key_ipv4_tunnel tun_key; >>> __be64 key = 0; >>> >>> if (unlikely(!pskb_may_pull(skb, CAPWAP_MIN_HLEN + ETH_HLEN))) >>> @@ -333,12 +334,11 @@ static int capwap_rcv(struct sock *sk, struct sk_buff >>> *skb) >>> goto error; >>> } >>> >>> - if (mutable->flags & TNL_F_IN_KEY_MATCH) >>> - OVS_CB(skb)->tun_id = key; >>> - else >>> - OVS_CB(skb)->tun_id = 0; >>> + tnl_tun_key_init(&tun_key, iph, >>> + mutable->flags & TNL_F_IN_KEY_MATCH ? key : 0); >>> + OVS_CB(skb)->tun_key = &tun_key; >>> >>> - ovs_tnl_rcv(vport, skb, iph->tos); >>> + ovs_tnl_rcv(vport, skb); >>> goto out; >>> >>> error: >>> diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c >>> index ab89c5b..8076b1d 100644 >>> --- a/datapath/vport-gre.c >>> +++ b/datapath/vport-gre.c >>> @@ -103,7 +103,7 @@ static struct sk_buff *gre_update_header(const struct >>> vport *vport, >>> >>> /* Work backwards over the options so the checksum is last. */ >>> if (mutable->flags & TNL_F_OUT_KEY_ACTION) >>> - *options = be64_get_low32(OVS_CB(skb)->tun_id); >>> + *options = be64_get_low32(OVS_CB(skb)->tun_key->tun_id); >>> >> Here we do need check if we doing flow-based tunneling. in case of >> flow-based tunneling mutable is shared between all tunnels, so there >> is no-way to we can do port specific flags check from mutable. about >> GRE checksum, we need to check for FLOW_TNL_F_CSUM flag for doing >> checksum on a packet. >> > > For both of these, to make this check I need to modify structurally the code, > because > I don't have access to the skbuff with the current code. So I need to modify > the > tunneling code to access that. Before proceeding down that path, wanted to > verify > this direction seems ok, because this seems like a larger change. > > Thanks, > Kyle > Scratch this comment, apparently I was looking at the wrong code or had not had enough coffee yet today.
Thanks, Kyle >>> if (mutable->out_key || mutable->flags & TNL_F_OUT_KEY_ACTION) >>> options--; >>> @@ -285,7 +285,7 @@ static void gre_err(struct sk_buff *skb, u32 info) >>> #endif >>> >>> __skb_pull(skb, tunnel_hdr_len); >>> - ovs_tnl_frag_needed(vport, mutable, skb, mtu, key); >>> + ovs_tnl_frag_needed(vport, mutable, skb, mtu); >>> __skb_push(skb, tunnel_hdr_len); >>> >> same here, tunnel_hdr_len is stored in mutable which is going to be >> shared for all tunnels for a type in flow based tunneling, so we need >> to calculate it on packet send. >> >>> out: >>> @@ -327,6 +327,7 @@ static int gre_rcv(struct sk_buff *skb) >>> const struct tnl_mutable_config *mutable; >>> int hdr_len; >>> struct iphdr *iph; >>> + struct ovs_key_ipv4_tunnel tun_key; >>> __be16 flags; >>> __be64 key; >>> >>> @@ -351,15 +352,15 @@ static int gre_rcv(struct sk_buff *skb) >>> goto error; >>> } >>> >>> - if (mutable->flags & TNL_F_IN_KEY_MATCH) >>> - OVS_CB(skb)->tun_id = key; >>> - else >>> - OVS_CB(skb)->tun_id = 0; >>> + >>> + tnl_tun_key_init(&tun_key, iph, >>> + mutable->flags & TNL_F_IN_KEY_MATCH ? key : 0); >>> + OVS_CB(skb)->tun_key = &tun_key; >>> >>> __skb_pull(skb, hdr_len); >>> skb_postpull_rcsum(skb, skb_transport_header(skb), hdr_len + >>> ETH_HLEN); >>> >>> - ovs_tnl_rcv(vport, skb, iph->tos); >>> + ovs_tnl_rcv(vport, skb); >>> return 0; >>> >>> error: >>> diff --git a/datapath/vport.c b/datapath/vport.c >>> index dc7adfa..e8279fb 100644 >>> --- a/datapath/vport.c >>> +++ b/datapath/vport.c >>> @@ -462,7 +462,7 @@ void ovs_vport_receive(struct vport *vport, struct >>> sk_buff *skb) >>> OVS_CB(skb)->flow = NULL; >>> >>> if (!(vport->ops->flags & VPORT_F_TUN_ID)) >>> - OVS_CB(skb)->tun_id = 0; >>> + OVS_CB(skb)->tun_key = NULL; >>> >>> ovs_dp_process_received_packet(vport, skb); >>> } >>> diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h >>> index f5c9cca..8334cc1 100644 >>> --- a/include/linux/openvswitch.h >>> +++ b/include/linux/openvswitch.h >>> @@ -278,7 +278,8 @@ enum ovs_key_attr { >>> OVS_KEY_ATTR_ICMPV6, /* struct ovs_key_icmpv6 */ >>> OVS_KEY_ATTR_ARP, /* struct ovs_key_arp */ >>> OVS_KEY_ATTR_ND, /* struct ovs_key_nd */ >>> - OVS_KEY_ATTR_TUN_ID = 63, /* be64 tunnel ID */ >>> + OVS_KEY_ATTR_IPV4_TUNNEL = 62, /* struct ovs_key_ipv4_tunnel */ >>> + OVS_KEY_ATTR_TUN_ID = 63, /* be64 tunnel ID */ >>> __OVS_KEY_ATTR_MAX >>> }; >>> >>> @@ -360,6 +361,21 @@ struct ovs_key_nd { >>> __u8 nd_tll[6]; >>> }; >>> >>> +/* Values for ovs_key_ipv4_tunnel->tun_flags */ >>> +#define FLOW_TNL_F_DONT_FRAGMENT (1 << 0) >>> +#define FLOW_TNL_F_CSUM (1 << 1) >>> +#define FLOW_TNL_F_KEY (1 << 2) >>> + >>> +struct ovs_key_ipv4_tunnel { >>> + __be64 tun_id; >>> + __u32 tun_flags; >>> + __be32 ipv4_src; >>> + __be32 ipv4_dst; >>> + __u8 ipv4_tos; >>> + __u8 ipv4_ttl; >>> + __u8 pad[2]; >>> +}; >>> + >>> /** >>> * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands. >>> * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow >>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c >>> index c9e3210..797cb06 100644 >>> --- a/lib/dpif-netdev.c >>> +++ b/lib/dpif-netdev.c >>> @@ -1179,6 +1179,7 @@ execute_set_action(struct ofpbuf *packet, const >>> struct nlattr *a) >>> case OVS_KEY_ATTR_TUN_ID: >>> case OVS_KEY_ATTR_PRIORITY: >>> case OVS_KEY_ATTR_IPV6: >>> + case OVS_KEY_ATTR_IPV4_TUNNEL: >>> /* not implemented */ >>> break; >>> >>> diff --git a/lib/odp-util.c b/lib/odp-util.c >>> index 257d7a7..9ed17ed 100644 >>> --- a/lib/odp-util.c >>> +++ b/lib/odp-util.c >>> @@ -93,6 +93,8 @@ ovs_key_attr_to_string(enum ovs_key_attr attr) >>> case OVS_KEY_ATTR_UNSPEC: return "unspec"; >>> case OVS_KEY_ATTR_ENCAP: return "encap"; >>> case OVS_KEY_ATTR_PRIORITY: return "priority"; >>> + case OVS_KEY_ATTR_TUN_ID: return "tun_id"; >>> + case OVS_KEY_ATTR_IPV4_TUNNEL: return "ipv4_tunnel"; >>> case OVS_KEY_ATTR_IN_PORT: return "in_port"; >>> case OVS_KEY_ATTR_ETHERNET: return "eth"; >>> case OVS_KEY_ATTR_VLAN: return "vlan"; >>> @@ -105,7 +107,6 @@ ovs_key_attr_to_string(enum ovs_key_attr attr) >>> case OVS_KEY_ATTR_ICMPV6: return "icmpv6"; >>> case OVS_KEY_ATTR_ARP: return "arp"; >>> case OVS_KEY_ATTR_ND: return "nd"; >>> - case OVS_KEY_ATTR_TUN_ID: return "tun_id"; >>> >>> case __OVS_KEY_ATTR_MAX: >>> default: >>> @@ -602,6 +603,7 @@ odp_flow_key_attr_len(uint16_t type) >>> case OVS_KEY_ATTR_ENCAP: return -2; >>> case OVS_KEY_ATTR_PRIORITY: return 4; >>> case OVS_KEY_ATTR_TUN_ID: return 8; >>> + case OVS_KEY_ATTR_IPV4_TUNNEL: return sizeof(struct >>> ovs_key_ipv4_tunnel); >>> case OVS_KEY_ATTR_IN_PORT: return 4; >>> case OVS_KEY_ATTR_ETHERNET: return sizeof(struct ovs_key_ethernet); >>> case OVS_KEY_ATTR_VLAN: return sizeof(ovs_be16); >>> @@ -668,6 +670,7 @@ format_odp_key_attr(const struct nlattr *a, struct ds >>> *ds) >>> const struct ovs_key_icmpv6 *icmpv6_key; >>> const struct ovs_key_arp *arp_key; >>> const struct ovs_key_nd *nd_key; >>> + const struct ovs_key_ipv4_tunnel *ipv4_tun_key; >>> enum ovs_key_attr attr = nl_attr_type(a); >>> int expected_len; >>> >>> @@ -698,6 +701,16 @@ format_odp_key_attr(const struct nlattr *a, struct ds >>> *ds) >>> ds_put_format(ds, "(%#"PRIx64")", ntohll(nl_attr_get_be64(a))); >>> break; >>> >>> + case OVS_KEY_ATTR_IPV4_TUNNEL: >>> + ipv4_tun_key = nl_attr_get(a); >>> + ds_put_format(ds, "(tun_id=0x%"PRIx64",flags=0x%"PRIx32 >>> + >>> ",src="IP_FMT",dst="IP_FMT",tos=0x%"PRIx8",ttl=%"PRIu8")", >>> + ntohll(ipv4_tun_key->tun_id), >>> ipv4_tun_key->tun_flags, >>> + IP_ARGS(&ipv4_tun_key->ipv4_src), >>> + IP_ARGS(&ipv4_tun_key->ipv4_dst), >>> + ipv4_tun_key->ipv4_tos, ipv4_tun_key->ipv4_ttl); >>> + break; >>> + >>> case OVS_KEY_ATTR_IN_PORT: >>> ds_put_format(ds, "(%"PRIu32")", nl_attr_get_u32(a)); >>> break; >>> diff --git a/lib/odp-util.h b/lib/odp-util.h >>> index 16f2b15..57073ba 100644 >>> --- a/lib/odp-util.h >>> +++ b/lib/odp-util.h >>> @@ -80,6 +80,7 @@ int odp_actions_from_string(const char *, const struct >>> simap *port_names, >>> * ------ --- ------ ----- >>> * OVS_KEY_ATTR_PRIORITY 4 -- 4 8 >>> * OVS_KEY_ATTR_TUN_ID 8 -- 4 12 >>> + * OVS_KEY_ATTR_IPV4_TUNNEL 24 -- 4 28 >>> * OVS_KEY_ATTR_IN_PORT 4 -- 4 8 >>> * OVS_KEY_ATTR_ETHERNET 12 -- 4 16 >>> * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 (outer VLAN ethertype) >>> @@ -90,7 +91,7 @@ int odp_actions_from_string(const char *, const struct >>> simap *port_names, >>> * OVS_KEY_ATTR_ICMPV6 2 2 4 8 >>> * OVS_KEY_ATTR_ND 28 -- 4 32 >>> * ------------------------------------------------- >>> - * total 156 >>> + * total 184 >>> * >>> * We include some slack space in case the calculation isn't quite right or >>> we >>> * add another field and forget to adjust this value. >>> -- >>> 1.7.11.4 >>> >>> _______________________________________________ >>> dev mailing list >>> dev@openvswitch.org >>> http://openvswitch.org/mailman/listinfo/dev > > _______________________________________________ > dev mailing list > dev@openvswitch.org > http://openvswitch.org/mailman/listinfo/dev _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev