On Wed, Oct 03, 2012 at 12:17:16PM -0700, Jesse Gross wrote: > From: Leo Alterman <lalter...@nicira.com> > > Allow datapath to recognize and extract MPLS labels into flow keys > and execute actions which push, pop, and set labels on packets.
Hi, I now have this code working in conjunction with a modified version of Ravi's user-space code. I will post a patchset which works under for my light testing shortly. What follows are details of changes I made to this patch in order for it to work in my setup. Firstly, I needed the following to unsure that diff --git a/datapath/datapath.c b/datapath/datapath.c index 79c50af..8ff244c 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -861,6 +864,8 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) OVS_CB(packet)->flow = flow; packet->priority = flow->key.phy.priority; + skb_cb_set_mpls_stack(packet); + rcu_read_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; The rest of my proposed changes are inline below. > > Signed-off-by: Leo Alterman <lalter...@nicira.com> > -- > I haven't reviewed this but hopefully it can be used a good starting point. > --- > datapath/actions.c | 81 > +++++++++++++++++++++++++++++++++++++++++++ > datapath/datapath.c | 56 ++++++++++++++++++++++++++++++ > datapath/datapath.h | 8 +++++ > datapath/flow.c | 30 ++++++++++++++++ > datapath/flow.h | 7 ++++ > datapath/vport.c | 2 ++ > include/linux/openvswitch.h | 32 +++++++++++++++++ > 7 files changed, 216 insertions(+) > > diff --git a/datapath/actions.c b/datapath/actions.c > index 208f260..a199125 100644 > --- a/datapath/actions.c > +++ b/datapath/actions.c > @@ -47,6 +47,67 @@ static int make_writable(struct sk_buff *skb, int > write_len) > return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); > } > > +static __be16 get_ethertype(const struct sk_buff *skb) > +{ > + struct ethhdr *hdr = (struct ethhdr *)(skb_cb_mpls_stack(skb) - > ETH_HLEN); > + return hdr->h_proto; > +} > + > +static void set_ethertype(struct sk_buff *skb, const __be16 ethertype) > +{ > + struct ethhdr *hdr = (struct ethhdr *)(skb_cb_mpls_stack(skb) - > ETH_HLEN); > + hdr->h_proto = ethertype; > +} > + > +static int push_mpls(struct sk_buff *skb, const struct ovs_action_push_mpls > *mpls) > +{ > + u32 l2_size; > + __be32 *new_mpls_label; > + > + if (skb_cow_head(skb, MPLS_HLEN) < 0) { > + kfree_skb(skb); > + return -ENOMEM; > + } > + > + l2_size = skb_cb_mpls_stack_offset(skb); > + skb_push(skb, MPLS_HLEN); > + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), l2_size); > + skb_reset_mac_header(skb); > + > + new_mpls_label = (__be32 *)(skb_mac_header(skb) + l2_size); > + *new_mpls_label = mpls->mpls_label; > + > + set_ethertype(skb, mpls->mpls_ethertype); > + return 0; > +} > + > +static int pop_mpls(struct sk_buff *skb, const __be16 *ethertype) > +{ > + __be16 current_ethertype = get_ethertype(skb); > + if (current_ethertype == htons(ETH_P_MPLS_UC) || > + current_ethertype == htons(ETH_P_MPLS_MC)) { > + u32 l2_size = skb_cb_mpls_stack_offset(skb); > + > + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), > l2_size); > + > + skb_pull(skb, MPLS_HLEN); > + skb_reset_mac_header(skb); > + > + set_ethertype(skb, *ethertype); > + } > + return 0; > +} > + > +static int set_mpls(struct sk_buff *skb, const __be32 *mpls_label) > +{ > + __be16 current_ethertype = get_ethertype(skb); > + if (current_ethertype == htons(ETH_P_MPLS_UC) || > + current_ethertype == htons(ETH_P_MPLS_MC)) { > + memcpy(skb_cb_mpls_stack(skb), mpls_label, sizeof(__be32)); > + } > + return 0; > +} > + > /* remove VLAN header from packet and update csum accrodingly. */ > static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) > { > @@ -100,6 +161,9 @@ static int pop_vlan(struct sk_buff *skb) > return err; > > __vlan_hwaccel_put_tag(skb, ntohs(tci)); > + > + /* update pointer to MPLS label stack */ > + skb_cb_set_mpls_stack(skb); > return 0; > } > > @@ -120,6 +184,9 @@ static int push_vlan(struct sk_buff *skb, const struct > ovs_action_push_vlan *vla > > } > __vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); > + > + /* update pointer to MPLS label stack */ > + skb_cb_set_mpls_stack(skb); > return 0; > } > > @@ -396,6 +463,20 @@ static int do_execute_actions(struct datapath *dp, > struct sk_buff *skb, > output_userspace(dp, skb, a); > break; > > + case OVS_ACTION_ATTR_PUSH_MPLS: > + err = push_mpls(skb, nla_data(a)); > + if (unlikely(err)) /* skb already freed. */ > + return err; > + break; > + > + case OVS_ACTION_ATTR_POP_MPLS: > + err = pop_mpls(skb, nla_data(a)); > + break; > + > + case OVS_ACTION_ATTR_SET_MPLS: > + err = set_mpls(skb, nla_data(a)); > + break; > + > case OVS_ACTION_ATTR_PUSH_VLAN: > err = push_vlan(skb, nla_data(a)); > if (unlikely(err)) /* skb already freed. */ > diff --git a/datapath/datapath.c b/datapath/datapath.c > index c83ce16..91b70bb 100644 > --- a/datapath/datapath.c > +++ b/datapath/datapath.c > @@ -74,6 +74,41 @@ int ovs_net_id __read_mostly; > int (*ovs_dp_ioctl_hook)(struct net_device *dev, struct ifreq *rq, int cmd); > EXPORT_SYMBOL(ovs_dp_ioctl_hook); > > +void skb_cb_set_mpls_stack(struct sk_buff *skb) > +{ > + struct ethhdr *eth; > + int nh_ofs; > + __be16 dl_type = 0; > + > + eth = eth_hdr(skb); > + nh_ofs = sizeof(struct ethhdr); > + if (likely(eth->h_proto >= htons(ETH_TYPE_MIN))) { > + dl_type = eth->h_proto; > + > + while (dl_type == htons(ETH_P_8021Q) && > + skb->len >= nh_ofs + sizeof(struct vlan_hdr)) { > + struct vlan_hdr *vh = (struct vlan_hdr*)(skb->data + > nh_ofs); > + dl_type = vh->h_vlan_encapsulated_proto; > + nh_ofs += sizeof(struct vlan_hdr); > + } > + > + OVS_CB(skb)->mpls_stack = nh_ofs; > + } else { > + OVS_CB(skb)->mpls_stack = 0; > + } > +} > + > +unsigned char *skb_cb_mpls_stack(const struct sk_buff *skb) > +{ > + return OVS_CB(skb)->mpls_stack ? > + skb_mac_header(skb) + > OVS_CB(skb)->mpls_stack : 0; The indentation on the line above is suboptimal. > +} > + > +ptrdiff_t skb_cb_mpls_stack_offset(const struct sk_buff *skb) > +{ > + return OVS_CB(skb)->mpls_stack; > +} > + > /** > * DOC: Locking: > * > @@ -663,12 +698,17 @@ static int validate_actions(const struct nlattr *attr, > static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { > [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), > [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, > + [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct > ovs_action_push_mpls), > + [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16), > + [OVS_ACTION_ATTR_SET_MPLS] = sizeof(__be32), > [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct > ovs_action_push_vlan), > [OVS_ACTION_ATTR_POP_VLAN] = 0, > [OVS_ACTION_ATTR_SET] = (u32)-1, > [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 > }; > const struct ovs_action_push_vlan *vlan; > + __be16 mpls_ethertype; > + __be32 mpls_label; > int type = nla_type(a); > > if (type > OVS_ACTION_ATTR_MAX || > @@ -691,6 +731,22 @@ static int validate_actions(const struct nlattr *attr, > return -EINVAL; > break; > > + case OVS_ACTION_ATTR_PUSH_MPLS: > + mpls_ethertype = nla_get_be16(a); > + if (mpls_ethertype != htons(ETH_P_MPLS_UC) && > + mpls_ethertype != htons(ETH_P_MPLS_MC)) > + return -EINVAL; > + break; The case above does not seem to function because the attribute is a struct struct ovs_action_push_mpls where the ether_type is preceded by a 32bit label. The following seems to work for me: case OVS_ACTION_ATTR_PUSH_MPLS: { const struct ovs_action_push_mpls *mpls = nla_data(a); if (mpls->mpls_ethertype != htons(ETH_P_MPLS_UC) && mpls->mpls_ethertype != htons(ETH_P_MPLS_MC)) return -EINVAL; break; } > + > + case OVS_ACTION_ATTR_POP_MPLS: > + break; > + > + case OVS_ACTION_ATTR_SET_MPLS: > + mpls_label = nla_get_be32(a); > + if (mpls_label == htonl(0)) { > + return -EINVAL; > + } > + break; > > case OVS_ACTION_ATTR_POP_VLAN: > break; > diff --git a/datapath/datapath.h b/datapath/datapath.h > index affbf0e..1801ccd 100644 > --- a/datapath/datapath.h > +++ b/datapath/datapath.h > @@ -97,6 +97,9 @@ struct datapath { > * struct ovs_skb_cb - OVS data in skb CB > * @flow: The flow associated with this packet. May be %NULL if no flow. > * @tun_id: ID of the tunnel that encapsulated this packet. It is 0 if the > + * packet is not being tunneled. > + * @mpls_stack: Offset of the packet's MPLS stack from the beginning of the > + * ethernet frame. It is 0 if no MPLS stack is present. > * @ip_summed: Consistently stores L4 checksumming status across different > * kernel versions. > * @csum_start: Stores the offset from which to start checksumming > independent > @@ -108,6 +111,7 @@ struct datapath { > struct ovs_skb_cb { > struct sw_flow *flow; > __be64 tun_id; > + ptrdiff_t mpls_stack; > #ifdef NEED_CSUM_NORMALIZE > enum csum_type ip_summed; > u16 csum_start; > @@ -192,4 +196,8 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *, > u32 pid, u32 seq, > u8 cmd); > > int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); > + > +void skb_cb_set_mpls_stack(struct sk_buff *skb); > +unsigned char *skb_cb_mpls_stack(const struct sk_buff *skb); > +ptrdiff_t skb_cb_mpls_stack_offset(const struct sk_buff *skb); > #endif /* datapath.h */ > diff --git a/datapath/flow.c b/datapath/flow.c > index d07337c..40df8ff 100644 > --- a/datapath/flow.c > +++ b/datapath/flow.c > @@ -739,6 +739,14 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, > struct sw_flow_key *key, > key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); > } > } > + } else if (key->eth.type == htons(ETH_P_MPLS_UC) || > + key->eth.type == htons(ETH_P_MPLS_MC)) { > + error = check_header(skb, MPLS_HLEN); > + if (unlikely(error)) > + goto out; > + > + key_len = SW_FLOW_KEY_OFFSET(mpls.top_label); > + memcpy(&key->mpls.top_label, skb_network_header(skb), > MPLS_HLEN); > } else if (key->eth.type == htons(ETH_P_IPV6)) { > int nh_len; /* IPv6 Header + Extensions */ > > @@ -836,6 +844,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { > [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet), > [OVS_KEY_ATTR_VLAN] = sizeof(__be16), > [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16), > + [OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls), > [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4), > [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), > [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), > @@ -1141,6 +1150,17 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, > int *key_lenp, > swkey->ip.proto = ntohs(arp_key->arp_op); > memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN); > memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN); > + } else if (swkey->eth.type == htons(ETH_P_MPLS_UC) || > + swkey->eth.type == htons(ETH_P_MPLS_MC)) { > + const struct ovs_key_mpls *mpls_key; > + > + if (!(attrs & (1 << OVS_KEY_ATTR_MPLS))) > + return -EINVAL; > + attrs &= ~(1 << OVS_KEY_ATTR_MPLS); > + > + key_len = SW_FLOW_KEY_OFFSET(mpls.top_label); > + mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); > + swkey->mpls.top_label = mpls_key->mpls_top_label; > } > > if (attrs) > @@ -1284,6 +1304,16 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key > *swkey, struct sk_buff *skb) > arp_key->arp_op = htons(swkey->ip.proto); > memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN); > memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN); > + } else if (swkey->eth.type == htons(ETH_P_MPLS_UC) || > + swkey->eth.type == htons(ETH_P_MPLS_MC)) { > + struct ovs_key_mpls *mpls_key; > + > + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key)); > + if (!nla) > + goto nla_put_failure; > + mpls_key = nla_data(nla); > + memset(mpls_key, 0, sizeof(struct ovs_key_mpls)); > + mpls_key->mpls_top_label = swkey->mpls.top_label; > } > > if ((swkey->eth.type == htons(ETH_P_IP) || > diff --git a/datapath/flow.h b/datapath/flow.h > index 5be481e..c18183b 100644 > --- a/datapath/flow.h > +++ b/datapath/flow.h > @@ -53,6 +53,9 @@ struct sw_flow_key { > __be16 type; /* Ethernet frame type. */ > } eth; > struct { > + __be32 top_label; /* 0 if no MPLS, top label from stack > otherwise */ > + } mpls; > + struct { > u8 proto; /* IP protocol or lower 8 bits of ARP > opcode. */ > u8 tos; /* IP ToS. */ > u8 ttl; /* IP TTL/hop limit. */ > @@ -126,6 +129,10 @@ struct arp_eth_header { > unsigned char ar_tip[4]; /* target IP address */ > } __packed; > > +#define ETH_TYPE_MIN 0x600 > + > +#define MPLS_HLEN 4 > + > int ovs_flow_init(void); > void ovs_flow_exit(void); > > diff --git a/datapath/vport.c b/datapath/vport.c > index 172261a..0664e4c 100644 > --- a/datapath/vport.c > +++ b/datapath/vport.c > @@ -464,6 +464,8 @@ void ovs_vport_receive(struct vport *vport, struct > sk_buff *skb) > if (!(vport->ops->flags & VPORT_F_TUN_ID)) > OVS_CB(skb)->tun_id = 0; I have encountered situations where skb_cb_set_mpls_stack() panics because eth_hdr(skb) returns NULL. I'm unsure if this is the best fix, but adding the following here seems to resolve the problem that I observed. skb_cb_set_mpls_stack(skb); > > + skb_cb_set_mpls_stack(skb); > + > ovs_dp_process_received_packet(vport, skb); > } > > diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h > index f5c9cca..78960d1 100644 > --- a/include/linux/openvswitch.h > +++ b/include/linux/openvswitch.h > @@ -278,6 +278,7 @@ enum ovs_key_attr { > OVS_KEY_ATTR_ICMPV6, /* struct ovs_key_icmpv6 */ > OVS_KEY_ATTR_ARP, /* struct ovs_key_arp */ > OVS_KEY_ATTR_ND, /* struct ovs_key_nd */ > + OVS_KEY_ATTR_MPLS, /* struct ovs_key_mpls */ > OVS_KEY_ATTR_TUN_ID = 63, /* be64 tunnel ID */ > __OVS_KEY_ATTR_MAX > }; > @@ -307,6 +308,10 @@ struct ovs_key_ethernet { > __u8 eth_dst[6]; > }; > > +struct ovs_key_mpls { > + __be32 mpls_top_label; > +}; > + > struct ovs_key_ipv4 { > __be32 ipv4_src; > __be32 ipv4_dst; > @@ -437,6 +442,20 @@ enum ovs_userspace_attr { > #define OVS_USERSPACE_ATTR_MAX (__OVS_USERSPACE_ATTR_MAX - 1) > > /** > + * struct ovs_action_push_mpls - %OVS_ACTION_ATTR_PUSH_MPLS action argument. > + * @mpls_label: MPLS label to push. > + * @mpls_ethertype: Ethertype to set in the encapsulating ethernet frame. > + * > + * The only values @mpls_ethertype should ever be given are %ETH_P_MPLS_UC > and > + * %ETH_P_MPLS_MC, indicating MPLS unicast or multicast. Any other values > would > + * produce a corrupt packet. > + */ > +struct ovs_action_push_mpls { > + __be32 mpls_label; > + __be16 mpls_ethertype; /* Either %ETH_P_MPLS_UC or %ETH_P_MPLS_MC */ I believe that this structure will be padded out to be 64 bytes so perhaps it would be worth adding a pad or zero element value to make it clear that some care needs to be taken there? __be16 zero; > +}; > + > +/** > * struct ovs_action_push_vlan - %OVS_ACTION_ATTR_PUSH_VLAN action argument. > * @vlan_tpid: Tag protocol identifier (TPID) to push. > * @vlan_tci: Tag control identifier (TCI) to push. The CFI bit must be set > @@ -461,6 +480,16 @@ struct ovs_action_push_vlan { > * @OVS_ACTION_ATTR_SET: Replaces the contents of an existing header. The > * single nested %OVS_KEY_ATTR_* attribute specifies a header to modify and > its > * value. > + * @OVS_ACTION_ATTR_PUSH_MPLS: Push a new MPLS label onto the top of the > packet > + * MPLS stack. Set the ethertype of the encapsulating frame to either > + * %ETH_P_MPLS_UC or %ETH_P_MPLS_MC to indicate the new packet contents. > + * @OVS_ACTION_ATTR_POP_MPLS: Pop an MPLS label off of the packet's MPLS > stack. > + * Set the encapsulating frame's ethertype to indicate the new packet > contents > + * (this could potentially still be %ETH_P_MPLS_* if there are remaining MPLS > + * labels). If there are no MPLS labels as determined by ethertype, no > action > + * is taken. > + * @OVS_ACTION_ATTR_SET_MPLS: Set the value of the top-most label in the MPLS > + * label stack. If there are no MPLS labels in the packet, no action is > taken. > * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q header onto the > * packet. > * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet. > @@ -477,6 +506,9 @@ enum ovs_action_attr { > OVS_ACTION_ATTR_OUTPUT, /* u32 port number. */ > OVS_ACTION_ATTR_USERSPACE, /* Nested OVS_USERSPACE_ATTR_*. */ > OVS_ACTION_ATTR_SET, /* One nested OVS_KEY_ATTR_*. */ > + OVS_ACTION_ATTR_PUSH_MPLS, /* struct ovs_action_push_mpls. */ > + OVS_ACTION_ATTR_POP_MPLS, /* __be16 ethertype. */ > + OVS_ACTION_ATTR_SET_MPLS, /* __be32 MPLS label */ > OVS_ACTION_ATTR_PUSH_VLAN, /* struct ovs_action_push_vlan. */ > OVS_ACTION_ATTR_POP_VLAN, /* No argument. */ > OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */ > -- > 1.7.9.5 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > http://openvswitch.org/mailman/listinfo/dev > _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev