Hi Jarno, Thanks for reviewing! Replies inline:
On 01/23/13 16:32, Jarno Rajahalme wrote: > Please find my comments below, > > Jarno > > On Jan 22, 2013, at 20:36 , ext Kyle Mestery wrote: > >> From: Lorand Jakab <loja...@cisco.com> >> > ... >> +Flows on br0 are configured as follows: >> + >> + priority=3,dl_dst=02:00:00:00:00:00,action=mod_dl_dst:<VMx_MAC>,NORMAL >> + priority=2,in_port=1,dl_type=0x0806,action=NORMAL >> + >> priority=1,in_port=1,dl_type=0x0800,vlan_tci=0,nw_src=<EID_prefix>,action=output:3 >> + priority=0,action=NORMAL > > I'll be referring to this example below. > >> diff --git a/datapath/vport-lisp.c b/datapath/vport-lisp.c >> new file mode 100644 >> index 0000000..558e5aa >> --- /dev/null >> +++ b/datapath/vport-lisp.c >> @@ -0,0 +1,351 @@ >> +/* >> + * Copyright (c) 2011 Nicira, Inc. >> + * Copyright (c) 2013 Cisco Systems, Inc. >> + * >> + * This program is free software; you can redistribute it and/or >> + * modify it under the terms of version 2 of the GNU General Public >> + * License as published by the Free Software Foundation. >> + * >> + * This program is distributed in the hope that it will be useful, but >> + * WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + * General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public License >> + * along with this program; if not, write to the Free Software >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> + * 02110-1301, USA >> + */ >> + >> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >> + >> +#include <linux/version.h> >> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) >> + >> +#include <linux/in.h> >> +#include <linux/ip.h> >> +#include <linux/list.h> >> +#include <linux/net.h> >> +#include <linux/udp.h> >> + >> +#include <net/icmp.h> >> +#include <net/ip.h> >> +#include <net/udp.h> >> + >> +#include "datapath.h" >> +#include "tunnel.h" >> +#include "vport.h" >> + >> +#define LISP_DST_PORT 4341 /* Well known UDP port for LISP data packets. */ >> + >> +struct lisp_net { >> + struct socket *lisp_rcv_socket; >> + int n_tunnels; >> +}; >> +static struct lisp_net lisp_net; > > How does this one shared global instance of lisp_net work with multiple > networking > namespaces? As it is, all namespaces seem to be sharing the same socket? Yes they are. Would it be better to use the same approach as the capwap tunnel, which uses the struct ovs_net->vport_net to store struct capwap_net? > >> + >> + >> +/* >> + * LISP encapsulation header: >> + * >> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ >> + * |N|L|E|V|I|flags| Nonce/Map-Version | >> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ >> + * | Instance ID/Locator Status Bits | >> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ >> + * >> + */ >> + >> +/** >> + * struct lisphdr - LISP header >> + * @nonce_present: Flag indicating the presence of a 24 bit nonce value. >> + * @lsb: Flag indicating the presence of Locator Status Bits (LSB). >> + * @echo_nonce: Flag indicating the use of the echo noncing mechanism. >> + * @map_version: Flag indicating the use of mapping versioning. >> + * @instance_id: Flag indicating the presence of a 24 bit Instance ID (IID). >> + * @rflags: 3 bits reserved for future flags. >> + * @nonce: 24 bit nonce value. >> + * @lsb_bits: 32 bit Locator Status Bits >> + */ >> +struct lisphdr { >> +#ifdef __LITTLE_ENDIAN_BITFIELD >> + __u8 rflags:3; >> + __u8 instance_id:1; >> + __u8 map_version:1; >> + __u8 echo_nonce:1; >> + __u8 lsb:1; >> + __u8 nonce_present:1; >> +#else >> + __u8 nonce_present:1; >> + __u8 lsb:1; >> + __u8 echo_nonce:1; >> + __u8 map_version:1; >> + __u8 instance_id:1; >> + __u8 rflags:3; >> +#endif >> + union { >> + __u8 nonce[3]; >> + __u8 map_version[3]; >> + } u1; >> + union { >> + __be32 lsb_bits; >> + __be32 iid; >> + } u2; >> +}; > > The code below would be more readable if the flags names could not be > confused with the value. E.g., "instance_id" could be called > "have_instance_id", > and "u2.iid" could be called "instance_id"? Agreed, poor naming choice on my part. Will fix. > >> + >> +#define LISP_HLEN (sizeof(struct udphdr) + sizeof(struct lisphdr)) >> + >> +static inline int lisp_hdr_len(const struct tnl_mutable_config *mutable, >> + const struct ovs_key_ipv4_tunnel *tun_key) >> +{ >> + return LISP_HLEN; >> +} >> + >> +static inline struct lisphdr *lisp_hdr(const struct sk_buff *skb) >> +{ >> + return (struct lisphdr *)(udp_hdr(skb) + 1); >> +} >> + >> +/* Compute source port for outgoing packet. >> + * Currently we use the flow hash. >> + */ >> +static u16 get_src_port(struct sk_buff *skb) >> +{ >> + int low; >> + int high; >> + unsigned int range; >> + u32 hash = OVS_CB(skb)->flow->hash; >> + >> + inet_get_local_port_range(&low, &high); >> + range = (high - low) + 1; >> + return (((u64) hash * range) >> 32) + low; >> +} >> + >> +static struct sk_buff *lisp_pre_tunnel(const struct vport *vport, >> + const struct tnl_mutable_config *mutable, >> + struct sk_buff *skb) >> +{ >> + /* Pop off "inner" Ethernet header */ >> + skb_pull(skb, ETH_HLEN); >> + return skb; >> +} > > Here it would be better to pull "skb_network_offset(skb)" instead of > "ETH_HLEN". Agreed. > That way this would work correctly for packets that may have VLAN tags or > MPLS labels. The skb->protocol may need to be updated in this case. > If the above should not be allowed, then there should be a mechanism > by which this could fail (e.g., return NULL, but see my comment on patch 1/2). > > Maybe ARP packets should be filtered out as well? How about using a whitelist instead? For the time being, the LISP protocol only specifies the encapsulation of IPv4 and IPv6 packets, although others may be added by a future revision of the specification. > > Also, the reduction in the packet size will show up in the port stats. This > may or > may not be desired. Currently, tunnel output will not report the tunnel > headers > as bytes sent (see send_frags() in tunnel.c). In the same spirit, maybe the > ETH_HLEN (or skb_network_offset()) should NOT be removed from the reported > bytes, i.e., the stats reflect the bytes given for the vport for transport, > not what the > vport actually sends out. This can be done with your own .send hook as well > as > you would define what it returns. Ok, will do. > >> + >> +/* Returns the least-significant 32 bits of a __be64. */ >> +static __be32 be64_get_low32(__be64 x) >> +{ >> +#ifdef __BIG_ENDIAN >> + return (__force __be32)x; >> +#else >> + return (__force __be32)((__force u64)x >> 32); >> +#endif >> +} >> + >> +static struct sk_buff *lisp_build_header(const struct vport *vport, >> + const struct tnl_mutable_config >> *mutable, >> + struct dst_entry *dst, >> + struct sk_buff *skb, >> + int tunnel_hlen) >> +{ >> + struct udphdr *udph = udp_hdr(skb); >> + struct lisphdr *lisph = (struct lisphdr *)(udph + 1); >> + const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key; >> + __be64 out_key; >> + u32 flags; >> + >> + tnl_get_param(mutable, tun_key, &flags, &out_key); >> + >> + udph->dest = htons(LISP_DST_PORT); >> + udph->source = htons(get_src_port(skb)); >> + udph->check = 0; >> + udph->len = htons(skb->len - skb_transport_offset(skb)); >> + >> + lisph->nonce_present = 1; /* We add a nonce instead of map version */ >> + lisph->lsb = 0; /* No reason to set LSBs, just one RLOC */ >> + lisph->echo_nonce = 0; /* No echo noncing */ >> + lisph->map_version = 0; /* No mapping versioning, nonce instead */ >> + lisph->instance_id = 1; /* Store the tun_id as Instance ID */ >> + lisph->rflags = 1; /* Reserved flags, set to 0 */ >> + >> + lisph->u1.nonce[0] = net_random() & 0xFF; >> + lisph->u1.nonce[1] = net_random() & 0xFF; >> + lisph->u1.nonce[2] = net_random() & 0xFF; >> + >> + lisph->u2.iid = htonl(be64_get_low32(tun_key->tun_id)); > > > This seems wrong. Isn't __be32 already in the network byte order? Indeed. Will remove the htonl() call. > >> + >> + /* >> + * Allow our local IP stack to fragment the outer packet even if the >> + * DF bit is set as a last resort. We also need to force selection of >> + * an IP ID here because Linux will otherwise leave it at 0 if the >> + * packet originally had DF set. >> + */ >> + skb->local_df = 1; >> + __ip_select_ident(ip_hdr(skb), dst, 0); >> + >> + return skb; >> +} >> + >> +/* Called with rcu_read_lock and BH disabled. */ >> +static int lisp_rcv(struct sock *sk, struct sk_buff *skb) >> +{ >> + struct vport *vport; >> + struct lisphdr *lisph; >> + const struct tnl_mutable_config *mutable; >> + struct iphdr *iph; >> + struct ovs_key_ipv4_tunnel tun_key; >> + __be64 key; >> + u32 tunnel_flags = 0; >> + struct ethhdr *ethh; >> + >> + if (unlikely(!pskb_may_pull(skb, LISP_HLEN))) >> + goto error; >> + >> + lisph = lisp_hdr(skb); >> + if (unlikely(lisph->instance_id != 1)) >> + goto error; >> + >> + __skb_pull(skb, LISP_HLEN); >> + skb_postpull_rcsum(skb, skb_transport_header(skb), LISP_HLEN); >> + >> + key = cpu_to_be64(ntohl(lisph->u2.iid)); > > You could add a be32_to_be64() and use that instead? Sure, will do. > >> + >> + iph = ip_hdr(skb); >> + vport = ovs_tnl_find_port(dev_net(skb->dev), iph->daddr, iph->saddr, >> + key, TNL_T_PROTO_LISP, &mutable); >> + if (unlikely(!vport)) { >> + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); >> + goto error; >> + } >> + >> + if (mutable->flags & TNL_F_IN_KEY_MATCH || !mutable->key.daddr) >> + tunnel_flags = OVS_TNL_F_KEY; >> + else >> + key = 0; >> + >> + /* Save outer tunnel values */ >> + tnl_tun_key_init(&tun_key, iph, key, tunnel_flags); >> + OVS_CB(skb)->tun_key = &tun_key; >> + >> + /* Add Ethernet header */ >> + skb_push(skb, ETH_HLEN); >> + >> + ethh = (struct ethhdr *)skb->data; >> + memset(ethh, 0, ETH_HLEN); >> + ethh->h_dest[0] = 0x02; >> + ethh->h_source[0] = 0x02; >> + ethh->h_proto = htons(ETH_P_IP); > > What if the inner packet is IPv6? Right, I only tested v4 for now, will fix. > > Also, it might be nice if the added MAC addresses were configurable. This > may help get rid of the top priority flow entry in your example in some cases? Should it be done per vport, as an option like remote_ip? Or should we wait until all bits and pieces are there for configuring null ports, and add MAC address configurability only then? > >> + >> + ovs_tnl_rcv(vport, skb); >> + goto out; >> + >> +error: >> + kfree_skb(skb); >> +out: >> + return 0; >> +} >> + >> +/* Arbitrary value. Irrelevant as long as it's not 0 since we set the >> handler. */ >> +#define UDP_ENCAP_LISP 7 >> +static int lisp_socket_init(struct net *net) >> +{ >> + int err; >> + struct sockaddr_in sin; >> + >> + if (lisp_net.n_tunnels) { >> + lisp_net.n_tunnels++; >> + return 0; >> + } >> + >> + err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, >> + &lisp_net.lisp_rcv_socket); >> + if (err) >> + goto error; >> + >> + /* release net ref. */ >> + sk_change_net(lisp_net.lisp_rcv_socket->sk, net); >> + >> + sin.sin_family = AF_INET; >> + sin.sin_addr.s_addr = htonl(INADDR_ANY); >> + sin.sin_port = htons(LISP_DST_PORT); >> + >> + err = kernel_bind(lisp_net.lisp_rcv_socket, >> + (struct sockaddr *)&sin, >> + sizeof(struct sockaddr_in)); >> + if (err) >> + goto error_sock; >> + >> + udp_sk(lisp_net.lisp_rcv_socket->sk)->encap_type = UDP_ENCAP_LISP; >> + udp_sk(lisp_net.lisp_rcv_socket->sk)->encap_rcv = lisp_rcv; >> + >> + udp_encap_enable(); >> + lisp_net.n_tunnels++; >> + >> + return 0; >> + >> +error_sock: >> + sk_release_kernel(lisp_net.lisp_rcv_socket->sk); >> +error: >> + pr_warn("cannot register lisp protocol handler: %d\n", err); >> + return err; >> +} >> + >> +static const struct tnl_ops ovs_lisp_tnl_ops = { >> + .tunnel_type = TNL_T_PROTO_LISP, >> + .ipproto = IPPROTO_UDP, >> + .hdr_len = lisp_hdr_len, >> + .pre_tunnel = lisp_pre_tunnel, >> + .build_header = lisp_build_header, >> +}; >> + >> +static void release_socket(struct net *net) >> +{ >> + lisp_net.n_tunnels--; >> + if (lisp_net.n_tunnels) >> + return; >> + >> + sk_release_kernel(lisp_net.lisp_rcv_socket->sk); >> +} >> + >> +static void lisp_tnl_destroy(struct vport *vport) >> +{ >> + ovs_tnl_destroy(vport); >> + release_socket(ovs_dp_get_net(vport->dp)); >> +} >> + >> +static struct vport *lisp_tnl_create(const struct vport_parms *parms) >> +{ >> + int err; >> + struct vport *vport; >> + >> + err = lisp_socket_init(ovs_dp_get_net(parms->dp)); >> + if (err) >> + return ERR_PTR(err); >> + >> + vport = ovs_tnl_create(parms, &ovs_lisp_vport_ops, &ovs_lisp_tnl_ops); >> + if (IS_ERR(vport)) >> + release_socket(ovs_dp_get_net(parms->dp)); >> + >> + return vport; >> +} >> + >> +static int lisp_tnl_init(void) >> +{ >> + lisp_net.n_tunnels = 0; >> + return 0; >> +} >> + >> +const struct vport_ops ovs_lisp_vport_ops = { >> + .type = OVS_VPORT_TYPE_LISP, >> + .flags = VPORT_F_TUN_ID, >> + .init = lisp_tnl_init, >> + .create = lisp_tnl_create, >> + .destroy = lisp_tnl_destroy, >> + .set_addr = ovs_tnl_set_addr, >> + .get_name = ovs_tnl_get_name, >> + .get_addr = ovs_tnl_get_addr, >> + .get_options = ovs_tnl_get_options, >> + .set_options = ovs_tnl_set_options, >> + .send = ovs_tnl_send, > > This .send hook could be used to implement the "pre_tunnel" functionality I > referred to in my comment in patch 1/2. We missed this, thanks for pointing it out. Will use this in the next version. Thanks again, -Lori > >> +}; >> +#else >> +#warning LISP tunneling will not be available on kernels before 2.6.26 >> +#endif /* Linux kernel < 2.6.26 */ >> diff --git a/datapath/vport.c b/datapath/vport.c >> index a78ebfa..bf8c763 100644 >> --- a/datapath/vport.c >> +++ b/datapath/vport.c >> @@ -46,6 +46,7 @@ static const struct vport_ops *base_vport_ops_list[] = { >> #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) >> &ovs_capwap_vport_ops, >> &ovs_vxlan_vport_ops, >> + &ovs_lisp_vport_ops, >> #endif >> }; >> >> diff --git a/datapath/vport.h b/datapath/vport.h >> index 91f8836..24f12ae 100644 >> --- a/datapath/vport.h >> +++ b/datapath/vport.h >> @@ -240,5 +240,6 @@ extern const struct vport_ops ovs_gre_ft_vport_ops; >> extern const struct vport_ops ovs_gre64_vport_ops; >> extern const struct vport_ops ovs_capwap_vport_ops; >> extern const struct vport_ops ovs_vxlan_vport_ops; >> +extern const struct vport_ops ovs_lisp_vport_ops; >> >> #endif /* vport.h */ >> diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h >> index f471fbc..b007b35 100644 >> --- a/include/linux/openvswitch.h >> +++ b/include/linux/openvswitch.h >> @@ -184,6 +184,7 @@ enum ovs_vport_type { >> OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */ >> OVS_VPORT_TYPE_FT_GRE, /* Flow based GRE tunnel. */ >> OVS_VPORT_TYPE_VXLAN, /* VXLAN tunnel */ >> + OVS_VPORT_TYPE_LISP, /* LISP tunnel */ >> OVS_VPORT_TYPE_PATCH = 100, /* virtual tunnel connecting two vports */ >> OVS_VPORT_TYPE_GRE, /* GRE tunnel */ >> OVS_VPORT_TYPE_CAPWAP, /* CAPWAP tunnel */ >> diff --git a/include/openflow/nicira-ext.h b/include/openflow/nicira-ext.h >> index 91c96b3..f98ab89 100644 >> --- a/include/openflow/nicira-ext.h >> +++ b/include/openflow/nicira-ext.h >> @@ -1579,9 +1579,9 @@ OFP_ASSERT(sizeof(struct nx_action_output_reg) == 24); >> >> /* Tunnel ID. >> * >> - * For a packet received via a GRE or VXLAN tunnel including a (32-bit) >> key, the >> - * key is stored in the low 32-bits and the high bits are zeroed. For other >> - * packets, the value is 0. >> + * For a packet received via a GRE, VXLAN or LISP tunnel including a >> (32-bit) >> + * key, the key is stored in the low 32-bits and the high bits are zeroed. >> For >> + * other packets, the value is 0. >> * >> * All zero bits, for packets not received via a keyed tunnel. >> * >> diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c >> index 60437b9..8020412 100644 >> --- a/lib/netdev-vport.c >> +++ b/lib/netdev-vport.c >> @@ -186,6 +186,9 @@ netdev_vport_get_netdev_type(const struct >> dpif_linux_vport *vport) >> case OVS_VPORT_TYPE_VXLAN: >> return "vxlan"; >> >> + case OVS_VPORT_TYPE_LISP: >> + return "lisp"; >> + >> case OVS_VPORT_TYPE_FT_GRE: >> case __OVS_VPORT_TYPE_MAX: >> break; >> @@ -915,6 +918,7 @@ netdev_vport_register(void) >> TUNNEL_CLASS("ipsec_gre64", OVS_VPORT_TYPE_GRE64), >> TUNNEL_CLASS("capwap", OVS_VPORT_TYPE_CAPWAP), >> TUNNEL_CLASS("vxlan", OVS_VPORT_TYPE_VXLAN), >> + TUNNEL_CLASS("lisp", OVS_VPORT_TYPE_LISP), >> >> { OVS_VPORT_TYPE_PATCH, >> { "patch", VPORT_FUNCTIONS(NULL, NULL) }, >> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml >> index 18643c2..295e41c 100644 >> --- a/vswitchd/vswitch.xml >> +++ b/vswitchd/vswitch.xml >> @@ -1278,6 +1278,15 @@ >> </p> >> </dd> >> >> + <dt><code>lisp</code></dt> >> + <dd> >> + A layer 3 tunnel over the experimental, UDP-based LISP >> + protocol described at >> + <code>http://tools.ietf.org/html/draft-ietf-lisp-24</code>. >> + LISP is currently supported only with the Linux kernel datapath >> + with kernel version 2.6.26 or later. >> + </dd> >> + >> <dt><code>patch</code></dt> >> <dd> >> A pair of virtual devices that act as a patch cable. >> -- >> 1.7.11.7 >> >> _______________________________________________ >> dev mailing list >> dev@openvswitch.org >> http://openvswitch.org/mailman/listinfo/dev > > _______________________________________________ > dev mailing list > dev@openvswitch.org > http://openvswitch.org/mailman/listinfo/dev > _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev