Hi Jarno,

Thanks for reviewing! Replies inline:

On 01/23/13 16:32, Jarno Rajahalme wrote:
> Please find my comments below,
> 
>   Jarno
> 
> On Jan 22, 2013, at 20:36 , ext Kyle Mestery wrote:
> 
>> From: Lorand Jakab <loja...@cisco.com>
>>
> ...
>> +Flows on br0 are configured as follows:
>> +
>> +    priority=3,dl_dst=02:00:00:00:00:00,action=mod_dl_dst:<VMx_MAC>,NORMAL
>> +    priority=2,in_port=1,dl_type=0x0806,action=NORMAL
>> +    
>> priority=1,in_port=1,dl_type=0x0800,vlan_tci=0,nw_src=<EID_prefix>,action=output:3
>> +    priority=0,action=NORMAL
> 
> I'll be referring to this example below.
> 
>> diff --git a/datapath/vport-lisp.c b/datapath/vport-lisp.c
>> new file mode 100644
>> index 0000000..558e5aa
>> --- /dev/null
>> +++ b/datapath/vport-lisp.c
>> @@ -0,0 +1,351 @@
>> +/*
>> + * Copyright (c) 2011 Nicira, Inc.
>> + * Copyright (c) 2013 Cisco Systems, Inc.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of version 2 of the GNU General Public
>> + * License as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it will be useful, but
>> + * WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + * General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> + * 02110-1301, USA
>> + */
>> +
>> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>> +
>> +#include <linux/version.h>
>> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
>> +
>> +#include <linux/in.h>
>> +#include <linux/ip.h>
>> +#include <linux/list.h>
>> +#include <linux/net.h>
>> +#include <linux/udp.h>
>> +
>> +#include <net/icmp.h>
>> +#include <net/ip.h>
>> +#include <net/udp.h>
>> +
>> +#include "datapath.h"
>> +#include "tunnel.h"
>> +#include "vport.h"
>> +
>> +#define LISP_DST_PORT 4341  /* Well known UDP port for LISP data packets. */
>> +
>> +struct lisp_net {
>> +    struct socket *lisp_rcv_socket;
>> +    int n_tunnels;
>> +};
>> +static struct lisp_net lisp_net;
> 
> How does this one shared global instance of lisp_net work with multiple 
> networking
> namespaces?  As it is, all namespaces seem to be sharing the same socket?

Yes they are. Would it be better to use the same approach as the capwap
tunnel, which uses the struct ovs_net->vport_net to store struct capwap_net?

> 
>> +
>> +
>> +/*
>> + *  LISP encapsulation header:
>> + *
>> + *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> + *  |N|L|E|V|I|flags|            Nonce/Map-Version                  |
>> + *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> + *  |                 Instance ID/Locator Status Bits               |
>> + *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> + *
>> + */
>> +
>> +/**
>> + * struct lisphdr - LISP header
>> + * @nonce_present: Flag indicating the presence of a 24 bit nonce value.
>> + * @lsb: Flag indicating the presence of Locator Status Bits (LSB).
>> + * @echo_nonce: Flag indicating the use of the echo noncing mechanism.
>> + * @map_version: Flag indicating the use of mapping versioning.
>> + * @instance_id: Flag indicating the presence of a 24 bit Instance ID (IID).
>> + * @rflags: 3 bits reserved for future flags.
>> + * @nonce: 24 bit nonce value.
>> + * @lsb_bits: 32 bit Locator Status Bits
>> + */
>> +struct lisphdr {
>> +#ifdef __LITTLE_ENDIAN_BITFIELD
>> +    __u8 rflags:3;
>> +    __u8 instance_id:1;
>> +    __u8 map_version:1;
>> +    __u8 echo_nonce:1;
>> +    __u8 lsb:1;
>> +    __u8 nonce_present:1;
>> +#else
>> +    __u8 nonce_present:1;
>> +    __u8 lsb:1;
>> +    __u8 echo_nonce:1;
>> +    __u8 map_version:1;
>> +    __u8 instance_id:1;
>> +    __u8 rflags:3;
>> +#endif
>> +    union {
>> +            __u8 nonce[3];
>> +            __u8 map_version[3];
>> +    } u1;
>> +    union {
>> +            __be32 lsb_bits;
>> +            __be32 iid;
>> +    } u2;
>> +};
> 
> The code below would be more readable if the flags names could not be
> confused with the value. E.g., "instance_id" could be called 
> "have_instance_id",
> and "u2.iid" could be called "instance_id"?

Agreed, poor naming choice on my part. Will fix.

> 
>> +
>> +#define LISP_HLEN (sizeof(struct udphdr) + sizeof(struct lisphdr))
>> +
>> +static inline int lisp_hdr_len(const struct tnl_mutable_config *mutable,
>> +                           const struct ovs_key_ipv4_tunnel *tun_key)
>> +{
>> +    return LISP_HLEN;
>> +}
>> +
>> +static inline struct lisphdr *lisp_hdr(const struct sk_buff *skb)
>> +{
>> +    return (struct lisphdr *)(udp_hdr(skb) + 1);
>> +}
>> +
>> +/* Compute source port for outgoing packet.
>> + * Currently we use the flow hash.
>> + */
>> +static u16 get_src_port(struct sk_buff *skb)
>> +{
>> +    int low;
>> +    int high;
>> +    unsigned int range;
>> +    u32 hash = OVS_CB(skb)->flow->hash;
>> +
>> +    inet_get_local_port_range(&low, &high);
>> +    range = (high - low) + 1;
>> +    return (((u64) hash * range) >> 32) + low;
>> +}
>> +
>> +static struct sk_buff *lisp_pre_tunnel(const struct vport *vport,
>> +                                   const struct tnl_mutable_config *mutable,
>> +                                   struct sk_buff *skb)
>> +{
>> +    /* Pop off "inner" Ethernet header */
>> +    skb_pull(skb, ETH_HLEN);
>> +    return skb;
>> +}
> 
> Here it would be better to pull "skb_network_offset(skb)" instead of 
> "ETH_HLEN".

Agreed.

> That way this would work correctly for packets that may have VLAN tags or
> MPLS labels. The skb->protocol may need to be updated in this case.
> If the above should not be allowed, then there should be a mechanism
> by which this could fail (e.g., return NULL, but see my comment on patch 1/2).
> 
> Maybe ARP packets should be filtered out as well?

How about using a whitelist instead? For the time being, the LISP
protocol only specifies the encapsulation of IPv4 and IPv6 packets,
although others may be added by a future revision of the specification.

> 
> Also, the reduction in the packet size will show up in the port stats. This 
> may or
> may not be desired. Currently, tunnel output will not report the tunnel 
> headers
> as bytes sent (see send_frags() in tunnel.c). In the same spirit, maybe the
> ETH_HLEN (or skb_network_offset()) should NOT be removed from the reported
> bytes, i.e., the stats reflect the bytes given for the vport for transport, 
> not what the
> vport actually sends out.  This can be done with your own .send hook as well 
> as
> you would define what it returns.

Ok, will do.

> 
>> +
>> +/* Returns the least-significant 32 bits of a __be64. */
>> +static __be32 be64_get_low32(__be64 x)
>> +{
>> +#ifdef __BIG_ENDIAN
>> +    return (__force __be32)x;
>> +#else
>> +    return (__force __be32)((__force u64)x >> 32);
>> +#endif
>> +}
>> +
>> +static struct sk_buff *lisp_build_header(const struct vport *vport,
>> +                                     const struct tnl_mutable_config 
>> *mutable,
>> +                                     struct dst_entry *dst,
>> +                                     struct sk_buff *skb,
>> +                                     int tunnel_hlen)
>> +{
>> +    struct udphdr *udph = udp_hdr(skb);
>> +    struct lisphdr *lisph = (struct lisphdr *)(udph + 1);
>> +    const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key;
>> +    __be64 out_key;
>> +    u32 flags;
>> +
>> +    tnl_get_param(mutable, tun_key, &flags, &out_key);
>> +
>> +    udph->dest = htons(LISP_DST_PORT);
>> +    udph->source = htons(get_src_port(skb));
>> +    udph->check = 0;
>> +    udph->len = htons(skb->len - skb_transport_offset(skb));
>> +
>> +    lisph->nonce_present = 1;   /* We add a nonce instead of map version */
>> +    lisph->lsb = 0;             /* No reason to set LSBs, just one RLOC */
>> +    lisph->echo_nonce = 0;      /* No echo noncing */
>> +    lisph->map_version = 0;     /* No mapping versioning, nonce instead */
>> +    lisph->instance_id = 1;     /* Store the tun_id as Instance ID  */
>> +    lisph->rflags = 1;          /* Reserved flags, set to 0  */
>> +
>> +    lisph->u1.nonce[0] = net_random() & 0xFF;
>> +    lisph->u1.nonce[1] = net_random() & 0xFF;
>> +    lisph->u1.nonce[2] = net_random() & 0xFF;
>> +
>> +    lisph->u2.iid = htonl(be64_get_low32(tun_key->tun_id));
> 
> 
> This seems wrong. Isn't __be32 already in the network byte order?

Indeed. Will remove the htonl() call.

> 
>> +
>> +    /*
>> +     * Allow our local IP stack to fragment the outer packet even if the
>> +     * DF bit is set as a last resort.  We also need to force selection of
>> +     * an IP ID here because Linux will otherwise leave it at 0 if the
>> +     * packet originally had DF set.
>> +     */
>> +    skb->local_df = 1;
>> +    __ip_select_ident(ip_hdr(skb), dst, 0);
>> +
>> +    return skb;
>> +}
>> +
>> +/* Called with rcu_read_lock and BH disabled. */
>> +static int lisp_rcv(struct sock *sk, struct sk_buff *skb)
>> +{
>> +    struct vport *vport;
>> +    struct lisphdr *lisph;
>> +    const struct tnl_mutable_config *mutable;
>> +    struct iphdr *iph;
>> +    struct ovs_key_ipv4_tunnel tun_key;
>> +    __be64 key;
>> +    u32 tunnel_flags = 0;
>> +    struct ethhdr *ethh;
>> +
>> +    if (unlikely(!pskb_may_pull(skb, LISP_HLEN)))
>> +            goto error;
>> +
>> +    lisph = lisp_hdr(skb);
>> +    if (unlikely(lisph->instance_id != 1))
>> +            goto error;
>> +
>> +    __skb_pull(skb, LISP_HLEN);
>> +    skb_postpull_rcsum(skb, skb_transport_header(skb), LISP_HLEN);
>> +
>> +    key = cpu_to_be64(ntohl(lisph->u2.iid));
> 
> You could add a be32_to_be64() and use that instead?

Sure, will do.

> 
>> +
>> +    iph = ip_hdr(skb);
>> +    vport = ovs_tnl_find_port(dev_net(skb->dev), iph->daddr, iph->saddr,
>> +                              key, TNL_T_PROTO_LISP, &mutable);
>> +    if (unlikely(!vport)) {
>> +            icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
>> +            goto error;
>> +    }
>> +
>> +    if (mutable->flags & TNL_F_IN_KEY_MATCH || !mutable->key.daddr)
>> +            tunnel_flags = OVS_TNL_F_KEY;
>> +    else
>> +            key = 0;
>> +
>> +    /* Save outer tunnel values */
>> +    tnl_tun_key_init(&tun_key, iph, key, tunnel_flags);
>> +    OVS_CB(skb)->tun_key = &tun_key;
>> +
>> +    /* Add Ethernet header */
>> +    skb_push(skb, ETH_HLEN);
>> +
>> +    ethh = (struct ethhdr *)skb->data;
>> +    memset(ethh, 0, ETH_HLEN);
>> +    ethh->h_dest[0] = 0x02;
>> +    ethh->h_source[0] = 0x02;
>> +    ethh->h_proto = htons(ETH_P_IP);
> 
> What if the inner packet is IPv6?

Right, I only tested v4 for now, will fix.

> 
> Also, it might be nice if the added MAC addresses were configurable. This
> may help get rid of the top priority flow entry in your example in some cases?

Should it be done per vport, as an option like remote_ip? Or should we
wait until all bits and pieces are there for configuring null ports, and
add MAC address configurability only then?

> 
>> +
>> +    ovs_tnl_rcv(vport, skb);
>> +    goto out;
>> +
>> +error:
>> +    kfree_skb(skb);
>> +out:
>> +    return 0;
>> +}
>> +
>> +/* Arbitrary value.  Irrelevant as long as it's not 0 since we set the 
>> handler. */
>> +#define UDP_ENCAP_LISP 7
>> +static int lisp_socket_init(struct net *net)
>> +{
>> +    int err;
>> +    struct sockaddr_in sin;
>> +
>> +    if (lisp_net.n_tunnels) {
>> +            lisp_net.n_tunnels++;
>> +            return 0;
>> +    }
>> +
>> +    err = sock_create_kern(AF_INET, SOCK_DGRAM, 0,
>> +                           &lisp_net.lisp_rcv_socket);
>> +    if (err)
>> +            goto error;
>> +
>> +    /* release net ref. */
>> +    sk_change_net(lisp_net.lisp_rcv_socket->sk, net);
>> +
>> +    sin.sin_family = AF_INET;
>> +    sin.sin_addr.s_addr = htonl(INADDR_ANY);
>> +    sin.sin_port = htons(LISP_DST_PORT);
>> +
>> +    err = kernel_bind(lisp_net.lisp_rcv_socket,
>> +                      (struct sockaddr *)&sin,
>> +                      sizeof(struct sockaddr_in));
>> +    if (err)
>> +            goto error_sock;
>> +
>> +    udp_sk(lisp_net.lisp_rcv_socket->sk)->encap_type = UDP_ENCAP_LISP;
>> +    udp_sk(lisp_net.lisp_rcv_socket->sk)->encap_rcv = lisp_rcv;
>> +
>> +    udp_encap_enable();
>> +    lisp_net.n_tunnels++;
>> +
>> +    return 0;
>> +
>> +error_sock:
>> +    sk_release_kernel(lisp_net.lisp_rcv_socket->sk);
>> +error:
>> +    pr_warn("cannot register lisp protocol handler: %d\n", err);
>> +    return err;
>> +}
>> +
>> +static const struct tnl_ops ovs_lisp_tnl_ops = {
>> +    .tunnel_type    = TNL_T_PROTO_LISP,
>> +    .ipproto        = IPPROTO_UDP,
>> +    .hdr_len        = lisp_hdr_len,
>> +    .pre_tunnel     = lisp_pre_tunnel,
>> +    .build_header   = lisp_build_header,
>> +};
>> +
>> +static void release_socket(struct net *net)
>> +{
>> +    lisp_net.n_tunnels--;
>> +    if (lisp_net.n_tunnels)
>> +            return;
>> +
>> +    sk_release_kernel(lisp_net.lisp_rcv_socket->sk);
>> +}
>> +
>> +static void lisp_tnl_destroy(struct vport *vport)
>> +{
>> +    ovs_tnl_destroy(vport);
>> +    release_socket(ovs_dp_get_net(vport->dp));
>> +}
>> +
>> +static struct vport *lisp_tnl_create(const struct vport_parms *parms)
>> +{
>> +    int err;
>> +    struct vport *vport;
>> +
>> +    err = lisp_socket_init(ovs_dp_get_net(parms->dp));
>> +    if (err)
>> +            return ERR_PTR(err);
>> +
>> +    vport = ovs_tnl_create(parms, &ovs_lisp_vport_ops, &ovs_lisp_tnl_ops);
>> +    if (IS_ERR(vport))
>> +            release_socket(ovs_dp_get_net(parms->dp));
>> +
>> +    return vport;
>> +}
>> +
>> +static int lisp_tnl_init(void)
>> +{
>> +    lisp_net.n_tunnels = 0;
>> +    return 0;
>> +}
>> +
>> +const struct vport_ops ovs_lisp_vport_ops = {
>> +    .type           = OVS_VPORT_TYPE_LISP,
>> +    .flags          = VPORT_F_TUN_ID,
>> +    .init           = lisp_tnl_init,
>> +    .create         = lisp_tnl_create,
>> +    .destroy        = lisp_tnl_destroy,
>> +    .set_addr       = ovs_tnl_set_addr,
>> +    .get_name       = ovs_tnl_get_name,
>> +    .get_addr       = ovs_tnl_get_addr,
>> +    .get_options    = ovs_tnl_get_options,
>> +    .set_options    = ovs_tnl_set_options,
>> +    .send           = ovs_tnl_send,
> 
> This .send hook could be used to implement the "pre_tunnel" functionality I
> referred to in my comment in patch 1/2.

We missed this, thanks for pointing it out. Will use this in the next
version.

Thanks again,
-Lori

> 
>> +};
>> +#else
>> +#warning LISP tunneling will not be available on kernels before 2.6.26
>> +#endif /* Linux kernel < 2.6.26 */
>> diff --git a/datapath/vport.c b/datapath/vport.c
>> index a78ebfa..bf8c763 100644
>> --- a/datapath/vport.c
>> +++ b/datapath/vport.c
>> @@ -46,6 +46,7 @@ static const struct vport_ops *base_vport_ops_list[] = {
>> #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
>>      &ovs_capwap_vport_ops,
>>      &ovs_vxlan_vport_ops,
>> +    &ovs_lisp_vport_ops,
>> #endif
>> };
>>
>> diff --git a/datapath/vport.h b/datapath/vport.h
>> index 91f8836..24f12ae 100644
>> --- a/datapath/vport.h
>> +++ b/datapath/vport.h
>> @@ -240,5 +240,6 @@ extern const struct vport_ops ovs_gre_ft_vport_ops;
>> extern const struct vport_ops ovs_gre64_vport_ops;
>> extern const struct vport_ops ovs_capwap_vport_ops;
>> extern const struct vport_ops ovs_vxlan_vport_ops;
>> +extern const struct vport_ops ovs_lisp_vport_ops;
>>
>> #endif /* vport.h */
>> diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
>> index f471fbc..b007b35 100644
>> --- a/include/linux/openvswitch.h
>> +++ b/include/linux/openvswitch.h
>> @@ -184,6 +184,7 @@ enum ovs_vport_type {
>>      OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
>>      OVS_VPORT_TYPE_FT_GRE,   /* Flow based GRE tunnel. */
>>      OVS_VPORT_TYPE_VXLAN,    /* VXLAN tunnel */
>> +    OVS_VPORT_TYPE_LISP,     /* LISP tunnel */
>>      OVS_VPORT_TYPE_PATCH = 100, /* virtual tunnel connecting two vports */
>>      OVS_VPORT_TYPE_GRE,      /* GRE tunnel */
>>      OVS_VPORT_TYPE_CAPWAP,   /* CAPWAP tunnel */
>> diff --git a/include/openflow/nicira-ext.h b/include/openflow/nicira-ext.h
>> index 91c96b3..f98ab89 100644
>> --- a/include/openflow/nicira-ext.h
>> +++ b/include/openflow/nicira-ext.h
>> @@ -1579,9 +1579,9 @@ OFP_ASSERT(sizeof(struct nx_action_output_reg) == 24);
>>
>> /* Tunnel ID.
>>  *
>> - * For a packet received via a GRE or VXLAN tunnel including a (32-bit) 
>> key, the
>> - * key is stored in the low 32-bits and the high bits are zeroed.  For other
>> - * packets, the value is 0.
>> + * For a packet received via a GRE, VXLAN or LISP tunnel including a 
>> (32-bit)
>> + * key, the key is stored in the low 32-bits and the high bits are zeroed.  
>> For
>> + * other packets, the value is 0.
>>  *
>>  * All zero bits, for packets not received via a keyed tunnel.
>>  *
>> diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
>> index 60437b9..8020412 100644
>> --- a/lib/netdev-vport.c
>> +++ b/lib/netdev-vport.c
>> @@ -186,6 +186,9 @@ netdev_vport_get_netdev_type(const struct 
>> dpif_linux_vport *vport)
>>     case OVS_VPORT_TYPE_VXLAN:
>>         return "vxlan";
>>
>> +    case OVS_VPORT_TYPE_LISP:
>> +        return "lisp";
>> +
>>     case OVS_VPORT_TYPE_FT_GRE:
>>     case __OVS_VPORT_TYPE_MAX:
>>         break;
>> @@ -915,6 +918,7 @@ netdev_vport_register(void)
>>         TUNNEL_CLASS("ipsec_gre64", OVS_VPORT_TYPE_GRE64),
>>         TUNNEL_CLASS("capwap", OVS_VPORT_TYPE_CAPWAP),
>>         TUNNEL_CLASS("vxlan", OVS_VPORT_TYPE_VXLAN),
>> +        TUNNEL_CLASS("lisp", OVS_VPORT_TYPE_LISP),
>>
>>         { OVS_VPORT_TYPE_PATCH,
>>           { "patch", VPORT_FUNCTIONS(NULL, NULL) },
>> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
>> index 18643c2..295e41c 100644
>> --- a/vswitchd/vswitch.xml
>> +++ b/vswitchd/vswitch.xml
>> @@ -1278,6 +1278,15 @@
>>          </p>
>>           </dd>
>>
>> +          <dt><code>lisp</code></dt>
>> +          <dd>
>> +            A layer 3 tunnel over the experimental, UDP-based LISP
>> +            protocol described at
>> +            <code>http://tools.ietf.org/html/draft-ietf-lisp-24</code>.
>> +            LISP is currently supported only with the Linux kernel datapath
>> +            with kernel version 2.6.26 or later.
>> +          </dd>
>> +
>>           <dt><code>patch</code></dt>
>>           <dd>
>>             A pair of virtual devices that act as a patch cable.
>> -- 
>> 1.7.11.7
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> http://openvswitch.org/mailman/listinfo/dev
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
> 
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to