Robert Shearman <rshea...@brocade.com> writes:

> Allow creating an mpls device for the purposes of encapsulating IP
> packets with:
>
>   ip link add type ipmpls
>
> This device defines its per-nexthop encapsulation data as a stack of
> labels, in the same format as for RTA_NEWST. It uses the encap data
> which will have been stored in the IP route to encapsulate the packet
> with that stack of labels, with the last label corresponding to a
> local label that defines how the packet will be sent out. The device
> sends packets over loopback to the local MPLS forwarding logic which
> performs all of the work.
>
> Stats are implemented, although any error in the sending via the real
> interface will be handled by the main mpls forwarding code and so not
> accounted by the interface.

Eeek stats!  Lots of unnecessary overhead.  If stats were ok we could
have simply reduced the cost of struct net_device to the point where it
would not matter.

This is really a bad hack for not getting in and being able to set
dst_output the way the xfrm infrastructure does.

What we really want here is xfrm-lite.  By lite I mean the tunnel
selection criteria is simple enough that it fits into the normal
routing table instead of having to do weird flow based magic that
is rarely needed.

I believe what we want are the xfrm stacking of dst entries.

Eric


> This implementation is based on an alternative earlier implementation
> by Eric W. Biederman.
>
> Signed-off-by: Robert Shearman <rshea...@brocade.com>
> ---
>  include/uapi/linux/if_arp.h |   1 +
>  net/mpls/Kconfig            |   5 +
>  net/mpls/Makefile           |   1 +
>  net/mpls/af_mpls.c          |   2 +
>  net/mpls/ipmpls.c           | 284 
> ++++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 293 insertions(+)
>  create mode 100644 net/mpls/ipmpls.c
>
> diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h
> index 4d024d75d64b..17d669fd1781 100644
> --- a/include/uapi/linux/if_arp.h
> +++ b/include/uapi/linux/if_arp.h
> @@ -88,6 +88,7 @@
>  #define ARPHRD_IEEE80211_RADIOTAP 803        /* IEEE 802.11 + radiotap 
> header */
>  #define ARPHRD_IEEE802154      804
>  #define ARPHRD_IEEE802154_MONITOR 805        /* IEEE 802.15.4 network 
> monitor */
> +#define ARPHRD_MPLS  806             /* IP and IPv6 over MPLS tunnels */
>  
>  #define ARPHRD_PHONET        820             /* PhoNet media type            
> */
>  #define ARPHRD_PHONET_PIPE 821               /* PhoNet pipe header           
> */
> diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig
> index 17bde799c854..5264da94733a 100644
> --- a/net/mpls/Kconfig
> +++ b/net/mpls/Kconfig
> @@ -27,4 +27,9 @@ config MPLS_ROUTING
>       help
>        Add support for forwarding of mpls packets.
>  
> +config MPLS_IPTUNNEL
> +     tristate "MPLS: IP over MPLS tunnel support"
> +     help
> +      A network device that encapsulates ip packets as mpls
> +
>  endif # MPLS
> diff --git a/net/mpls/Makefile b/net/mpls/Makefile
> index 65bbe68c72e6..3a93c14b23c5 100644
> --- a/net/mpls/Makefile
> +++ b/net/mpls/Makefile
> @@ -3,5 +3,6 @@
>  #
>  obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o
>  obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o
> +obj-$(CONFIG_MPLS_IPTUNNEL) += ipmpls.o
>  
>  mpls_router-y := af_mpls.o
> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index 7b3f732269e4..68bdfbdddfaf 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -615,6 +615,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype,
>  
>       return 0;
>  }
> +EXPORT_SYMBOL(nla_put_labels);
>  
>  int nla_get_labels(const struct nlattr *nla,
>                  u32 max_labels, u32 *labels, u32 label[])
> @@ -660,6 +661,7 @@ int nla_get_labels(const struct nlattr *nla,
>       *labels = nla_labels;
>       return 0;
>  }
> +EXPORT_SYMBOL(nla_get_labels);
>  
>  static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>                              struct mpls_route_config *cfg)
> diff --git a/net/mpls/ipmpls.c b/net/mpls/ipmpls.c
> new file mode 100644
> index 000000000000..cf6894ae0c61
> --- /dev/null
> +++ b/net/mpls/ipmpls.c
> @@ -0,0 +1,284 @@
> +#include <linux/types.h>
> +#include <linux/netdevice.h>
> +#include <linux/if_vlan.h>
> +#include <linux/if_arp.h>
> +#include <linux/ip.h>
> +#include <linux/ipv6.h>
> +#include <linux/module.h>
> +#include <linux/mpls.h>
> +#include "internal.h"
> +
> +static LIST_HEAD(ipmpls_dev_list);
> +
> +#define MAX_NEW_LABELS 2
> +
> +struct ipmpls_dev_priv {
> +     struct net_device *out_dev;
> +     struct list_head list;
> +     struct net_device *dev;
> +};
> +
> +static netdev_tx_t ipmpls_dev_xmit(struct sk_buff *skb, struct net_device 
> *dev)
> +{
> +     struct ipmpls_dev_priv *priv = netdev_priv(dev);
> +     struct net_device *out_dev = priv->out_dev;
> +     struct mpls_shim_hdr *hdr;
> +     bool bottom_of_stack = true;
> +     int len = skb->len;
> +     const void *encap;
> +     int num_labels;
> +     unsigned ttl;
> +     const u32 *labels;
> +     int ret;
> +     int i;
> +
> +     num_labels = dst_get_encap(skb, &encap) / 4;
> +     if (!num_labels)
> +             goto drop;
> +
> +     labels = encap;
> +
> +     /* Obtain the ttl */
> +     if (skb->protocol == htons(ETH_P_IP)) {
> +             ttl = ip_hdr(skb)->ttl;
> +     } else if (skb->protocol == htons(ETH_P_IPV6)) {
> +             ttl = ipv6_hdr(skb)->hop_limit;
> +     } else if (skb->protocol == htons(ETH_P_MPLS_UC)) {
> +             ttl = mpls_entry_decode(mpls_hdr(skb)).ttl;
> +             bottom_of_stack = false;
> +     } else {
> +             goto drop;
> +     }
> +
> +     /* Now that the encap has been retrieved, there's no longer
> +      * any need to keep the dst around so clear it out.
> +      */
> +     skb_dst_drop(skb);
> +     skb_orphan(skb);
> +
> +     skb->inner_protocol = skb->protocol;
> +     skb->inner_network_header = skb->network_header;
> +
> +     skb_push(skb, num_labels * sizeof(*hdr));
> +     skb_reset_network_header(skb);
> +     hdr = mpls_hdr(skb);
> +
> +     for (i = num_labels - 1; i >= 0; i--) {
> +             hdr[i] = mpls_entry_encode(labels[i], ttl, 0, bottom_of_stack);
> +             bottom_of_stack = false;
> +     }
> +
> +     skb->dev = out_dev;
> +     skb->protocol = htons(ETH_P_MPLS_UC);
> +
> +     ret = dev_hard_header(skb, out_dev, ETH_P_MPLS_UC,
> +                           out_dev->dev_addr, NULL, len);
> +     if (ret >= 0)
> +             ret = dev_queue_xmit(skb);
> +     if (ret)
> +             goto drop;
> +
> +     dev->stats.tx_packets++;
> +     dev->stats.tx_bytes += len;
> +
> +     return 0;
> +
> +drop:
> +     dev->stats.tx_dropped++;
> +     kfree_skb(skb);
> +     return NETDEV_TX_OK;
> +}
> +
> +static int ipmpls_dev_init(struct net_device *dev)
> +{
> +     struct ipmpls_dev_priv *priv = netdev_priv(dev);
> +
> +     list_add_tail(&priv->list, &ipmpls_dev_list);
> +
> +     return 0;
> +}
> +
> +static void ipmpls_dev_uninit(struct net_device *dev)
> +{
> +     struct ipmpls_dev_priv *priv = netdev_priv(dev);
> +
> +     list_del_init(&priv->list);
> +}
> +
> +static void ipmpls_dev_free(struct net_device *dev)
> +{
> +     free_netdev(dev);
> +}
> +
> +static const struct net_device_ops ipmpls_netdev_ops = {
> +     .ndo_init               = ipmpls_dev_init,
> +     .ndo_start_xmit         = ipmpls_dev_xmit,
> +     .ndo_uninit             = ipmpls_dev_uninit,
> +};
> +
> +#define IPMPLS_FEATURES (NETIF_F_SG |                        \
> +                      NETIF_F_FRAGLIST |             \
> +                      NETIF_F_HIGHDMA |              \
> +                      NETIF_F_VLAN_CHALLENGED)
> +
> +static void ipmpls_dev_setup(struct net_device *dev)
> +{
> +     dev->netdev_ops         = &ipmpls_netdev_ops;
> +
> +     dev->type               = ARPHRD_MPLS;
> +     dev->flags              = IFF_NOARP;
> +     netif_keep_dst(dev);
> +     dev->addr_len           = 0;
> +     dev->features           |= NETIF_F_LLTX;
> +     dev->features           |= IPMPLS_FEATURES;
> +     dev->hw_features        |= IPMPLS_FEATURES;
> +     dev->vlan_features      = 0;
> +
> +     dev->destructor = ipmpls_dev_free;
> +}
> +
> +static int ipmpls_dev_validate(struct nlattr *tb[], struct nlattr *data[])
> +{
> +     return 0;
> +}
> +
> +static int ipmpls_dev_newlink(struct net *src_net, struct net_device *dev,
> +                           struct nlattr *tb[], struct nlattr *data[])
> +{
> +     struct ipmpls_dev_priv *priv = netdev_priv(dev);
> +
> +     priv->out_dev = src_net->loopback_dev;
> +     priv->dev = dev;
> +
> +     dev->hard_header_len =
> +             priv->out_dev->hard_header_len +
> +             sizeof(struct mpls_shim_hdr) * MAX_NEW_LABELS;
> +
> +     return register_netdevice(dev);
> +}
> +
> +static void ipmpls_dev_dellink(struct net_device *dev, struct list_head 
> *head)
> +{
> +     unregister_netdevice_queue(dev, head);
> +}
> +
> +static int ipmpls_dev_parse_encap(const struct net_device *dev,
> +                               const struct nlattr *nla,
> +                               void *encap)
> +{
> +     u32 labels;
> +
> +     if (nla_len(nla) / 4 > MAX_NEW_LABELS)
> +             return -EINVAL;
> +
> +     if (encap && nla_get_labels(nla, MAX_NEW_LABELS, &labels, encap))
> +             return -EINVAL;
> +
> +     /* Stored encap size is the same as the rtnl encap len */
> +     return nla_len(nla);
> +}
> +
> +static int ipmpls_dev_fill_encap(const struct net_device *dev,
> +                              struct sk_buff *skb, int encap_len,
> +                              const void *encap)
> +{
> +     return nla_put_labels(skb, RTA_ENCAP, encap_len / 4, encap);
> +}
> +
> +static int ipmpls_dev_match_encap(const struct net_device *dev,
> +                               const struct nlattr *nla, int encap_len,
> +                               const void *encap)
> +{
> +     unsigned nla_labels;
> +     struct mpls_shim_hdr *nla_label;
> +     const u32 *stored_labels = encap;
> +     int i;
> +
> +     /* Stored encap size is the same as the rtnl encap len */
> +     if (nla_len(nla) != encap_len)
> +             return 1;
> +
> +     nla_labels = nla_len(nla) / 4;
> +     nla_label = nla_data(nla);
> +
> +     for (i = 0; i < nla_labels; i++) {
> +             struct mpls_entry_decoded dec;
> +
> +             dec = mpls_entry_decode(nla_label + i);
> +
> +             if (stored_labels[i] != dec.label)
> +                     return 1;
> +     }
> +
> +     return 0;
> +}
> +
> +static struct rtnl_link_ops ipmpls_ops = {
> +     .kind           = "ipmpls",
> +     .priv_size      = sizeof(struct ipmpls_dev_priv),
> +     .setup          = ipmpls_dev_setup,
> +     .validate       = ipmpls_dev_validate,
> +     .newlink        = ipmpls_dev_newlink,
> +     .dellink        = ipmpls_dev_dellink,
> +     .parse_encap    = ipmpls_dev_parse_encap,
> +     .fill_encap     = ipmpls_dev_fill_encap,
> +     .match_encap    = ipmpls_dev_match_encap,
> +};
> +
> +static int ipmpls_dev_notify(struct notifier_block *this, unsigned long 
> event,
> +                          void *ptr)
> +{
> +     struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> +
> +     if (event == NETDEV_UNREGISTER) {
> +             struct ipmpls_dev_priv *priv, *priv2;
> +             LIST_HEAD(list_kill);
> +
> +             /* Ignore netns device moves */
> +             if (dev->reg_state != NETREG_UNREGISTERING)
> +                     goto done;
> +
> +             list_for_each_entry_safe(priv, priv2, &ipmpls_dev_list, list) {
> +                     if (priv->out_dev != dev)
> +                             continue;
> +
> +                     ipmpls_dev_dellink(priv->dev, &list_kill);
> +             }
> +             unregister_netdevice_many(&list_kill);
> +     }
> +done:
> +     return NOTIFY_OK;
> +}
> +
> +static struct notifier_block ipmpls_dev_notifier = {
> +     .notifier_call = ipmpls_dev_notify,
> +};
> +
> +static int __init ipmpls_init(void)
> +{
> +     int err;
> +
> +     err = register_netdevice_notifier(&ipmpls_dev_notifier);
> +     if (err)
> +             goto out;
> +
> +     err = rtnl_link_register(&ipmpls_ops);
> +     if (err)
> +             goto out_unregister_notifier;
> +out:
> +     return err;
> +out_unregister_notifier:
> +     unregister_netdevice_notifier(&ipmpls_dev_notifier);
> +     goto out;
> +}
> +module_init(ipmpls_init);
> +
> +static void __exit ipmpls_exit(void)
> +{
> +     rtnl_link_unregister(&ipmpls_ops);
> +     unregister_netdevice_notifier(&ipmpls_dev_notifier);
> +}
> +module_exit(ipmpls_exit);
> +
> +MODULE_LICENSE("GPL v2");
> +MODULE_ALIAS_RTNL_LINK("ipmpls");
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to