On 01/31/2019 12:51 AM, Peter Oskolkov wrote: > This patch implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap > BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN > and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers > to packets (e.g. IP/GRE, GUE, IPIP). > > This is useful when thousands of different short-lived flows should be > encapped, each with different and dynamically determined destination. > Although lwtunnels can be used in some of these scenarios, the ability > to dynamically generate encap headers adds more flexibility, e.g. > when routing depends on the state of the host (reflected in global bpf > maps). > > Signed-off-by: Peter Oskolkov <p...@google.com> > --- > include/net/lwtunnel.h | 3 +++ > net/core/filter.c | 3 ++- > net/core/lwt_bpf.c | 59 ++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 64 insertions(+), 1 deletion(-) > > diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h > index 33fd9ba7e0e5..f0973eca8036 100644 > --- a/include/net/lwtunnel.h > +++ b/include/net/lwtunnel.h > @@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct > lwtunnel_state *b); > int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); > int lwtunnel_input(struct sk_buff *skb); > int lwtunnel_xmit(struct sk_buff *skb); > +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, > + bool ingress); > > static inline void lwtunnel_set_redirect(struct dst_entry *dst) > { > @@ -138,6 +140,7 @@ static inline void lwtunnel_set_redirect(struct dst_entry > *dst) > dst->input = lwtunnel_input; > } > } > + > #else > > static inline void lwtstate_free(struct lwtunnel_state *lws) > diff --git a/net/core/filter.c b/net/core/filter.c > index 27d3fbe4b77b..de6bd4b4e0a3 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -73,6 +73,7 @@ > #include <linux/seg6_local.h> > #include <net/seg6.h> > #include <net/seg6_local.h> > +#include <net/lwtunnel.h> > > /** > * sk_filter_trim_cap - run a packet through a socket filter > @@ -4804,7 +4805,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 > type, void *hdr, u32 len > static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, > bool ingress) > { > - return -EINVAL; /* Implemented in the next patch. */ > + return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); > } > > BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, > hdr, > diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c > index a648568c5e8f..6a6e9acab73d 100644 > --- a/net/core/lwt_bpf.c > +++ b/net/core/lwt_bpf.c > @@ -390,6 +390,65 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = { > .owner = THIS_MODULE, > }; > > +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool > ingress) > +{ > + struct iphdr *iph; > + bool ipv4; > + int err; > + > + if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) > + return -EINVAL; > + > + /* validate protocol and length */ > + iph = (struct iphdr *)hdr; > + if (iph->version == 4) { > + ipv4 = true; > + if (unlikely(len < iph->ihl * 4)) > + return -EINVAL; > + } else if (iph->version == 6) { > + ipv4 = false; > + if (unlikely(len < sizeof(struct ipv6hdr))) > + return -EINVAL; > + } else { > + return -EINVAL; > + } > + > + if (ingress) > + err = skb_cow_head(skb, len + skb->mac_len); > + else > + err = skb_cow_head(skb, > + len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); > + if (unlikely(err)) > + return err; > + > + /* push the encap headers and fix pointers */ > + skb_reset_inner_headers(skb); > + skb->encapsulation = 1; > + skb_push(skb, len); > + if (ingress) > + skb_postpush_rcsum(skb, iph, len); > + skb_reset_network_header(skb); > + memcpy(skb_network_header(skb), hdr, len); > + bpf_compute_data_pointers(skb);
Does this work transparently with GSO as well or would we need to update shared info for this (like in nat64 case, for example)? > + if (ipv4) { > + skb->protocol = htons(ETH_P_IP); > + iph = ip_hdr(skb); > + if (iph->ihl * 4 < len) > + skb_set_transport_header(skb, iph->ihl * 4); > + > + if (!iph->check) > + iph->check = ip_fast_csum((unsigned char *)iph, > + iph->ihl); > + } else { > + skb->protocol = htons(ETH_P_IPV6); > + if (sizeof(struct ipv6hdr) < len) > + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); > + } > + > + return 0; > +} > + > static int __init bpf_lwt_init(void) > { > return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); >