On Wed, Nov 9, 2016 at 6:57 PM Martin KaFai Lau <ka...@fb.com> wrote: > > If the bpf program calls bpf_redirect(dev, 0) and dev is > an ipip/ip6tnl, it currently includes the mac header. > e.g. If dev is ipip, the end result is IP-EthHdr-IP instead > of IP-IP. > > The fix is to pull the mac header. At ingress, skb_postpull_rcsum() > is not needed because the ethhdr should have been pulled once already > and then got pushed back just before calling the bpf_prog. > At egress, this patch calls skb_postpull_rcsum(). > > If bpf_redirect(dev, BPF_F_INGRESS) is called, > it also fails now because it calls dev_forward_skb() which > eventually calls eth_type_trans(skb, dev). The eth_type_trans() > will set skb->type = PACKET_OTHERHOST because the mac address > does not match the redirecting dev->dev_addr. The PACKET_OTHERHOST > will eventually cause the ip_rcv() errors out. To fix this, > ____dev_forward_skb() is added. > > Joint work with Daniel Borkmann. > > Fixes: cfc7381b3002 ("ip_tunnel: add collect_md mode to IPIP tunnel") > Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels") > Acked-by: Daniel Borkmann <dan...@iogearbox.net> > Acked-by: Alexei Starovoitov <a...@fb.com> > Signed-off-by: Martin KaFai Lau <ka...@fb.com> > --- > include/linux/netdevice.h | 15 +++++++++++ > net/core/dev.c | 17 +++++------- > net/core/filter.c | 68 > +++++++++++++++++++++++++++++++++++++++++------ > 3 files changed, 81 insertions(+), 19 deletions(-) > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 91ee364..bf04a46 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -3354,6 +3354,21 @@ int dev_forward_skb(struct net_device *dev, struct > sk_buff *skb); > bool is_skb_forwardable(const struct net_device *dev, > const struct sk_buff *skb); > > +static __always_inline int ____dev_forward_skb(struct net_device *dev, > + struct sk_buff *skb) > +{ > + if (skb_orphan_frags(skb, GFP_ATOMIC) || > + unlikely(!is_skb_forwardable(dev, skb))) { > + atomic_long_inc(&dev->rx_dropped); > + kfree_skb(skb); > + return NET_RX_DROP; > + } > + > + skb_scrub_packet(skb, true); > + skb->priority = 0; > + return 0; > +} > + > void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); > > extern int netdev_budget; > diff --git a/net/core/dev.c b/net/core/dev.c > index eaad4c2..6666b28 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -1766,19 +1766,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable); > > int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) > { > - if (skb_orphan_frags(skb, GFP_ATOMIC) || > - unlikely(!is_skb_forwardable(dev, skb))) { > - atomic_long_inc(&dev->rx_dropped); > - kfree_skb(skb); > - return NET_RX_DROP; > - } > + int ret = ____dev_forward_skb(dev, skb); > > - skb_scrub_packet(skb, true); > - skb->priority = 0; > - skb->protocol = eth_type_trans(skb, dev); > - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); > + if (likely(!ret)) { > + skb->protocol = eth_type_trans(skb, dev); > + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); > + } > > - return 0; > + return ret; > } > EXPORT_SYMBOL_GPL(__dev_forward_skb); > > diff --git a/net/core/filter.c b/net/core/filter.c > index 00351cd..b391209 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -1628,6 +1628,19 @@ static inline int __bpf_rx_skb(struct net_device *dev, > struct sk_buff *skb) > return dev_forward_skb(dev, skb); > } > > +static inline int __bpf_rx_skb_no_mac(struct net_device *dev, > + struct sk_buff *skb) > +{ > + int ret = ____dev_forward_skb(dev, skb); > + > + if (likely(!ret)) { > + skb->dev = dev; > + ret = netif_rx(skb); > + } > + > + return ret; > +} > + > static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) > { > int ret; > @@ -1647,6 +1660,51 @@ static inline int __bpf_tx_skb(struct net_device *dev, > struct sk_buff *skb) > return ret; > } > > +static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, > + u32 flags) > +{ > + /* skb->mac_len is not set on normal egress */ > + unsigned int mlen = skb->network_header - skb->mac_header; > + > + __skb_pull(skb, mlen);
syzbot found an issue when redirecting an LWT_XMIT program using BPF_PROG_TEST_RUN. That path produces packets with skb->data at skb->network_header, as is_l2 is false in bpf_prog_test_run_skb. I think the correct fix here is to pull from skb->data up to skb->network_header, instead of unconditionally from skb->mac_header: - unsigned int mlen = skb->network_header - skb->mac_header; + unsigned int mlen = skb_network_offset(skb); - __skb_pull(skb, mlen); - - /* At ingress, the mac header has already been pulled once. - * At egress, skb_pospull_rcsum has to be done in case that - * the skb is originated from ingress (i.e. a forwarded skb) - * to ensure that rcsum starts at net header. - */ - if (!skb_at_tc_ingress(skb)) - skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); + if (mlen) { + __skb_pull(skb, mlen); + + /* At ingress, the mac header has already been pulled once. + * At egress, skb_pospull_rcsum has to be done in case that + * the skb is originated from ingress (i.e. a forwarded skb) + * to ensure that rcsum starts at net header. + */ + if (!skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); + } Comments, concerns? > + > + /* At ingress, the mac header has already been pulled once. > + * At egress, skb_pospull_rcsum has to be done in case that > + * the skb is originated from ingress (i.e. a forwarded skb) > + * to ensure that rcsum starts at net header. > + */ > + if (!skb_at_tc_ingress(skb)) > + skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); > + skb_pop_mac_header(skb); > + skb_reset_mac_len(skb); > + return flags & BPF_F_INGRESS ? > + __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); > +} > + > +static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, > + u32 flags) > +{ This function has since been extended with check + /* Verify that a link layer header is carried */ + if (unlikely(skb->mac_header >= skb->network_header)) { + kfree_skb(skb); + return -ERANGE; + } + I haven't checked yet, but I think that this will stillbe incorrect from LWT_XMIT to a destination that expects skb->data at skb->mac_header. > + bpf_push_mac_rcsum(skb); > + return flags & BPF_F_INGRESS ? > + __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); > +}