On Sun, Jun 17, 2018 at 08:18:19AM -0700, dsah...@kernel.org wrote: > From: David Ahern <dsah...@gmail.com> > > For ACLs implemented using either FIB rules or FIB entries, the BPF > program needs the FIB lookup status to be able to drop the packet. Except BPF_FIB_LKUP_RET_SUCCESS and BPF_FIB_LKUP_RET_NO_NEIGH, can you give an example on how the xdp_prog may decide XDP_PASS vs XDP_DROP based on other BPF_FIB_LKUP_RET_*?
> Since the bpf_fib_lookup API has not reached a released kernel yet, > change the return code to contain an encoding of the FIB lookup > result and return the nexthop device index in the params struct. > > In addition, inform the BPF program of any post FIB lookup reason as > to why the packet needs to go up the stack. > > Update the sample program per the change in API. > > Signed-off-by: David Ahern <dsah...@gmail.com> > --- > include/uapi/linux/bpf.h | 28 ++++++++++++++---- > net/core/filter.c | 74 > ++++++++++++++++++++++++++++++++-------------- > samples/bpf/xdp_fwd_kern.c | 8 ++--- > 3 files changed, 78 insertions(+), 32 deletions(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 59b19b6a40d7..ceb80071c341 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -1857,7 +1857,8 @@ union bpf_attr { > * is resolved), the nexthop address is returned in ipv4_dst > * or ipv6_dst based on family, smac is set to mac address of > * egress device, dmac is set to nexthop mac address, rt_metric > - * is set to metric from route (IPv4/IPv6 only). > + * is set to metric from route (IPv4/IPv6 only), and ifindex > + * is set to the device index of the nexthop from the FIB lookup. > * > * *plen* argument is the size of the passed in struct. > * *flags* argument can be a combination of one or more of the > @@ -1873,9 +1874,9 @@ union bpf_attr { > * *ctx* is either **struct xdp_md** for XDP programs or > * **struct sk_buff** tc cls_act programs. > * Return > - * Egress device index on success, 0 if packet needs to continue > - * up the stack for further processing or a negative error in > case > - * of failure. > + * < 0 if any input argument is invalid > + * 0 on success (packet is forwarded and nexthop neighbor exists) > + * > 0 one of BPF_FIB_LKUP_RET_ codes on FIB lookup response > * > * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map > *map, void *key, u64 flags) > * Description > @@ -2612,6 +2613,19 @@ struct bpf_raw_tracepoint_args { > #define BPF_FIB_LOOKUP_DIRECT BIT(0) > #define BPF_FIB_LOOKUP_OUTPUT BIT(1) > > +enum { > + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ > + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed */ > + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable */ > + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed */ > + BPF_FIB_LKUP_RET_NOT_FWDED, /* pkt is not forwardded */ BPF_FIB_LKUP_RET_NOT_FWDED is a catch all? > + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ > + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires unsupported encap */ > + BPF_FIB_LKUP_RET_NO_NHDEV, /* nh device does not exist */ > + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neigh entry for nh */ > + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* pkt too big to fwd */ > +}; > + > struct bpf_fib_lookup { > /* input: network family for lookup (AF_INET, AF_INET6) > * output: network family of egress nexthop > @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup { > > /* total length of packet from network header - used for MTU check */ > __u16 tot_len; > - __u32 ifindex; /* L3 device index for lookup */ > + > + /* input: L3 device index for lookup > + * output: nexthop device index from FIB lookup > + */ > + __u32 ifindex; > > union { > /* inputs to lookup */ > diff --git a/net/core/filter.c b/net/core/filter.c > index e7f12e9f598c..e758ca487878 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup > *params, > memcpy(params->smac, dev->dev_addr, ETH_ALEN); > params->h_vlan_TCI = 0; > params->h_vlan_proto = 0; > + params->ifindex = dev->ifindex; > > - return dev->ifindex; > + return 0; > } > #endif > > @@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct > bpf_fib_lookup *params, > /* verify forwarding is enabled on this interface */ > in_dev = __in_dev_get_rcu(dev); > if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) > - return 0; > + return BPF_FIB_LKUP_RET_FWD_DISABLED; > > if (flags & BPF_FIB_LOOKUP_OUTPUT) { > fl4.flowi4_iif = 1; > @@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct > bpf_fib_lookup *params, > > tb = fib_get_table(net, tbid); > if (unlikely(!tb)) > - return 0; > + return BPF_FIB_LKUP_RET_NOT_FWDED; > > err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); > } else { > @@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct > bpf_fib_lookup *params, > err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); > } > > - if (err || res.type != RTN_UNICAST) > - return 0; > + if (err) { > + /* map fib lookup errors to RTN_ type */ > + if (err == -EINVAL) > + return BPF_FIB_LKUP_RET_BLACKHOLE; > + if (err == -EHOSTUNREACH) > + return BPF_FIB_LKUP_RET_UNREACHABLE; > + if (err == -EACCES) > + return BPF_FIB_LKUP_RET_PROHIBIT; > + > + return BPF_FIB_LKUP_RET_NOT_FWDED; > + } > + > + if (res.type != RTN_UNICAST) > + return BPF_FIB_LKUP_RET_NOT_FWDED; > > if (res.fi->fib_nhs > 1) > fib_select_path(net, &res, &fl4, NULL); > @@ -4144,18 +4157,18 @@ static int bpf_ipv4_fib_lookup(struct net *net, > struct bpf_fib_lookup *params, > if (check_mtu) { > mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); > if (params->tot_len > mtu) > - return 0; > + return BPF_FIB_LKUP_RET_FRAG_NEEDED; > } > > nh = &res.fi->fib_nh[res.nh_sel]; > > /* do not handle lwt encaps right now */ > if (nh->nh_lwtstate) > - return 0; > + return BPF_FIB_LKUP_RET_UNSUPP_LWT; > > dev = nh->nh_dev; > if (unlikely(!dev)) > - return 0; > + return BPF_FIB_LKUP_RET_NO_NHDEV; > > if (nh->nh_gw) > params->ipv4_dst = nh->nh_gw; > @@ -4166,10 +4179,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, > struct bpf_fib_lookup *params, > * rcu_read_lock_bh is not needed here > */ > neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); > - if (neigh) > - return bpf_fib_set_fwd_params(params, neigh, dev); > + if (!neigh) > + return BPF_FIB_LKUP_RET_NO_NEIGH; > > - return 0; > + return bpf_fib_set_fwd_params(params, neigh, dev); > } > #endif > > @@ -4190,7 +4203,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct > bpf_fib_lookup *params, > > /* link local addresses are never forwarded */ > if (rt6_need_strict(dst) || rt6_need_strict(src)) > - return 0; > + return BPF_FIB_LKUP_RET_NOT_FWDED; > > dev = dev_get_by_index_rcu(net, params->ifindex); > if (unlikely(!dev)) > @@ -4198,7 +4211,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct > bpf_fib_lookup *params, > > idev = __in6_dev_get_safely(dev); > if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) > - return 0; > + return BPF_FIB_LKUP_RET_FWD_DISABLED; > > if (flags & BPF_FIB_LOOKUP_OUTPUT) { > fl6.flowi6_iif = 1; > @@ -4225,7 +4238,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct > bpf_fib_lookup *params, > > tb = ipv6_stub->fib6_get_table(net, tbid); > if (unlikely(!tb)) > - return 0; > + return BPF_FIB_LKUP_RET_NOT_FWDED; > > f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); > } else { > @@ -4238,11 +4251,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, > struct bpf_fib_lookup *params, > } > > if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) > - return 0; > + return BPF_FIB_LKUP_RET_NOT_FWDED; > + > + if (unlikely(f6i->fib6_flags & RTF_REJECT)) { > + switch (f6i->fib6_type) { > + case RTN_BLACKHOLE: > + return BPF_FIB_LKUP_RET_BLACKHOLE; > + case RTN_UNREACHABLE: > + return BPF_FIB_LKUP_RET_UNREACHABLE; > + case RTN_PROHIBIT: > + return BPF_FIB_LKUP_RET_PROHIBIT; > + default: > + return BPF_FIB_LKUP_RET_NOT_FWDED; > + } > + } > > - if (unlikely(f6i->fib6_flags & RTF_REJECT || > - f6i->fib6_type != RTN_UNICAST)) > - return 0; > + if (f6i->fib6_type != RTN_UNICAST) > + return BPF_FIB_LKUP_RET_NOT_FWDED; > > if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) > f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, > @@ -4252,16 +4277,19 @@ static int bpf_ipv6_fib_lookup(struct net *net, > struct bpf_fib_lookup *params, > if (check_mtu) { > mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); > if (params->tot_len > mtu) > - return 0; > + return BPF_FIB_LKUP_RET_FRAG_NEEDED; > } > > if (f6i->fib6_nh.nh_lwtstate) > - return 0; > + return BPF_FIB_LKUP_RET_UNSUPP_LWT; > > if (f6i->fib6_flags & RTF_GATEWAY) > *dst = f6i->fib6_nh.nh_gw; > > dev = f6i->fib6_nh.nh_dev; > + if (unlikely(!dev)) > + return BPF_FIB_LKUP_RET_NO_NHDEV; Is this a bug fix? > + > params->rt_metric = f6i->fib6_metric; > > /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is > @@ -4270,10 +4298,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, > struct bpf_fib_lookup *params, > */ > neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, > ndisc_hashfn, dst, dev); > - if (neigh) > - return bpf_fib_set_fwd_params(params, neigh, dev); > + if (!neigh) > + return BPF_FIB_LKUP_RET_NO_NEIGH; > > - return 0; > + return bpf_fib_set_fwd_params(params, neigh, dev); > } > #endif > > diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c > index 6673cdb9f55c..a7e94e7ff87d 100644 > --- a/samples/bpf/xdp_fwd_kern.c > +++ b/samples/bpf/xdp_fwd_kern.c > @@ -48,9 +48,9 @@ static __always_inline int xdp_fwd_flags(struct xdp_md > *ctx, u32 flags) > struct ethhdr *eth = data; > struct ipv6hdr *ip6h; > struct iphdr *iph; > - int out_index; > u16 h_proto; > u64 nh_off; > + int rc; > > nh_off = sizeof(*eth); > if (data + nh_off > data_end) > @@ -101,7 +101,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md > *ctx, u32 flags) > > fib_params.ifindex = ctx->ingress_ifindex; > > - out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); > + rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); > > /* verify egress index has xdp support > * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with > @@ -109,7 +109,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md > *ctx, u32 flags) > * NOTE: without verification that egress index supports XDP > * forwarding packets are dropped. > */ > - if (out_index > 0) { > + if (rc == 0) { > if (h_proto == htons(ETH_P_IP)) > ip_decrease_ttl(iph); > else if (h_proto == htons(ETH_P_IPV6)) > @@ -117,7 +117,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md > *ctx, u32 flags) > > memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); > memcpy(eth->h_source, fib_params.smac, ETH_ALEN); > - return bpf_redirect_map(&tx_port, out_index, 0); > + return bpf_redirect_map(&tx_port, fib_params.ifindex, 0); > } > > return XDP_PASS; > -- > 2.11.0 >