On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote: > An XDP program can redirect packets between interfaces using either the > xdp_redirect() helper or the xdp_redirect_map() helper. Apart from the > flexibility of updating maps from userspace, the redirect_map() helper also > uses the map structure to batch packets, which results in a significant > (around 50%) performance boost. However, the xdp_redirect() API is simpler > if one just wants to redirect to another interface, which means people tend > to use this interface and then wonder why they getter worse performance > than expected. > > This patch seeks to close this performance difference between the two APIs. > It achieves this by changing xdp_redirect() to use a hidden devmap for > looking up destination interfaces, thus gaining the batching benefit with > no visible difference from the user API point of view. > > A hidden per-namespace map is allocated when an XDP program that uses the > non-map xdp_redirect() helper is first loaded. This map is populated with > all available interfaces in its namespace, and kept up to date as > interfaces come and go. Once allocated, the map is kept around until the > namespace is removed. > > The hidden map uses the ifindex as map key, which means they are limited to > ifindexes smaller than the map size of 64. A later patch introduces a new > map type to lift this restriction. > > Performance numbers: > > Before patch: > xdp_redirect: 5426035 pkt/s > xdp_redirect_map: 8412754 pkt/s > > After patch: > xdp_redirect: 8314702 pkt/s > xdp_redirect_map: 8411854 pkt/s > > This corresponds to a 53% increase in xdp_redirect performance, or a > reduction in per-packet processing time by 64 nanoseconds. > > Signed-off-by: Toke Høiland-Jørgensen <t...@redhat.com>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index de18227b3d95..060c5850af3b 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -25,6 +25,7 @@ struct sock; > struct seq_file; > struct btf; > struct btf_type; > +struct net; > > /* map is generic key/value storage optionally accesible by eBPF programs */ > struct bpf_map_ops { > @@ -533,6 +534,7 @@ extern const struct bpf_verifier_ops > tc_cls_act_analyzer_ops; > extern const struct bpf_verifier_ops xdp_analyzer_ops; > > struct bpf_prog *bpf_prog_get(u32 ufd); > +struct bpf_prog *bpf_prog_get_by_id(u32 id); > struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, > bool attach_drv); > struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); > @@ -612,6 +614,11 @@ struct xdp_buff; > struct sk_buff; > > struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); > +struct bpf_map *__dev_map_get_default_map(struct net_device *dev); > +int dev_map_ensure_default_map(struct net *net); > +void dev_map_put_default_map(struct net *net); > +int dev_map_inc_redirect_count(void); > +void dev_map_dec_redirect_count(void); > void __dev_map_insert_ctx(struct bpf_map *map, u32 index); > void __dev_map_flush(struct bpf_map *map); > int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > @@ -641,6 +648,11 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd) > return ERR_PTR(-EOPNOTSUPP); > } > > +static inline struct bpf_prog *bpf_prog_get_by_id(u32 id) > +{ > + return ERR_PTR(-EOPNOTSUPP); > +} > + > static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, > enum bpf_prog_type type, > bool attach_drv) > @@ -693,6 +705,29 @@ static inline struct net_device > *__dev_map_lookup_elem(struct bpf_map *map, > return NULL; > } > > +static inline struct bpf_map *__dev_map_get_default_map(struct net_device > *dev) > +{ > + return NULL; > +} > + > +static inline int dev_map_ensure_default_map(struct net *net) > +{ > + return 0; > +} > + > +static inline void dev_map_put_default_map(struct net *net) > +{ > +} > + > +static inline int dev_map_inc_redirect_count(void) > +{ > + return 0; > +} > + > +static inline void dev_map_dec_redirect_count(void) > +{ > +} > + > static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index) > { > } > diff --git a/include/linux/filter.h b/include/linux/filter.h > index 95e2d7ebdf21..dd6bbbab32f7 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -507,6 +507,8 @@ struct bpf_prog { > gpl_compatible:1, /* Is filter GPL compatible? > */ > cb_access:1, /* Is control block accessed? */ > dst_needed:1, /* Do we need dst entry? */ > + redirect_needed:1, /* Does program need > access to xdp_redirect? */ > + redirect_used:1, /* Does program use > xdp_redirect? */ > blinded:1, /* Was blinded */ > is_func:1, /* program is a bpf function */ > kprobe_override:1, /* Do we override a kprobe? > */ > diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h > index a68ced28d8f4..6706ecc25d8f 100644 > --- a/include/net/net_namespace.h > +++ b/include/net/net_namespace.h > @@ -162,7 +162,7 @@ struct net { > #if IS_ENABLED(CONFIG_CAN) > struct netns_can can; > #endif > -#ifdef CONFIG_XDP_SOCKETS > +#ifdef CONFIG_BPF_SYSCALL > struct netns_xdp xdp; > #endif > struct sock *diag_nlsk; > diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h > index e5734261ba0a..4935dfe1cf43 100644 > --- a/include/net/netns/xdp.h > +++ b/include/net/netns/xdp.h > @@ -4,10 +4,21 @@ > > #include <linux/rculist.h> > #include <linux/mutex.h> > +#include <linux/atomic.h> > + > +struct bpf_dtab; > + > +struct bpf_dtab_container { > + struct bpf_dtab __rcu *dtab; > + atomic_t refcnt; refcount_t ? I'm not sure what the rules are for non-performance critical stuff are.. > +}; > > struct netns_xdp { > +#ifdef CONFIG_XDP_SOCKETS > struct mutex lock; > struct hlist_head list; > +#endif > + struct bpf_dtab_container default_map; > }; > > #endif /* __NETNS_XDP_H__ */ > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c > index 1037fc08c504..e55707e62b60 100644 > --- a/kernel/bpf/devmap.c > +++ b/kernel/bpf/devmap.c > @@ -56,6 +56,9 @@ > (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) > > #define DEV_MAP_BULK_SIZE 16 > +#define DEV_MAP_DEFAULT_SIZE 8 > +#define BPF_MAX_REFCNT 32768 > + > struct xdp_bulk_queue { > struct xdp_frame *q[DEV_MAP_BULK_SIZE]; > struct net_device *dev_rx; > @@ -80,6 +83,7 @@ struct bpf_dtab { > > static DEFINE_SPINLOCK(dev_map_lock); > static LIST_HEAD(dev_map_list); > +static atomic_t global_redirect_use = {}; REFCOUNT_INIT(0) or ATOMIC_INIT(0) > static u64 dev_map_bitmap_size(const union bpf_attr *attr) > { > @@ -332,6 +336,18 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct > bpf_map *map, u32 key) > return obj; > } > > +/* This is only being called from xdp_do_redirect() if the xdp_redirect > helper > + * is used; the default map is allocated on XDP program load if the helper is > + * used, so will always be available at this point. > + */ Yet it does check for NULL..? > +struct bpf_map *__dev_map_get_default_map(struct net_device *dev) > +{ > + struct net *net = dev_net(dev); > + struct bpf_dtab *dtab = rcu_dereference(net->xdp.default_map.dtab); This init is long and break rxmas tree rules, separate line perhaps? > + return dtab ? &dtab->map : NULL; > +} > + > /* Runs under RCU-read-side, plus in softirq under NAPI protection. > * Thus, safe percpu variable access. > */ > @@ -533,14 +549,210 @@ const struct bpf_map_ops dev_map_ops = { > .map_check_btf = map_check_no_btf, > }; > > +static inline struct net *bpf_default_map_to_net(struct bpf_dtab_container > *cont) > +{ > + struct netns_xdp *xdp = container_of(cont, struct netns_xdp, > default_map); > + > + return container_of(xdp, struct net, xdp); > +} > + > +static void __dev_map_release_default_map(struct bpf_dtab_container *cont) > +{ > + struct bpf_dtab *dtab = NULL; > + > + lockdep_assert_held(&dev_map_lock); > + > + dtab = rcu_dereference(cont->dtab); > + if (dtab) { How can it be NULL if it's refcounted? hm.. > + list_del_rcu(&dtab->list); > + rcu_assign_pointer(cont->dtab, NULL); > + bpf_clear_redirect_map(&dtab->map); > + call_rcu(&dtab->rcu, __dev_map_free); > + } > +} > + > +void dev_map_put_default_map(struct net *net) > +{ > + if (atomic_dec_and_test(&net->xdp.default_map.refcnt)) { > + spin_lock(&dev_map_lock); > + __dev_map_release_default_map(&net->xdp.default_map); > + spin_unlock(&dev_map_lock); > + } > +} > + > +static int __init_default_map(struct bpf_dtab_container *cont) > +{ > + struct net *net = bpf_default_map_to_net(cont); > + struct bpf_dtab *dtab, *old_dtab; > + int size = DEV_MAP_DEFAULT_SIZE; > + struct net_device *netdev; > + union bpf_attr attr = {}; > + u32 idx; > + int err; > + > + lockdep_assert_held(&dev_map_lock); > + > + if (!atomic_read(&global_redirect_use)) > + return 0; > + > + for_each_netdev(net, netdev) > + if (netdev->ifindex >= size) > + size <<= 1; > + > + old_dtab = rcu_dereference(cont->dtab); > + if (old_dtab && old_dtab->map.max_entries == size) > + return 0; > + > + dtab = kzalloc(sizeof(*dtab), GFP_USER); You are calling this with a spin lock held :( > + if (!dtab) > + return -ENOMEM; > + > + attr.map_type = BPF_MAP_TYPE_DEVMAP; > + attr.max_entries = size; > + attr.value_size = 4; > + attr.key_size = 4; > + > + err = dev_map_init_map(dtab, &attr, false); > + if (err) { > + kfree(dtab); > + return err; > + } > + > + for_each_netdev(net, netdev) { > + idx = netdev->ifindex; > + err = __dev_map_update_elem(net, &dtab->map, &idx, &idx, 0); > + if (err) { > + __dev_map_free(&dtab->rcu); This map wasn't visible yet, so probably no need to go through RCU? Not that it matters much for an error path.. > + return err; > + } > + } > + > + rcu_assign_pointer(cont->dtab, dtab); > + list_add_tail_rcu(&dtab->list, &dev_map_list); > + > + if (old_dtab) { > + list_del_rcu(&old_dtab->list); > + bpf_clear_redirect_map(&old_dtab->map); > + call_rcu(&old_dtab->rcu, __dev_map_free); > + } > + > + return 0; > +} > + > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 1b9496c41383..f1b2f01e7ca1 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -7609,6 +7609,17 @@ static int fixup_bpf_calls(struct bpf_verifier_env > *env) > prog->dst_needed = 1; > if (insn->imm == BPF_FUNC_get_prandom_u32) > bpf_user_rnd_init_once(); > + if (insn->imm == BPF_FUNC_redirect) { > + prog->redirect_needed = true; nit: this field is a bitfield not boolean > + if (!prog->redirect_used) { > + int err = dev_map_inc_redirect_count(); Don't call complex functions as variable init, please > + if (err) > + return err; > + prog->redirect_used = true; > + } > + } > + > if (insn->imm == BPF_FUNC_override_return) > prog->kprobe_override = 1; > if (insn->imm == BPF_FUNC_tail_call) { > @@ -7618,6 +7629,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) > * the program array. > */ > prog->cb_access = 1; > + prog->redirect_needed = true; > env->prog->aux->stack_depth = MAX_BPF_STACK; > env->prog->aux->max_pkt_offset = MAX_PACKET_OFF; > > diff --git a/net/core/dev.c b/net/core/dev.c > index 2b67f2aa59dd..1df20d529026 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -7990,6 +7990,21 @@ u32 __dev_xdp_query(struct net_device *dev, bpf_op_t > bpf_op, > return xdp.prog_id; > } > > +static struct bpf_prog *dev_xdp_get_prog(struct net_device *dev) > +{ > + struct bpf_prog *prog; > + u32 prog_id; > + > + prog_id = __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, > XDP_QUERY_PROG); if (!prog_id) return NULL; ... > + if (prog_id) { > + prog = bpf_prog_get_by_id(prog_id); > + if (!IS_ERR(prog)) > + return prog; > + } > + > + return NULL; > +} > + > static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, > struct netlink_ext_ack *extack, u32 flags, > struct bpf_prog *prog) > @@ -8024,9 +8039,18 @@ static void dev_xdp_uninstall(struct net_device *dev) > memset(&xdp, 0, sizeof(xdp)); > xdp.command = XDP_QUERY_PROG; > WARN_ON(ndo_bpf(dev, &xdp)); > - if (xdp.prog_id) > + if (xdp.prog_id) { > + struct bpf_prog *prog = bpf_prog_get_by_id(xdp.prog_id); > + > + if (!IS_ERR(prog)) { > + if (prog->redirect_needed) > + dev_map_put_default_map(dev_net(dev)); > + bpf_prog_put(prog); > + } > + > WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, > NULL)); > + } > > /* Remove HW offload */ > memset(&xdp, 0, sizeof(xdp)); > @@ -8091,6 +8115,23 @@ int dev_change_xdp_fd(struct net_device *dev, struct > netlink_ext_ack *extack, > bpf_prog_put(prog); > return -EINVAL; > } > + > + if (!offload && bpf_op == ops->ndo_bpf && > + prog->redirect_needed) { > + err = dev_map_ensure_default_map(dev_net(dev)); > + if (err) { > + NL_SET_ERR_MSG(extack, "unable to allocate > default map for xdp_redirect()"); > + return err; > + } > + } > + } else { > + struct bpf_prog *old_prog = dev_xdp_get_prog(dev); > + > + if (old_prog) { > + if (old_prog->redirect_needed) > + dev_map_put_default_map(dev_net(dev)); I think you should hold the ref to this program and only release the map _after_ new program was installed. You also have to do this for replacing a program with a different one. Could you please add tests for the replacement combinations? > + bpf_prog_put(old_prog); > + } > } > err = dev_xdp_install(dev, bpf_op, extack, flags, prog); > @@ -9333,6 +9374,7 @@ EXPORT_SYMBOL(unregister_netdev); > int dev_change_net_namespace(struct net_device *dev, struct net *net, const > char *pat) > { > int err, new_nsid, new_ifindex; > + struct bpf_prog *prog = NULL; > > ASSERT_RTNL(); > > @@ -9350,6 +9392,15 @@ int dev_change_net_namespace(struct net_device *dev, > struct net *net, const char > if (net_eq(dev_net(dev), net)) > goto out; > > + prog = dev_xdp_get_prog(dev); > + if (prog) { > + if (prog->redirect_needed) > + err = dev_map_ensure_default_map(net); > + > + if (err) > + goto out; > + } prog = dev_xdp_get_prog(dev); if (prog && prog->redirect_needed) { err = dev_map_ensure_default_map(net); if (err) goto out; } > /* Pick the destination device name, and ensure > * we can use it in the destination network namespace. > */ > @@ -9388,6 +9439,9 @@ int dev_change_net_namespace(struct net_device *dev, > struct net *net, const char > call_netdevice_notifiers(NETDEV_UNREGISTER, dev); > rcu_barrier(); > > + if (prog && prog->redirect_needed) > + dev_map_put_default_map(dev_net(dev)); > + > new_nsid = peernet2id_alloc(dev_net(dev), net); > /* If there is an ifindex conflict assign a new one */ > if (__dev_get_by_index(net, dev->ifindex)) > @@ -9435,6 +9489,9 @@ int dev_change_net_namespace(struct net_device *dev, > struct net *net, const char > synchronize_net(); > err = 0; > out: > + if (prog) > + bpf_prog_put(prog); > + > return err; > } > EXPORT_SYMBOL_GPL(dev_change_net_namespace);