On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
> An XDP program can redirect packets between interfaces using either the
> xdp_redirect() helper or the xdp_redirect_map() helper. Apart from the
> flexibility of updating maps from userspace, the redirect_map() helper also
> uses the map structure to batch packets, which results in a significant
> (around 50%) performance boost. However, the xdp_redirect() API is simpler
> if one just wants to redirect to another interface, which means people tend
> to use this interface and then wonder why they getter worse performance
> than expected.
> 
> This patch seeks to close this performance difference between the two APIs.
> It achieves this by changing xdp_redirect() to use a hidden devmap for
> looking up destination interfaces, thus gaining the batching benefit with
> no visible difference from the user API point of view.
> 
> A hidden per-namespace map is allocated when an XDP program that uses the
> non-map xdp_redirect() helper is first loaded. This map is populated with
> all available interfaces in its namespace, and kept up to date as
> interfaces come and go. Once allocated, the map is kept around until the
> namespace is removed.
> 
> The hidden map uses the ifindex as map key, which means they are limited to
> ifindexes smaller than the map size of 64. A later patch introduces a new
> map type to lift this restriction.
> 
> Performance numbers:
> 
> Before patch:
> xdp_redirect:     5426035 pkt/s
> xdp_redirect_map: 8412754 pkt/s
> 
> After patch:
> xdp_redirect:     8314702 pkt/s
> xdp_redirect_map: 8411854 pkt/s
> 
> This corresponds to a 53% increase in xdp_redirect performance, or a
> reduction in per-packet processing time by 64 nanoseconds.
> 
> Signed-off-by: Toke Høiland-Jørgensen <t...@redhat.com>

> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index de18227b3d95..060c5850af3b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -25,6 +25,7 @@ struct sock;
>  struct seq_file;
>  struct btf;
>  struct btf_type;
> +struct net;
>  
>  /* map is generic key/value storage optionally accesible by eBPF programs */
>  struct bpf_map_ops {
> @@ -533,6 +534,7 @@ extern const struct bpf_verifier_ops 
> tc_cls_act_analyzer_ops;
>  extern const struct bpf_verifier_ops xdp_analyzer_ops;
>  
>  struct bpf_prog *bpf_prog_get(u32 ufd);
> +struct bpf_prog *bpf_prog_get_by_id(u32 id);
>  struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
>                                      bool attach_drv);
>  struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
> @@ -612,6 +614,11 @@ struct xdp_buff;
>  struct sk_buff;
>  
>  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
> +struct bpf_map *__dev_map_get_default_map(struct net_device *dev);
> +int dev_map_ensure_default_map(struct net *net);
> +void dev_map_put_default_map(struct net *net);
> +int dev_map_inc_redirect_count(void);
> +void dev_map_dec_redirect_count(void);
>  void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
>  void __dev_map_flush(struct bpf_map *map);
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> @@ -641,6 +648,11 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
>       return ERR_PTR(-EOPNOTSUPP);
>  }
>  
> +static inline struct bpf_prog *bpf_prog_get_by_id(u32 id)
> +{
> +     return ERR_PTR(-EOPNOTSUPP);
> +}
> +
>  static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
>                                                    enum bpf_prog_type type,
>                                                    bool attach_drv)
> @@ -693,6 +705,29 @@ static inline struct net_device  
> *__dev_map_lookup_elem(struct bpf_map *map,
>       return NULL;
>  }
>  
> +static inline struct bpf_map *__dev_map_get_default_map(struct net_device 
> *dev)
> +{
> +     return NULL;
> +}
> +
> +static inline int dev_map_ensure_default_map(struct net *net)
> +{
> +     return 0;
> +}
> +
> +static inline void dev_map_put_default_map(struct net *net)
> +{
> +}
> +
> +static inline int dev_map_inc_redirect_count(void)
> +{
> +     return 0;
> +}
> +
> +static inline void dev_map_dec_redirect_count(void)
> +{
> +}
> +
>  static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
>  {
>  }
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 95e2d7ebdf21..dd6bbbab32f7 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -507,6 +507,8 @@ struct bpf_prog {
>                               gpl_compatible:1, /* Is filter GPL compatible? 
> */
>                               cb_access:1,    /* Is control block accessed? */
>                               dst_needed:1,   /* Do we need dst entry? */
> +                             redirect_needed:1,      /* Does program need 
> access to xdp_redirect? */
> +                             redirect_used:1,        /* Does program use 
> xdp_redirect? */
>                               blinded:1,      /* Was blinded */
>                               is_func:1,      /* program is a bpf function */
>                               kprobe_override:1, /* Do we override a kprobe? 
> */
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index a68ced28d8f4..6706ecc25d8f 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -162,7 +162,7 @@ struct net {
>  #if IS_ENABLED(CONFIG_CAN)
>       struct netns_can        can;
>  #endif
> -#ifdef CONFIG_XDP_SOCKETS
> +#ifdef CONFIG_BPF_SYSCALL
>       struct netns_xdp        xdp;
>  #endif
>       struct sock             *diag_nlsk;
> diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h
> index e5734261ba0a..4935dfe1cf43 100644
> --- a/include/net/netns/xdp.h
> +++ b/include/net/netns/xdp.h
> @@ -4,10 +4,21 @@
>  
>  #include <linux/rculist.h>
>  #include <linux/mutex.h>
> +#include <linux/atomic.h>
> +
> +struct bpf_dtab;
> +
> +struct bpf_dtab_container {
> +     struct bpf_dtab __rcu *dtab;
> +     atomic_t refcnt;

refcount_t ?  I'm not sure what the rules are for non-performance
critical stuff are..

> +};
>  
>  struct netns_xdp {
> +#ifdef CONFIG_XDP_SOCKETS
>       struct mutex            lock;
>       struct hlist_head       list;
> +#endif
> +     struct bpf_dtab_container default_map;
>  };
>  
>  #endif /* __NETNS_XDP_H__ */
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 1037fc08c504..e55707e62b60 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -56,6 +56,9 @@
>       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
>  
>  #define DEV_MAP_BULK_SIZE 16
> +#define DEV_MAP_DEFAULT_SIZE 8
> +#define BPF_MAX_REFCNT 32768
> +
>  struct xdp_bulk_queue {
>       struct xdp_frame *q[DEV_MAP_BULK_SIZE];
>       struct net_device *dev_rx;
> @@ -80,6 +83,7 @@ struct bpf_dtab {
>  
>  static DEFINE_SPINLOCK(dev_map_lock);
>  static LIST_HEAD(dev_map_list);
> +static atomic_t global_redirect_use = {};

REFCOUNT_INIT(0) or ATOMIC_INIT(0)

>  static u64 dev_map_bitmap_size(const union bpf_attr *attr)
>  {
> @@ -332,6 +336,18 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct 
> bpf_map *map, u32 key)
>       return obj;
>  }
>  
> +/* This is only being called from xdp_do_redirect() if the xdp_redirect 
> helper
> + * is used; the default map is allocated on XDP program load if the helper is
> + * used, so will always be available at this point.
> + */

Yet it does check for NULL..?

> +struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
> +{
> +     struct net *net = dev_net(dev);
> +     struct bpf_dtab *dtab = rcu_dereference(net->xdp.default_map.dtab);

This init is long and break rxmas tree rules, separate line perhaps?

> +     return dtab ? &dtab->map : NULL;
> +}
> +
>  /* Runs under RCU-read-side, plus in softirq under NAPI protection.
>   * Thus, safe percpu variable access.
>   */
> @@ -533,14 +549,210 @@ const struct bpf_map_ops dev_map_ops = {
>       .map_check_btf = map_check_no_btf,
>  };
>  
> +static inline struct net *bpf_default_map_to_net(struct bpf_dtab_container 
> *cont)
> +{
> +     struct netns_xdp *xdp = container_of(cont, struct netns_xdp, 
> default_map);
> +
> +     return container_of(xdp, struct net, xdp);
> +}
> +
> +static void __dev_map_release_default_map(struct bpf_dtab_container *cont)
> +{
> +     struct bpf_dtab *dtab = NULL;
> +
> +     lockdep_assert_held(&dev_map_lock);
> +
> +     dtab = rcu_dereference(cont->dtab);
> +     if (dtab) {

How can it be NULL if it's refcounted? hm..

> +             list_del_rcu(&dtab->list);
> +             rcu_assign_pointer(cont->dtab, NULL);
> +             bpf_clear_redirect_map(&dtab->map);
> +             call_rcu(&dtab->rcu, __dev_map_free);
> +     }
> +}
> +
> +void dev_map_put_default_map(struct net *net)
> +{
> +     if (atomic_dec_and_test(&net->xdp.default_map.refcnt)) {
> +             spin_lock(&dev_map_lock);
> +             __dev_map_release_default_map(&net->xdp.default_map);
> +             spin_unlock(&dev_map_lock);
> +     }
> +}
> +
> +static int __init_default_map(struct bpf_dtab_container *cont)
> +{
> +     struct net *net = bpf_default_map_to_net(cont);
> +     struct bpf_dtab *dtab, *old_dtab;
> +     int size = DEV_MAP_DEFAULT_SIZE;
> +     struct net_device *netdev;
> +     union bpf_attr attr = {};
> +     u32 idx;
> +     int err;
> +
> +     lockdep_assert_held(&dev_map_lock);
> +
> +     if (!atomic_read(&global_redirect_use))
> +             return 0;
> +
> +     for_each_netdev(net, netdev)
> +             if (netdev->ifindex >= size)
> +                     size <<= 1;
> +
> +     old_dtab = rcu_dereference(cont->dtab);
> +     if (old_dtab && old_dtab->map.max_entries == size)
> +             return 0;
> +
> +     dtab = kzalloc(sizeof(*dtab), GFP_USER);

You are calling this with a spin lock held :(

> +     if (!dtab)
> +             return -ENOMEM;
> +
> +     attr.map_type = BPF_MAP_TYPE_DEVMAP;
> +     attr.max_entries = size;
> +     attr.value_size = 4;
> +     attr.key_size = 4;
> +
> +     err = dev_map_init_map(dtab, &attr, false);
> +     if (err) {
> +             kfree(dtab);
> +             return err;
> +     }
> +
> +     for_each_netdev(net, netdev) {
> +             idx = netdev->ifindex;
> +             err = __dev_map_update_elem(net, &dtab->map, &idx, &idx, 0);
> +             if (err) {
> +                     __dev_map_free(&dtab->rcu);

This map wasn't visible yet, so probably no need to go through RCU? 
Not that it matters much for an error path..

> +                     return err;
> +             }
> +     }
> +
> +     rcu_assign_pointer(cont->dtab, dtab);
> +     list_add_tail_rcu(&dtab->list, &dev_map_list);
> +
> +     if (old_dtab) {
> +             list_del_rcu(&old_dtab->list);
> +             bpf_clear_redirect_map(&old_dtab->map);
> +             call_rcu(&old_dtab->rcu, __dev_map_free);
> +     }
> +
> +     return 0;
> +}
> +

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 1b9496c41383..f1b2f01e7ca1 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7609,6 +7609,17 @@ static int fixup_bpf_calls(struct bpf_verifier_env 
> *env)
>                       prog->dst_needed = 1;
>               if (insn->imm == BPF_FUNC_get_prandom_u32)
>                       bpf_user_rnd_init_once();
> +             if (insn->imm == BPF_FUNC_redirect) {
> +                     prog->redirect_needed = true;

nit: this field is a bitfield not boolean

> +                     if (!prog->redirect_used) {
> +                             int err = dev_map_inc_redirect_count();

Don't call complex functions as variable init, please

> +                             if (err)
> +                                     return err;
> +                             prog->redirect_used = true;
> +                     }
> +             }
> +
>               if (insn->imm == BPF_FUNC_override_return)
>                       prog->kprobe_override = 1;
>               if (insn->imm == BPF_FUNC_tail_call) {
> @@ -7618,6 +7629,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>                        * the program array.
>                        */
>                       prog->cb_access = 1;
> +                     prog->redirect_needed = true;
>                       env->prog->aux->stack_depth = MAX_BPF_STACK;
>                       env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
>  
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 2b67f2aa59dd..1df20d529026 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -7990,6 +7990,21 @@ u32 __dev_xdp_query(struct net_device *dev, bpf_op_t 
> bpf_op,
>       return xdp.prog_id;
>  }
>  
> +static struct bpf_prog *dev_xdp_get_prog(struct net_device *dev)
> +{
> +     struct bpf_prog *prog;
> +     u32 prog_id;
> +
> +     prog_id = __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, 
> XDP_QUERY_PROG);

if (!prog_id)
        return NULL;

...

> +     if (prog_id) {
> +             prog = bpf_prog_get_by_id(prog_id);
> +             if (!IS_ERR(prog))
> +                     return prog;
> +     }
> +
> +     return NULL;
> +}
> +
>  static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
>                          struct netlink_ext_ack *extack, u32 flags,
>                          struct bpf_prog *prog)
> @@ -8024,9 +8039,18 @@ static void dev_xdp_uninstall(struct net_device *dev)
>       memset(&xdp, 0, sizeof(xdp));
>       xdp.command = XDP_QUERY_PROG;
>       WARN_ON(ndo_bpf(dev, &xdp));
> -     if (xdp.prog_id)
> +     if (xdp.prog_id) {
> +             struct bpf_prog *prog = bpf_prog_get_by_id(xdp.prog_id);
> +
> +             if (!IS_ERR(prog)) {
> +                     if (prog->redirect_needed)
> +                             dev_map_put_default_map(dev_net(dev));
> +                     bpf_prog_put(prog);
> +             }
> +
>               WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
>                                       NULL));
> +     }
>  
>       /* Remove HW offload */
>       memset(&xdp, 0, sizeof(xdp));
> @@ -8091,6 +8115,23 @@ int dev_change_xdp_fd(struct net_device *dev, struct 
> netlink_ext_ack *extack,
>                       bpf_prog_put(prog);
>                       return -EINVAL;
>               }
> +
> +             if (!offload && bpf_op == ops->ndo_bpf &&
> +                 prog->redirect_needed) {
> +                     err = dev_map_ensure_default_map(dev_net(dev));
> +                     if (err) {
> +                             NL_SET_ERR_MSG(extack, "unable to allocate 
> default map for xdp_redirect()");
> +                             return err;
> +                     }
> +             }
> +     } else {
> +             struct bpf_prog *old_prog = dev_xdp_get_prog(dev);
> +
> +             if (old_prog) {
> +                     if (old_prog->redirect_needed)
> +                             dev_map_put_default_map(dev_net(dev));

I think you should hold the ref to this program and only release the
map _after_ new program was installed.

You also have to do this for replacing a program with a different one.

Could you please add tests for the replacement combinations?

> +                     bpf_prog_put(old_prog);
> +             }
>       }

>       err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
> @@ -9333,6 +9374,7 @@ EXPORT_SYMBOL(unregister_netdev);
>  int dev_change_net_namespace(struct net_device *dev, struct net *net, const 
> char *pat)
>  {
>       int err, new_nsid, new_ifindex;
> +     struct bpf_prog *prog = NULL;
>  
>       ASSERT_RTNL();
>  
> @@ -9350,6 +9392,15 @@ int dev_change_net_namespace(struct net_device *dev, 
> struct net *net, const char
>       if (net_eq(dev_net(dev), net))
>               goto out;
>  
> +     prog = dev_xdp_get_prog(dev);
> +     if (prog) {
> +             if (prog->redirect_needed)
> +                     err = dev_map_ensure_default_map(net);
> +
> +             if (err)
> +                     goto out;
> +     }

prog = dev_xdp_get_prog(dev);
if (prog && prog->redirect_needed) {
        err = dev_map_ensure_default_map(net);
        if (err)
                goto out;
}

>       /* Pick the destination device name, and ensure
>        * we can use it in the destination network namespace.
>        */
> @@ -9388,6 +9439,9 @@ int dev_change_net_namespace(struct net_device *dev, 
> struct net *net, const char
>       call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
>       rcu_barrier();
>  
> +     if (prog && prog->redirect_needed)
> +             dev_map_put_default_map(dev_net(dev));
> +
>       new_nsid = peernet2id_alloc(dev_net(dev), net);
>       /* If there is an ifindex conflict assign a new one */
>       if (__dev_get_by_index(net, dev->ifindex))
> @@ -9435,6 +9489,9 @@ int dev_change_net_namespace(struct net_device *dev, 
> struct net *net, const char
>       synchronize_net();
>       err = 0;
>  out:
> +     if (prog)
> +             bpf_prog_put(prog);
> +
>       return err;
>  }
>  EXPORT_SYMBOL_GPL(dev_change_net_namespace);

Reply via email to