Re: [bpf-next, v2 1/3] flow_dissector: implements flow dissector BPF hook

Alexei Starovoitov Tue, 11 Sep 2018 20:47:18 -0700

On Fri, Sep 07, 2018 at 05:11:08PM -0700, Petar Penkov wrote:
> From: Petar Penkov <ppen...@google.com>
> 
> Adds a hook for programs of type BPF_PROG_TYPE_FLOW_DISSECTOR and
> attach type BPF_FLOW_DISSECTOR that is executed in the flow dissector
> path. The BPF program is per-network namespace.
> 
> Signed-off-by: Petar Penkov <ppen...@google.com>
> Signed-off-by: Willem de Bruijn <will...@google.com>
> ---
>  include/linux/bpf.h            |   1 +
>  include/linux/bpf_types.h      |   1 +
>  include/linux/skbuff.h         |   7 ++
>  include/net/net_namespace.h    |   3 +
>  include/net/sch_generic.h      |  12 ++-
>  include/uapi/linux/bpf.h       |  25 ++++++
>  kernel/bpf/syscall.c           |   8 ++
>  kernel/bpf/verifier.c          |  32 ++++++++
>  net/core/filter.c              |  67 ++++++++++++++++
>  net/core/flow_dissector.c      | 136 +++++++++++++++++++++++++++++++++
>  tools/bpf/bpftool/prog.c       |   1 +
>  tools/include/uapi/linux/bpf.h |  25 ++++++
>  tools/lib/bpf/libbpf.c         |   2 +


please split up update to tools/include/uapi/linux/bpf.h as a separate patch 2.
We often have conflicts in there, so best to have a separate.
Also please split tools/lib and tools/bpf chnages into patch 3.

>  13 files changed, 317 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 523481a3471b..988a00797bcd 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -212,6 +212,7 @@ enum bpf_reg_type {
>       PTR_TO_PACKET_META,      /* skb->data - meta_len */
>       PTR_TO_PACKET,           /* reg points to skb->data */
>       PTR_TO_PACKET_END,       /* skb->data + headlen */
> +     PTR_TO_FLOW_KEYS,        /* reg points to bpf_flow_keys */
>  };
>  
>  /* The information passed from prog-specific *_is_valid_access
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index cd26c090e7c0..22083712dd18 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -32,6 +32,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
>  #ifdef CONFIG_INET
>  BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
>  #endif
> +BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector)
>  
>  BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 17a13e4785fc..ce0e863f02a2 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -243,6 +243,8 @@ struct scatterlist;
>  struct pipe_inode_info;
>  struct iov_iter;
>  struct napi_struct;
> +struct bpf_prog;
> +union bpf_attr;
>  
>  #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
>  struct nf_conntrack {
> @@ -1192,6 +1194,11 @@ void skb_flow_dissector_init(struct flow_dissector 
> *flow_dissector,
>                            const struct flow_dissector_key *key,
>                            unsigned int key_count);
>  
> +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
> +                                    struct bpf_prog *prog);
> +
> +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
> +
>  bool __skb_flow_dissect(const struct sk_buff *skb,
>                       struct flow_dissector *flow_dissector,
>                       void *target_container,
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index 9b5fdc50519a..99d4148e0f90 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -43,6 +43,7 @@ struct ctl_table_header;
>  struct net_generic;
>  struct uevent_sock;
>  struct netns_ipvs;
> +struct bpf_prog;
>  
>  
>  #define NETDEV_HASHBITS    8
> @@ -145,6 +146,8 @@ struct net {
>  #endif
>       struct net_generic __rcu        *gen;
>  
> +     struct bpf_prog __rcu   *flow_dissector_prog;
> +
>       /* Note : following structs are cache line aligned */
>  #ifdef CONFIG_XFRM
>       struct netns_xfrm       xfrm;
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index a6d00093f35e..1b81ba85fd2d 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -19,6 +19,7 @@ struct Qdisc_ops;
>  struct qdisc_walker;
>  struct tcf_walker;
>  struct module;
> +struct bpf_flow_keys;
>  
>  typedef int tc_setup_cb_t(enum tc_setup_type type,
>                         void *type_data, void *cb_priv);
> @@ -307,9 +308,14 @@ struct tcf_proto {
>  };
>  
>  struct qdisc_skb_cb {
> -     unsigned int            pkt_len;
> -     u16                     slave_dev_queue_mapping;
> -     u16                     tc_classid;
> +     union {
> +             struct {
> +                     unsigned int            pkt_len;
> +                     u16                     slave_dev_queue_mapping;
> +                     u16                     tc_classid;
> +             };
> +             struct bpf_flow_keys *flow_keys;
> +     };

is this magic really necessary? flow_dissector runs very early in recv path.
There is no qdisc or conflicts with tcp/ip use of cb.
I think the whole cb block can be used.

>  #define QDISC_CB_PRIV_LEN 20
>       unsigned char           data[QDISC_CB_PRIV_LEN];
>  };
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 66917a4eba27..3064706fcaaa 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -152,6 +152,7 @@ enum bpf_prog_type {
>       BPF_PROG_TYPE_LWT_SEG6LOCAL,
>       BPF_PROG_TYPE_LIRC_MODE2,
>       BPF_PROG_TYPE_SK_REUSEPORT,
> +     BPF_PROG_TYPE_FLOW_DISSECTOR,
>  };
>  
>  enum bpf_attach_type {
> @@ -172,6 +173,7 @@ enum bpf_attach_type {
>       BPF_CGROUP_UDP4_SENDMSG,
>       BPF_CGROUP_UDP6_SENDMSG,
>       BPF_LIRC_MODE2,
> +     BPF_FLOW_DISSECTOR,
>       __MAX_BPF_ATTACH_TYPE
>  };
>  
> @@ -2333,6 +2335,7 @@ struct __sk_buff {
>       /* ... here. */
>  
>       __u32 data_meta;
> +     __u32 flow_keys;

please use
struct bpf_flow_keys *flow_keys;
instead.

See what we did in 'struct sk_msg_md' and in 'struct sk_reuseport_md'.
There is no need to hide pointers in u32.

>  };
>  
>  struct bpf_tunnel_key {
> @@ -2778,4 +2781,26 @@ enum bpf_task_fd_type {
>       BPF_FD_TYPE_URETPROBE,          /* filename + offset */
>  };
>  
> +struct bpf_flow_keys {
> +     __u16   thoff;
> +     __u16   addr_proto;                     /* ETH_P_* of valid addrs */
> +     __u8    is_frag;
> +     __u8    is_first_frag;
> +     __u8    is_encap;
> +     __be16  n_proto;
> +     __u8    ip_proto;
> +     union {
> +             struct {
> +                     __be32  ipv4_src;
> +                     __be32  ipv4_dst;
> +             };
> +             struct {
> +                     __u32   ipv6_src[4];    /* in6_addr; network order */
> +                     __u32   ipv6_dst[4];    /* in6_addr; network order */
> +             };
> +     };
> +     __be16  sport;
> +     __be16  dport;
> +};
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 3c9636f03bb2..b3c2d09bcf7a 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1615,6 +1615,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
>       case BPF_LIRC_MODE2:
>               ptype = BPF_PROG_TYPE_LIRC_MODE2;
>               break;
> +     case BPF_FLOW_DISSECTOR:
> +             ptype = BPF_PROG_TYPE_FLOW_DISSECTOR;
> +             break;
>       default:
>               return -EINVAL;
>       }
> @@ -1636,6 +1639,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
>       case BPF_PROG_TYPE_LIRC_MODE2:
>               ret = lirc_prog_attach(attr, prog);
>               break;
> +     case BPF_PROG_TYPE_FLOW_DISSECTOR:
> +             ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
> +             break;
>       default:
>               ret = cgroup_bpf_prog_attach(attr, ptype, prog);
>       }
> @@ -1688,6 +1694,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
>               return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
>       case BPF_LIRC_MODE2:
>               return lirc_prog_detach(attr);
> +     case BPF_FLOW_DISSECTOR:
> +             return skb_flow_dissector_bpf_prog_detach(attr);
>       default:
>               return -EINVAL;
>       }
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 6ff1bac1795d..8ccbff4fff93 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -261,6 +261,7 @@ static const char * const reg_type_str[] = {
>       [PTR_TO_PACKET]         = "pkt",
>       [PTR_TO_PACKET_META]    = "pkt_meta",
>       [PTR_TO_PACKET_END]     = "pkt_end",
> +     [PTR_TO_FLOW_KEYS]      = "flow_keys",
>  };
>  
>  static char slot_type_char[] = {
> @@ -965,6 +966,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
>       case PTR_TO_PACKET:
>       case PTR_TO_PACKET_META:
>       case PTR_TO_PACKET_END:
> +     case PTR_TO_FLOW_KEYS:
>       case CONST_PTR_TO_MAP:
>               return true;
>       default:
> @@ -1238,6 +1240,7 @@ static bool may_access_direct_pkt_data(struct 
> bpf_verifier_env *env,
>       case BPF_PROG_TYPE_LWT_XMIT:
>       case BPF_PROG_TYPE_SK_SKB:
>       case BPF_PROG_TYPE_SK_MSG:
> +     case BPF_PROG_TYPE_FLOW_DISSECTOR:
>               if (meta)
>                       return meta->pkt_access;
>  
> @@ -1321,6 +1324,18 @@ static int check_ctx_access(struct bpf_verifier_env 
> *env, int insn_idx, int off,
>       return -EACCES;
>  }
>  
> +static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
> +                               int size)
> +{
> +     if (size < 0 || off < 0 ||
> +         (u64)off + size > sizeof(struct bpf_flow_keys)) {
> +             verbose(env, "invalid access to flow keys off=%d size=%d\n",
> +                     off, size);
> +             return -EACCES;
> +     }
> +     return 0;
> +}
> +
>  static bool __is_pointer_value(bool allow_ptr_leaks,
>                              const struct bpf_reg_state *reg)
>  {
> @@ -1422,6 +1437,9 @@ static int check_ptr_alignment(struct bpf_verifier_env 
> *env,
>                * right in front, treat it the very same way.
>                */
>               return check_pkt_ptr_alignment(env, reg, off, size, strict);
> +     case PTR_TO_FLOW_KEYS:
> +             pointer_desc = "flow keys ";
> +             break;
>       case PTR_TO_MAP_VALUE:
>               pointer_desc = "value ";
>               break;
> @@ -1692,6 +1710,17 @@ static int check_mem_access(struct bpf_verifier_env 
> *env, int insn_idx, u32 regn
>               err = check_packet_access(env, regno, off, size, false);
>               if (!err && t == BPF_READ && value_regno >= 0)
>                       mark_reg_unknown(env, regs, value_regno);
> +     } else if (reg->type == PTR_TO_FLOW_KEYS) {
> +             if (t == BPF_WRITE && value_regno >= 0 &&
> +                 is_pointer_value(env, value_regno)) {
> +                     verbose(env, "R%d leaks addr into flow keys\n",
> +                             value_regno);
> +                     return -EACCES;
> +             }
> +
> +             err = check_flow_keys_access(env, off, size);
> +             if (!err && t == BPF_READ && value_regno >= 0)
> +                     mark_reg_unknown(env, regs, value_regno);
>       } else {
>               verbose(env, "R%d invalid mem access '%s'\n", regno,
>                       reg_type_str[reg->type]);
> @@ -1839,6 +1868,8 @@ static int check_helper_mem_access(struct 
> bpf_verifier_env *env, int regno,
>       case PTR_TO_PACKET_META:
>               return check_packet_access(env, regno, reg->off, access_size,
>                                          zero_size_allowed);
> +     case PTR_TO_FLOW_KEYS:
> +             return check_flow_keys_access(env, reg->off, access_size);
>       case PTR_TO_MAP_VALUE:
>               return check_map_access(env, regno, reg->off, access_size,
>                                       zero_size_allowed);
> @@ -4366,6 +4397,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct 
> bpf_reg_state *rcur,
>       case PTR_TO_CTX:
>       case CONST_PTR_TO_MAP:
>       case PTR_TO_PACKET_END:
> +     case PTR_TO_FLOW_KEYS:
>               /* Only valid matches are exact, which memcmp() above
>                * would have accepted
>                */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 8cb242b4400f..bc3725c26794 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5122,6 +5122,17 @@ sk_skb_func_proto(enum bpf_func_id func_id, const 
> struct bpf_prog *prog)
>       }
>  }
>  
> +static const struct bpf_func_proto *
> +flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog 
> *prog)
> +{
> +     switch (func_id) {
> +     case BPF_FUNC_skb_load_bytes:
> +             return &bpf_skb_load_bytes_proto;
> +     default:
> +             return bpf_base_func_proto(func_id);
> +     }
> +}
> +
>  static const struct bpf_func_proto *
>  lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  {
> @@ -5237,6 +5248,7 @@ static bool bpf_skb_is_valid_access(int off, int size, 
> enum bpf_access_type type
>       case bpf_ctx_range(struct __sk_buff, data):
>       case bpf_ctx_range(struct __sk_buff, data_meta):
>       case bpf_ctx_range(struct __sk_buff, data_end):
> +     case bpf_ctx_range(struct __sk_buff, flow_keys):
>               if (size != size_default)
>                       return false;
>               break;
> @@ -5265,6 +5277,7 @@ static bool sk_filter_is_valid_access(int off, int size,
>       case bpf_ctx_range(struct __sk_buff, data):
>       case bpf_ctx_range(struct __sk_buff, data_meta):
>       case bpf_ctx_range(struct __sk_buff, data_end):
> +     case bpf_ctx_range(struct __sk_buff, flow_keys):
>       case bpf_ctx_range_till(struct __sk_buff, family, local_port):
>               return false;
>       }
> @@ -5290,6 +5303,7 @@ static bool lwt_is_valid_access(int off, int size,
>       case bpf_ctx_range(struct __sk_buff, tc_classid):
>       case bpf_ctx_range_till(struct __sk_buff, family, local_port):
>       case bpf_ctx_range(struct __sk_buff, data_meta):
> +     case bpf_ctx_range(struct __sk_buff, flow_keys):
>               return false;
>       }
>  
> @@ -5500,6 +5514,7 @@ static bool tc_cls_act_is_valid_access(int off, int 
> size,
>       case bpf_ctx_range(struct __sk_buff, data_end):
>               info->reg_type = PTR_TO_PACKET_END;
>               break;
> +     case bpf_ctx_range(struct __sk_buff, flow_keys):
>       case bpf_ctx_range_till(struct __sk_buff, family, local_port):
>               return false;
>       }
> @@ -5701,6 +5716,7 @@ static bool sk_skb_is_valid_access(int off, int size,
>       switch (off) {
>       case bpf_ctx_range(struct __sk_buff, tc_classid):
>       case bpf_ctx_range(struct __sk_buff, data_meta):
> +     case bpf_ctx_range(struct __sk_buff, flow_keys):
>               return false;
>       }
>  
> @@ -5760,6 +5776,39 @@ static bool sk_msg_is_valid_access(int off, int size,
>       return true;
>  }
>  
> +static bool flow_dissector_is_valid_access(int off, int size,
> +                                        enum bpf_access_type type,
> +                                        const struct bpf_prog *prog,
> +                                        struct bpf_insn_access_aux *info)
> +{
> +     if (type == BPF_WRITE) {
> +             switch (off) {
> +             case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
> +                     break;
> +             default:
> +                     return false;
> +             }
> +     }
> +
> +     switch (off) {
> +     case bpf_ctx_range(struct __sk_buff, data):
> +             info->reg_type = PTR_TO_PACKET;
> +             break;
> +     case bpf_ctx_range(struct __sk_buff, data_end):
> +             info->reg_type = PTR_TO_PACKET_END;
> +             break;
> +     case bpf_ctx_range(struct __sk_buff, flow_keys):
> +             info->reg_type = PTR_TO_FLOW_KEYS;
> +             break;
> +     case bpf_ctx_range(struct __sk_buff, tc_classid):
> +     case bpf_ctx_range(struct __sk_buff, data_meta):
> +     case bpf_ctx_range_till(struct __sk_buff, family, local_port):
> +             return false;
> +     }
> +
> +     return bpf_skb_is_valid_access(off, size, type, prog, info);
> +}
> +
>  static u32 bpf_convert_ctx_access(enum bpf_access_type type,
>                                 const struct bpf_insn *si,
>                                 struct bpf_insn *insn_buf,
> @@ -6054,6 +6103,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type 
> type,
>                                     bpf_target_off(struct sock_common,
>                                                    skc_num, 2, target_size));
>               break;
> +
> +     case offsetof(struct __sk_buff, flow_keys):
> +             off  = si->off;
> +             off -= offsetof(struct __sk_buff, flow_keys);
> +             off += offsetof(struct sk_buff, cb);
> +             off += offsetof(struct qdisc_skb_cb, flow_keys);
> +             *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
> +                                   si->src_reg, off);
> +             break;
>       }
>  
>       return insn - insn_buf;
> @@ -7017,6 +7075,15 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = {
>  const struct bpf_prog_ops sk_msg_prog_ops = {
>  };
>  
> +const struct bpf_verifier_ops flow_dissector_verifier_ops = {
> +     .get_func_proto         = flow_dissector_func_proto,
> +     .is_valid_access        = flow_dissector_is_valid_access,
> +     .convert_ctx_access     = bpf_convert_ctx_access,
> +};
> +
> +const struct bpf_prog_ops flow_dissector_prog_ops = {
> +};
> +
>  int sk_detach_filter(struct sock *sk)
>  {
>       int ret = -ENOENT;
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index ce9eeeb7c024..7eed48c46a94 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -25,6 +25,9 @@
>  #include <net/flow_dissector.h>
>  #include <scsi/fc/fc_fcoe.h>
>  #include <uapi/linux/batadv_packet.h>
> +#include <linux/bpf.h>
> +
> +static DEFINE_MUTEX(flow_dissector_mutex);
>  
>  static void dissector_set_key(struct flow_dissector *flow_dissector,
>                             enum flow_dissector_key_id key_id)
> @@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector 
> *flow_dissector,
>  }
>  EXPORT_SYMBOL(skb_flow_dissector_init);
>  
> +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
> +                                    struct bpf_prog *prog)
> +{
> +     struct bpf_prog *attached;
> +     struct net *net;
> +
> +     net = current->nsproxy->net_ns;
> +     mutex_lock(&flow_dissector_mutex);
> +     attached = rcu_dereference_protected(net->flow_dissector_prog,
> +                                          
> lockdep_is_held(&flow_dissector_mutex));
> +     if (attached) {
> +             /* Only one BPF program can be attached at a time */
> +             mutex_unlock(&flow_dissector_mutex);
> +             return -EEXIST;
> +     }
> +     rcu_assign_pointer(net->flow_dissector_prog, prog);
> +     mutex_unlock(&flow_dissector_mutex);
> +     return 0;
> +}
> +
> +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
> +{
> +     struct bpf_prog *attached;
> +     struct net *net;
> +
> +     net = current->nsproxy->net_ns;
> +     mutex_lock(&flow_dissector_mutex);
> +     attached = rcu_dereference_protected(net->flow_dissector_prog,
> +                                          
> lockdep_is_held(&flow_dissector_mutex));
> +     if (!attached) {
> +             mutex_unlock(&flow_dissector_mutex);
> +             return -ENOENT;
> +     }
> +     bpf_prog_put(attached);
> +     RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
> +     mutex_unlock(&flow_dissector_mutex);
> +     return 0;
> +}
>  /**
>   * skb_flow_get_be16 - extract be16 entity
>   * @skb: sk_buff to extract from
> @@ -588,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
>       return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
>  }
>  
> +static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
> +                                  struct flow_dissector *flow_dissector,
> +                                  void *target_container)
> +{
> +     struct flow_dissector_key_control *key_control;
> +     struct flow_dissector_key_basic *key_basic;
> +     struct flow_dissector_key_addrs *key_addrs;
> +     struct flow_dissector_key_ports *key_ports;
> +
> +     key_control = skb_flow_dissector_target(flow_dissector,
> +                                             FLOW_DISSECTOR_KEY_CONTROL,
> +                                             target_container);
> +     key_control->thoff = flow_keys->thoff;
> +     if (flow_keys->is_frag)
> +             key_control->flags |= FLOW_DIS_IS_FRAGMENT;
> +     if (flow_keys->is_first_frag)
> +             key_control->flags |= FLOW_DIS_FIRST_FRAG;
> +     if (flow_keys->is_encap)
> +             key_control->flags |= FLOW_DIS_ENCAPSULATION;
> +
> +     key_basic = skb_flow_dissector_target(flow_dissector,
> +                                           FLOW_DISSECTOR_KEY_BASIC,
> +                                           target_container);
> +     key_basic->n_proto = flow_keys->n_proto;
> +     key_basic->ip_proto = flow_keys->ip_proto;
> +
> +     if (flow_keys->addr_proto == ETH_P_IP &&
> +         dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
> +             key_addrs = skb_flow_dissector_target(flow_dissector,
> +                                                   
> FLOW_DISSECTOR_KEY_IPV4_ADDRS,
> +                                                   target_container);
> +             key_addrs->v4addrs.src = flow_keys->ipv4_src;
> +             key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
> +             key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> +     } else if (flow_keys->addr_proto == ETH_P_IPV6 &&
> +                dissector_uses_key(flow_dissector,
> +                                   FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
> +             key_addrs = skb_flow_dissector_target(flow_dissector,
> +                                                   
> FLOW_DISSECTOR_KEY_IPV6_ADDRS,
> +                                                   target_container);
> +             memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src,
> +                    sizeof(key_addrs->v6addrs));
> +             key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
> +     }
> +
> +     if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
> +             key_ports = skb_flow_dissector_target(flow_dissector,
> +                                                   FLOW_DISSECTOR_KEY_PORTS,
> +                                                   target_container);
> +             key_ports->src = flow_keys->sport;
> +             key_ports->dst = flow_keys->dport;
> +     }
> +}
> +
>  /**
>   * __skb_flow_dissect - extract the flow_keys struct and return it
>   * @skb: sk_buff to extract the flow from, can be NULL if the rest are 
> specified
> @@ -619,6 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
>       struct flow_dissector_key_vlan *key_vlan;
>       enum flow_dissect_ret fdret;
>       enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
> +     struct bpf_prog *attached;
>       int num_hdrs = 0;
>       u8 ip_proto = 0;
>       bool ret;
> @@ -658,6 +754,46 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
>                                             FLOW_DISSECTOR_KEY_BASIC,
>                                             target_container);
>  
> +     rcu_read_lock();
> +     attached = skb ? rcu_dereference(dev_net(skb->dev)->flow_dissector_prog)
> +                    : NULL;
> +     if (attached) {
> +             /* Note that even though the const qualifier is discarded
> +              * throughout the execution of the BPF program, all changes(the
> +              * control block) are reverted after the BPF program returns.
> +              * Therefore, __skb_flow_dissect does not alter the skb.
> +              */
> +             struct bpf_flow_keys flow_keys = {};
> +             struct qdisc_skb_cb cb_saved;
> +             struct qdisc_skb_cb *cb;
> +             u16 *pseudo_cb;
> +             u32 result;
> +
> +             cb = qdisc_skb_cb(skb);
> +             pseudo_cb = (u16 *)bpf_skb_cb((struct sk_buff *)skb);
> +
> +             /* Save Control Block */
> +             memcpy(&cb_saved, cb, sizeof(cb_saved));
> +             memset(cb, 0, sizeof(cb_saved));
> +
> +             /* Pass parameters to the BPF program */
> +             cb->flow_keys = &flow_keys;
> +             *pseudo_cb = nhoff;

I don't understand this bit.
What is this pseudo_cb and why nhoff goes in there?
Some odd way to pass it into the prog?

> +
> +             bpf_compute_data_pointers((struct sk_buff *)skb);
> +             result = BPF_PROG_RUN(attached, skb);
> +
> +             /* Restore state */
> +             memcpy(cb, &cb_saved, sizeof(cb_saved));
> +
> +             __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
> +                                      target_container);
> +             key_control->thoff = min_t(u16, key_control->thoff, skb->len);
> +             rcu_read_unlock();
> +             return result == BPF_OK;
> +     }
> +     rcu_read_unlock();
> +
>       if (dissector_uses_key(flow_dissector,
>                              FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
>               struct ethhdr *eth = eth_hdr(skb);

Re: [bpf-next, v2 1/3] flow_dissector: implements flow dissector BPF hook

Reply via email to