On 05/14/2018 07:00 PM, John Fastabend wrote:
> Sockmap is currently backed by an array and enforces keys to be
> four bytes. This works well for many use cases and was originally
> modeled after devmap which also uses four bytes keys. However,
> this has become limiting in larger use cases where a hash would
> be more appropriate. For example users may want to use the 5-tuple
> of the socket as the lookup key.
> 
> To support this add hash support.
> 
> Signed-off-by: John Fastabend <john.fastab...@gmail.com>
> Acked-by: David S. Miller <da...@davemloft.net>
> ---
>  include/linux/bpf.h       |   8 +
>  include/linux/bpf_types.h |   1 +
>  include/uapi/linux/bpf.h  |  52 ++++-
>  kernel/bpf/core.c         |   1 +
>  kernel/bpf/sockmap.c      | 494 
> ++++++++++++++++++++++++++++++++++++++++++++--
>  kernel/bpf/verifier.c     |  14 +-
>  net/core/filter.c         |  58 ++++++
>  7 files changed, 610 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index a38e474..ed0122b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -668,6 +668,7 @@ static inline void bpf_map_offload_map_free(struct 
> bpf_map *map)
>  
>  #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && 
> defined(CONFIG_INET)
>  struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
> +struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
>  int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
>  #else
>  static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 
> key)
> @@ -675,6 +676,12 @@ static inline struct sock  
> *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
>       return NULL;
>  }
>  
> +static inline struct sock  *__sock_hash_lookup_elem(struct bpf_map *map,
> +                                                 void *key)
> +{
> +     return NULL;
> +}
> +
>  static inline int sock_map_prog(struct bpf_map *map,
>                               struct bpf_prog *prog,
>                               u32 type)
> @@ -724,6 +731,7 @@ static inline void __xsk_map_flush(struct bpf_map *map)
>  extern const struct bpf_func_proto bpf_get_stackid_proto;
>  extern const struct bpf_func_proto bpf_get_stack_proto;
>  extern const struct bpf_func_proto bpf_sock_map_update_proto;
> +extern const struct bpf_func_proto bpf_sock_hash_update_proto;
>  
>  /* Shared helpers among cBPF and eBPF. */
>  void bpf_user_rnd_init_once(void);
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index d7df1b32..b67f879 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -47,6 +47,7 @@
>  BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
>  #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
> +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
>  #endif
>  BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
>  #if defined(CONFIG_XDP_SOCKETS)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 02e4112..1205d86 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -118,6 +118,7 @@ enum bpf_map_type {
>       BPF_MAP_TYPE_SOCKMAP,
>       BPF_MAP_TYPE_CPUMAP,
>       BPF_MAP_TYPE_XSKMAP,
> +     BPF_MAP_TYPE_SOCKHASH,
>  };
>  
>  enum bpf_prog_type {
> @@ -1855,6 +1856,52 @@ struct bpf_stack_build_id {
>   *             Egress device index on success, 0 if packet needs to continue
>   *             up the stack for further processing or a negative error in 
> case
>   *             of failure.
> + * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map 
> *map, void *key, u64 flags)

When you rebase please fix this up properly next time and add a newline in 
between
the helpers. I fixed this up while applying.

> + *   Description
> + *           Add an entry to, or update a sockhash *map* referencing sockets.
> + *           The *skops* is used as a new value for the entry associated to
> + *           *key*. *flags* is one of:
> + *
> + *           **BPF_NOEXIST**
> + *                   The entry for *key* must not exist in the map.
> + *           **BPF_EXIST**
> + *                   The entry for *key* must already exist in the map.
> + *           **BPF_ANY**
> + *                   No condition on the existence of the entry for *key*.
> + *
> + *           If the *map* has eBPF programs (parser and verdict), those will
> + *           be inherited by the socket being added. If the socket is
> + *           already attached to eBPF programs, this results in an error.
> + *   Return
> + *           0 on success, or a negative error in case of failure.
[...]
> +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
> +{
> +     struct bpf_htab *htab;
> +     int i, err;
> +     u64 cost;
> +
> +     if (!capable(CAP_NET_ADMIN))
> +             return ERR_PTR(-EPERM);
> +
> +     /* check sanity of attributes */
> +     if (attr->max_entries == 0 || attr->value_size != 4 ||
> +         attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
> +             return ERR_PTR(-EINVAL);
> +
> +     err = bpf_tcp_ulp_register();
> +     if (err && err != -EEXIST)
> +             return ERR_PTR(err);
> +
> +     htab = kzalloc(sizeof(*htab), GFP_USER);
> +     if (!htab)
> +             return ERR_PTR(-ENOMEM);
> +
> +     bpf_map_init_from_attr(&htab->map, attr);
> +
> +     htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
> +     htab->elem_size = sizeof(struct htab_elem) +
> +                       round_up(htab->map.key_size, 8);
> +
> +     if (htab->n_buckets == 0 ||
> +         htab->n_buckets > U32_MAX / sizeof(struct bucket))
> +             goto free_htab;
> +
> +     cost = (u64) htab->n_buckets * sizeof(struct bucket) +
> +            (u64) htab->elem_size * htab->map.max_entries;
> +
> +     if (cost >= U32_MAX - PAGE_SIZE)
> +             goto free_htab;

Also above two goto free_htab were buggy! Error after bpf_tcp_ulp_register()
is either 0 or -EEXIST, if it's 0, then when you bail out here this will cause
a NULL pointer dereference in subsequent map setup paths. Adapting the same
logic from sock_map_alloc() would have been enough. I've added a err = -EINVAL
to the first occasion above this time.

> +     htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
> +     err = bpf_map_precharge_memlock(htab->map.pages);
> +     if (err)
> +             goto free_htab;
> +
> +     err = -ENOMEM;
> +     htab->buckets = bpf_map_area_alloc(
> +                             htab->n_buckets * sizeof(struct bucket),
> +                             htab->map.numa_node);
> +     if (!htab->buckets)
> +             goto free_htab;
> +
> +     for (i = 0; i < htab->n_buckets; i++) {
> +             INIT_HLIST_HEAD(&htab->buckets[i].head);
> +             raw_spin_lock_init(&htab->buckets[i].lock);
> +     }
> +
> +     return &htab->map;
> +free_htab:
> +     kfree(htab);
> +     return ERR_PTR(err);
> +}

Reply via email to