On Mon, Jul 8, 2019 at 2:55 AM Toke Høiland-Jørgensen <t...@redhat.com> wrote: > > Y Song <ys114...@gmail.com> writes: > > > On Sat, Jul 6, 2019 at 1:48 AM Toke Høiland-Jørgensen <t...@redhat.com> > > wrote: > >> > >> From: Toke Høiland-Jørgensen <t...@redhat.com> > >> > >> A common pattern when using xdp_redirect_map() is to create a device map > >> where the lookup key is simply ifindex. Because device maps are arrays, > >> this leaves holes in the map, and the map has to be sized to fit the > >> largest ifindex, regardless of how many devices actually are actually > >> needed in the map. > >> > >> This patch adds a second type of device map where the key is looked up > >> using a hashmap, instead of being used as an array index. This allows maps > >> to be densely packed, so they can be smaller. > >> > >> Signed-off-by: Toke Høiland-Jørgensen <t...@redhat.com> > >> --- > >> include/linux/bpf.h | 7 ++ > >> include/linux/bpf_types.h | 1 > >> include/trace/events/xdp.h | 3 - > >> kernel/bpf/devmap.c | 192 > >> ++++++++++++++++++++++++++++++++++++++++++++ > >> kernel/bpf/verifier.c | 2 > >> net/core/filter.c | 9 ++ > >> 6 files changed, 211 insertions(+), 3 deletions(-) > >> > >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h > >> index bfdb54dd2ad1..f9a506147c8a 100644 > >> --- a/include/linux/bpf.h > >> +++ b/include/linux/bpf.h > >> @@ -713,6 +713,7 @@ struct xdp_buff; > >> struct sk_buff; > >> > >> struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 > >> key); > >> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, > >> u32 key); > >> void __dev_map_flush(struct bpf_map *map); > >> int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > >> struct net_device *dev_rx); > >> @@ -799,6 +800,12 @@ static inline struct net_device > >> *__dev_map_lookup_elem(struct bpf_map *map, > >> return NULL; > >> } > >> > >> +static inline struct net_device *__dev_map_hash_lookup_elem(struct > >> bpf_map *map, > >> + u32 key) > >> +{ > >> + return NULL; > >> +} > >> + > >> static inline void __dev_map_flush(struct bpf_map *map) > >> { > >> } > >> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h > >> index eec5aeeeaf92..36a9c2325176 100644 > >> --- a/include/linux/bpf_types.h > >> +++ b/include/linux/bpf_types.h > >> @@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, > >> array_of_maps_map_ops) > >> BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) > >> #ifdef CONFIG_NET > >> BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) > >> +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) > >> BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) > >> #if defined(CONFIG_BPF_STREAM_PARSER) > >> BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) > >> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h > >> index 68899fdc985b..8c8420230a10 100644 > >> --- a/include/trace/events/xdp.h > >> +++ b/include/trace/events/xdp.h > >> @@ -175,7 +175,8 @@ struct _bpf_dtab_netdev { > >> #endif /* __DEVMAP_OBJ_TYPE */ > >> > >> #define devmap_ifindex(fwd, map) \ > >> - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ > >> + ((map->map_type == BPF_MAP_TYPE_DEVMAP || \ > >> + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ? \ > >> ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0) > >> > >> #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ > >> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c > >> index a2fe16362129..341af02f049d 100644 > >> --- a/kernel/bpf/devmap.c > >> +++ b/kernel/bpf/devmap.c > >> @@ -37,6 +37,12 @@ > >> * notifier hook walks the map we know that new dev references can not be > >> * added by the user because core infrastructure ensures > >> dev_get_by_index() > >> * calls will fail at this point. > >> + * > >> + * The devmap_hash type is a map type which interprets keys as ifindexes > >> and > >> + * indexes these using a hashmap. This allows maps that use ifindex as > >> key to be > >> + * densely packed instead of having holes in the lookup array for unused > >> + * ifindexes. The setup and packet enqueue/send code is shared between > >> the two > >> + * types of devmap; only the lookup and insertion is different. > >> */ > >> #include <linux/bpf.h> > >> #include <net/xdp.h> > >> @@ -59,6 +65,7 @@ struct xdp_bulk_queue { > >> > >> struct bpf_dtab_netdev { > >> struct net_device *dev; /* must be first member, due to tracepoint > >> */ > >> + struct hlist_node index_hlist; > >> struct bpf_dtab *dtab; > >> unsigned int idx; /* keep track of map index for tracepoint */ > >> struct xdp_bulk_queue __percpu *bulkq; > >> @@ -70,11 +77,29 @@ struct bpf_dtab { > >> struct bpf_dtab_netdev **netdev_map; > >> struct list_head __percpu *flush_list; > >> struct list_head list; > >> + > >> + /* these are only used for DEVMAP_HASH type maps */ > >> + unsigned int items; > >> + struct hlist_head *dev_index_head; > >> + spinlock_t index_lock; > >> }; > >> > >> static DEFINE_SPINLOCK(dev_map_lock); > >> static LIST_HEAD(dev_map_list); > >> > >> +static struct hlist_head *dev_map_create_hash(void) > >> +{ > >> + int i; > >> + struct hlist_head *hash; > >> + > >> + hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), > >> GFP_KERNEL); > >> + if (hash != NULL) > >> + for (i = 0; i < NETDEV_HASHENTRIES; i++) > >> + INIT_HLIST_HEAD(&hash[i]); > >> + > >> + return hash; > >> +} > >> + > >> static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr, > >> bool check_memlock) > >> { > >> @@ -98,6 +123,9 @@ static int dev_map_init_map(struct bpf_dtab *dtab, > >> union bpf_attr *attr, > >> cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev > >> *); > >> cost += sizeof(struct list_head) * num_possible_cpus(); > >> > >> + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) > >> + cost += sizeof(struct hlist_head) * NETDEV_HASHENTRIES; > >> + > >> /* if map size is larger than memlock limit, reject it */ > >> err = bpf_map_charge_init(&dtab->map.memory, cost); > >> if (err) > >> @@ -116,8 +144,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, > >> union bpf_attr *attr, > >> if (!dtab->netdev_map) > >> goto free_percpu; > >> > >> + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { > >> + dtab->dev_index_head = dev_map_create_hash(); > >> + if (!dtab->dev_index_head) > >> + goto free_map_area; > >> + > >> + spin_lock_init(&dtab->index_lock); > >> + } > >> + > >> return 0; > >> > >> +free_map_area: > >> + bpf_map_area_free(dtab->netdev_map); > >> free_percpu: > >> free_percpu(dtab->flush_list); > >> free_charge: > >> @@ -199,6 +237,7 @@ static void dev_map_free(struct bpf_map *map) > >> > >> free_percpu(dtab->flush_list); > >> bpf_map_area_free(dtab->netdev_map); > >> + kfree(dtab->dev_index_head); > >> kfree(dtab); > >> } > >> > >> @@ -219,6 +258,70 @@ static int dev_map_get_next_key(struct bpf_map *map, > >> void *key, void *next_key) > >> return 0; > >> } > >> > >> +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, > >> + int idx) > >> +{ > >> + return &dtab->dev_index_head[idx & (NETDEV_HASHENTRIES - 1)]; > >> +} > >> + > >> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, > >> u32 key) > >> +{ > >> + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > >> + struct hlist_head *head = dev_map_index_hash(dtab, key); > >> + struct bpf_dtab_netdev *dev; > >> + > >> + hlist_for_each_entry_rcu(dev, head, index_hlist) > >> + if (dev->idx == key) > >> + return dev; > >> + > >> + return NULL; > >> +} > >> + > >> +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, > >> + void *next_key) > >> +{ > >> + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > >> + u32 idx, *next = next_key; > >> + struct bpf_dtab_netdev *dev, *next_dev; > >> + struct hlist_head *head; > >> + int i = 0; > >> + > >> + if (!key) > >> + goto find_first; > >> + > >> + idx = *(u32 *)key; > >> + > >> + dev = __dev_map_hash_lookup_elem(map, idx); > >> + if (!dev) > >> + goto find_first; > >> + > >> + next_dev = > >> hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), > >> + struct bpf_dtab_netdev, index_hlist); > > > > Just want to get a better understanding. Why do you want > > hlist_entry_safe instead of hlist_entry? > > Erm, because the entry might not exist? The _safe variant just checks > the list pointer before casting to the containing struct.
hlist_entry_safe is certainly correct. Just want to better understand when the entry might not exist. By looking at hashtab.c implementation, it has the same possible-null-entry assumption, so we should be ok here. > > > Also, maybe rcu_dereference instead of rcu_dereference_raw? > > Well, the _raw() variant comes from the equivalent function in > hashtab.c, where I originally nicked most of this function, so think it > makes more sense to keep them the same. Okay. > > > dev_map_hash_get_next_key() is called in syscall.c within rcu_read_lock > > region. > > > >> + > >> + if (next_dev) { > >> + *next = next_dev->idx; > >> + return 0; > >> + } > >> + > >> + i = idx & (NETDEV_HASHENTRIES - 1); > >> + i++; > >> + > >> + find_first: > >> + for (; i < NETDEV_HASHENTRIES; i++) { > >> + head = dev_map_index_hash(dtab, i); > >> + > >> + next_dev = > >> hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), > >> + struct bpf_dtab_netdev, > >> + index_hlist); > > > > ditto. The same question as the above. > > > >> + if (next_dev) { > >> + *next = next_dev->idx; > >> + return 0; > >> + } > >> + } > >> + > >> + return -ENOENT; > >> +} > >> + > >> static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, > >> bool in_napi_ctx) > >> { > >> @@ -374,6 +477,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, > >> void *key) > >> return dev ? &dev->ifindex : NULL; > >> } > >> > >> +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) > >> +{ > >> + struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, > >> + *(u32 > >> *)key); > >> + struct net_device *dev = obj ? obj->dev : NULL; > > > > When obj->dev could be NULL? > > Never. This could just as well be written as > > return obj ? obj->dev->ifindex : NULL; > > but writing it this way keeps it consistent with the existing > dev_map_lookup_elem() function. Okay then in order to be consistent with the existing code base. Thanks for all the explanation!