On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote: > The subsequent commits introducing default maps and a hash-based ifindex > devmap require a bit of refactoring of the devmap code. Perform this first > so the subsequent commits become easier to read. > > Signed-off-by: Toke Høiland-Jørgensen <t...@redhat.com>
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c > index 191b79948424..1037fc08c504 100644 > --- a/kernel/bpf/devmap.c > +++ b/kernel/bpf/devmap.c > @@ -75,6 +75,7 @@ struct bpf_dtab { > struct bpf_dtab_netdev **netdev_map; > unsigned long __percpu *flush_needed; > struct list_head list; > + struct rcu_head rcu; I think the RCU change may warrant a separate commit or at least a mention in the commit message ;) > }; > > static DEFINE_SPINLOCK(dev_map_lock); > @@ -85,23 +86,11 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr) > return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); > } > > -static struct bpf_map *dev_map_alloc(union bpf_attr *attr) > +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr, > + bool check_memlock) > { > - struct bpf_dtab *dtab; > - int err = -EINVAL; > u64 cost; > - > - if (!capable(CAP_NET_ADMIN)) > - return ERR_PTR(-EPERM); > - > - /* check sanity of attributes */ > - if (attr->max_entries == 0 || attr->key_size != 4 || > - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) > - return ERR_PTR(-EINVAL); perhaps consider moving these to a map_alloc_check callback? > - dtab = kzalloc(sizeof(*dtab), GFP_USER); > - if (!dtab) > - return ERR_PTR(-ENOMEM); > + int err; > > bpf_map_init_from_attr(&dtab->map, attr); > > @@ -109,60 +98,72 @@ static struct bpf_map *dev_map_alloc(union bpf_attr > *attr) > cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); > cost += dev_map_bitmap_size(attr) * num_possible_cpus(); > if (cost >= U32_MAX - PAGE_SIZE) > - goto free_dtab; > + return -EINVAL; > > dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; > > - /* if map size is larger than memlock limit, reject it early */ > - err = bpf_map_precharge_memlock(dtab->map.pages); > - if (err) > - goto free_dtab; > - > - err = -ENOMEM; > + if (check_memlock) { > + /* if map size is larger than memlock limit, reject it early */ > + err = bpf_map_precharge_memlock(dtab->map.pages); > + if (err) > + return -EINVAL; > + } > > /* A per cpu bitfield with a bit per possible net device */ > dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), > __alignof__(unsigned long), > GFP_KERNEL | __GFP_NOWARN); > if (!dtab->flush_needed) > - goto free_dtab; > + goto err_alloc; > > dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * > sizeof(struct bpf_dtab_netdev *), > dtab->map.numa_node); > if (!dtab->netdev_map) > - goto free_dtab; > + goto err_map; > > - spin_lock(&dev_map_lock); > - list_add_tail_rcu(&dtab->list, &dev_map_list); > - spin_unlock(&dev_map_lock); > + return 0; > > - return &dtab->map; > -free_dtab: > + err_map: The label should name the first action. So it was correct, you're making it less good :) Also why the space? > free_percpu(dtab->flush_needed); > - kfree(dtab); > - return ERR_PTR(err); > + err_alloc: and no need for this one > + return -ENOMEM; > } > > -static void dev_map_free(struct bpf_map *map) > +static struct bpf_map *dev_map_alloc(union bpf_attr *attr) > { > - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > - int i, cpu; > + struct bpf_dtab *dtab; > + int err; > > - /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, > - * so the programs (can be more than one that used this map) were > - * disconnected from events. Wait for outstanding critical sections in > - * these programs to complete. The rcu critical section only guarantees > - * no further reads against netdev_map. It does __not__ ensure pending > - * flush operations (if any) are complete. > - */ > + if (!capable(CAP_NET_ADMIN)) > + return ERR_PTR(-EPERM); > + > + /* check sanity of attributes */ > + if (attr->max_entries == 0 || attr->key_size != 4 || > + attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) > + return ERR_PTR(-EINVAL); > + > + dtab = kzalloc(sizeof(*dtab), GFP_USER); > + if (!dtab) > + return ERR_PTR(-ENOMEM); > + > + err = dev_map_init_map(dtab, attr, true); > + if (err) { > + kfree(dtab); > + return ERR_PTR(err); > + } > > spin_lock(&dev_map_lock); > - list_del_rcu(&dtab->list); > + list_add_tail_rcu(&dtab->list, &dev_map_list); > spin_unlock(&dev_map_lock); > > - bpf_clear_redirect_map(map); > - synchronize_rcu(); > + return &dtab->map; > +} > + > +static void __dev_map_free(struct rcu_head *rcu) > +{ > + struct bpf_dtab *dtab = container_of(rcu, struct bpf_dtab, rcu); > + int i, cpu; > > /* To ensure all pending flush operations have completed wait for flush > * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. > @@ -192,6 +193,26 @@ static void dev_map_free(struct bpf_map *map) There is a cond_resched() here, I don't think you can call cond_resched() from an RCU callback. > kfree(dtab); > } > > +static void dev_map_free(struct bpf_map *map) > +{ > + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > + > + /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, > + * so the programs (can be more than one that used this map) were > + * disconnected from events. Wait for outstanding critical sections in > + * these programs to complete. The rcu critical section only guarantees > + * no further reads against netdev_map. It does __not__ ensure pending > + * flush operations (if any) are complete. > + */ > + > + spin_lock(&dev_map_lock); > + list_del_rcu(&dtab->list); > + spin_unlock(&dev_map_lock); > + > + bpf_clear_redirect_map(map); > + call_rcu(&dtab->rcu, __dev_map_free); > +} > + > static int dev_map_get_next_key(struct bpf_map *map, void *key, void > *next_key) > { > struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);