On 06/16/2015 07:20 PM, Alexei Starovoitov wrote: > On 6/16/15 5:38 AM, Daniel Wagner wrote: >> static int free_thread(void *arg) >> +{ >> + unsigned long flags; >> + struct htab_elem *l; >> + >> + while (!kthread_should_stop()) { >> + spin_lock_irqsave(&elem_freelist_lock, flags); >> + while (!list_empty(&elem_freelist)) { >> + l = list_entry(elem_freelist.next, >> + struct htab_elem, list); >> + list_del(&l->list); >> + kfree(l); > > that's not right, since such thread defeats rcu protection of lookup. > We need either kfree_rcu/call_rcu or synchronize_rcu.
D'oh, I have not seen the elephant in the room. /me grabs a brown bag. > Obviously the former is preferred that's why I'm still digging into it. > Probably a thread that does kfree_rcu would be ok, but we shouldn't > be doing it unconditionally. For all networking programs and 99% > of tracing programs the existing code is fine and I don't want to > slow it down to tackle the corner case. > Extra spin_lock just to add it to the list is also quite costly. Anyway, I changed to above kfree() to a kfree_rcu() and it explodes again. With the same stack trace we seen. Steven's suggestion deferring the work via irq_work results in the same stack trace. (Now I get cold feets, without the nice heat from the CPU busy looping...) It looks like there is something else somewhere hidden. cheers, daniel >From 3215ba9e4f9ecd0c7183f2400d9d579dcd5f4bc3 Mon Sep 17 00:00:00 2001 From: Daniel Wagner <daniel.wag...@bmw-carit.de> Date: Wed, 17 Jun 2015 09:52:23 +0200 Subject: [PATCH] bpf: Defer kfree_rcu via irq_work --- kernel/bpf/hashtab.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 83c209d..f6f1702 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -13,6 +13,7 @@ #include <linux/jhash.h> #include <linux/filter.h> #include <linux/vmalloc.h> +#include <linux/irq_work.h> struct bpf_htab { struct bpf_map map; @@ -27,10 +28,39 @@ struct bpf_htab { struct htab_elem { struct hlist_node hash_node; struct rcu_head rcu; + struct llist_node llist; u32 hash; char key[0] __aligned(8); }; +static struct irq_work free_work; +static LLIST_HEAD(free_list); +static bool free_pending; + +static void free_work_cb(struct irq_work *work) +{ + struct llist_node *n; + struct htab_elem *e; + + free_pending = false; + + n = llist_del_all(&free_list); + if (!n) + return; + + llist_for_each_entry(e, n, llist) + kfree_rcu(e, rcu); +} + +static void free_elem(struct htab_elem *e) +{ + llist_add(&e->llist, &free_list); + if (!free_pending) { + free_pending = true; + irq_work_queue(&free_work); + } +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { @@ -262,7 +292,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_del_rcu(&l_old->hash_node); - kfree_rcu(l_old, rcu); + free_elem(l_old); } else { htab->count++; } @@ -300,7 +330,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) if (l) { hlist_del_rcu(&l->hash_node); htab->count--; - kfree_rcu(l, rcu); + free_elem(l); ret = 0; } @@ -361,6 +391,7 @@ static struct bpf_map_type_list htab_type __read_mostly = { static int __init register_htab_map(void) { + init_irq_work(&free_work, free_work_cb); bpf_register_map_type(&htab_type); return 0; } -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/