On 02/05/2016 04:22 PM, Vladimir Davydov wrote:
On Thu, Feb 04, 2016 at 07:39:48PM +0300, Dmitry Safonov wrote: ...diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index b7e57927..a6bf41a 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -103,9 +103,10 @@ struct kmem_cache {#ifdef CONFIG_SYSFS#define SLAB_SUPPORTS_SYSFS -void sysfs_slab_remove(struct kmem_cache *); +int sysfs_slab_remove(struct kmem_cache *); +void sysfs_slab_remove_cancel(struct kmem_cache *s); #else -static inline void sysfs_slab_remove(struct kmem_cache *s) +void sysfs_slab_remove_cancel(struct kmem_cache *s) { } #endif diff --git a/mm/slab.h b/mm/slab.h index 834ad24..ec87600 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -141,7 +141,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,int __kmem_cache_shutdown(struct kmem_cache *);int __kmem_cache_shrink(struct kmem_cache *, bool); -void slab_kmem_cache_release(struct kmem_cache *); +int slab_kmem_cache_release(struct kmem_cache *);struct seq_file;struct file; diff --git a/mm/slab_common.c b/mm/slab_common.c index b50aef0..3ad3d22 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -448,33 +448,58 @@ out_unlock: } EXPORT_SYMBOL(kmem_cache_create);-static int shutdown_cache(struct kmem_cache *s,+static void prepare_caches_release(struct kmem_cache *s, struct list_head *release, bool *need_rcu_barrier) { - if (__kmem_cache_shutdown(s) != 0) - return -EBUSY; - if (s->flags & SLAB_DESTROY_BY_RCU) *need_rcu_barrier = true;list_move(&s->list, release);- return 0; }-static void release_caches(struct list_head *release, bool need_rcu_barrier)+#ifdef SLAB_SUPPORTS_SYSFS +#define release_one_cache sysfs_slab_remove +#else +#define release_one_cache slab_kmem_cache_release +#endif + +static int release_caches_type(struct list_head *release, bool children) { struct kmem_cache *s, *s2; + int ret = 0;+ list_for_each_entry_safe(s, s2, release, list) {+ if (is_root_cache(s) == children) + continue; + + ret += release_one_cache(s); + } + return ret; +} + +static void release_caches(struct list_head *release, bool need_rcu_barrier) +{ if (need_rcu_barrier) rcu_barrier();You must issue an rcu barrier before freeing kmem_cache structure, not before issuing __kmem_cache_shutdown. Otherwise a slab freed by __kmem_cache_shutdown might hit use-after-free bug.
Yeah, I wrongly interpreted fast reuse of deleted object.
- list_for_each_entry_safe(s, s2, release, list) {-#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_remove(s); -#else - slab_kmem_cache_release(s); -#endif - } + /* remove children in the first place, remove root on success */ + if (!release_caches_type(release, true)) + release_caches_type(release, false); +} + +static void release_cache_cancel(struct kmem_cache *s) +{ + sysfs_slab_remove_cancel(s); + + get_online_cpus(); + get_online_mems();What's point taking these locks when you just want to add a cache to the slab_list? Besides, if cpu/mem hotplug happens *between* prepare_cache_release and release_cache_cancel we'll get a cache on the list with not all per node/cpu structures allocated.
Oh, I see, you right.
+ mutex_lock(&slab_mutex); + + list_move(&s->list, &slab_caches); + + mutex_unlock(&slab_mutex); + put_online_mems(); + put_online_cpus(); }#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)@@ -589,16 +614,14 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) put_online_cpus(); }-static int __shutdown_memcg_cache(struct kmem_cache *s,+static void prepare_memcg_empty_caches(struct kmem_cache *s, struct list_head *release, bool *need_rcu_barrier) { BUG_ON(is_root_cache(s));- if (shutdown_cache(s, release, need_rcu_barrier))- return -EBUSY; + prepare_caches_release(s, release, need_rcu_barrier);- list_del(&s->memcg_params.list);- return 0; + list_del_init(&s->memcg_params.list);AFAICS if kmem_cache_destroy fails, you don't add per-memcg cache back to the list. Not good.}void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)@@ -614,11 +637,12 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) list_for_each_entry_safe(s, s2, &slab_caches, list) { if (is_root_cache(s) || s->memcg_params.memcg != memcg) continue; + /* * The cgroup is about to be freed and therefore has no charges * left. Hence, all its caches must be empty by now. */ - BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier));It was a nice little check if everything works fine on memcg side. And you wiped it out. Sad.
It's still there but in form of WARN()
+ prepare_memcg_empty_caches(s, &release, &need_rcu_barrier); } mutex_unlock(&slab_mutex);@@ -628,81 +652,53 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)release_caches(&release, need_rcu_barrier); }-static int shutdown_memcg_caches(struct kmem_cache *s,+static void prepare_memcg_filled_caches(struct kmem_cache *s, struct list_head *release, bool *need_rcu_barrier) { struct memcg_cache_array *arr; struct kmem_cache *c, *c2; - LIST_HEAD(busy); - int i;BUG_ON(!is_root_cache(s)); - /*- * First, shutdown active caches, i.e. caches that belong to online - * memory cgroups. - */ arr = rcu_dereference_protected(s->memcg_params.memcg_caches, lockdep_is_held(&slab_mutex));And now what's that for?- for_each_memcg_cache_index(i) { - c = arr->entries[i]; - if (!c) - continue; - if (__shutdown_memcg_cache(c, release, need_rcu_barrier)) - /* - * The cache still has objects. Move it to a temporary - * list so as not to try to destroy it for a second - * time while iterating over inactive caches below. - */ - list_move(&c->memcg_params.list, &busy); - else - /* - * The cache is empty and will be destroyed soon. Clear - * the pointer to it in the memcg_caches array so that - * it will never be accessed even if the root cache - * stays alive. - */ - arr->entries[i] = NULL;So you don't clean arr->entries on global cache destruction. If we fail to destroy a cache, this will result in use-after-free when the global cache gets reused.
Yes, should have clean
- }- /*- * Second, shutdown all caches left from memory cgroups that are now - * offline. - */ + /* move children caches to release list */ list_for_each_entry_safe(c, c2, &s->memcg_params.list, memcg_params.list) - __shutdown_memcg_cache(c, release, need_rcu_barrier); - - list_splice(&busy, &s->memcg_params.list); + prepare_caches_release(c, release, need_rcu_barrier);- /*- * A cache being destroyed must be empty. In particular, this means - * that all per memcg caches attached to it must be empty too. - */ - if (!list_empty(&s->memcg_params.list)) - return -EBUSY; - return 0; + /* root cache to the same place */ + prepare_caches_release(s, release, need_rcu_barrier); } + #else -static inline int shutdown_memcg_caches(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) -{ - return 0; -} +#define prepare_memcg_filled_caches prepare_caches_release #endif /* CONFIG_MEMCG && !CONFIG_SLOB */-void slab_kmem_cache_release(struct kmem_cache *s)+int slab_kmem_cache_release(struct kmem_cache *s) { + if (__kmem_cache_shutdown(s)) { + WARN(1, "release slub cache %s failed: it still has objects\n", + s->name); + release_cache_cancel(s); + return 1; + } + +#ifdef CONFIG_MEMCG + list_del(&s->memcg_params.list); +#endif + destroy_memcg_params(s); kfree_const(s->name); kmem_cache_free(kmem_cache, s); + return 0; }void kmem_cache_destroy(struct kmem_cache *s){ LIST_HEAD(release); bool need_rcu_barrier = false; - int err;if (unlikely(!s))return; @@ -716,15 +712,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock;- err = shutdown_memcg_caches(s, &release, &need_rcu_barrier);- if (!err) - err = shutdown_cache(s, &release, &need_rcu_barrier); + prepare_memcg_filled_caches(s, &release, &need_rcu_barrier);- if (err) {- pr_err("kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); - dump_stack(); - } out_unlock: mutex_unlock(&slab_mutex);diff --git a/mm/slub.c b/mm/slub.cindex 2e1355a..373aa6d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5429,14 +5429,14 @@ out_del_kobj: goto out; }-void sysfs_slab_remove(struct kmem_cache *s)+int sysfs_slab_remove(struct kmem_cache *s) { if (slab_state < FULL) /* * Sysfs has not been setup yet so no need to remove the * cache from sysfs. */ - return; + return 0;#ifdef CONFIG_MEMCGkset_unregister(s->memcg_kset); @@ -5444,6 +5444,24 @@ void sysfs_slab_remove(struct kmem_cache *s) kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); kobject_put(&s->kobj); + return 0; +} + +void sysfs_slab_remove_cancel(struct kmem_cache *s) +{ + int ret; + + if (slab_state < FULL) + return; + + /* tricky */What purpose is this comment supposed to serve for?+ kobject_get(&s->kobj); + ret = kobject_add(&s->kobj, NULL, "%s", s->name);ret is not used.+ kobject_uevent(&s->kobj, KOBJ_ADD); + +#ifdef CONFIG_MEMCG + s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); +#endifFor per-memcg cache we must not create memcg_kset. And what if any of these functions fail? I don't think that silently ignoring failures is good.
I thought, it's fine after WARN try to revert everything as far as you can. Without checking was reverting successful.
Anyway, I really don't like how this patch approaches the problem. AFAIU we only want to postpone kmem_cache_node free until sysfs file is destroyed. Can't we just move the code freeing kmem_cache_node to a separate function which will be called from sysfs release instead of messing things up?
Yeah, I'll redo that way. Thanks for a review.
Thanks, Vladimir
-- Regards, Dmitry Safonov