Hello Minchan, On (03/25/16 08:41), Minchan Kim wrote: [..] > > Test #10 iozone -t 10 -R -r 80K -s 0M -I +Z > > Initial write 3213973.56 2731512.62 4416466.25* > > Rewrite 3066956.44* 2693819.50 332671.94 > > Read 7769523.25* 2681473.75 462840.44 > > Re-read 5244861.75 5473037.00* 382183.03 > > Reverse Read 7479397.25* 4869597.75 374714.06 > > Stride read 5403282.50* 5385083.75 382473.44 > > Random read 5131997.25 5176799.75* 380593.56 > > Mixed workload 3998043.25 4219049.00* 1645850.45 > > Random write 3452832.88 3290861.69 3588531.75* > > Pwrite 3757435.81 2711756.47 4561807.88* > > Pread 2743595.25* 2635835.00 412947.98 > > Fwrite 16076549.00 16741977.25* 14797209.38 > > Fread 23581812.62* 21664184.25 5064296.97 > > = real 0m44.490s 0m44.444s 0m44.609s > > = user 0m0.054s 0m0.049s 0m0.055s > > = sys 0m0.037s 0m0.046s 0m0.148s > > > > > > so when the number of active tasks become larger than the number > > of online CPUS, iozone reports a bit hard to understand data. I > > can assume that since now we keep the preemption disabled longer > > in write path, a concurrent operation (READ or WRITE) cannot preempt > > current anymore... slightly suspicious. > > > > the other hard to understand thing is why do READ-only tests have > > such a huge jitter. READ-only tests don't depend on streams, they > > don't even use them, we supply compressed data directly to > > decompression api. > > > > may be better retire iozone and never use it again. > > > > > > "118 insertions(+), 238 deletions(-)" the patches remove a big > > pile of code. > > First of all, I appreciate you very much!
thanks! > At a glance, on write workload, huge win but worth to investigate > how such fluctuation/regression happens on read-related test > (read and mixed workload). yes, was going to investigate in more details but got interrupted, will return back to it today/tomorrow. > Could you send your patchset? I will test it. oh, sorry, sure! attached (because it's not a real patch submission yet, but they look more or less ready I guess). patches are against next-20160324. -ss
>From 6bf369f2180dad1c8013a4847ec09d3b9056e910 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky <sergey.senozhat...@gmail.com> Subject: [PATCH 1/2] zsmalloc: require GFP in zs_malloc() Pass GFP flags to zs_malloc() instead of using fixed ones (set during pool creation), so we can be more flexible. Apart from that, this also align zs_malloc() interface with zspool/zbud. Signed-off-by: Sergey Senozhatsky <sergey.senozhat...@gmail.com> --- drivers/block/zram/zram_drv.c | 2 +- include/linux/zsmalloc.h | 2 +- mm/zsmalloc.c | 15 ++++++--------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 370c2f7..9030992 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, src = uncmem; } - handle = zs_malloc(meta->mem_pool, clen); + handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM); if (!handle) { pr_err("Error allocating memory for compressed page: %u, size=%zu\n", index, clen); diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 34eb160..6d89f8b 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -44,7 +44,7 @@ struct zs_pool; struct zs_pool *zs_create_pool(const char *name, gfp_t flags); void zs_destroy_pool(struct zs_pool *pool); -unsigned long zs_malloc(struct zs_pool *pool, size_t size); +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); void zs_free(struct zs_pool *pool, unsigned long obj); void *zs_map_object(struct zs_pool *pool, unsigned long handle, diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e72efb1..19027a1 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -247,7 +247,6 @@ struct zs_pool { struct size_class **size_class; struct kmem_cache *handle_cachep; - gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; struct zs_pool_stats stats; @@ -295,10 +294,10 @@ static void destroy_handle_cache(struct zs_pool *pool) kmem_cache_destroy(pool->handle_cachep); } -static unsigned long alloc_handle(struct zs_pool *pool) +static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) { return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - pool->flags & ~__GFP_HIGHMEM); + gfp & ~__GFP_HIGHMEM); } static void free_handle(struct zs_pool *pool, unsigned long handle) @@ -335,7 +334,7 @@ static void zs_zpool_destroy(void *pool) static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { - *handle = zs_malloc(pool, size); + *handle = zs_malloc(pool, size, gfp); return *handle ? 0 : -1; } static void zs_zpool_free(void *pool, unsigned long handle) @@ -1391,7 +1390,7 @@ static unsigned long obj_malloc(struct page *first_page, * otherwise 0. * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size) +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) { unsigned long handle, obj; struct size_class *class; @@ -1400,7 +1399,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; - handle = alloc_handle(pool); + handle = alloc_handle(pool, gfp); if (!handle) return 0; @@ -1413,7 +1412,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (!first_page) { spin_unlock(&class->lock); - first_page = alloc_zspage(class, pool->flags); + first_page = alloc_zspage(class, gfp); if (unlikely(!first_page)) { free_handle(pool, handle); return 0; @@ -1954,8 +1953,6 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags) prev_class = class; } - pool->flags = flags; - if (zs_pool_stat_create(name, pool)) goto err; -- 2.8.0.rc0.1.gd285ab0
>From 02e997f42a77ebf9e2fdeb84c5e7eaf4dca21b37 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky <sergey.senozhat...@gmail.com> Subject: [PATCH 2/2] zram: switch to per-cpu streams Use per-cpu compression streams. Signed-off-by: Sergey Senozhatsky <sergey.senozhat...@gmail.com> --- drivers/block/zram/zcomp.c | 293 ++++++++++++------------------------------ drivers/block/zram/zcomp.h | 12 +- drivers/block/zram/zram_drv.c | 32 ++++- 3 files changed, 110 insertions(+), 227 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 3ef42e5..d4159e4 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/wait.h> #include <linux/sched.h> +#include <linux/cpu.h> #include "zcomp.h" #include "zcomp_lzo.h" @@ -20,29 +21,6 @@ #include "zcomp_lz4.h" #endif -/* - * single zcomp_strm backend - */ -struct zcomp_strm_single { - struct mutex strm_lock; - struct zcomp_strm *zstrm; -}; - -/* - * multi zcomp_strm backend - */ -struct zcomp_strm_multi { - /* protect strm list */ - spinlock_t strm_lock; - /* max possible number of zstrm streams */ - int max_strm; - /* number of available zstrm streams */ - int avail_strm; - /* list of available strms */ - struct list_head idle_strm; - wait_queue_head_t strm_wait; -}; - static struct zcomp_backend *backends[] = { &zcomp_lzo, #ifdef CONFIG_ZRAM_LZ4_COMPRESS @@ -93,188 +71,6 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) return zstrm; } -/* - * get idle zcomp_strm or wait until other process release - * (zcomp_strm_release()) one for us - */ -static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp) -{ - struct zcomp_strm_multi *zs = comp->stream; - struct zcomp_strm *zstrm; - - while (1) { - spin_lock(&zs->strm_lock); - if (!list_empty(&zs->idle_strm)) { - zstrm = list_entry(zs->idle_strm.next, - struct zcomp_strm, list); - list_del(&zstrm->list); - spin_unlock(&zs->strm_lock); - return zstrm; - } - /* zstrm streams limit reached, wait for idle stream */ - if (zs->avail_strm >= zs->max_strm) { - spin_unlock(&zs->strm_lock); - wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); - continue; - } - /* allocate new zstrm stream */ - zs->avail_strm++; - spin_unlock(&zs->strm_lock); - /* - * This function can be called in swapout/fs write path - * so we can't use GFP_FS|IO. And it assumes we already - * have at least one stream in zram initialization so we - * don't do best effort to allocate more stream in here. - * A default stream will work well without further multiple - * streams. That's why we use NORETRY | NOWARN. - */ - zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY | - __GFP_NOWARN); - if (!zstrm) { - spin_lock(&zs->strm_lock); - zs->avail_strm--; - spin_unlock(&zs->strm_lock); - wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); - continue; - } - break; - } - return zstrm; -} - -/* add stream back to idle list and wake up waiter or free the stream */ -static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm) -{ - struct zcomp_strm_multi *zs = comp->stream; - - spin_lock(&zs->strm_lock); - if (zs->avail_strm <= zs->max_strm) { - list_add(&zstrm->list, &zs->idle_strm); - spin_unlock(&zs->strm_lock); - wake_up(&zs->strm_wait); - return; - } - - zs->avail_strm--; - spin_unlock(&zs->strm_lock); - zcomp_strm_free(comp, zstrm); -} - -/* change max_strm limit */ -static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) -{ - struct zcomp_strm_multi *zs = comp->stream; - struct zcomp_strm *zstrm; - - spin_lock(&zs->strm_lock); - zs->max_strm = num_strm; - /* - * if user has lowered the limit and there are idle streams, - * immediately free as much streams (and memory) as we can. - */ - while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) { - zstrm = list_entry(zs->idle_strm.next, - struct zcomp_strm, list); - list_del(&zstrm->list); - zcomp_strm_free(comp, zstrm); - zs->avail_strm--; - } - spin_unlock(&zs->strm_lock); - return true; -} - -static void zcomp_strm_multi_destroy(struct zcomp *comp) -{ - struct zcomp_strm_multi *zs = comp->stream; - struct zcomp_strm *zstrm; - - while (!list_empty(&zs->idle_strm)) { - zstrm = list_entry(zs->idle_strm.next, - struct zcomp_strm, list); - list_del(&zstrm->list); - zcomp_strm_free(comp, zstrm); - } - kfree(zs); -} - -static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) -{ - struct zcomp_strm *zstrm; - struct zcomp_strm_multi *zs; - - comp->destroy = zcomp_strm_multi_destroy; - comp->strm_find = zcomp_strm_multi_find; - comp->strm_release = zcomp_strm_multi_release; - comp->set_max_streams = zcomp_strm_multi_set_max_streams; - zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); - if (!zs) - return -ENOMEM; - - comp->stream = zs; - spin_lock_init(&zs->strm_lock); - INIT_LIST_HEAD(&zs->idle_strm); - init_waitqueue_head(&zs->strm_wait); - zs->max_strm = max_strm; - zs->avail_strm = 1; - - zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); - if (!zstrm) { - kfree(zs); - return -ENOMEM; - } - list_add(&zstrm->list, &zs->idle_strm); - return 0; -} - -static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) -{ - struct zcomp_strm_single *zs = comp->stream; - mutex_lock(&zs->strm_lock); - return zs->zstrm; -} - -static void zcomp_strm_single_release(struct zcomp *comp, - struct zcomp_strm *zstrm) -{ - struct zcomp_strm_single *zs = comp->stream; - mutex_unlock(&zs->strm_lock); -} - -static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) -{ - /* zcomp_strm_single support only max_comp_streams == 1 */ - return false; -} - -static void zcomp_strm_single_destroy(struct zcomp *comp) -{ - struct zcomp_strm_single *zs = comp->stream; - zcomp_strm_free(comp, zs->zstrm); - kfree(zs); -} - -static int zcomp_strm_single_create(struct zcomp *comp) -{ - struct zcomp_strm_single *zs; - - comp->destroy = zcomp_strm_single_destroy; - comp->strm_find = zcomp_strm_single_find; - comp->strm_release = zcomp_strm_single_release; - comp->set_max_streams = zcomp_strm_single_set_max_streams; - zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL); - if (!zs) - return -ENOMEM; - - comp->stream = zs; - mutex_init(&zs->strm_lock); - zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); - if (!zs->zstrm) { - kfree(zs); - return -ENOMEM; - } - return 0; -} - /* show available compressors */ ssize_t zcomp_available_show(const char *comp, char *buf) { @@ -301,17 +97,17 @@ bool zcomp_available_algorithm(const char *comp) bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) { - return comp->set_max_streams(comp, num_strm); + return true; } struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) { - return comp->strm_find(comp); + return *get_cpu_ptr(comp->stream); } void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) { - comp->strm_release(comp, zstrm); + put_cpu_ptr(comp->stream); } int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, @@ -327,9 +123,83 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src, return comp->backend->decompress(src, src_len, dst); } +static int __zcomp_cpu_notifier(struct zcomp *comp, + unsigned long action, unsigned long cpu) +{ + struct zcomp_strm *zstrm; + + switch (action) { + case CPU_UP_PREPARE: + if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) + break; + zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); + if (IS_ERR_OR_NULL(zstrm)) { + pr_err("Can't allocate a compression stream\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(comp->stream, cpu) = zstrm; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + zstrm = *per_cpu_ptr(comp->stream, cpu); + if (!IS_ERR_OR_NULL(zstrm)) + zcomp_strm_free(comp, zstrm); + *per_cpu_ptr(comp->stream, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zcomp_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + struct zcomp *comp = container_of(nb, typeof(*comp), notifier); + + return __zcomp_cpu_notifier(comp, action, cpu); +} + +static int zcomp_init(struct zcomp *comp) +{ + unsigned long cpu; + int ret; + + comp->notifier.notifier_call = zcomp_cpu_notifier; + + comp->stream = alloc_percpu(struct zcomp_strm *); + if (!comp->stream) + return -ENOMEM; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) { + ret = __zcomp_cpu_notifier(comp, CPU_UP_PREPARE, cpu); + if (ret == NOTIFY_BAD) + goto cleanup; + } + __register_cpu_notifier(&comp->notifier); + cpu_notifier_register_done(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu); + cpu_notifier_register_done(); + return -ENOMEM; +} + void zcomp_destroy(struct zcomp *comp) { - comp->destroy(comp); + unsigned long cpu; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu); + __unregister_cpu_notifier(&comp->notifier); + cpu_notifier_register_done(); + + free_percpu(comp->stream); kfree(comp); } @@ -356,10 +226,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm) return ERR_PTR(-ENOMEM); comp->backend = backend; - if (max_strm > 1) - error = zcomp_strm_multi_create(comp, max_strm); - else - error = zcomp_strm_single_create(comp); + error = zcomp_init(comp); if (error) { kfree(comp); return ERR_PTR(error); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index b7d2a4b..aba8c21 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -10,8 +10,6 @@ #ifndef _ZCOMP_H_ #define _ZCOMP_H_ -#include <linux/mutex.h> - struct zcomp_strm { /* compression/decompression buffer */ void *buffer; @@ -21,8 +19,6 @@ struct zcomp_strm { * working memory) */ void *private; - /* used in multi stream backend, protected by backend strm_lock */ - struct list_head list; }; /* static compression backend */ @@ -41,13 +37,9 @@ struct zcomp_backend { /* dynamic per-device compression frontend */ struct zcomp { - void *stream; + struct zcomp_strm * __percpu *stream; struct zcomp_backend *backend; - - struct zcomp_strm *(*strm_find)(struct zcomp *comp); - void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); - bool (*set_max_streams)(struct zcomp *comp, int num_strm); - void (*destroy)(struct zcomp *comp); + struct notifier_block notifier; }; ssize_t zcomp_available_show(const char *comp, char *buf); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9030992..c5e682c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -650,7 +650,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, { int ret = 0; size_t clen; - unsigned long handle; + unsigned long handle = 0; struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; @@ -673,9 +673,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - zstrm = zcomp_strm_find(zram->comp); +compress_again: user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) { memcpy(uncmem + offset, user_mem + bvec->bv_offset, bvec->bv_len); @@ -699,6 +698,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } + zstrm = zcomp_strm_find(zram->comp); ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); @@ -710,6 +710,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, pr_err("Compression failed! err=%d\n", ret); goto out; } + src = zstrm->buffer; if (unlikely(clen > max_zpage_size)) { clen = PAGE_SIZE; @@ -717,8 +718,31 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, src = uncmem; } - handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM); + /* + * handle allocation has 2 paths: + * a) fast path is executed with preemption disabled (for + * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear, + * since we can't sleep; + * b) slow path enables preemption and attempts to alocate + * the page with __GFP_DIRECT_RECLAIM bit set. we have to + * put per-cpu compression stream and, thus, to re-do + * the compression once handle is allocated. + * + * if we have a 'non-null' handle here then we are coming + * from the slow path and handle has already been allocated. + */ + if (!handle) + handle = zs_malloc(meta->mem_pool, clen, + __GFP_KSWAPD_RECLAIM | + __GFP_NOWARN | + __GFP_HIGHMEM); if (!handle) { + zcomp_strm_release(zram->comp, zstrm); + handle = zs_malloc(meta->mem_pool, clen, + GFP_NOIO | __GFP_HIGHMEM); + if (handle) + goto compress_again; + pr_err("Error allocating memory for compressed page: %u, size=%zu\n", index, clen); ret = -ENOMEM; -- 2.8.0.rc0.1.gd285ab0