It will cost more time if compressed buffers are allocated on demand for
low-latency algorithms (like lz4) so EROFS uses per-CPU buffers to keep
compressed data if in-place decompression is unfulfilled.  While it is kind
of wasteful of memory for a device with hundreds of CPUs, and only a small
number of CPUs concurrently decompress most of the time.

This patch renames it as 'global buffer pool' and makes it configurable.
This allows two or more CPUs to share a common buffer to reduce memory
occupation.

Suggested-by: Gao Xiang <xi...@kernel.org>
Reviewed-by: Gao Xiang <hsiang...@linux.alibaba.com>
Signed-off-by: Chunhai Guo <guochun...@vivo.com>
---
V1 -> V2: Fix compiling error when CONFIG_EROFS_FS_ZIP is disabled.
---
 fs/erofs/Makefile       |   2 +-
 fs/erofs/decompressor.c |   6 +-
 fs/erofs/internal.h     |  14 ++--
 fs/erofs/pcpubuf.c      | 148 ----------------------------------------
 fs/erofs/super.c        |   9 ++-
 fs/erofs/zutil.c        | 148 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 166 insertions(+), 161 deletions(-)
 delete mode 100644 fs/erofs/pcpubuf.c

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 845eafdcee4a..20d1ec422443 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -3,7 +3,7 @@
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o sysfs.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o zutil.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o
 erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
 erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
 erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 2ec9b2bb628d..e1239d886984 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -54,7 +54,7 @@ static int z_erofs_load_lz4_config(struct super_block *sb,
        sbi->lz4.max_distance_pages = distance ?
                                        DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
                                        LZ4_MAX_DISTANCE_PAGES;
-       return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
+       return z_erofs_gbuf_growsize(sbi->lz4.max_pclusterblks);
 }
 
 /*
@@ -159,7 +159,7 @@ static void *z_erofs_lz4_handle_overlap(struct 
z_erofs_lz4_decompress_ctx *ctx,
 docopy:
        /* Or copy compressed data which can be overlapped to per-CPU buffer */
        in = rq->in;
-       src = erofs_get_pcpubuf(ctx->inpages);
+       src = z_erofs_get_gbuf(ctx->inpages);
        if (!src) {
                DBG_BUGON(1);
                kunmap_local(inpage);
@@ -260,7 +260,7 @@ static int z_erofs_lz4_decompress_mem(struct 
z_erofs_lz4_decompress_ctx *ctx,
        } else if (maptype == 1) {
                vm_unmap_ram(src, ctx->inpages);
        } else if (maptype == 2) {
-               erofs_put_pcpubuf(src);
+               z_erofs_put_gbuf(src);
        } else if (maptype != 3) {
                DBG_BUGON(1);
                return -EFAULT;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 39c67119f43b..2ebbf3333800 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -470,11 +470,11 @@ int erofs_try_to_free_all_cached_folios(struct 
erofs_sb_info *sbi,
                                        struct erofs_workgroup *egrp);
 int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
                            int flags);
-void *erofs_get_pcpubuf(unsigned int requiredpages);
-void erofs_put_pcpubuf(void *ptr);
-int erofs_pcpubuf_growsize(unsigned int nrpages);
-void __init erofs_pcpubuf_init(void);
-void erofs_pcpubuf_exit(void);
+void *z_erofs_get_gbuf(unsigned int requiredpages);
+void z_erofs_put_gbuf(void *ptr);
+int z_erofs_gbuf_growsize(unsigned int nrpages);
+int __init z_erofs_gbuf_init(void);
+void z_erofs_gbuf_exit(void);
 int erofs_init_managed_cache(struct super_block *sb);
 int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb);
 #else
@@ -484,8 +484,8 @@ static inline int erofs_init_shrinker(void) { return 0; }
 static inline void erofs_exit_shrinker(void) {}
 static inline int z_erofs_init_zip_subsystem(void) { return 0; }
 static inline void z_erofs_exit_zip_subsystem(void) {}
-static inline void erofs_pcpubuf_init(void) {}
-static inline void erofs_pcpubuf_exit(void) {}
+static inline int z_erofs_gbuf_init(void) { return 0; }
+static inline void z_erofs_gbuf_exit(void) {}
 static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; 
}
 #endif /* !CONFIG_EROFS_FS_ZIP */
 
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
deleted file mode 100644
index c7a4b1d77069..000000000000
--- a/fs/erofs/pcpubuf.c
+++ /dev/null
@@ -1,148 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) Gao Xiang <xi...@kernel.org>
- *
- * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
- * per-CPU virtual memory (in pages) in advance to store such inplace I/O
- * data if inplace decompression is failed (due to unmet inplace margin for
- * example).
- */
-#include "internal.h"
-
-struct erofs_pcpubuf {
-       raw_spinlock_t lock;
-       void *ptr;
-       struct page **pages;
-       unsigned int nrpages;
-};
-
-static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
-
-void *erofs_get_pcpubuf(unsigned int requiredpages)
-       __acquires(pcb->lock)
-{
-       struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
-
-       raw_spin_lock(&pcb->lock);
-       /* check if the per-CPU buffer is too small */
-       if (requiredpages > pcb->nrpages) {
-               raw_spin_unlock(&pcb->lock);
-               put_cpu_var(erofs_pcb);
-               /* (for sparse checker) pretend pcb->lock is still taken */
-               __acquire(pcb->lock);
-               return NULL;
-       }
-       return pcb->ptr;
-}
-
-void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
-{
-       struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
-
-       DBG_BUGON(pcb->ptr != ptr);
-       raw_spin_unlock(&pcb->lock);
-       put_cpu_var(erofs_pcb);
-}
-
-/* the next step: support per-CPU page buffers hotplug */
-int erofs_pcpubuf_growsize(unsigned int nrpages)
-{
-       static DEFINE_MUTEX(pcb_resize_mutex);
-       static unsigned int pcb_nrpages;
-       struct page *pagepool = NULL;
-       int delta, cpu, ret, i;
-
-       mutex_lock(&pcb_resize_mutex);
-       delta = nrpages - pcb_nrpages;
-       ret = 0;
-       /* avoid shrinking pcpubuf, since no idea how many fses rely on */
-       if (delta <= 0)
-               goto out;
-
-       for_each_possible_cpu(cpu) {
-               struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
-               struct page **pages, **oldpages;
-               void *ptr, *old_ptr;
-
-               pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
-               if (!pages) {
-                       ret = -ENOMEM;
-                       break;
-               }
-
-               for (i = 0; i < nrpages; ++i) {
-                       pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
-                       if (!pages[i]) {
-                               ret = -ENOMEM;
-                               oldpages = pages;
-                               goto free_pagearray;
-                       }
-               }
-               ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
-               if (!ptr) {
-                       ret = -ENOMEM;
-                       oldpages = pages;
-                       goto free_pagearray;
-               }
-               raw_spin_lock(&pcb->lock);
-               old_ptr = pcb->ptr;
-               pcb->ptr = ptr;
-               oldpages = pcb->pages;
-               pcb->pages = pages;
-               i = pcb->nrpages;
-               pcb->nrpages = nrpages;
-               raw_spin_unlock(&pcb->lock);
-
-               if (!oldpages) {
-                       DBG_BUGON(old_ptr);
-                       continue;
-               }
-
-               if (old_ptr)
-                       vunmap(old_ptr);
-free_pagearray:
-               while (i)
-                       erofs_pagepool_add(&pagepool, oldpages[--i]);
-               kfree(oldpages);
-               if (ret)
-                       break;
-       }
-       pcb_nrpages = nrpages;
-       erofs_release_pages(&pagepool);
-out:
-       mutex_unlock(&pcb_resize_mutex);
-       return ret;
-}
-
-void __init erofs_pcpubuf_init(void)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
-
-               raw_spin_lock_init(&pcb->lock);
-       }
-}
-
-void erofs_pcpubuf_exit(void)
-{
-       int cpu, i;
-
-       for_each_possible_cpu(cpu) {
-               struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
-
-               if (pcb->ptr) {
-                       vunmap(pcb->ptr);
-                       pcb->ptr = NULL;
-               }
-               if (!pcb->pages)
-                       continue;
-
-               for (i = 0; i < pcb->nrpages; ++i)
-                       if (pcb->pages[i])
-                               put_page(pcb->pages[i]);
-               kfree(pcb->pages);
-               pcb->pages = NULL;
-       }
-}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index c0eb139adb07..7d8718420136 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -873,7 +873,10 @@ static int __init erofs_module_init(void)
        if (err)
                goto deflate_err;
 
-       erofs_pcpubuf_init();
+       err = z_erofs_gbuf_init();
+       if (err)
+               goto gbuf_err;
+
        err = z_erofs_init_zip_subsystem();
        if (err)
                goto zip_err;
@@ -893,6 +896,8 @@ static int __init erofs_module_init(void)
 sysfs_err:
        z_erofs_exit_zip_subsystem();
 zip_err:
+       z_erofs_gbuf_exit();
+gbuf_err:
        z_erofs_deflate_exit();
 deflate_err:
        z_erofs_lzma_exit();
@@ -916,7 +921,7 @@ static void __exit erofs_module_exit(void)
        z_erofs_lzma_exit();
        erofs_exit_shrinker();
        kmem_cache_destroy(erofs_inode_cachep);
-       erofs_pcpubuf_exit();
+       z_erofs_gbuf_exit();
 }
 
 static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 8cd30ac2091f..e13806681763 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -5,6 +5,18 @@
  */
 #include "internal.h"
 
+struct z_erofs_gbuf {
+       spinlock_t lock;
+       void *ptr;
+       struct page **pages;
+       unsigned int nrpages;
+};
+
+static struct z_erofs_gbuf *z_erofs_gbufpool;
+static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages;
+
+module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444);
+
 static atomic_long_t erofs_global_shrink_cnt;  /* for all mounted instances */
 /* protected by 'erofs_sb_list_lock' */
 static unsigned int shrinker_run_no;
@@ -14,6 +26,142 @@ static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 static struct shrinker *erofs_shrinker_info;
 
+static unsigned int z_erofs_gbuf_id(void)
+{
+       return smp_processor_id() % z_erofs_gbuf_count;
+}
+
+void *z_erofs_get_gbuf(unsigned int requiredpages)
+       __acquires(gbuf->lock)
+{
+       struct z_erofs_gbuf *gbuf;
+
+       gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()];
+       spin_lock(&gbuf->lock);
+       /* check if the buffer is too small */
+       if (requiredpages > gbuf->nrpages) {
+               spin_unlock(&gbuf->lock);
+               /* (for sparse checker) pretend gbuf->lock is still taken */
+               __acquire(gbuf->lock);
+               return NULL;
+       }
+       return gbuf->ptr;
+}
+
+void z_erofs_put_gbuf(void *ptr) __releases(gbuf->lock)
+{
+       struct z_erofs_gbuf *gbuf;
+
+       gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()];
+       DBG_BUGON(gbuf->ptr != ptr);
+       spin_unlock(&gbuf->lock);
+}
+
+int z_erofs_gbuf_growsize(unsigned int nrpages)
+{
+       static DEFINE_MUTEX(gbuf_resize_mutex);
+       struct page *pagepool = NULL;
+       int delta, ret, i, j;
+
+       mutex_lock(&gbuf_resize_mutex);
+       delta = nrpages - z_erofs_gbuf_nrpages;
+       ret = 0;
+       /* avoid shrinking gbufs, since no idea how many fses rely on */
+       if (delta <= 0)
+               goto out;
+
+       for (i = 0; i < z_erofs_gbuf_count; ++i) {
+               struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i];
+               struct page **pages, **tmp_pages;
+               void *ptr, *old_ptr = NULL;
+
+               ret = -ENOMEM;
+               tmp_pages = kcalloc(nrpages, sizeof(*tmp_pages), GFP_KERNEL);
+               if (!tmp_pages)
+                       break;
+               for (j = 0; j < nrpages; ++j) {
+                       tmp_pages[j] = erofs_allocpage(&pagepool, GFP_KERNEL);
+                       if (!tmp_pages[j])
+                               goto free_pagearray;
+               }
+               ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL);
+               if (!ptr)
+                       goto free_pagearray;
+
+               pages = tmp_pages;
+               spin_lock(&gbuf->lock);
+               old_ptr = gbuf->ptr;
+               gbuf->ptr = ptr;
+               tmp_pages = gbuf->pages;
+               gbuf->pages = pages;
+               j = gbuf->nrpages;
+               gbuf->nrpages = nrpages;
+               spin_unlock(&gbuf->lock);
+               ret = 0;
+               if (!tmp_pages) {
+                       DBG_BUGON(old_ptr);
+                       continue;
+               }
+
+               if (old_ptr)
+                       vunmap(old_ptr);
+free_pagearray:
+               while (j)
+                       erofs_pagepool_add(&pagepool, tmp_pages[--j]);
+               kfree(tmp_pages);
+               if (ret)
+                       break;
+       }
+       z_erofs_gbuf_nrpages = nrpages;
+       erofs_release_pages(&pagepool);
+out:
+       mutex_unlock(&gbuf_resize_mutex);
+       return ret;
+}
+
+int __init z_erofs_gbuf_init(void)
+{
+       unsigned int i = num_possible_cpus();
+
+       if (!z_erofs_gbuf_count)
+               z_erofs_gbuf_count = i;
+       else
+               z_erofs_gbuf_count = min(z_erofs_gbuf_count, i);
+
+       z_erofs_gbufpool = kcalloc(z_erofs_gbuf_count,
+                       sizeof(*z_erofs_gbufpool), GFP_KERNEL);
+       if (!z_erofs_gbufpool)
+               return -ENOMEM;
+
+       for (i = 0; i < z_erofs_gbuf_count; ++i)
+               spin_lock_init(&z_erofs_gbufpool[i].lock);
+       return 0;
+}
+
+void z_erofs_gbuf_exit(void)
+{
+       int i;
+
+       for (i = 0; i < z_erofs_gbuf_count; ++i) {
+               struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i];
+
+               if (gbuf->ptr) {
+                       vunmap(gbuf->ptr);
+                       gbuf->ptr = NULL;
+               }
+
+               if (!gbuf->pages)
+                       continue;
+
+               for (i = 0; i < gbuf->nrpages; ++i)
+                       if (gbuf->pages[i])
+                               put_page(gbuf->pages[i]);
+               kfree(gbuf->pages);
+               gbuf->pages = NULL;
+       }
+       kfree(z_erofs_gbufpool);
+}
+
 struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
 {
        struct page *page = *pagepool;
-- 
2.25.1

Reply via email to