Got it. I will send patch v2 as your suggestion. Thanks,
在 2024/3/22 12:10, Gao Xiang 写道: > > On 2024/3/22 11:47, Chunhai Guo wrote: >> It will cost more time if compressed buffers are allocated on demand for >> low-latency algorithms (like lz4) so EROFS uses per-CPU buffers to keep >> compressed data if in-place decompression is unfulfilled. While it is >> kind of wasteful of memory for a device with hundreds of CPUs, and only >> a small number of CPUs are concurrently decompressing most of the time. > a small number of CPUs concurrently decompress most of the time. > >> This patch renames it as 'global buffer pool' and makes it configurable. >> This allows two or more CPUs to share a common buffer to reduce memory >> occupation. >> >> Signed-off-by: Chunhai Guo <guochun...@vivo.com> >> Suggested-by: Gao Xiang <xi...@kernel.org> >> --- >> fs/erofs/Makefile | 2 +- >> fs/erofs/decompressor.c | 6 +- >> fs/erofs/internal.h | 14 ++-- >> fs/erofs/pcpubuf.c | 148 -------------------------------------- >> fs/erofs/super.c | 4 +- >> fs/erofs/utils.c | 155 ++++++++++++++++++++++++++++++++++++++++ >> 6 files changed, 168 insertions(+), 161 deletions(-) >> delete mode 100644 fs/erofs/pcpubuf.c >> >> diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile >> index 994d0b9deddf..6cf1504c63e6 100644 >> --- a/fs/erofs/Makefile >> +++ b/fs/erofs/Makefile >> @@ -3,7 +3,7 @@ >> obj-$(CONFIG_EROFS_FS) += erofs.o >> erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o >> erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o >> -erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o >> +erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o >> erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o >> erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o >> erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o >> diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c >> index d4cee95af14c..99a344684132 100644 >> --- a/fs/erofs/decompressor.c >> +++ b/fs/erofs/decompressor.c >> @@ -54,7 +54,7 @@ static int z_erofs_load_lz4_config(struct super_block *sb, >> sbi->lz4.max_distance_pages = distance ? >> DIV_ROUND_UP(distance, PAGE_SIZE) + 1 : >> LZ4_MAX_DISTANCE_PAGES; >> - return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); >> + return erofs_gbuf_growsize(sbi->lz4.max_pclusterblks); >> } >> >> /* >> @@ -159,7 +159,7 @@ static void *z_erofs_lz4_handle_overlap(struct >> z_erofs_lz4_decompress_ctx *ctx, >> docopy: >> /* Or copy compressed data which can be overlapped to per-CPU buffer */ >> in = rq->in; >> - src = erofs_get_pcpubuf(ctx->inpages); >> + src = erofs_get_gbuf(ctx->inpages); >> if (!src) { >> DBG_BUGON(1); >> kunmap_local(inpage); >> @@ -260,7 +260,7 @@ static int z_erofs_lz4_decompress_mem(struct >> z_erofs_lz4_decompress_ctx *ctx, >> } else if (maptype == 1) { >> vm_unmap_ram(src, ctx->inpages); >> } else if (maptype == 2) { >> - erofs_put_pcpubuf(src); >> + erofs_put_gbuf(src); >> } else if (maptype != 3) { >> DBG_BUGON(1); >> return -EFAULT; >> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h >> index b0409badb017..320d3d0d3526 100644 >> --- a/fs/erofs/internal.h >> +++ b/fs/erofs/internal.h >> @@ -471,11 +471,11 @@ int erofs_try_to_free_all_cached_pages(struct >> erofs_sb_info *sbi, >> struct erofs_workgroup *egrp); >> int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks >> *map, >> int flags); >> -void *erofs_get_pcpubuf(unsigned int requiredpages); >> -void erofs_put_pcpubuf(void *ptr); >> -int erofs_pcpubuf_growsize(unsigned int nrpages); >> -void __init erofs_pcpubuf_init(void); >> -void erofs_pcpubuf_exit(void); >> +void *erofs_get_gbuf(unsigned int requiredpages); >> +void erofs_put_gbuf(void *ptr); >> +int erofs_gbuf_growsize(unsigned int nrpages); >> +void __init erofs_gbuf_init(void); >> +void erofs_gbuf_exit(void); >> int erofs_init_managed_cache(struct super_block *sb); >> int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block >> *dsb); >> #else >> @@ -485,8 +485,8 @@ static inline int erofs_init_shrinker(void) { return 0; } >> static inline void erofs_exit_shrinker(void) {} >> static inline int z_erofs_init_zip_subsystem(void) { return 0; } >> static inline void z_erofs_exit_zip_subsystem(void) {} >> -static inline void erofs_pcpubuf_init(void) {} >> -static inline void erofs_pcpubuf_exit(void) {} >> +static inline void erofs_gbuf_init(void) {} >> +static inline void erofs_gbuf_exit(void) {} >> static inline int erofs_init_managed_cache(struct super_block *sb) { >> return 0; } >> #endif /* !CONFIG_EROFS_FS_ZIP */ >> >> diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c >> deleted file mode 100644 >> index c7a4b1d77069..000000000000 >> --- a/fs/erofs/pcpubuf.c >> +++ /dev/null >> @@ -1,148 +0,0 @@ >> -// SPDX-License-Identifier: GPL-2.0-only >> -/* >> - * Copyright (C) Gao Xiang <xi...@kernel.org> >> - * >> - * For low-latency decompression algorithms (e.g. lz4), reserve consecutive >> - * per-CPU virtual memory (in pages) in advance to store such inplace I/O >> - * data if inplace decompression is failed (due to unmet inplace margin for >> - * example). >> - */ >> -#include "internal.h" >> - >> -struct erofs_pcpubuf { >> - raw_spinlock_t lock; >> - void *ptr; >> - struct page **pages; >> - unsigned int nrpages; >> -}; >> - >> -static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb); >> - >> -void *erofs_get_pcpubuf(unsigned int requiredpages) >> - __acquires(pcb->lock) >> -{ >> - struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb); >> - >> - raw_spin_lock(&pcb->lock); >> - /* check if the per-CPU buffer is too small */ >> - if (requiredpages > pcb->nrpages) { >> - raw_spin_unlock(&pcb->lock); >> - put_cpu_var(erofs_pcb); >> - /* (for sparse checker) pretend pcb->lock is still taken */ >> - __acquire(pcb->lock); >> - return NULL; >> - } >> - return pcb->ptr; >> -} >> - >> -void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock) >> -{ >> - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id()); >> - >> - DBG_BUGON(pcb->ptr != ptr); >> - raw_spin_unlock(&pcb->lock); >> - put_cpu_var(erofs_pcb); >> -} >> - >> -/* the next step: support per-CPU page buffers hotplug */ >> -int erofs_pcpubuf_growsize(unsigned int nrpages) >> -{ >> - static DEFINE_MUTEX(pcb_resize_mutex); >> - static unsigned int pcb_nrpages; >> - struct page *pagepool = NULL; >> - int delta, cpu, ret, i; >> - >> - mutex_lock(&pcb_resize_mutex); >> - delta = nrpages - pcb_nrpages; >> - ret = 0; >> - /* avoid shrinking pcpubuf, since no idea how many fses rely on */ >> - if (delta <= 0) >> - goto out; >> - >> - for_each_possible_cpu(cpu) { >> - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); >> - struct page **pages, **oldpages; >> - void *ptr, *old_ptr; >> - >> - pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL); >> - if (!pages) { >> - ret = -ENOMEM; >> - break; >> - } >> - >> - for (i = 0; i < nrpages; ++i) { >> - pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL); >> - if (!pages[i]) { >> - ret = -ENOMEM; >> - oldpages = pages; >> - goto free_pagearray; >> - } >> - } >> - ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL); >> - if (!ptr) { >> - ret = -ENOMEM; >> - oldpages = pages; >> - goto free_pagearray; >> - } >> - raw_spin_lock(&pcb->lock); >> - old_ptr = pcb->ptr; >> - pcb->ptr = ptr; >> - oldpages = pcb->pages; >> - pcb->pages = pages; >> - i = pcb->nrpages; >> - pcb->nrpages = nrpages; >> - raw_spin_unlock(&pcb->lock); >> - >> - if (!oldpages) { >> - DBG_BUGON(old_ptr); >> - continue; >> - } >> - >> - if (old_ptr) >> - vunmap(old_ptr); >> -free_pagearray: >> - while (i) >> - erofs_pagepool_add(&pagepool, oldpages[--i]); >> - kfree(oldpages); >> - if (ret) >> - break; >> - } >> - pcb_nrpages = nrpages; >> - erofs_release_pages(&pagepool); >> -out: >> - mutex_unlock(&pcb_resize_mutex); >> - return ret; >> -} >> - >> -void __init erofs_pcpubuf_init(void) >> -{ >> - int cpu; >> - >> - for_each_possible_cpu(cpu) { >> - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); >> - >> - raw_spin_lock_init(&pcb->lock); >> - } >> -} >> - >> -void erofs_pcpubuf_exit(void) >> -{ >> - int cpu, i; >> - >> - for_each_possible_cpu(cpu) { >> - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); >> - >> - if (pcb->ptr) { >> - vunmap(pcb->ptr); >> - pcb->ptr = NULL; >> - } >> - if (!pcb->pages) >> - continue; >> - >> - for (i = 0; i < pcb->nrpages; ++i) >> - if (pcb->pages[i]) >> - put_page(pcb->pages[i]); >> - kfree(pcb->pages); >> - pcb->pages = NULL; >> - } >> -} >> diff --git a/fs/erofs/super.c b/fs/erofs/super.c >> index 5f60f163bd56..5161923c33fc 100644 >> --- a/fs/erofs/super.c >> +++ b/fs/erofs/super.c >> @@ -902,7 +902,7 @@ static int __init erofs_module_init(void) >> if (err) >> goto deflate_err; >> >> - erofs_pcpubuf_init(); >> + erofs_gbuf_init(); >> err = z_erofs_init_zip_subsystem(); >> if (err) >> goto zip_err; >> @@ -945,7 +945,7 @@ static void __exit erofs_module_exit(void) >> z_erofs_lzma_exit(); >> erofs_exit_shrinker(); >> kmem_cache_destroy(erofs_inode_cachep); >> - erofs_pcpubuf_exit(); >> + erofs_gbuf_exit(); >> } >> >> static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) >> diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c >> index e146d09151af..7bdff1c7ce19 100644 >> --- a/fs/erofs/utils.c >> +++ b/fs/erofs/utils.c >> @@ -284,4 +284,159 @@ void erofs_exit_shrinker(void) >> { >> shrinker_free(erofs_shrinker_info); >> } >> + >> +struct erofs_gbuf { > struct z_erofs_gbuf { > >> + spinlock_t lock; >> + void *ptr; >> + struct page **pages; >> + unsigned int nrpages, ref; >> + struct list_head list; >> +}; >> + >> +struct erofs_gbufpool { >> + struct erofs_gbuf *gbuf_array; >> + unsigned int gbuf_num, gbuf_nrpages; >> +}; > could we avoid this structure? I think it's only used once: > > struct z_erofs_gbuf *z_erofs_gbufpool; > unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages; > > module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444); > >> + >> +struct erofs_gbufpool z_erofs_gbufpool; >> + >> +module_param_named(pcluster_buf_num, z_erofs_gbufpool.gbuf_num, uint, 0444); >> + >> +static inline unsigned int erofs_cur_gbuf(void) > inline here is unnecessary, and > > static unsigned int z_erofs_gbuf_id(void) > { > >> +{ >> + return smp_processor_id() % z_erofs_gbufpool.gbuf_num; >> +} >> + >> +void *erofs_get_gbuf(unsigned int requiredpages) > z_erofs_get_gbuf > >> + __acquires(gbuf->lock) >> +{ >> + struct erofs_gbuf *gbuf; >> + >> + gbuf = &z_erofs_gbufpool.gbuf_array[erofs_cur_gbuf()]; >> + spin_lock(&gbuf->lock); >> + /* check if the buffer is too small */ >> + if (requiredpages > gbuf->nrpages) { >> + spin_unlock(&gbuf->lock); >> + /* (for sparse checker) pretend gbuf->lock is still taken */ >> + __acquire(gbuf->lock); >> + return NULL; >> + } >> + return gbuf->ptr; >> +} >> + >> +void erofs_put_gbuf(void *ptr) __releases(gbuf->lock) > z_erofs_put_gbuf > >> +{ >> + struct erofs_gbuf *gbuf; >> + >> + gbuf = &z_erofs_gbufpool.gbuf_array[erofs_cur_gbuf()]; >> + DBG_BUGON(gbuf->ptr != ptr); >> + spin_unlock(&gbuf->lock); >> +} >> + >> +int erofs_gbuf_growsize(unsigned int nrpages) > z_erofs_get_growsize > >> +{ >> + static DEFINE_MUTEX(gbuf_resize_mutex); >> + struct page *pagepool = NULL; > Since we just grow global buffers, I think we could just copy > the old array and fill the rest? so that pagepool is needed > and we don't touch `struct page` anymore so `folio` is not > eeded. > > But it can be done with a seperate patch. > > >> + int delta, ret, i, j; >> + >> + if (!z_erofs_gbufpool.gbuf_array) >> + return -ENOMEM; >> + >> + mutex_lock(&gbuf_resize_mutex); >> + delta = nrpages - z_erofs_gbufpool.gbuf_nrpages; >> + ret = 0; > ret = -EINVAL; > >> + /* avoid shrinking pcl_buf, since no idea how many fses rely on */ > /* avoid shrinking gbufs ... */ > >> + if (delta <= 0) >> + goto out; >> + >> + for (i = 0; i < z_erofs_gbufpool.gbuf_num; ++i) { >> + struct erofs_gbuf *gbuf = &z_erofs_gbufpool.gbuf_array[i]; >> + struct page **pages, **tmp_pages; >> + void *ptr, *old_ptr = NULL; >> + >> + ret = -ENOMEM; >> + tmp_pages = kmalloc_array(nrpages, sizeof(*tmp_pages), >> + GFP_KERNEL); >> + if (!tmp_pages) >> + break; >> + for (j = 0; j < nrpages; ++j) { >> + tmp_pages[j] = erofs_allocpage(&pagepool, GFP_KERNEL); >> + if (!tmp_pages[j]) >> + goto free_pagearray; >> + } >> + ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL); >> + if (!ptr) >> + goto free_pagearray; >> + >> + pages = tmp_pages; >> + spin_lock(&gbuf->lock); >> + old_ptr = gbuf->ptr; >> + gbuf->ptr = ptr; >> + tmp_pages = gbuf->pages; >> + gbuf->pages = pages; >> + j = gbuf->nrpages; >> + gbuf->nrpages = nrpages; >> + spin_unlock(&gbuf->lock); >> + ret = 0; >> + if (!tmp_pages) { >> + DBG_BUGON(old_ptr); >> + continue; >> + } >> + >> + if (old_ptr) >> + vunmap(old_ptr); >> +free_pagearray: >> + while (j) >> + erofs_pagepool_add(&pagepool, tmp_pages[--j]); >> + kfree(tmp_pages); >> + if (ret) >> + break; >> + } >> + z_erofs_gbufpool.gbuf_nrpages = nrpages; >> + erofs_release_pages(&pagepool); >> +out: >> + mutex_unlock(&gbuf_resize_mutex); >> + return ret; >> +} >> + >> +void __init erofs_gbuf_init(void) >> +{ >> + if (!z_erofs_gbufpool.gbuf_num) >> + z_erofs_gbufpool.gbuf_num = num_possible_cpus(); >> + else if (z_erofs_gbufpool.gbuf_num > num_possible_cpus()) >> + z_erofs_gbufpool.gbuf_num = num_possible_cpus(); > unsigned int i = num_possible_cpus(); > > if (!z_erofs_gbuf_count) > z_erofs_gbuf_count = cpus; > else > z_erofs_gbuf_count = z_erofs_gbuf_count < i ? : i; > > > >> + >> + z_erofs_gbufpool.gbuf_array = kmalloc_array(z_erofs_gbufpool.gbuf_num, >> + sizeof(*z_erofs_gbufpool.gbuf_array), >> + GFP_KERNEL | __GFP_ZERO);> + if >> (!z_erofs_gbufpool.gbuf_array) { >> + erofs_err(NULL, "failed to alloc page for erofs gbuf_array\n"); > > erofs_err(NULL, "failed to alloate global buffer array"); > > >> + return;> + } >> + >> + for (int i = 0; i < z_erofs_gbufpool.gbuf_num; ++i) > please don't use `for (int i..`. > > so for (i = 0; i < z_erofs_gbufpool.gbuf_num; ++i) > > Thanks, > Gao Xiang