Re: [PATCH] erofs: rename per-CPU buffers to global buffer pool and make it configurable

Chunhai Guo via Linux-erofs Fri, 22 Mar 2024 00:18:57 -0700

Got it. I will send patch v2 as your suggestion.

Thanks,


在 2024/3/22 12:10, Gao Xiang 写道:
>
> On 2024/3/22 11:47, Chunhai Guo wrote:
>> It will cost more time if compressed buffers are allocated on demand for
>> low-latency algorithms (like lz4) so EROFS uses per-CPU buffers to keep
>> compressed data if in-place decompression is unfulfilled.  While it is
>> kind of wasteful of memory for a device with hundreds of CPUs, and only
>> a small number of CPUs are concurrently decompressing most of the time.
> a small number of CPUs concurrently decompress most of the time.
>
>> This patch renames it as 'global buffer pool' and makes it configurable.
>> This allows two or more CPUs to share a common buffer to reduce memory
>> occupation.
>>
>> Signed-off-by: Chunhai Guo <guochun...@vivo.com>
>> Suggested-by: Gao Xiang <xi...@kernel.org>
>> ---
>>    fs/erofs/Makefile       |   2 +-
>>    fs/erofs/decompressor.c |   6 +-
>>    fs/erofs/internal.h     |  14 ++--
>>    fs/erofs/pcpubuf.c      | 148 --------------------------------------
>>    fs/erofs/super.c        |   4 +-
>>    fs/erofs/utils.c        | 155 ++++++++++++++++++++++++++++++++++++++++
>>    6 files changed, 168 insertions(+), 161 deletions(-)
>>    delete mode 100644 fs/erofs/pcpubuf.c
>>
>> diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
>> index 994d0b9deddf..6cf1504c63e6 100644
>> --- a/fs/erofs/Makefile
>> +++ b/fs/erofs/Makefile
>> @@ -3,7 +3,7 @@
>>    obj-$(CONFIG_EROFS_FS) += erofs.o
>>    erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o
>>    erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
>> -erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o
>> +erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
>>    erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
>>    erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
>>    erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
>> diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
>> index d4cee95af14c..99a344684132 100644
>> --- a/fs/erofs/decompressor.c
>> +++ b/fs/erofs/decompressor.c
>> @@ -54,7 +54,7 @@ static int z_erofs_load_lz4_config(struct super_block *sb,
>>      sbi->lz4.max_distance_pages = distance ?
>>                                      DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
>>                                      LZ4_MAX_DISTANCE_PAGES;
>> -    return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
>> +    return erofs_gbuf_growsize(sbi->lz4.max_pclusterblks);
>>    }
>>    
>>    /*
>> @@ -159,7 +159,7 @@ static void *z_erofs_lz4_handle_overlap(struct 
>> z_erofs_lz4_decompress_ctx *ctx,
>>    docopy:
>>      /* Or copy compressed data which can be overlapped to per-CPU buffer */
>>      in = rq->in;
>> -    src = erofs_get_pcpubuf(ctx->inpages);
>> +    src = erofs_get_gbuf(ctx->inpages);
>>      if (!src) {
>>              DBG_BUGON(1);
>>              kunmap_local(inpage);
>> @@ -260,7 +260,7 @@ static int z_erofs_lz4_decompress_mem(struct 
>> z_erofs_lz4_decompress_ctx *ctx,
>>      } else if (maptype == 1) {
>>              vm_unmap_ram(src, ctx->inpages);
>>      } else if (maptype == 2) {
>> -            erofs_put_pcpubuf(src);
>> +            erofs_put_gbuf(src);
>>      } else if (maptype != 3) {
>>              DBG_BUGON(1);
>>              return -EFAULT;
>> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
>> index b0409badb017..320d3d0d3526 100644
>> --- a/fs/erofs/internal.h
>> +++ b/fs/erofs/internal.h
>> @@ -471,11 +471,11 @@ int erofs_try_to_free_all_cached_pages(struct 
>> erofs_sb_info *sbi,
>>                                     struct erofs_workgroup *egrp);
>>    int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks 
>> *map,
>>                          int flags);
>> -void *erofs_get_pcpubuf(unsigned int requiredpages);
>> -void erofs_put_pcpubuf(void *ptr);
>> -int erofs_pcpubuf_growsize(unsigned int nrpages);
>> -void __init erofs_pcpubuf_init(void);
>> -void erofs_pcpubuf_exit(void);
>> +void *erofs_get_gbuf(unsigned int requiredpages);
>> +void erofs_put_gbuf(void *ptr);
>> +int erofs_gbuf_growsize(unsigned int nrpages);
>> +void __init erofs_gbuf_init(void);
>> +void erofs_gbuf_exit(void);
>>    int erofs_init_managed_cache(struct super_block *sb);
>>    int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block 
>> *dsb);
>>    #else
>> @@ -485,8 +485,8 @@ static inline int erofs_init_shrinker(void) { return 0; }
>>    static inline void erofs_exit_shrinker(void) {}
>>    static inline int z_erofs_init_zip_subsystem(void) { return 0; }
>>    static inline void z_erofs_exit_zip_subsystem(void) {}
>> -static inline void erofs_pcpubuf_init(void) {}
>> -static inline void erofs_pcpubuf_exit(void) {}
>> +static inline void erofs_gbuf_init(void) {}
>> +static inline void erofs_gbuf_exit(void) {}
>>    static inline int erofs_init_managed_cache(struct super_block *sb) { 
>> return 0; }
>>    #endif    /* !CONFIG_EROFS_FS_ZIP */
>>    
>> diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
>> deleted file mode 100644
>> index c7a4b1d77069..000000000000
>> --- a/fs/erofs/pcpubuf.c
>> +++ /dev/null
>> @@ -1,148 +0,0 @@
>> -// SPDX-License-Identifier: GPL-2.0-only
>> -/*
>> - * Copyright (C) Gao Xiang <xi...@kernel.org>
>> - *
>> - * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
>> - * per-CPU virtual memory (in pages) in advance to store such inplace I/O
>> - * data if inplace decompression is failed (due to unmet inplace margin for
>> - * example).
>> - */
>> -#include "internal.h"
>> -
>> -struct erofs_pcpubuf {
>> -    raw_spinlock_t lock;
>> -    void *ptr;
>> -    struct page **pages;
>> -    unsigned int nrpages;
>> -};
>> -
>> -static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
>> -
>> -void *erofs_get_pcpubuf(unsigned int requiredpages)
>> -    __acquires(pcb->lock)
>> -{
>> -    struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
>> -
>> -    raw_spin_lock(&pcb->lock);
>> -    /* check if the per-CPU buffer is too small */
>> -    if (requiredpages > pcb->nrpages) {
>> -            raw_spin_unlock(&pcb->lock);
>> -            put_cpu_var(erofs_pcb);
>> -            /* (for sparse checker) pretend pcb->lock is still taken */
>> -            __acquire(pcb->lock);
>> -            return NULL;
>> -    }
>> -    return pcb->ptr;
>> -}
>> -
>> -void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
>> -{
>> -    struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
>> -
>> -    DBG_BUGON(pcb->ptr != ptr);
>> -    raw_spin_unlock(&pcb->lock);
>> -    put_cpu_var(erofs_pcb);
>> -}
>> -
>> -/* the next step: support per-CPU page buffers hotplug */
>> -int erofs_pcpubuf_growsize(unsigned int nrpages)
>> -{
>> -    static DEFINE_MUTEX(pcb_resize_mutex);
>> -    static unsigned int pcb_nrpages;
>> -    struct page *pagepool = NULL;
>> -    int delta, cpu, ret, i;
>> -
>> -    mutex_lock(&pcb_resize_mutex);
>> -    delta = nrpages - pcb_nrpages;
>> -    ret = 0;
>> -    /* avoid shrinking pcpubuf, since no idea how many fses rely on */
>> -    if (delta <= 0)
>> -            goto out;
>> -
>> -    for_each_possible_cpu(cpu) {
>> -            struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
>> -            struct page **pages, **oldpages;
>> -            void *ptr, *old_ptr;
>> -
>> -            pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
>> -            if (!pages) {
>> -                    ret = -ENOMEM;
>> -                    break;
>> -            }
>> -
>> -            for (i = 0; i < nrpages; ++i) {
>> -                    pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
>> -                    if (!pages[i]) {
>> -                            ret = -ENOMEM;
>> -                            oldpages = pages;
>> -                            goto free_pagearray;
>> -                    }
>> -            }
>> -            ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
>> -            if (!ptr) {
>> -                    ret = -ENOMEM;
>> -                    oldpages = pages;
>> -                    goto free_pagearray;
>> -            }
>> -            raw_spin_lock(&pcb->lock);
>> -            old_ptr = pcb->ptr;
>> -            pcb->ptr = ptr;
>> -            oldpages = pcb->pages;
>> -            pcb->pages = pages;
>> -            i = pcb->nrpages;
>> -            pcb->nrpages = nrpages;
>> -            raw_spin_unlock(&pcb->lock);
>> -
>> -            if (!oldpages) {
>> -                    DBG_BUGON(old_ptr);
>> -                    continue;
>> -            }
>> -
>> -            if (old_ptr)
>> -                    vunmap(old_ptr);
>> -free_pagearray:
>> -            while (i)
>> -                    erofs_pagepool_add(&pagepool, oldpages[--i]);
>> -            kfree(oldpages);
>> -            if (ret)
>> -                    break;
>> -    }
>> -    pcb_nrpages = nrpages;
>> -    erofs_release_pages(&pagepool);
>> -out:
>> -    mutex_unlock(&pcb_resize_mutex);
>> -    return ret;
>> -}
>> -
>> -void __init erofs_pcpubuf_init(void)
>> -{
>> -    int cpu;
>> -
>> -    for_each_possible_cpu(cpu) {
>> -            struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
>> -
>> -            raw_spin_lock_init(&pcb->lock);
>> -    }
>> -}
>> -
>> -void erofs_pcpubuf_exit(void)
>> -{
>> -    int cpu, i;
>> -
>> -    for_each_possible_cpu(cpu) {
>> -            struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
>> -
>> -            if (pcb->ptr) {
>> -                    vunmap(pcb->ptr);
>> -                    pcb->ptr = NULL;
>> -            }
>> -            if (!pcb->pages)
>> -                    continue;
>> -
>> -            for (i = 0; i < pcb->nrpages; ++i)
>> -                    if (pcb->pages[i])
>> -                            put_page(pcb->pages[i]);
>> -            kfree(pcb->pages);
>> -            pcb->pages = NULL;
>> -    }
>> -}
>> diff --git a/fs/erofs/super.c b/fs/erofs/super.c
>> index 5f60f163bd56..5161923c33fc 100644
>> --- a/fs/erofs/super.c
>> +++ b/fs/erofs/super.c
>> @@ -902,7 +902,7 @@ static int __init erofs_module_init(void)
>>      if (err)
>>              goto deflate_err;
>>    
>> -    erofs_pcpubuf_init();
>> +    erofs_gbuf_init();
>>      err = z_erofs_init_zip_subsystem();
>>      if (err)
>>              goto zip_err;
>> @@ -945,7 +945,7 @@ static void __exit erofs_module_exit(void)
>>      z_erofs_lzma_exit();
>>      erofs_exit_shrinker();
>>      kmem_cache_destroy(erofs_inode_cachep);
>> -    erofs_pcpubuf_exit();
>> +    erofs_gbuf_exit();
>>    }
>>    
>>    static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
>> diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
>> index e146d09151af..7bdff1c7ce19 100644
>> --- a/fs/erofs/utils.c
>> +++ b/fs/erofs/utils.c
>> @@ -284,4 +284,159 @@ void erofs_exit_shrinker(void)
>>    {
>>      shrinker_free(erofs_shrinker_info);
>>    }
>> +
>> +struct erofs_gbuf {
> struct z_erofs_gbuf {
>
>> +    spinlock_t lock;
>> +    void *ptr;
>> +    struct page **pages;
>> +    unsigned int nrpages, ref;
>> +    struct list_head list;
>> +};
>> +
>> +struct erofs_gbufpool {
>> +    struct erofs_gbuf *gbuf_array;
>> +    unsigned int gbuf_num, gbuf_nrpages;
>> +};
> could we avoid this structure? I think it's only used once:
>
> struct z_erofs_gbuf *z_erofs_gbufpool;
> unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages;
>
> module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444);
>
>> +
>> +struct erofs_gbufpool z_erofs_gbufpool;
>> +
>> +module_param_named(pcluster_buf_num, z_erofs_gbufpool.gbuf_num, uint, 0444);
>> +
>> +static inline unsigned int erofs_cur_gbuf(void)
> inline here is unnecessary, and
>
> static unsigned int z_erofs_gbuf_id(void)
> {
>
>> +{
>> +    return smp_processor_id() % z_erofs_gbufpool.gbuf_num;
>> +}
>> +
>> +void *erofs_get_gbuf(unsigned int requiredpages)
> z_erofs_get_gbuf
>
>> +    __acquires(gbuf->lock)
>> +{
>> +    struct erofs_gbuf *gbuf;
>> +
>> +    gbuf = &z_erofs_gbufpool.gbuf_array[erofs_cur_gbuf()];
>> +    spin_lock(&gbuf->lock);
>> +    /* check if the buffer is too small */
>> +    if (requiredpages > gbuf->nrpages) {
>> +            spin_unlock(&gbuf->lock);
>> +            /* (for sparse checker) pretend gbuf->lock is still taken */
>> +            __acquire(gbuf->lock);
>> +            return NULL;
>> +    }
>> +    return gbuf->ptr;
>> +}
>> +
>> +void erofs_put_gbuf(void *ptr) __releases(gbuf->lock)
> z_erofs_put_gbuf
>
>> +{
>> +    struct erofs_gbuf *gbuf;
>> +
>> +    gbuf = &z_erofs_gbufpool.gbuf_array[erofs_cur_gbuf()];
>> +    DBG_BUGON(gbuf->ptr != ptr);
>> +    spin_unlock(&gbuf->lock);
>> +}
>> +
>> +int erofs_gbuf_growsize(unsigned int nrpages)
> z_erofs_get_growsize
>
>> +{
>> +    static DEFINE_MUTEX(gbuf_resize_mutex);
>> +    struct page *pagepool = NULL;
> Since we just grow global buffers, I think we could just copy
> the old array and fill the rest? so that pagepool is needed
> and we don't touch `struct page` anymore so `folio` is not
> eeded.
>
> But it can be done with a seperate patch.
>
>
>> +    int delta, ret, i, j;
>> +
>> +    if (!z_erofs_gbufpool.gbuf_array)
>> +            return -ENOMEM;
>> +
>> +    mutex_lock(&gbuf_resize_mutex);
>> +    delta = nrpages - z_erofs_gbufpool.gbuf_nrpages;
>> +    ret = 0;
>       ret = -EINVAL;
>
>> +    /* avoid shrinking pcl_buf, since no idea how many fses rely on */
> /* avoid shrinking gbufs ... */
>
>> +    if (delta <= 0)
>> +            goto out;
>> +
>> +    for (i = 0; i < z_erofs_gbufpool.gbuf_num; ++i) {
>> +            struct erofs_gbuf *gbuf = &z_erofs_gbufpool.gbuf_array[i];
>> +            struct page **pages, **tmp_pages;
>> +            void *ptr, *old_ptr = NULL;
>> +
>> +            ret = -ENOMEM;
>> +            tmp_pages = kmalloc_array(nrpages, sizeof(*tmp_pages),
>> +                            GFP_KERNEL);
>> +            if (!tmp_pages)
>> +                    break;
>> +            for (j = 0; j < nrpages; ++j) {
>> +                    tmp_pages[j] = erofs_allocpage(&pagepool, GFP_KERNEL);
>> +                    if (!tmp_pages[j])
>> +                            goto free_pagearray;
>> +            }
>> +            ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL);
>> +            if (!ptr)
>> +                    goto free_pagearray;
>> +
>> +            pages = tmp_pages;
>> +            spin_lock(&gbuf->lock);
>> +            old_ptr = gbuf->ptr;
>> +            gbuf->ptr = ptr;
>> +            tmp_pages = gbuf->pages;
>> +            gbuf->pages = pages;
>> +            j = gbuf->nrpages;
>> +            gbuf->nrpages = nrpages;
>> +            spin_unlock(&gbuf->lock);
>> +            ret = 0;
>> +            if (!tmp_pages) {
>> +                    DBG_BUGON(old_ptr);
>> +                    continue;
>> +            }
>> +
>> +            if (old_ptr)
>> +                    vunmap(old_ptr);
>> +free_pagearray:
>> +            while (j)
>> +                    erofs_pagepool_add(&pagepool, tmp_pages[--j]);
>> +            kfree(tmp_pages);
>> +            if (ret)
>> +                    break;
>> +    }
>> +    z_erofs_gbufpool.gbuf_nrpages = nrpages;
>> +    erofs_release_pages(&pagepool);
>> +out:
>> +    mutex_unlock(&gbuf_resize_mutex);
>> +    return ret;
>> +}
>> +
>> +void __init erofs_gbuf_init(void)
>> +{
>> +    if (!z_erofs_gbufpool.gbuf_num)
>> +            z_erofs_gbufpool.gbuf_num = num_possible_cpus();
>> +    else if (z_erofs_gbufpool.gbuf_num > num_possible_cpus())
>> +            z_erofs_gbufpool.gbuf_num = num_possible_cpus();
>       unsigned int i = num_possible_cpus();
>
>       if (!z_erofs_gbuf_count)
>               z_erofs_gbuf_count = cpus;
>       else
>               z_erofs_gbuf_count = z_erofs_gbuf_count < i ? : i;
>
>
>
>> +
>> +    z_erofs_gbufpool.gbuf_array = kmalloc_array(z_erofs_gbufpool.gbuf_num,
>> +                    sizeof(*z_erofs_gbufpool.gbuf_array),
>> +                    GFP_KERNEL | __GFP_ZERO);> +    if 
>> (!z_erofs_gbufpool.gbuf_array) {
>> +            erofs_err(NULL, "failed to alloc page for erofs gbuf_array\n");
>
>               erofs_err(NULL, "failed to alloate global buffer array");
>
>
>> +            return;> +      }
>> +
>> +    for (int i = 0; i < z_erofs_gbufpool.gbuf_num; ++i)
> please don't use `for (int i..`.
>
> so    for (i = 0; i < z_erofs_gbufpool.gbuf_num; ++i)
>
> Thanks,
> Gao Xiang

Re: [PATCH] erofs: rename per-CPU buffers to global buffer pool and make it configurable

Reply via email to