This modifies relevant functions to apply the page cache share feature. Below is the memory usage for reading all files in two different minor versions of container images:
+-------------------+------------------+-------------+---------------+ | Image | Page Cache Share | Memory (MB) | Memory | | | | | Reduction (%) | +-------------------+------------------+-------------+---------------+ | | No | 241 | - | | redis +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 163 | 33% | +-------------------+------------------+-------------+---------------+ | | No | 872 | - | | postgres +------------------+-------------+---------------+ | 16.1 & 16.2 | Yes | 630 | 28% | +-------------------+------------------+-------------+---------------+ | | No | 2771 | - | | tensorflow +------------------+-------------+---------------+ | 1.11.0 & 2.11.1 | Yes | 2340 | 16% | +-------------------+------------------+-------------+---------------+ | | No | 926 | - | | mysql +------------------+-------------+---------------+ | 8.0.11 & 8.0.12 | Yes | 735 | 21% | +-------------------+------------------+-------------+---------------+ | | No | 390 | - | | nginx +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 219 | 44% | +-------------------+------------------+-------------+---------------+ | tomcat | No | 924 | - | | 10.1.25 & 10.1.26 +------------------+-------------+---------------+ | | Yes | 474 | 49% | +-------------------+------------------+-------------+---------------+ Additionally, the table below shows the runtime memory usage of the container: +-------------------+------------------+-------------+---------------+ | Image | Page Cache Share | Memory (MB) | Memory | | | | | Reduction (%) | +-------------------+------------------+-------------+---------------+ | | No | 35 | - | | redis +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 28 | 20% | +-------------------+------------------+-------------+---------------+ | | No | 149 | - | | postgres +------------------+-------------+---------------+ | 16.1 & 16.2 | Yes | 95 | 37% | +-------------------+------------------+-------------+---------------+ | | No | 1028 | - | | tensorflow +------------------+-------------+---------------+ | 1.11.0 & 2.11.1 | Yes | 930 | 10% | +-------------------+------------------+-------------+---------------+ | | No | 155 | - | | mysql +------------------+-------------+---------------+ | 8.0.11 & 8.0.12 | Yes | 132 | 15% | +-------------------+------------------+-------------+---------------+ | | No | 25 | - | | nginx +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 20 | 20% | +-------------------+------------------+-------------+---------------+ | tomcat | No | 186 | - | | 10.1.25 & 10.1.26 +------------------+-------------+---------------+ | | Yes | 98 | 48% | +-------------------+------------------+-------------+---------------+ Signed-off-by: Hongzhen Luo <hongz...@linux.alibaba.com> --- fs/erofs/data.c | 14 +++++++-- fs/erofs/inode.c | 5 ++- fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++ fs/erofs/pagecache_share.h | 11 +++++++ fs/erofs/super.c | 7 +++++ fs/erofs/zdata.c | 9 ++++-- 6 files changed, 104 insertions(+), 5 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 0cd6b5c4df98..fb08acbeaab6 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -5,6 +5,7 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "internal.h" +#include "pagecache_share.h" #include <linux/sched/mm.h> #include <trace/events/erofs.h> @@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ static int erofs_read_folio(struct file *file, struct folio *folio) { - return iomap_read_folio(folio, &erofs_iomap_ops); + int ret, pcshr; + + pcshr = erofs_pcshr_read_begin(file, folio); + ret = iomap_read_folio(folio, &erofs_iomap_ops); + erofs_pcshr_read_end(file, folio, pcshr); + return ret; } static void erofs_readahead(struct readahead_control *rac) { - return iomap_readahead(rac, &erofs_iomap_ops); + int pcshr; + + pcshr = erofs_pcshr_readahead_begin(rac); + iomap_readahead(rac, &erofs_iomap_ops); + erofs_pcshr_readahead_end(rac, pcshr); } static sector_t erofs_bmap(struct address_space *mapping, sector_t block) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d4b89407822a..0b070f4b46b8 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -5,6 +5,7 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" +#include "pagecache_share.h" #include <trace/events/erofs.h> static int erofs_fill_symlink(struct inode *inode, void *kaddr, @@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode) switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &erofs_generic_iops; - if (erofs_inode_is_data_compressed(vi->datalayout)) + if (erofs_pcshr_fill_inode(inode) == 0) + inode->i_fop = &erofs_pcshr_fops; + else if (erofs_inode_is_data_compressed(vi->datalayout)) inode->i_fop = &generic_ro_fops; else inode->i_fop = &erofs_file_fops; diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c index 703fd17c002c..22172b5e21c7 100644 --- a/fs/erofs/pagecache_share.c +++ b/fs/erofs/pagecache_share.c @@ -22,6 +22,7 @@ struct erofs_pcshr_counter { struct erofs_pcshr_private { char fprt[PCSHR_FPRT_MAXLEN]; + struct mutex mutex; }; static struct erofs_pcshr_counter mnt_counter = { @@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data) if (!ano_private) return -ENOMEM; memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data); + mutex_init(&ano_private->mutex); inode->i_private = ano_private; return 0; } @@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = { .get_unmapped_area = thp_get_unmapped_area, .splice_read = filemap_splice_read, }; + +int erofs_pcshr_read_begin(struct file *file, struct folio *folio) +{ + struct erofs_inode *vi; + struct erofs_pcshr_private *ano_private; + + if (!(file && file->private_data)) + return 0; + + vi = file->private_data; + if (vi->ano_inode != file_inode(file)) + return 0; + + ano_private = vi->ano_inode->i_private; + mutex_lock(&ano_private->mutex); + folio->mapping->host = &vi->vfs_inode; + return 1; +} + +void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr) +{ + struct erofs_pcshr_private *ano_private; + + if (pcshr == 0) + return; + + ano_private = file_inode(file)->i_private; + folio->mapping->host = file_inode(file); + mutex_unlock(&ano_private->mutex); +} + +int erofs_pcshr_readahead_begin(struct readahead_control *rac) +{ + struct erofs_inode *vi; + struct file *file = rac->file; + struct erofs_pcshr_private *ano_private; + + if (!(file && file->private_data)) + return 0; + + vi = file->private_data; + if (vi->ano_inode != file_inode(file)) + return 0; + + ano_private = file_inode(file)->i_private; + mutex_lock(&ano_private->mutex); + rac->mapping->host = &vi->vfs_inode; + return 1; +} + +void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr) +{ + struct erofs_pcshr_private *ano_private; + + if (pcshr == 0) + return; + + ano_private = file_inode(rac->file)->i_private; + rac->mapping->host = file_inode(rac->file); + mutex_unlock(&ano_private->mutex); +} diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h index f3889d6889e5..abda2a60278b 100644 --- a/fs/erofs/pagecache_share.h +++ b/fs/erofs/pagecache_share.h @@ -14,6 +14,12 @@ void erofs_pcshr_free_mnt(void); int erofs_pcshr_fill_inode(struct inode *inode); void erofs_pcshr_free_inode(struct inode *inode); +/* switch between the anonymous inode and the real inode */ +int erofs_pcshr_read_begin(struct file *file, struct folio *folio); +void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr); +int erofs_pcshr_readahead_begin(struct readahead_control *rac); +void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr); + #else static inline int erofs_pcshr_init_mnt(void) { return 0; } @@ -21,6 +27,11 @@ static inline void erofs_pcshr_free_mnt(void) {} static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1; } static inline void erofs_pcshr_free_inode(struct inode *inode) {} +static inline int erofs_pcshr_read_begin(struct file *file, struct folio *folio) { return 0; } +static inline void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr) {} +static inline int erofs_pcshr_readahead_begin(struct readahead_control *rac) { return 0; } +static inline void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr) {} + #endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE #endif diff --git a/fs/erofs/super.c b/fs/erofs/super.c index b4ce07dc931c..1b690eb6c1f1 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -13,6 +13,7 @@ #include <linux/backing-dev.h> #include <linux/pseudo_fs.h> #include "xattr.h" +#include "pagecache_share.h" #define CREATE_TRACE_POINTS #include <trace/events/erofs.h> @@ -81,6 +82,7 @@ static void erofs_free_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); + erofs_pcshr_free_inode(inode); if (inode->i_op == &erofs_fast_symlink_iops) kfree(inode->i_link); kfree(vi->xattr_shared_xattrs); @@ -683,6 +685,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; + err = erofs_pcshr_init_mnt(); + if (err) + return err; + erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid); return 0; } @@ -818,6 +824,7 @@ static void erofs_kill_sb(struct super_block *sb) kill_anon_super(sb); else kill_block_super(sb); + erofs_pcshr_free_mnt(); fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); erofs_sb_free(sbi); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 19ef4ff2a134..fc2ed01eaabe 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -5,6 +5,7 @@ * Copyright (C) 2022 Alibaba Cloud */ #include "compress.h" +#include "pagecache_share.h" #include <linux/psi.h> #include <linux/cpuhotplug.h> #include <trace/events/erofs.h> @@ -1891,9 +1892,10 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) { struct inode *const inode = folio->mapping->host; struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - int err; + int err, pcshr; trace_erofs_read_folio(folio, false); + pcshr = erofs_pcshr_read_begin(file, folio); f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; z_erofs_pcluster_readmore(&f, NULL, true); @@ -1909,6 +1911,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); + erofs_pcshr_read_end(file, folio, pcshr); return err; } @@ -1918,8 +1921,9 @@ static void z_erofs_readahead(struct readahead_control *rac) struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct folio *head = NULL, *folio; unsigned int nr_folios; - int err; + int err, pcshr; + pcshr = erofs_pcshr_readahead_begin(rac); f.headoffset = readahead_pos(rac); z_erofs_pcluster_readmore(&f, rac, true); @@ -1947,6 +1951,7 @@ static void z_erofs_readahead(struct readahead_control *rac) (void)z_erofs_runqueue(&f, nr_folios); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); + erofs_pcshr_readahead_end(rac, pcshr); } const struct address_space_operations z_erofs_aops = { -- 2.43.5