This modifies relevant functions to apply the page cache
share feature.

Below is the memory usage for reading all files in two different minor
versions of container images:

+-------------------+------------------+-------------+---------------+
|       Image       | Page Cache Share | Memory (MB) |    Memory     |
|                   |                  |             | Reduction (%) |
+-------------------+------------------+-------------+---------------+
|                   |        No        |     241     |       -       |
|       redis       +------------------+-------------+---------------+
|   7.2.4 & 7.2.5   |        Yes       |     163     |      33%      |
+-------------------+------------------+-------------+---------------+
|                   |        No        |     872     |       -       |
|      postgres     +------------------+-------------+---------------+
|    16.1 & 16.2    |        Yes       |     630     |      28%      |
+-------------------+------------------+-------------+---------------+
|                   |        No        |     2771    |       -       |
|     tensorflow    +------------------+-------------+---------------+
|  1.11.0 & 2.11.1  |        Yes       |     2340    |      16%      |
+-------------------+------------------+-------------+---------------+
|                   |        No        |     926     |       -       |
|       mysql       +------------------+-------------+---------------+
|  8.0.11 & 8.0.12  |        Yes       |     735     |      21%      |
+-------------------+------------------+-------------+---------------+
|                   |        No        |     390     |       -       |
|       nginx       +------------------+-------------+---------------+
|   7.2.4 & 7.2.5   |        Yes       |     219     |      44%      |
+-------------------+------------------+-------------+---------------+
|       tomcat      |        No        |     924     |       -       |
| 10.1.25 & 10.1.26 +------------------+-------------+---------------+
|                   |        Yes       |     474     |      49%      |
+-------------------+------------------+-------------+---------------+

Additionally, the table below shows the runtime memory usage of the
container:

+-------------------+------------------+-------------+---------------+
|       Image       | Page Cache Share | Memory (MB) |    Memory     |
|                   |                  |             | Reduction (%) |
+-------------------+------------------+-------------+---------------+
|                   |        No        |      35     |       -       |
|       redis       +------------------+-------------+---------------+
|   7.2.4 & 7.2.5   |        Yes       |      28     |      20%      |
+-------------------+------------------+-------------+---------------+
|                   |        No        |     149     |       -       |
|      postgres     +------------------+-------------+---------------+
|    16.1 & 16.2    |        Yes       |      95     |      37%      |
+-------------------+------------------+-------------+---------------+
|                   |        No        |     1028    |       -       |
|     tensorflow    +------------------+-------------+---------------+
|  1.11.0 & 2.11.1  |        Yes       |     930     |      10%      |
+-------------------+------------------+-------------+---------------+
|                   |        No        |     155     |       -       |
|       mysql       +------------------+-------------+---------------+
|  8.0.11 & 8.0.12  |        Yes       |     132     |      15%      |
+-------------------+------------------+-------------+---------------+
|                   |        No        |      25     |       -       |
|       nginx       +------------------+-------------+---------------+
|   7.2.4 & 7.2.5   |        Yes       |      20     |      20%      |
+-------------------+------------------+-------------+---------------+
|       tomcat      |        No        |     186     |       -       |
| 10.1.25 & 10.1.26 +------------------+-------------+---------------+
|                   |        Yes       |      98     |      48%      |
+-------------------+------------------+-------------+---------------+

Signed-off-by: Hongzhen Luo <hongz...@linux.alibaba.com>
---
 fs/erofs/data.c            | 14 +++++++--
 fs/erofs/inode.c           |  5 ++-
 fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++
 fs/erofs/pagecache_share.h | 11 +++++++
 fs/erofs/super.c           |  7 +++++
 fs/erofs/zdata.c           |  9 ++++--
 6 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0cd6b5c4df98..fb08acbeaab6 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2021, Alibaba Cloud
  */
 #include "internal.h"
+#include "pagecache_share.h"
 #include <linux/sched/mm.h>
 #include <trace/events/erofs.h>
 
@@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct 
fiemap_extent_info *fieinfo,
  */
 static int erofs_read_folio(struct file *file, struct folio *folio)
 {
-       return iomap_read_folio(folio, &erofs_iomap_ops);
+       int ret, pcshr;
+
+       pcshr = erofs_pcshr_read_begin(file, folio);
+       ret = iomap_read_folio(folio, &erofs_iomap_ops);
+       erofs_pcshr_read_end(file, folio, pcshr);
+       return ret;
 }
 
 static void erofs_readahead(struct readahead_control *rac)
 {
-       return iomap_readahead(rac, &erofs_iomap_ops);
+       int pcshr;
+
+       pcshr = erofs_pcshr_readahead_begin(rac);
+       iomap_readahead(rac, &erofs_iomap_ops);
+       erofs_pcshr_readahead_end(rac, pcshr);
 }
 
 static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index d4b89407822a..0b070f4b46b8 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2021, Alibaba Cloud
  */
 #include "xattr.h"
+#include "pagecache_share.h"
 #include <trace/events/erofs.h>
 
 static int erofs_fill_symlink(struct inode *inode, void *kaddr,
@@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
                inode->i_op = &erofs_generic_iops;
-               if (erofs_inode_is_data_compressed(vi->datalayout))
+               if (erofs_pcshr_fill_inode(inode) == 0)
+                       inode->i_fop = &erofs_pcshr_fops;
+               else if (erofs_inode_is_data_compressed(vi->datalayout))
                        inode->i_fop = &generic_ro_fops;
                else
                        inode->i_fop = &erofs_file_fops;
diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
index 703fd17c002c..22172b5e21c7 100644
--- a/fs/erofs/pagecache_share.c
+++ b/fs/erofs/pagecache_share.c
@@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
 
 struct erofs_pcshr_private {
        char fprt[PCSHR_FPRT_MAXLEN];
+       struct mutex mutex;
 };
 
 static struct erofs_pcshr_counter mnt_counter = {
@@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data)
        if (!ano_private)
                return -ENOMEM;
        memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
+       mutex_init(&ano_private->mutex);
        inode->i_private = ano_private;
        return 0;
 }
@@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
        .get_unmapped_area = thp_get_unmapped_area,
        .splice_read    = filemap_splice_read,
 };
+
+int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
+{
+       struct erofs_inode *vi;
+       struct erofs_pcshr_private *ano_private;
+
+       if (!(file && file->private_data))
+               return 0;
+
+       vi = file->private_data;
+       if (vi->ano_inode != file_inode(file))
+               return 0;
+
+       ano_private = vi->ano_inode->i_private;
+       mutex_lock(&ano_private->mutex);
+       folio->mapping->host = &vi->vfs_inode;
+       return 1;
+}
+
+void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr)
+{
+       struct erofs_pcshr_private *ano_private;
+
+       if (pcshr == 0)
+               return;
+
+       ano_private = file_inode(file)->i_private;
+       folio->mapping->host = file_inode(file);
+       mutex_unlock(&ano_private->mutex);
+}
+
+int erofs_pcshr_readahead_begin(struct readahead_control *rac)
+{
+       struct erofs_inode *vi;
+       struct file *file = rac->file;
+       struct erofs_pcshr_private *ano_private;
+
+       if (!(file && file->private_data))
+               return 0;
+
+       vi = file->private_data;
+       if (vi->ano_inode != file_inode(file))
+               return 0;
+
+       ano_private = file_inode(file)->i_private;
+       mutex_lock(&ano_private->mutex);
+       rac->mapping->host = &vi->vfs_inode;
+       return 1;
+}
+
+void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr)
+{
+       struct erofs_pcshr_private *ano_private;
+
+       if (pcshr == 0)
+               return;
+
+       ano_private = file_inode(rac->file)->i_private;
+       rac->mapping->host = file_inode(rac->file);
+       mutex_unlock(&ano_private->mutex);
+}
diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
index f3889d6889e5..abda2a60278b 100644
--- a/fs/erofs/pagecache_share.h
+++ b/fs/erofs/pagecache_share.h
@@ -14,6 +14,12 @@ void erofs_pcshr_free_mnt(void);
 int erofs_pcshr_fill_inode(struct inode *inode);
 void erofs_pcshr_free_inode(struct inode *inode);
 
+/* switch between the anonymous inode and the real inode */
+int erofs_pcshr_read_begin(struct file *file, struct folio *folio);
+void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr);
+int erofs_pcshr_readahead_begin(struct readahead_control *rac);
+void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr);
+
 #else
 
 static inline int erofs_pcshr_init_mnt(void) { return 0; }
@@ -21,6 +27,11 @@ static inline void erofs_pcshr_free_mnt(void) {}
 static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1; }
 static inline void erofs_pcshr_free_inode(struct inode *inode) {}
 
+static inline int erofs_pcshr_read_begin(struct file *file, struct folio 
*folio) { return 0; }
+static inline void erofs_pcshr_read_end(struct file *file, struct folio 
*folio, int pcshr) {}
+static inline int erofs_pcshr_readahead_begin(struct readahead_control *rac) { 
return 0; }
+static inline void erofs_pcshr_readahead_end(struct readahead_control *rac, 
int pcshr) {}
+
 #endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
 
 #endif
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index b4ce07dc931c..1b690eb6c1f1 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -13,6 +13,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pseudo_fs.h>
 #include "xattr.h"
+#include "pagecache_share.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/erofs.h>
@@ -81,6 +82,7 @@ static void erofs_free_inode(struct inode *inode)
 {
        struct erofs_inode *vi = EROFS_I(inode);
 
+       erofs_pcshr_free_inode(inode);
        if (inode->i_op == &erofs_fast_symlink_iops)
                kfree(inode->i_link);
        kfree(vi->xattr_shared_xattrs);
@@ -683,6 +685,10 @@ static int erofs_fc_fill_super(struct super_block *sb, 
struct fs_context *fc)
        if (err)
                return err;
 
+       err = erofs_pcshr_init_mnt();
+       if (err)
+               return err;
+
        erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
        return 0;
 }
@@ -818,6 +824,7 @@ static void erofs_kill_sb(struct super_block *sb)
                kill_anon_super(sb);
        else
                kill_block_super(sb);
+       erofs_pcshr_free_mnt();
        fs_put_dax(sbi->dif0.dax_dev, NULL);
        erofs_fscache_unregister_fs(sb);
        erofs_sb_free(sbi);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 19ef4ff2a134..fc2ed01eaabe 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2022 Alibaba Cloud
  */
 #include "compress.h"
+#include "pagecache_share.h"
 #include <linux/psi.h>
 #include <linux/cpuhotplug.h>
 #include <trace/events/erofs.h>
@@ -1891,9 +1892,10 @@ static int z_erofs_read_folio(struct file *file, struct 
folio *folio)
 {
        struct inode *const inode = folio->mapping->host;
        struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
-       int err;
+       int err, pcshr;
 
        trace_erofs_read_folio(folio, false);
+       pcshr = erofs_pcshr_read_begin(file, folio);
        f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
 
        z_erofs_pcluster_readmore(&f, NULL, true);
@@ -1909,6 +1911,7 @@ static int z_erofs_read_folio(struct file *file, struct 
folio *folio)
 
        erofs_put_metabuf(&f.map.buf);
        erofs_release_pages(&f.pagepool);
+       erofs_pcshr_read_end(file, folio, pcshr);
        return err;
 }
 
@@ -1918,8 +1921,9 @@ static void z_erofs_readahead(struct readahead_control 
*rac)
        struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
        struct folio *head = NULL, *folio;
        unsigned int nr_folios;
-       int err;
+       int err, pcshr;
 
+       pcshr = erofs_pcshr_readahead_begin(rac);
        f.headoffset = readahead_pos(rac);
 
        z_erofs_pcluster_readmore(&f, rac, true);
@@ -1947,6 +1951,7 @@ static void z_erofs_readahead(struct readahead_control 
*rac)
        (void)z_erofs_runqueue(&f, nr_folios);
        erofs_put_metabuf(&f.map.buf);
        erofs_release_pages(&f.pagepool);
+       erofs_pcshr_readahead_end(rac, pcshr);
 }
 
 const struct address_space_operations z_erofs_aops = {
-- 
2.43.5

Reply via email to