[RFC PATCH v5 3/4] erofs: apply the page cache share feature
    Hongzhen Luo 
    hongzhen at linux.alibaba.com
       
    Wed Jan 22 01:48:33 AEDT 2025
    
    
  
On 2025/1/21 19:59, Hongbo Li wrote:
>
>
> On 2025/1/5 23:12, Hongzhen Luo wrote:
>> This modifies relevant functions to apply the page cache
>> share feature.
>>
>> Below is the memory usage for reading all files in two different minor
>> versions of container images:
>>
>> +-------------------+------------------+-------------+---------------+
>> |       Image       | Page Cache Share | Memory (MB) | Memory     |
>> |                   |                  |             | Reduction (%) |
>> +-------------------+------------------+-------------+---------------+
>> |                   |        No        |     241     | -       |
>> |       redis +------------------+-------------+---------------+
>> |   7.2.4 & 7.2.5   |        Yes       |     163     | 33%      |
>> +-------------------+------------------+-------------+---------------+
>> |                   |        No        |     872     | -       |
>> |      postgres +------------------+-------------+---------------+
>> |    16.1 & 16.2    |        Yes       |     630     | 28%      |
>> +-------------------+------------------+-------------+---------------+
>> |                   |        No        |     2771    | -       |
>> |     tensorflow +------------------+-------------+---------------+
>> |  1.11.0 & 2.11.1  |        Yes       |     2340    | 16%      |
>> +-------------------+------------------+-------------+---------------+
>> |                   |        No        |     926     | -       |
>> |       mysql +------------------+-------------+---------------+
>> |  8.0.11 & 8.0.12  |        Yes       |     735     | 21%      |
>> +-------------------+------------------+-------------+---------------+
>> |                   |        No        |     390     | -       |
>> |       nginx +------------------+-------------+---------------+
>> |   7.2.4 & 7.2.5   |        Yes       |     219     | 44%      |
>> +-------------------+------------------+-------------+---------------+
>> |       tomcat      |        No        |     924     | -       |
>> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
>> |                   |        Yes       |     474     | 49%      |
>> +-------------------+------------------+-------------+---------------+
>>
>> Additionally, the table below shows the runtime memory usage of the
>> container:
>>
>> +-------------------+------------------+-------------+---------------+
>> |       Image       | Page Cache Share | Memory (MB) | Memory     |
>> |                   |                  |             | Reduction (%) |
>> +-------------------+------------------+-------------+---------------+
>> |                   |        No        |      35     | -       |
>> |       redis +------------------+-------------+---------------+
>> |   7.2.4 & 7.2.5   |        Yes       |      28     | 20%      |
>> +-------------------+------------------+-------------+---------------+
>> |                   |        No        |     149     | -       |
>> |      postgres +------------------+-------------+---------------+
>> |    16.1 & 16.2    |        Yes       |      95     | 37%      |
>> +-------------------+------------------+-------------+---------------+
>> |                   |        No        |     1028    | -       |
>> |     tensorflow +------------------+-------------+---------------+
>> |  1.11.0 & 2.11.1  |        Yes       |     930     | 10%      |
>> +-------------------+------------------+-------------+---------------+
>> |                   |        No        |     155     | -       |
>> |       mysql +------------------+-------------+---------------+
>> |  8.0.11 & 8.0.12  |        Yes       |     132     | 15%      |
>> +-------------------+------------------+-------------+---------------+
>> |                   |        No        |      25     | -       |
>> |       nginx +------------------+-------------+---------------+
>> |   7.2.4 & 7.2.5   |        Yes       |      20     | 20%      |
>> +-------------------+------------------+-------------+---------------+
>> |       tomcat      |        No        |     186     | -       |
>> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
>> |                   |        Yes       |      98     | 48%      |
>> +-------------------+------------------+-------------+---------------+
>>
>> Signed-off-by: Hongzhen Luo <hongzhen at linux.alibaba.com>
>> ---
>>   fs/erofs/data.c            | 14 +++++++--
>>   fs/erofs/inode.c           |  5 ++-
>>   fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++
>>   fs/erofs/pagecache_share.h | 11 +++++++
>>   fs/erofs/super.c           |  7 +++++
>>   fs/erofs/zdata.c           |  9 ++++--
>>   6 files changed, 104 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
>> index 0cd6b5c4df98..fb08acbeaab6 100644
>> --- a/fs/erofs/data.c
>> +++ b/fs/erofs/data.c
>> @@ -5,6 +5,7 @@
>>    * Copyright (C) 2021, Alibaba Cloud
>>    */
>>   #include "internal.h"
>> +#include "pagecache_share.h"
>>   #include <linux/sched/mm.h>
>>   #include <trace/events/erofs.h>
>>   @@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct 
>> fiemap_extent_info *fieinfo,
>>    */
>>   static int erofs_read_folio(struct file *file, struct folio *folio)
>>   {
>> -    return iomap_read_folio(folio, &erofs_iomap_ops);
>> +    int ret, pcshr;
>> +
>> +    pcshr = erofs_pcshr_read_begin(file, folio);
>> +    ret = iomap_read_folio(folio, &erofs_iomap_ops);
>> +    erofs_pcshr_read_end(file, folio, pcshr);
>> +    return ret;
>>   }
>>     static void erofs_readahead(struct readahead_control *rac)
>>   {
>> -    return iomap_readahead(rac, &erofs_iomap_ops);
>> +    int pcshr;
>> +
>> +    pcshr = erofs_pcshr_readahead_begin(rac);
>> +    iomap_readahead(rac, &erofs_iomap_ops);
>> +    erofs_pcshr_readahead_end(rac, pcshr);
>>   }
>>     static sector_t erofs_bmap(struct address_space *mapping, 
>> sector_t block)
>> diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
>> index d4b89407822a..0b070f4b46b8 100644
>> --- a/fs/erofs/inode.c
>> +++ b/fs/erofs/inode.c
>> @@ -5,6 +5,7 @@
>>    * Copyright (C) 2021, Alibaba Cloud
>>    */
>>   #include "xattr.h"
>> +#include "pagecache_share.h"
>>   #include <trace/events/erofs.h>
>>     static int erofs_fill_symlink(struct inode *inode, void *kaddr,
>> @@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
>>       switch (inode->i_mode & S_IFMT) {
>>       case S_IFREG:
>>           inode->i_op = &erofs_generic_iops;
>> -        if (erofs_inode_is_data_compressed(vi->datalayout))
>> +        if (erofs_pcshr_fill_inode(inode) == 0)
>> +            inode->i_fop = &erofs_pcshr_fops;
>> +        else if (erofs_inode_is_data_compressed(vi->datalayout))
>>               inode->i_fop = &generic_ro_fops;
>>           else
>>               inode->i_fop = &erofs_file_fops;
>> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
>> index 703fd17c002c..22172b5e21c7 100644
>> --- a/fs/erofs/pagecache_share.c
>> +++ b/fs/erofs/pagecache_share.c
>> @@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
>>     struct erofs_pcshr_private {
>>       char fprt[PCSHR_FPRT_MAXLEN];
>> +    struct mutex mutex;
>>   };
>>     static struct erofs_pcshr_counter mnt_counter = {
>> @@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void 
>> *data)
>>       if (!ano_private)
>>           return -ENOMEM;
>>       memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
>> +    mutex_init(&ano_private->mutex);
>>       inode->i_private = ano_private;
>>       return 0;
>>   }
>> @@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
>>       .get_unmapped_area = thp_get_unmapped_area,
>>       .splice_read    = filemap_splice_read,
>>   };
>> +
>> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
>> +{
>> +    struct erofs_inode *vi;
>> +    struct erofs_pcshr_private *ano_private;
>> +
>> +    if (!(file && file->private_data))
>> +        return 0;
>> +
>> +    vi = file->private_data;
>> +    if (vi->ano_inode != file_inode(file))
>> +        return 0;
>> +
>> +    ano_private = vi->ano_inode->i_private;
>> +    mutex_lock(&ano_private->mutex);
> Can we lock in folio granularity? The erofs_pcshr_private mutex may 
> limit the concurrent in reading.
I’m sorry for the delay in responding; I just saw this message. I will
send an improved version of the patch soon. Thanks for this suggestion.
>> +    folio->mapping->host = &vi->vfs_inode;
>> +    return 1;
>> +}
>> +
>> +void erofs_pcshr_read_end(struct file *file, struct folio *folio, 
>> int pcshr)
>> +{
>> +    struct erofs_pcshr_private *ano_private;
>> +
>> +    if (pcshr == 0)
>> +        return;
>> +
>> +    ano_private = file_inode(file)->i_private;
>> +    folio->mapping->host = file_inode(file);
>> +    mutex_unlock(&ano_private->mutex);
>> +}
>> +
>> +int erofs_pcshr_readahead_begin(struct readahead_control *rac)
>> +{
> May be the begin/end helpers for read and readahead can be used with 
> the same helpers. They did the similar logic.
Okay, indeed! I will send an improved version later.
Best wishes,
Hongzhen Luo
>> +    struct erofs_inode *vi;
>> +    struct file *file = rac->file;
>> +    struct erofs_pcshr_private *ano_private;
>> +
>> +    if (!(file && file->private_data))
>> +        return 0;
>> +
>> +    vi = file->private_data;
>> +    if (vi->ano_inode != file_inode(file))
>> +        return 0;
>> +
>> +    ano_private = file_inode(file)->i_private;
>> +    mutex_lock(&ano_private->mutex);
>> +    rac->mapping->host = &vi->vfs_inode;
>> +    return 1;
>> +}
>> +
>> +void erofs_pcshr_readahead_end(struct readahead_control *rac, int 
>> pcshr)
>> +{
>> +    struct erofs_pcshr_private *ano_private;
>> +
>> +    if (pcshr == 0)
>> +        return;
>> +
>> +    ano_private = file_inode(rac->file)->i_private;
>> +    rac->mapping->host = file_inode(rac->file);
>> +    mutex_unlock(&ano_private->mutex);
>> +}
>> diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
>> index f3889d6889e5..abda2a60278b 100644
>> --- a/fs/erofs/pagecache_share.h
>> +++ b/fs/erofs/pagecache_share.h
>> @@ -14,6 +14,12 @@ void erofs_pcshr_free_mnt(void);
>>   int erofs_pcshr_fill_inode(struct inode *inode);
>>   void erofs_pcshr_free_inode(struct inode *inode);
>>   +/* switch between the anonymous inode and the real inode */
>> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio);
>> +void erofs_pcshr_read_end(struct file *file, struct folio *folio, 
>> int pcshr);
>> +int erofs_pcshr_readahead_begin(struct readahead_control *rac);
>> +void erofs_pcshr_readahead_end(struct readahead_control *rac, int 
>> pcshr);
>> +
>>   #else
>>     static inline int erofs_pcshr_init_mnt(void) { return 0; }
>> @@ -21,6 +27,11 @@ static inline void erofs_pcshr_free_mnt(void) {}
>>   static inline int erofs_pcshr_fill_inode(struct inode *inode) { 
>> return -1; }
>>   static inline void erofs_pcshr_free_inode(struct inode *inode) {}
>>   +static inline int erofs_pcshr_read_begin(struct file *file, struct 
>> folio *folio) { return 0; }
>> +static inline void erofs_pcshr_read_end(struct file *file, struct 
>> folio *folio, int pcshr) {}
>> +static inline int erofs_pcshr_readahead_begin(struct 
>> readahead_control *rac) { return 0; }
>> +static inline void erofs_pcshr_readahead_end(struct 
>> readahead_control *rac, int pcshr) {}
>> +
>>   #endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
>>     #endif
>> diff --git a/fs/erofs/super.c b/fs/erofs/super.c
>> index b4ce07dc931c..1b690eb6c1f1 100644
>> --- a/fs/erofs/super.c
>> +++ b/fs/erofs/super.c
>> @@ -13,6 +13,7 @@
>>   #include <linux/backing-dev.h>
>>   #include <linux/pseudo_fs.h>
>>   #include "xattr.h"
>> +#include "pagecache_share.h"
>>     #define CREATE_TRACE_POINTS
>>   #include <trace/events/erofs.h>
>> @@ -81,6 +82,7 @@ static void erofs_free_inode(struct inode *inode)
>>   {
>>       struct erofs_inode *vi = EROFS_I(inode);
>>   +    erofs_pcshr_free_inode(inode);
>>       if (inode->i_op == &erofs_fast_symlink_iops)
>>           kfree(inode->i_link);
>>       kfree(vi->xattr_shared_xattrs);
>> @@ -683,6 +685,10 @@ static int erofs_fc_fill_super(struct 
>> super_block *sb, struct fs_context *fc)
>>       if (err)
>>           return err;
>>   +    err = erofs_pcshr_init_mnt();
>> +    if (err)
>> +        return err;
>> +
>>       erofs_info(sb, "mounted with root inode @ nid %llu.", 
>> sbi->root_nid);
>>       return 0;
>>   }
>> @@ -818,6 +824,7 @@ static void erofs_kill_sb(struct super_block *sb)
>>           kill_anon_super(sb);
>>       else
>>           kill_block_super(sb);
>> +    erofs_pcshr_free_mnt();
>>       fs_put_dax(sbi->dif0.dax_dev, NULL);
>>       erofs_fscache_unregister_fs(sb);
>>       erofs_sb_free(sbi);
>> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
>> index 19ef4ff2a134..fc2ed01eaabe 100644
>> --- a/fs/erofs/zdata.c
>> +++ b/fs/erofs/zdata.c
>> @@ -5,6 +5,7 @@
>>    * Copyright (C) 2022 Alibaba Cloud
>>    */
>>   #include "compress.h"
>> +#include "pagecache_share.h"
>>   #include <linux/psi.h>
>>   #include <linux/cpuhotplug.h>
>>   #include <trace/events/erofs.h>
>> @@ -1891,9 +1892,10 @@ static int z_erofs_read_folio(struct file 
>> *file, struct folio *folio)
>>   {
>>       struct inode *const inode = folio->mapping->host;
>>       struct z_erofs_decompress_frontend f = 
>> DECOMPRESS_FRONTEND_INIT(inode);
>> -    int err;
>> +    int err, pcshr;
>>         trace_erofs_read_folio(folio, false);
>> +    pcshr = erofs_pcshr_read_begin(file, folio);
>>       f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
>>         z_erofs_pcluster_readmore(&f, NULL, true);
>> @@ -1909,6 +1911,7 @@ static int z_erofs_read_folio(struct file 
>> *file, struct folio *folio)
>>         erofs_put_metabuf(&f.map.buf);
>>       erofs_release_pages(&f.pagepool);
>> +    erofs_pcshr_read_end(file, folio, pcshr);
>>       return err;
>>   }
>>   @@ -1918,8 +1921,9 @@ static void z_erofs_readahead(struct 
>> readahead_control *rac)
>>       struct z_erofs_decompress_frontend f = 
>> DECOMPRESS_FRONTEND_INIT(inode);
>>       struct folio *head = NULL, *folio;
>>       unsigned int nr_folios;
>> -    int err;
>> +    int err, pcshr;
>>   +    pcshr = erofs_pcshr_readahead_begin(rac);
>>       f.headoffset = readahead_pos(rac);
>>         z_erofs_pcluster_readmore(&f, rac, true);
>> @@ -1947,6 +1951,7 @@ static void z_erofs_readahead(struct 
>> readahead_control *rac)
>>       (void)z_erofs_runqueue(&f, nr_folios);
>>       erofs_put_metabuf(&f.map.buf);
>>       erofs_release_pages(&f.pagepool);
>> +    erofs_pcshr_readahead_end(rac, pcshr);
>>   }
>>     const struct address_space_operations z_erofs_aops = {
    
    
More information about the Linux-erofs
mailing list