[PATCH RFC 4/4] erofs: introduce .fadvise for page cache share

Christian Brauner brauner at kernel.org
Thu Jul 3 22:23:13 AEST 2025


From: Hongzhen Luo <hongzhen at linux.alibaba.com>

When using .fadvise to release a file's page cache, it frees page cache
pages that were first read by this file. To achieve this, an interval
tree is added in the inode of that file to track the segments first
read by that inode.

Signed-off-by: Hongzhen Luo <hongzhen at linux.alibaba.com>
Link: https://lore.kernel.org/20240902110620.2202586-5-hongzhen@linux.alibaba.com
Signed-off-by: Christian Brauner <brauner at kernel.org>
---
 fs/erofs/data.c            | 38 ++++++++++++++++++++--
 fs/erofs/internal.h        |  5 +++
 fs/erofs/pagecache_share.c | 81 ++++++++++++++++++++++++++++++++++++++++++++--
 fs/erofs/pagecache_share.h |  2 ++
 fs/erofs/super.c           |  9 ++++++
 5 files changed, 131 insertions(+), 4 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index fb54162f4c54..61a42a95d26b 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -7,6 +7,7 @@
 #include "internal.h"
 #include <linux/sched/mm.h>
 #include <trace/events/erofs.h>
+#include "pagecache_share.h"
 
 void erofs_unmap_metabuf(struct erofs_buf *buf)
 {
@@ -353,6 +354,7 @@ static int erofs_read_folio(struct file *file, struct folio *folio)
 {
 #ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
 	struct erofs_inode *vi = NULL;
+	struct interval_tree_node *seg;
 	int ret;
 
 	if (file && file->private_data) {
@@ -363,8 +365,22 @@ static int erofs_read_folio(struct file *file, struct folio *folio)
 			vi = NULL;
 	}
 	ret = iomap_read_folio(folio, &erofs_iomap_ops);
-	if (vi)
+	if (vi) {
 		folio->mapping->host = file_inode(file);
+		seg = erofs_pcs_alloc_seg();
+		if (!seg)
+			return -ENOMEM;
+		seg->start = folio->index;
+		seg->last = seg->start + (folio_size(folio) >> PAGE_SHIFT);
+		if (seg->last > (vi->vfs_inode.i_size >> PAGE_SHIFT))
+			seg->last = vi->vfs_inode.i_size >> PAGE_SHIFT;
+		if (seg->last >= seg->start) {
+			mutex_lock(&vi->segs_mutex);
+			interval_tree_insert(seg, &vi->segs);
+			mutex_unlock(&vi->segs_mutex);
+		} else
+			erofs_pcs_free_seg(seg);
+	}
 	return ret;
 #else
 	return iomap_read_folio(folio, &erofs_iomap_ops);
@@ -376,6 +392,8 @@ static void erofs_readahead(struct readahead_control *rac)
 #ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
 	struct erofs_inode *vi = NULL;
 	struct file *file = rac->file;
+	struct interval_tree_node *seg;
+	erofs_off_t start, end;
 
 	if (file && file->private_data) {
 		vi = file->private_data;
@@ -383,10 +401,26 @@ static void erofs_readahead(struct readahead_control *rac)
 			rac->mapping->host = &vi->vfs_inode;
 		else
 			vi = NULL;
+		start = readahead_pos(rac);
+		end = start + readahead_length(rac);
+		if (end > vi->vfs_inode.i_size)
+			end = vi->vfs_inode.i_size;
 	}
 	iomap_readahead(rac, &erofs_iomap_ops);
-	if (vi)
+	if (vi) {
 		rac->mapping->host = file_inode(file);
+		seg = erofs_pcs_alloc_seg();
+		if (!seg)
+			return;
+		seg->start = start >> PAGE_SHIFT;
+		seg->last = end >> PAGE_SHIFT;
+		if (seg->last >= seg->start) {
+			mutex_lock(&vi->segs_mutex);
+			interval_tree_insert(seg, &vi->segs);
+			mutex_unlock(&vi->segs_mutex);
+		} else
+			erofs_pcs_free_seg(seg);
+	}
 #else
 	return iomap_readahead(rac, &erofs_iomap_ops);
 #endif
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 47136894d17d..5aa1215ce734 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -18,6 +18,8 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/iomap.h>
+#include <linux/interval_tree.h>
+#include <linux/mutex.h>
 #include "erofs_fs.h"
 
 __printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...);
@@ -275,6 +277,9 @@ struct erofs_inode {
 	};
 #ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
 	struct inode *ano_inode;
+	/* segments attributed by this inode */
+	struct rb_root_cached segs;
+	struct mutex segs_mutex;
 #endif
 	/* the corresponding vfs inode */
 	struct inode vfs_inode;
diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
index 309b33cc6c30..84713c0f20c8 100644
--- a/fs/erofs/pagecache_share.c
+++ b/fs/erofs/pagecache_share.c
@@ -4,6 +4,9 @@
  */
 #include <linux/xxhash.h>
 #include <linux/refcount.h>
+#include <uapi/linux/fadvise.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
 #include "pagecache_share.h"
 #include "internal.h"
 #include "xattr.h"
@@ -15,10 +18,12 @@
 static DEFINE_MUTEX(pseudo_mnt_lock);
 static refcount_t pseudo_mnt_count;
 static struct vfsmount *erofs_pcs_mnt;
+struct kmem_cache *erofs_pcs_segsp;
 
 int erofs_pcs_init_mnt(void)
 {
 	struct vfsmount *mnt;
+	struct kmem_cache *cache;
 
 	if (refcount_inc_not_zero(&pseudo_mnt_count))
 		return 0;
@@ -29,12 +34,21 @@ int erofs_pcs_init_mnt(void)
 		return 0;
 	}
 
+	cache = kmem_cache_create("erofs_pcs_segs",
+				  sizeof(struct interval_tree_node), 0,
+				  SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, NULL);
+	if (!cache)
+		return -ENOMEM;
+
 	mnt = kern_mount(&erofs_anon_fs_type);
-	if (IS_ERR(mnt))
+	if (IS_ERR(mnt)) {
+		kmem_cache_destroy(cache);
 		return PTR_ERR(mnt);
+	}
 
 	rcu_read_lock();
 	rcu_assign_pointer(erofs_pcs_mnt, mnt);
+	rcu_assign_pointer(erofs_pcs_segsp, cache);
 	rcu_read_unlock();
 	refcount_set_release(&pseudo_mnt_count, 1);
 	return 0;
@@ -43,18 +57,34 @@ int erofs_pcs_init_mnt(void)
 void erofs_pcs_free_mnt(void)
 {
 	struct vfsmount *mnt = NULL;
+	struct kmem_cache *cache = NULL;
 
 	if (refcount_dec_not_one(&pseudo_mnt_count))
 		return;
 
 	scoped_guard(mutex, &pseudo_mnt_lock) {
 		rcu_read_lock();
-		if (refcount_dec_and_test(&pseudo_mnt_count))
+		if (refcount_dec_and_test(&pseudo_mnt_count)) {
 			mnt = rcu_replace_pointer(erofs_pcs_mnt, NULL, true);
+			cache = rcu_replace_pointer(erofs_pcs_segsp, NULL, true);
+		}
 		rcu_read_unlock();
 	}
+
 	if (mnt)
 		kern_unmount(mnt);
+	if (cache)
+		kmem_cache_destroy(cache);
+}
+
+struct interval_tree_node *erofs_pcs_alloc_seg(void)
+{
+	return kmem_cache_alloc(erofs_pcs_segsp, GFP_KERNEL);
+}
+
+void erofs_pcs_free_seg(struct interval_tree_node *seg)
+{
+	kmem_cache_free(erofs_pcs_segsp, seg);
 }
 
 static int erofs_pcs_eq(struct inode *inode, void *data)
@@ -90,6 +120,8 @@ void erofs_pcs_fill_inode(struct inode *inode)
 					 erofs_pcs_eq, erofs_pcs_set_fprt,
 					 fprt);
 		vi->ano_inode = ano_inode;
+		vi->segs = RB_ROOT_CACHED;
+		mutex_init(&vi->segs_mutex);
 		if (ano_inode->i_state & I_NEW) {
 			if (erofs_inode_is_data_compressed(vi->datalayout))
 				ano_inode->i_mapping->a_ops = &z_erofs_aops;
@@ -189,6 +221,50 @@ static int erofs_pcs_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+static int erofs_pcs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+	struct erofs_inode *vi = EROFS_I(file_inode(file));
+	struct interval_tree_node *seg, *next_seg, *new_seg;
+	struct file *ano_file = file->private_data;
+	erofs_off_t start, end;
+	int err = 0;
+	u64 l, r;
+
+	if (advice != POSIX_FADV_DONTNEED)
+		return generic_fadvise(ano_file, offset, len, advice);
+
+	start = offset >> PAGE_SHIFT;
+	/* len = 0 means EOF */
+	end = (!len ? LLONG_MAX : offset + len) >> PAGE_SHIFT;
+
+	mutex_lock(&vi->segs_mutex);
+	seg = interval_tree_iter_first(&vi->segs, start, end);
+	while (seg) {
+		next_seg = interval_tree_iter_next(seg, start, end);
+		l = max_t(u64, seg->start | 0ULL, start);
+		r = min_t(u64, seg->last | 0ULL, end);
+		if (l > r)
+			continue;
+		(void)invalidate_mapping_pages(ano_file->f_mapping, l, r);
+		if (seg->start < l) {
+			new_seg = erofs_pcs_alloc_seg();
+			new_seg->start = seg->start;
+			new_seg->last = l;
+			interval_tree_insert(new_seg, &vi->segs);
+		}
+		if (r < seg->last) {
+			new_seg = erofs_pcs_alloc_seg();
+			new_seg->start = r;
+			new_seg->last = seg->last;
+			interval_tree_insert(new_seg, &vi->segs);
+		}
+		interval_tree_remove(seg, &vi->segs);
+		seg = next_seg;
+	}
+	mutex_unlock(&vi->segs_mutex);
+	return err;
+}
+
 const struct file_operations erofs_pcs_file_fops = {
 	.open		= erofs_pcs_file_open,
 	/*
@@ -201,4 +277,5 @@ const struct file_operations erofs_pcs_file_fops = {
 	.release	= erofs_pcs_file_release,
 	.get_unmapped_area = thp_get_unmapped_area,
 	.splice_read	= filemap_splice_read,
+	.fadvise	= erofs_pcs_fadvise,
 };
diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
index b8111291cf79..eb5869070d4b 100644
--- a/fs/erofs/pagecache_share.h
+++ b/fs/erofs/pagecache_share.h
@@ -14,6 +14,8 @@
 int erofs_pcs_init_mnt(void);
 void erofs_pcs_free_mnt(void);
 void erofs_pcs_fill_inode(struct inode *inode);
+struct interval_tree_node *erofs_pcs_alloc_seg(void);
+void erofs_pcs_free_seg(struct interval_tree_node *seg);
 
 extern const struct vm_operations_struct generic_file_vm_ops;
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index b9a71840cc45..607dc94a45a0 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -83,10 +83,19 @@ static void erofs_free_inode(struct inode *inode)
 	struct erofs_inode *vi = EROFS_I(inode);
 
 #ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+	struct interval_tree_node *seg, *next_seg;
+
 	if (S_ISREG(inode->i_mode) &&  vi->ano_inode) {
 		iput(vi->ano_inode);
 		vi->ano_inode = NULL;
 	}
+	seg = interval_tree_iter_first(&vi->segs, 0, ULONG_MAX);
+	while (seg) {
+		next_seg = interval_tree_iter_next(seg, 0, ULONG_MAX);
+		interval_tree_remove(seg, &vi->segs);
+		erofs_pcs_free_seg(seg);
+		seg = next_seg;
+	}
 #endif
 	if (inode->i_op == &erofs_fast_symlink_iops)
 		kfree(inode->i_link);

-- 
2.47.2



More information about the Linux-erofs mailing list