[PATCH] erofs: rename per-CPU buffers to global buffer pool and make it configurable

Fri Mar 22 15:10:32 AEDT 2024

On 2024/3/22 11:47, Chunhai Guo wrote:
> It will cost more time if compressed buffers are allocated on demand for
> low-latency algorithms (like lz4) so EROFS uses per-CPU buffers to keep
> compressed data if in-place decompression is unfulfilled.  While it is
> kind of wasteful of memory for a device with hundreds of CPUs, and only
> a small number of CPUs are concurrently decompressing most of the time.

a small number of CPUs concurrently decompress most of the time.

> 
> This patch renames it as 'global buffer pool' and makes it configurable.
> This allows two or more CPUs to share a common buffer to reduce memory
> occupation.
> 
> Signed-off-by: Chunhai Guo <guochunhai at vivo.com>
> Suggested-by: Gao Xiang <xiang at kernel.org>
> ---
>   fs/erofs/Makefile       |   2 +-
>   fs/erofs/decompressor.c |   6 +-
>   fs/erofs/internal.h     |  14 ++--
>   fs/erofs/pcpubuf.c      | 148 --------------------------------------
>   fs/erofs/super.c        |   4 +-
>   fs/erofs/utils.c        | 155 ++++++++++++++++++++++++++++++++++++++++
>   6 files changed, 168 insertions(+), 161 deletions(-)
>   delete mode 100644 fs/erofs/pcpubuf.c
> 
> diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
> index 994d0b9deddf..6cf1504c63e6 100644
> --- a/fs/erofs/Makefile
> +++ b/fs/erofs/Makefile
> @@ -3,7 +3,7 @@
>   obj-$(CONFIG_EROFS_FS) += erofs.o
>   erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o
>   erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
> -erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o
> +erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
>   erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
>   erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
>   erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
> diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
> index d4cee95af14c..99a344684132 100644
> --- a/fs/erofs/decompressor.c
> +++ b/fs/erofs/decompressor.c
> @@ -54,7 +54,7 @@ static int z_erofs_load_lz4_config(struct super_block *sb,
>   	sbi->lz4.max_distance_pages = distance ?
>   					DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
>   					LZ4_MAX_DISTANCE_PAGES;
> -	return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
> +	return erofs_gbuf_growsize(sbi->lz4.max_pclusterblks);
>   }
>   
>   /*
> @@ -159,7 +159,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
>   docopy:
>   	/* Or copy compressed data which can be overlapped to per-CPU buffer */
>   	in = rq->in;
> -	src = erofs_get_pcpubuf(ctx->inpages);
> +	src = erofs_get_gbuf(ctx->inpages);
>   	if (!src) {
>   		DBG_BUGON(1);
>   		kunmap_local(inpage);
> @@ -260,7 +260,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
>   	} else if (maptype == 1) {
>   		vm_unmap_ram(src, ctx->inpages);
>   	} else if (maptype == 2) {
> -		erofs_put_pcpubuf(src);
> +		erofs_put_gbuf(src);
>   	} else if (maptype != 3) {
>   		DBG_BUGON(1);
>   		return -EFAULT;
> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
> index b0409badb017..320d3d0d3526 100644
> --- a/fs/erofs/internal.h
> +++ b/fs/erofs/internal.h
> @@ -471,11 +471,11 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
>   				       struct erofs_workgroup *egrp);
>   int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
>   			    int flags);
> -void *erofs_get_pcpubuf(unsigned int requiredpages);
> -void erofs_put_pcpubuf(void *ptr);
> -int erofs_pcpubuf_growsize(unsigned int nrpages);
> -void __init erofs_pcpubuf_init(void);
> -void erofs_pcpubuf_exit(void);
> +void *erofs_get_gbuf(unsigned int requiredpages);
> +void erofs_put_gbuf(void *ptr);
> +int erofs_gbuf_growsize(unsigned int nrpages);
> +void __init erofs_gbuf_init(void);
> +void erofs_gbuf_exit(void);
>   int erofs_init_managed_cache(struct super_block *sb);
>   int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb);
>   #else
> @@ -485,8 +485,8 @@ static inline int erofs_init_shrinker(void) { return 0; }
>   static inline void erofs_exit_shrinker(void) {}
>   static inline int z_erofs_init_zip_subsystem(void) { return 0; }
>   static inline void z_erofs_exit_zip_subsystem(void) {}
> -static inline void erofs_pcpubuf_init(void) {}
> -static inline void erofs_pcpubuf_exit(void) {}
> +static inline void erofs_gbuf_init(void) {}
> +static inline void erofs_gbuf_exit(void) {}
>   static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
>   #endif	/* !CONFIG_EROFS_FS_ZIP */
>   
> diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
> deleted file mode 100644
> index c7a4b1d77069..000000000000
> --- a/fs/erofs/pcpubuf.c
> +++ /dev/null
> @@ -1,148 +0,0 @@
> -// SPDX-License-Identifier: GPL-2.0-only
> -/*
> - * Copyright (C) Gao Xiang <xiang at kernel.org>
> - *
> - * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
> - * per-CPU virtual memory (in pages) in advance to store such inplace I/O
> - * data if inplace decompression is failed (due to unmet inplace margin for
> - * example).
> - */
> -#include "internal.h"
> -
> -struct erofs_pcpubuf {
> -	raw_spinlock_t lock;
> -	void *ptr;
> -	struct page **pages;
> -	unsigned int nrpages;
> -};
> -
> -static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
> -
> -void *erofs_get_pcpubuf(unsigned int requiredpages)
> -	__acquires(pcb->lock)
> -{
> -	struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
> -
> -	raw_spin_lock(&pcb->lock);
> -	/* check if the per-CPU buffer is too small */
> -	if (requiredpages > pcb->nrpages) {
> -		raw_spin_unlock(&pcb->lock);
> -		put_cpu_var(erofs_pcb);
> -		/* (for sparse checker) pretend pcb->lock is still taken */
> -		__acquire(pcb->lock);
> -		return NULL;
> -	}
> -	return pcb->ptr;
> -}
> -
> -void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
> -{
> -	struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
> -
> -	DBG_BUGON(pcb->ptr != ptr);
> -	raw_spin_unlock(&pcb->lock);
> -	put_cpu_var(erofs_pcb);
> -}
> -
> -/* the next step: support per-CPU page buffers hotplug */
> -int erofs_pcpubuf_growsize(unsigned int nrpages)
> -{
> -	static DEFINE_MUTEX(pcb_resize_mutex);
> -	static unsigned int pcb_nrpages;
> -	struct page *pagepool = NULL;
> -	int delta, cpu, ret, i;
> -
> -	mutex_lock(&pcb_resize_mutex);
> -	delta = nrpages - pcb_nrpages;
> -	ret = 0;
> -	/* avoid shrinking pcpubuf, since no idea how many fses rely on */
> -	if (delta <= 0)
> -		goto out;
> -
> -	for_each_possible_cpu(cpu) {
> -		struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
> -		struct page **pages, **oldpages;
> -		void *ptr, *old_ptr;
> -
> -		pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
> -		if (!pages) {
> -			ret = -ENOMEM;
> -			break;
> -		}
> -
> -		for (i = 0; i < nrpages; ++i) {
> -			pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
> -			if (!pages[i]) {
> -				ret = -ENOMEM;
> -				oldpages = pages;
> -				goto free_pagearray;
> -			}
> -		}
> -		ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
> -		if (!ptr) {
> -			ret = -ENOMEM;
> -			oldpages = pages;
> -			goto free_pagearray;
> -		}
> -		raw_spin_lock(&pcb->lock);
> -		old_ptr = pcb->ptr;
> -		pcb->ptr = ptr;
> -		oldpages = pcb->pages;
> -		pcb->pages = pages;
> -		i = pcb->nrpages;
> -		pcb->nrpages = nrpages;
> -		raw_spin_unlock(&pcb->lock);
> -
> -		if (!oldpages) {
> -			DBG_BUGON(old_ptr);
> -			continue;
> -		}
> -
> -		if (old_ptr)
> -			vunmap(old_ptr);
> -free_pagearray:
> -		while (i)
> -			erofs_pagepool_add(&pagepool, oldpages[--i]);
> -		kfree(oldpages);
> -		if (ret)
> -			break;
> -	}
> -	pcb_nrpages = nrpages;
> -	erofs_release_pages(&pagepool);
> -out:
> -	mutex_unlock(&pcb_resize_mutex);
> -	return ret;
> -}
> -
> -void __init erofs_pcpubuf_init(void)
> -{
> -	int cpu;
> -
> -	for_each_possible_cpu(cpu) {
> -		struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
> -
> -		raw_spin_lock_init(&pcb->lock);
> -	}
> -}
> -
> -void erofs_pcpubuf_exit(void)
> -{
> -	int cpu, i;
> -
> -	for_each_possible_cpu(cpu) {
> -		struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
> -
> -		if (pcb->ptr) {
> -			vunmap(pcb->ptr);
> -			pcb->ptr = NULL;
> -		}
> -		if (!pcb->pages)
> -			continue;
> -
> -		for (i = 0; i < pcb->nrpages; ++i)
> -			if (pcb->pages[i])
> -				put_page(pcb->pages[i]);
> -		kfree(pcb->pages);
> -		pcb->pages = NULL;
> -	}
> -}
> diff --git a/fs/erofs/super.c b/fs/erofs/super.c
> index 5f60f163bd56..5161923c33fc 100644
> --- a/fs/erofs/super.c
> +++ b/fs/erofs/super.c
> @@ -902,7 +902,7 @@ static int __init erofs_module_init(void)
>   	if (err)
>   		goto deflate_err;
>   
> -	erofs_pcpubuf_init();
> +	erofs_gbuf_init();
>   	err = z_erofs_init_zip_subsystem();
>   	if (err)
>   		goto zip_err;
> @@ -945,7 +945,7 @@ static void __exit erofs_module_exit(void)
>   	z_erofs_lzma_exit();
>   	erofs_exit_shrinker();
>   	kmem_cache_destroy(erofs_inode_cachep);
> -	erofs_pcpubuf_exit();
> +	erofs_gbuf_exit();
>   }
>   
>   static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
> diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
> index e146d09151af..7bdff1c7ce19 100644
> --- a/fs/erofs/utils.c
> +++ b/fs/erofs/utils.c
> @@ -284,4 +284,159 @@ void erofs_exit_shrinker(void)
>   {
>   	shrinker_free(erofs_shrinker_info);
>   }
> +
> +struct erofs_gbuf {

struct z_erofs_gbuf {

> +	spinlock_t lock;
> +	void *ptr;
> +	struct page **pages;
> +	unsigned int nrpages, ref;
> +	struct list_head list;
> +};
> +
> +struct erofs_gbufpool {
> +	struct erofs_gbuf *gbuf_array;
> +	unsigned int gbuf_num, gbuf_nrpages;
> +};

could we avoid this structure? I think it's only used once:

struct z_erofs_gbuf *z_erofs_gbufpool;
unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages;

module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444);

> +
> +struct erofs_gbufpool z_erofs_gbufpool;
> +
> +module_param_named(pcluster_buf_num, z_erofs_gbufpool.gbuf_num, uint, 0444);
> +
> +static inline unsigned int erofs_cur_gbuf(void)

inline here is unnecessary, and

static unsigned int z_erofs_gbuf_id(void)
{

> +{
> +	return smp_processor_id() % z_erofs_gbufpool.gbuf_num;
> +}
> +
> +void *erofs_get_gbuf(unsigned int requiredpages)

z_erofs_get_gbuf

> +	__acquires(gbuf->lock)
> +{
> +	struct erofs_gbuf *gbuf;
> +
> +	gbuf = &z_erofs_gbufpool.gbuf_array[erofs_cur_gbuf()];
> +	spin_lock(&gbuf->lock);
> +	/* check if the buffer is too small */
> +	if (requiredpages > gbuf->nrpages) {
> +		spin_unlock(&gbuf->lock);
> +		/* (for sparse checker) pretend gbuf->lock is still taken */
> +		__acquire(gbuf->lock);
> +		return NULL;
> +	}
> +	return gbuf->ptr;
> +}
> +
> +void erofs_put_gbuf(void *ptr) __releases(gbuf->lock)

z_erofs_put_gbuf

> +{
> +	struct erofs_gbuf *gbuf;
> +
> +	gbuf = &z_erofs_gbufpool.gbuf_array[erofs_cur_gbuf()];
> +	DBG_BUGON(gbuf->ptr != ptr);
> +	spin_unlock(&gbuf->lock);
> +}
> +
> +int erofs_gbuf_growsize(unsigned int nrpages)

z_erofs_get_growsize

> +{
> +	static DEFINE_MUTEX(gbuf_resize_mutex);
> +	struct page *pagepool = NULL;

Since we just grow global buffers, I think we could just copy
the old array and fill the rest? so that pagepool is needed
and we don't touch `struct page` anymore so `folio` is not
eeded.

But it can be done with a seperate patch.

> +	int delta, ret, i, j;
> +
> +	if (!z_erofs_gbufpool.gbuf_array)
> +		return -ENOMEM;
> +
> +	mutex_lock(&gbuf_resize_mutex);
> +	delta = nrpages - z_erofs_gbufpool.gbuf_nrpages;
> +	ret = 0;

	ret = -EINVAL;

> +	/* avoid shrinking pcl_buf, since no idea how many fses rely on */

/* avoid shrinking gbufs ... */

> +	if (delta <= 0)
> +		goto out;
> +
> +	for (i = 0; i < z_erofs_gbufpool.gbuf_num; ++i) {
> +		struct erofs_gbuf *gbuf = &z_erofs_gbufpool.gbuf_array[i];
> +		struct page **pages, **tmp_pages;
> +		void *ptr, *old_ptr = NULL;
> +
> +		ret = -ENOMEM;
> +		tmp_pages = kmalloc_array(nrpages, sizeof(*tmp_pages),
> +				GFP_KERNEL);
> +		if (!tmp_pages)
> +			break;
> +		for (j = 0; j < nrpages; ++j) {
> +			tmp_pages[j] = erofs_allocpage(&pagepool, GFP_KERNEL);
> +			if (!tmp_pages[j])
> +				goto free_pagearray;
> +		}
> +		ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL);
> +		if (!ptr)
> +			goto free_pagearray;
> +
> +		pages = tmp_pages;
> +		spin_lock(&gbuf->lock);
> +		old_ptr = gbuf->ptr;
> +		gbuf->ptr = ptr;
> +		tmp_pages = gbuf->pages;
> +		gbuf->pages = pages;
> +		j = gbuf->nrpages;
> +		gbuf->nrpages = nrpages;
> +		spin_unlock(&gbuf->lock);
> +		ret = 0;
> +		if (!tmp_pages) {
> +			DBG_BUGON(old_ptr);
> +			continue;
> +		}
> +
> +		if (old_ptr)
> +			vunmap(old_ptr);
> +free_pagearray:
> +		while (j)
> +			erofs_pagepool_add(&pagepool, tmp_pages[--j]);
> +		kfree(tmp_pages);
> +		if (ret)
> +			break;
> +	}
> +	z_erofs_gbufpool.gbuf_nrpages = nrpages;
> +	erofs_release_pages(&pagepool);
> +out:
> +	mutex_unlock(&gbuf_resize_mutex);
> +	return ret;
> +}
> +
> +void __init erofs_gbuf_init(void)
> +{
> +	if (!z_erofs_gbufpool.gbuf_num)
> +		z_erofs_gbufpool.gbuf_num = num_possible_cpus();
> +	else if (z_erofs_gbufpool.gbuf_num > num_possible_cpus())
> +		z_erofs_gbufpool.gbuf_num = num_possible_cpus();

	unsigned int i = num_possible_cpus();

	if (!z_erofs_gbuf_count)
		z_erofs_gbuf_count = cpus;
	else
		z_erofs_gbuf_count = z_erofs_gbuf_count < i ? : i;

> +
> +	z_erofs_gbufpool.gbuf_array = kmalloc_array(z_erofs_gbufpool.gbuf_num,
> +			sizeof(*z_erofs_gbufpool.gbuf_array),
> +			GFP_KERNEL | __GFP_ZERO);> +	if (!z_erofs_gbufpool.gbuf_array) {
> +		erofs_err(NULL, "failed to alloc page for erofs gbuf_array\n");

		erofs_err(NULL, "failed to alloate global buffer array");

> +		return;> +	}
> +
> +	for (int i = 0; i < z_erofs_gbufpool.gbuf_num; ++i)

please don't use `for (int i..`.

so	for (i = 0; i < z_erofs_gbufpool.gbuf_num; ++i)

Thanks,
Gao Xiang