[PATCH 04/16] erofs: introduce bufvec to store decompressed buffers

Yue Hu zbestahu at gmail.com
Fri Jul 15 16:29:30 AEST 2022


On Thu, 14 Jul 2022 21:20:39 +0800
Gao Xiang <hsiangkao at linux.alibaba.com> wrote:

> For each pcluster, the total compressed buffers are determined in
> advance, yet the number of decompressed buffers actually vary.  Too
> many decompressed pages can be recorded if one pcluster is highly
> compressed or its pcluster size is large.  That takes extra memory
> footprints compared to uncompressed filesystems, especially a lot of
> I/O in flight on low-ended devices.
> 
> Therefore, similar to inplace I/O, pagevec was introduced to reuse
> page cache to store these pointers in the time-sharing way since
> these pages are actually unused before decompressing.
> 
> In order to make it more flexable, a cleaner bufvec is used to
> replace the old pagevec stuffs so that
> 
>  - Decompressed offsets can be stored inline, thus it can be used
>    for the upcoming feature like compressed data deduplication;
> 
>  - Towards supporting large folios for compressed inodes since
>    our final goal is to completely avoid page->private but use
>    folio->private only for all page cache pages.
> 
> Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
> ---
>  fs/erofs/zdata.c | 177 +++++++++++++++++++++++++++++++++++------------
>  fs/erofs/zdata.h |  26 +++++--
>  2 files changed, 153 insertions(+), 50 deletions(-)
> 
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index c183cd0bc42b..f52c54058f31 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -2,6 +2,7 @@
>  /*
>   * Copyright (C) 2018 HUAWEI, Inc.
>   *             https://www.huawei.com/
> + * Copyright (C) 2022 Alibaba Cloud
>   */
>  #include "zdata.h"
>  #include "compress.h"
> @@ -26,6 +27,82 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
>  	_PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
>  };
>  
> +struct z_erofs_bvec_iter {
> +	struct page *bvpage;
> +	struct z_erofs_bvset *bvset;
> +	unsigned int nr, cur;
> +};
> +
> +static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
> +{
> +	if (iter->bvpage)
> +		kunmap_local(iter->bvset);
> +	return iter->bvpage;
> +}
> +
> +static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
> +{
> +	unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
> +	/* have to access nextpage in advance, otherwise it will be unmapped */
> +	struct page *nextpage = iter->bvset->nextpage;
> +	struct page *oldpage;
> +
> +	DBG_BUGON(!nextpage);
> +	oldpage = z_erofs_bvec_iter_end(iter);
> +	iter->bvpage = nextpage;
> +	iter->bvset = kmap_local_page(nextpage);
> +	iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
> +	iter->cur = 0;
> +	return oldpage;
> +}
> +
> +static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
> +				    struct z_erofs_bvset_inline *bvset,
> +				    unsigned int bootstrap_nr,
> +				    unsigned int cur)
> +{
> +	*iter = (struct z_erofs_bvec_iter) {
> +		.nr = bootstrap_nr,
> +		.bvset = (struct z_erofs_bvset *)bvset,
> +	};
> +
> +	while (cur > iter->nr) {
> +		cur -= iter->nr;
> +		z_erofs_bvset_flip(iter);
> +	}
> +	iter->cur = cur;
> +}
> +
> +static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
> +				struct z_erofs_bvec *bvec,
> +				struct page **candidate_bvpage)
> +{
> +	if (iter->cur == iter->nr) {
> +		if (!*candidate_bvpage)
> +			return -EAGAIN;
> +
> +		DBG_BUGON(iter->bvset->nextpage);
> +		iter->bvset->nextpage = *candidate_bvpage;
> +		z_erofs_bvset_flip(iter);
> +
> +		iter->bvset->nextpage = NULL;
> +		*candidate_bvpage = NULL;
> +	}
> +	iter->bvset->bvec[iter->cur++] = *bvec;
> +	return 0;
> +}
> +
> +static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
> +				 struct z_erofs_bvec *bvec,
> +				 struct page **old_bvpage)
> +{
> +	if (iter->cur == iter->nr)
> +		*old_bvpage = z_erofs_bvset_flip(iter);
> +	else
> +		*old_bvpage = NULL;
> +	*bvec = iter->bvset->bvec[iter->cur++];
> +}
> +

Touch a new file to include bufvec related code? call it zbvec.c/h?

>  static void z_erofs_destroy_pcluster_pool(void)
>  {
>  	int i;
> @@ -195,9 +272,10 @@ enum z_erofs_collectmode {
>  struct z_erofs_decompress_frontend {
>  	struct inode *const inode;
>  	struct erofs_map_blocks map;
> -
> +	struct z_erofs_bvec_iter biter;
>  	struct z_erofs_pagevec_ctor vector;
>  
> +	struct page *candidate_bvpage;
>  	struct z_erofs_pcluster *pcl, *tailpcl;
>  	/* a pointer used to pick up inplace I/O pages */
>  	struct page **icpage_ptr;
> @@ -358,21 +436,24 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
>  
>  /* callers must be with pcluster lock held */
>  static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
> -			       struct page *page, enum z_erofs_page_type type,
> -			       bool pvec_safereuse)
> +			       struct z_erofs_bvec *bvec,
> +			       enum z_erofs_page_type type)
>  {
>  	int ret;
>  
> -	/* give priority for inplaceio */
>  	if (fe->mode >= COLLECT_PRIMARY &&
> -	    type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
> -	    z_erofs_try_inplace_io(fe, page))
> -		return 0;
> -
> -	ret = z_erofs_pagevec_enqueue(&fe->vector, page, type,
> -				      pvec_safereuse);
> -	fe->pcl->vcnt += (unsigned int)ret;
> -	return ret ? 0 : -EAGAIN;
> +	    type == Z_EROFS_PAGE_TYPE_EXCLUSIVE) {
> +		/* give priority for inplaceio to use file pages first */
> +		if (z_erofs_try_inplace_io(fe, bvec->page))
> +			return 0;
> +		/* otherwise, check if it can be used as a bvpage */
> +		if (fe->mode >= COLLECT_PRIMARY_FOLLOWED &&
> +		    !fe->candidate_bvpage)
> +			fe->candidate_bvpage = bvec->page;
> +	}
> +	ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage);
> +	fe->pcl->vcnt += (ret >= 0);
> +	return ret;
>  }
>  
>  static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
> @@ -554,9 +635,8 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
>  	} else if (ret) {
>  		return ret;
>  	}
> -
> -	z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS,
> -				  fe->pcl->pagevec, fe->pcl->vcnt);
> +	z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
> +				Z_EROFS_NR_INLINE_PAGEVECS, fe->pcl->vcnt);
>  	/* since file-backed online pages are traversed in reverse order */
>  	fe->icpage_ptr = fe->pcl->compressed_pages +
>  			z_erofs_pclusterpages(fe->pcl);
> @@ -588,9 +668,14 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
>  	if (!pcl)
>  		return false;
>  
> -	z_erofs_pagevec_ctor_exit(&fe->vector, false);
> +	z_erofs_bvec_iter_end(&fe->biter);
>  	mutex_unlock(&pcl->lock);
>  
> +	if (fe->candidate_bvpage) {
> +		DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage));
> +		fe->candidate_bvpage = NULL;
> +	}
> +
>  	/*
>  	 * if all pending pages are added, don't hold its reference
>  	 * any longer if the pcluster isn't hosted by ourselves.
> @@ -712,22 +797,23 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
>  		tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED);
>  
>  retry:
> -	err = z_erofs_attach_page(fe, page, page_type,
> -				  fe->mode >= COLLECT_PRIMARY_FOLLOWED);
> -	/* should allocate an additional short-lived page for pagevec */
> -	if (err == -EAGAIN) {
> -		struct page *const newpage =
> -				alloc_page(GFP_NOFS | __GFP_NOFAIL);
> -
> -		set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE);
> -		err = z_erofs_attach_page(fe, newpage,
> -					  Z_EROFS_PAGE_TYPE_EXCLUSIVE, true);
> -		if (!err)
> -			goto retry;
> +	err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
> +					.page = page,
> +					.offset = offset - map->m_la,
> +					.end = end,
> +				  }), page_type);
> +	/* should allocate an additional short-lived page for bvset */
> +	if (err == -EAGAIN && !fe->candidate_bvpage) {
> +		fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL);
> +		set_page_private(fe->candidate_bvpage,
> +				 Z_EROFS_SHORTLIVED_PAGE);
> +		goto retry;
>  	}
>  
> -	if (err)
> +	if (err) {
> +		DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage);
>  		goto err_out;
> +	}
>  
>  	index = page->index - (map->m_la >> PAGE_SHIFT);
>  
> @@ -781,29 +867,24 @@ static bool z_erofs_page_is_invalidated(struct page *page)
>  static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl,
>  				   struct page **pages, struct page **pagepool)
>  {
> -	struct z_erofs_pagevec_ctor ctor;
> -	enum z_erofs_page_type page_type;
> +	struct z_erofs_bvec_iter biter;
> +	struct page *old_bvpage;
>  	int i, err = 0;
>  
> -	z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS,
> -				  pcl->pagevec, 0);
> +	z_erofs_bvec_iter_begin(&biter, &pcl->bvset,
> +				Z_EROFS_NR_INLINE_PAGEVECS, 0);
>  	for (i = 0; i < pcl->vcnt; ++i) {
> -		struct page *page = z_erofs_pagevec_dequeue(&ctor, &page_type);
> +		struct z_erofs_bvec bvec;
>  		unsigned int pagenr;
>  
> -		/* all pages in pagevec ought to be valid */
> -		DBG_BUGON(!page);
> -		DBG_BUGON(z_erofs_page_is_invalidated(page));
> -
> -		if (z_erofs_put_shortlivedpage(pagepool, page))
> -			continue;
> +		z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
>  
> -		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
> -			pagenr = 0;
> -		else
> -			pagenr = z_erofs_onlinepage_index(page);
> +		if (old_bvpage)
> +			z_erofs_put_shortlivedpage(pagepool, old_bvpage);
>  
> +		pagenr = (bvec.offset + pcl->pageofs_out) >> PAGE_SHIFT;
>  		DBG_BUGON(pagenr >= pcl->nr_pages);
> +		DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
>  		/*
>  		 * currently EROFS doesn't support multiref(dedup),
>  		 * so here erroring out one multiref page.
> @@ -814,9 +895,12 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl,
>  			z_erofs_onlinepage_endio(pages[pagenr]);
>  			err = -EFSCORRUPTED;
>  		}
> -		pages[pagenr] = page;
> +		pages[pagenr] = bvec.page;
>  	}
> -	z_erofs_pagevec_ctor_exit(&ctor, true);
> +
> +	old_bvpage = z_erofs_bvec_iter_end(&biter);
> +	if (old_bvpage)
> +		z_erofs_put_shortlivedpage(pagepool, old_bvpage);
>  	return err;
>  }
>  
> @@ -986,6 +1070,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
>  		kvfree(pages);
>  
>  	pcl->nr_pages = 0;
> +	pcl->bvset.nextpage = NULL;
>  	pcl->vcnt = 0;
>  
>  	/* pcluster lock MUST be taken before the following line */
> diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
> index 58053bb5066f..d03e333e4fde 100644
> --- a/fs/erofs/zdata.h
> +++ b/fs/erofs/zdata.h
> @@ -21,6 +21,21 @@
>   */
>  typedef void *z_erofs_next_pcluster_t;
>  
> +struct z_erofs_bvec {
> +	struct page *page;
> +	int offset;
> +	unsigned int end;
> +};
> +
> +#define __Z_EROFS_BVSET(name, total) \
> +struct name { \
> +	/* point to the next page which contains the following bvecs */ \
> +	struct page *nextpage; \
> +	struct z_erofs_bvec bvec[total]; \
> +}
> +__Z_EROFS_BVSET(z_erofs_bvset,);
> +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_NR_INLINE_PAGEVECS);
> +
>  /*
>   * Structure fields follow one of the following exclusion rules.
>   *
> @@ -41,22 +56,25 @@ struct z_erofs_pcluster {
>  	/* A: lower limit of decompressed length and if full length or not */
>  	unsigned int length;
>  
> +	/* L: total number of bvecs */
> +	unsigned int vcnt;
> +
>  	/* I: page offset of start position of decompression */
>  	unsigned short pageofs_out;
>  
>  	/* I: page offset of inline compressed data */
>  	unsigned short pageofs_in;
>  
> -	/* L: maximum relative page index in pagevec[] */
> +	/* L: maximum relative page index in bvecs */
>  	unsigned short nr_pages;
>  
> -	/* L: total number of pages in pagevec[] */
> -	unsigned int vcnt;
> -
>  	union {
>  		/* L: inline a certain number of pagevecs for bootstrap */
>  		erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS];
>  
> +		/* L: inline a certain number of bvec for bootstrap */
> +		struct z_erofs_bvset_inline bvset;
> +
>  		/* I: can be used to free the pcluster by RCU. */
>  		struct rcu_head rcu;
>  	};



More information about the Linux-erofs mailing list