[PATCH 3/3] erofs-utils: lib: enhance erofs_rebuild_get_dentry with bloom filter

Gao Xiang hsiangkao at linux.alibaba.com
Fri Jul 19 18:19:00 AEST 2024



On 2024/7/18 13:40, Hongzhen Luo wrote:
> Add a bloom filter to exclude entries that are not in `pwd->i_subdirs`,
> thereby improving the performance of `erofs_rebuild_get_dentry`. Below
> are the results for different # of files in the same directory:
> 
> +---------+--------------------+
> | # files | time reduction (%) |
> +---------+--------------------+
> |   1e4   |         55%        |
> +---------+--------------------+
> |   1e5   |         98%        |
> +---------+--------------------+
> |   2e5   |         98%        |
> +---------+--------------------+
> |   3e5   |         99%        |
> +---------+--------------------+
> 
> Signed-off-by: Hongzhen Luo <hongzhen at linux.alibaba.com>
> ---
>   lib/rebuild.c | 61 ++++++++++++++++++++++++++++++++++++++++++++-------
>   lib/super.c   |  2 ++
>   mkfs/main.c   |  4 ++++
>   3 files changed, 59 insertions(+), 8 deletions(-)
> 
> diff --git a/lib/rebuild.c b/lib/rebuild.c
> index 9e8bf8f..3fd3ea0 100644
> --- a/lib/rebuild.c
> +++ b/lib/rebuild.c
> @@ -15,6 +15,7 @@
>   #include "erofs/xattr.h"
>   #include "erofs/blobchunk.h"
>   #include "erofs/internal.h"
> +#include "erofs/bloom_filter.h"
>   #include "liberofs_uuid.h"
>   
>   #ifdef HAVE_LINUX_AUFS_TYPE_H
> @@ -62,14 +63,48 @@ struct erofs_dentry *erofs_rebuild_get_dentry(struct erofs_inode *pwd,
>   		char *path, bool aufs, bool *whout, bool *opq, bool to_head)
>   {
>   	struct erofs_dentry *d = NULL;
> -	unsigned int len = strlen(path);
> -	char *s = path;
> +	unsigned int len, p = 0;
> +	char *s;
> +	struct erofs_sb_info *sbi = pwd->sbi;
> +	char buf[4096];
>   
>   	*whout = false;
>   	*opq = false;
>   
> +	s = pwd->i_srcpath;
> +	len = strlen(pwd->i_srcpath);
> +	/* normalize the pwd path, e.g., /./../xxx/yyy -> /xxx/yyy */
> +	buf[p++] = '/';
> +	while (s < pwd->i_srcpath + len) {

Why should we need that?


> +		char *slash = memchr(s, '/', pwd->i_srcpath + len - s);
> +		if (slash) {
> +			if (s == slash) {
> +				while(*++s == '/');
> +				continue;;
> +			}
> +			*slash = '\0';
> +		}
> +		if (memcmp(s, ".", 2) && memcmp(s, "..", 3)) {
> +			memcpy(buf + p, s, strlen(s));
> +			p += strlen(s);
> +			buf[p++] = '/';
> +
> +		}
> +		if (slash) {
> +			*slash = '/';
> +			s = slash + 1;
> +		} else{
> +			break;
> +		}
> +	}
> +	if (buf[p - 1] != '/')
> +		buf[p++] = '/';
> +
> +	s = path;
> +	len = strlen(path);
>   	while (s < path + len) {
>   		char *slash = memchr(s, '/', path + len - s);
> +		int bloom, slen;
>   
>   		if (slash) {
>   			if (s == slash) {
> @@ -97,13 +132,19 @@ struct erofs_dentry *erofs_rebuild_get_dentry(struct erofs_inode *pwd,
>   				}
>   			}
>   
> -			list_for_each_entry(d, &pwd->i_subdirs, d_child) {
> -				if (!strcmp(d->name, s)) {
> -					if (d->type != EROFS_FT_DIR && slash)
> -						return ERR_PTR(-EIO);
> -					inode = d->inode;
> -					break;
> +			slen = strlen(s);
> +			memcpy(buf + p, s, slen);
> +			p += slen;
> +			if ((bloom = erofs_bloom_test(sbi, buf, p)) > 0) {
> +				list_for_each_entry(d, &pwd->i_subdirs, d_child) {
> +					if (!strcmp(d->name, s)) {
> +						if (d->type != EROFS_FT_DIR && slash)
> +							return ERR_PTR(-EIO);
> +						inode = d->inode;
> +						break;
> +					}
>   				}
> +
>   			}
>   
>   			if (inode) {
> @@ -124,6 +165,10 @@ struct erofs_dentry *erofs_rebuild_get_dentry(struct erofs_inode *pwd,
>   					return d;
>   				pwd = d->inode;
>   			}
> +			if (!bloom && erofs_bloom_add(sbi, buf, p))
> +				return ERR_PTR(-EINVAL);
> +			if (slash)
> +				buf[p++] = '/';
>   		}
>   		if (slash) {
>   			*slash = '/';
> diff --git a/lib/super.c b/lib/super.c
> index 45233c4..7289236 100644
> --- a/lib/super.c
> +++ b/lib/super.c
> @@ -7,6 +7,7 @@
>   #include "erofs/print.h"
>   #include "erofs/xattr.h"
>   #include "erofs/cache.h"
> +#include "erofs/bloom_filter.h"
>   
>   static bool check_layout_compatibility(struct erofs_sb_info *sbi,
>   				       struct erofs_super_block *dsb)
> @@ -153,6 +154,7 @@ void erofs_put_super(struct erofs_sb_info *sbi)
>   		erofs_buffer_exit(sbi->bmgr);
>   		sbi->bmgr = NULL;
>   	}
> +	erofs_bloom_exit(sbi);
>   }
>   
>   int erofs_writesb(struct erofs_sb_info *sbi, struct erofs_buffer_head *sb_bh,
> diff --git a/mkfs/main.c b/mkfs/main.c
> index 20f12fc..27a2ea0 100644
> --- a/mkfs/main.c
> +++ b/mkfs/main.c
> @@ -31,6 +31,7 @@
>   #include "../lib/liberofs_private.h"
>   #include "../lib/liberofs_uuid.h"
>   #include "../lib/compressor.h"
> +#include "erofs/bloom_filter.h"
>   
>   static struct option long_options[] = {
>   	{"version", no_argument, 0, 'V'},
> @@ -1344,6 +1345,9 @@ int main(int argc, char **argv)
>   	}
>   
>   	erofs_inode_manager_init();
> +	srand(time(NULL));
> +	/* 1000000 should be enough */
> +	erofs_bloom_init(&g_sbi, 5, 1000000, rand());

I don't want to add any new init function anymore, please wrap
all init functions into a common one.

Also what's the meaning of hard-coded 1000000?

Thanks,
Gao Xiang


More information about the Linux-erofs mailing list