[PATCH 3/3] erofs-utils: lib: enhance erofs_rebuild_get_dentry with bloom filter
Hongzhen Luo
hongzhen at linux.alibaba.com
Mon Jul 22 12:25:35 AEST 2024
On 2024/7/19 16:19, Gao Xiang wrote:
>
>
> On 2024/7/18 13:40, Hongzhen Luo wrote:
>> Add a bloom filter to exclude entries that are not in `pwd->i_subdirs`,
>> thereby improving the performance of `erofs_rebuild_get_dentry`. Below
>> are the results for different # of files in the same directory:
>>
>> +---------+--------------------+
>> | # files | time reduction (%) |
>> +---------+--------------------+
>> | 1e4 | 55% |
>> +---------+--------------------+
>> | 1e5 | 98% |
>> +---------+--------------------+
>> | 2e5 | 98% |
>> +---------+--------------------+
>> | 3e5 | 99% |
>> +---------+--------------------+
>>
>> Signed-off-by: Hongzhen Luo <hongzhen at linux.alibaba.com>
>> ---
>> lib/rebuild.c | 61 ++++++++++++++++++++++++++++++++++++++++++++-------
>> lib/super.c | 2 ++
>> mkfs/main.c | 4 ++++
>> 3 files changed, 59 insertions(+), 8 deletions(-)
>>
>> diff --git a/lib/rebuild.c b/lib/rebuild.c
>> index 9e8bf8f..3fd3ea0 100644
>> --- a/lib/rebuild.c
>> +++ b/lib/rebuild.c
>> @@ -15,6 +15,7 @@
>> #include "erofs/xattr.h"
>> #include "erofs/blobchunk.h"
>> #include "erofs/internal.h"
>> +#include "erofs/bloom_filter.h"
>> #include "liberofs_uuid.h"
>> #ifdef HAVE_LINUX_AUFS_TYPE_H
>> @@ -62,14 +63,48 @@ struct erofs_dentry
>> *erofs_rebuild_get_dentry(struct erofs_inode *pwd,
>> char *path, bool aufs, bool *whout, bool *opq, bool to_head)
>> {
>> struct erofs_dentry *d = NULL;
>> - unsigned int len = strlen(path);
>> - char *s = path;
>> + unsigned int len, p = 0;
>> + char *s;
>> + struct erofs_sb_info *sbi = pwd->sbi;
>> + char buf[4096];
>> *whout = false;
>> *opq = false;
>> + s = pwd->i_srcpath;
>> + len = strlen(pwd->i_srcpath);
>> + /* normalize the pwd path, e.g., /./../xxx/yyy -> /xxx/yyy */
>> + buf[p++] = '/';
>> + while (s < pwd->i_srcpath + len) {
>
> Why should we need that?
>
We need that because in the incremental build scenario, the `i_srcpath`
of the inodes in tarerofs_parse_tar() could be "denormalized" (e.g.,
./data/1.txt) while the path in erofs_rebuild_load_basedir() is
"normalized" (e.g., /data/1.txt). This inconsistency might cause the
bloom filter to determine that the path does not exist, thereby
preventing it from traversing the `/data` directory, which would result
in an error.
>> + char *slash = memchr(s, '/', pwd->i_srcpath + len - s);
>> + if (slash) {
>> + if (s == slash) {
>> + while(*++s == '/');
>> + continue;;
>> + }
>> + *slash = '\0';
>> + }
>> + if (memcmp(s, ".", 2) && memcmp(s, "..", 3)) {
>> + memcpy(buf + p, s, strlen(s));
>> + p += strlen(s);
>> + buf[p++] = '/';
>> +
>> + }
>> + if (slash) {
>> + *slash = '/';
>> + s = slash + 1;
>> + } else{
>> + break;
>> + }
>> + }
>> + if (buf[p - 1] != '/')
>> + buf[p++] = '/';
>> +
>> + s = path;
>> + len = strlen(path);
>> while (s < path + len) {
>> char *slash = memchr(s, '/', path + len - s);
>> + int bloom, slen;
>> if (slash) {
>> if (s == slash) {
>> @@ -97,13 +132,19 @@ struct erofs_dentry
>> *erofs_rebuild_get_dentry(struct erofs_inode *pwd,
>> }
>> }
>> - list_for_each_entry(d, &pwd->i_subdirs, d_child) {
>> - if (!strcmp(d->name, s)) {
>> - if (d->type != EROFS_FT_DIR && slash)
>> - return ERR_PTR(-EIO);
>> - inode = d->inode;
>> - break;
>> + slen = strlen(s);
>> + memcpy(buf + p, s, slen);
>> + p += slen;
>> + if ((bloom = erofs_bloom_test(sbi, buf, p)) > 0) {
>> + list_for_each_entry(d, &pwd->i_subdirs, d_child) {
>> + if (!strcmp(d->name, s)) {
>> + if (d->type != EROFS_FT_DIR && slash)
>> + return ERR_PTR(-EIO);
>> + inode = d->inode;
>> + break;
>> + }
>> }
>> +
>> }
>> if (inode) {
>> @@ -124,6 +165,10 @@ struct erofs_dentry
>> *erofs_rebuild_get_dentry(struct erofs_inode *pwd,
>> return d;
>> pwd = d->inode;
>> }
>> + if (!bloom && erofs_bloom_add(sbi, buf, p))
>> + return ERR_PTR(-EINVAL);
>> + if (slash)
>> + buf[p++] = '/';
>> }
>> if (slash) {
>> *slash = '/';
>> diff --git a/lib/super.c b/lib/super.c
>> index 45233c4..7289236 100644
>> --- a/lib/super.c
>> +++ b/lib/super.c
>> @@ -7,6 +7,7 @@
>> #include "erofs/print.h"
>> #include "erofs/xattr.h"
>> #include "erofs/cache.h"
>> +#include "erofs/bloom_filter.h"
>> static bool check_layout_compatibility(struct erofs_sb_info *sbi,
>> struct erofs_super_block *dsb)
>> @@ -153,6 +154,7 @@ void erofs_put_super(struct erofs_sb_info *sbi)
>> erofs_buffer_exit(sbi->bmgr);
>> sbi->bmgr = NULL;
>> }
>> + erofs_bloom_exit(sbi);
>> }
>> int erofs_writesb(struct erofs_sb_info *sbi, struct
>> erofs_buffer_head *sb_bh,
>> diff --git a/mkfs/main.c b/mkfs/main.c
>> index 20f12fc..27a2ea0 100644
>> --- a/mkfs/main.c
>> +++ b/mkfs/main.c
>> @@ -31,6 +31,7 @@
>> #include "../lib/liberofs_private.h"
>> #include "../lib/liberofs_uuid.h"
>> #include "../lib/compressor.h"
>> +#include "erofs/bloom_filter.h"
>> static struct option long_options[] = {
>> {"version", no_argument, 0, 'V'},
>> @@ -1344,6 +1345,9 @@ int main(int argc, char **argv)
>> }
>> erofs_inode_manager_init();
>> + srand(time(NULL));
>> + /* 1000000 should be enough */
>> + erofs_bloom_init(&g_sbi, 5, 1000000, rand());
>
> I don't want to add any new init function anymore, please wrap
> all init functions into a common one.
>
Sure. I plan to use an erofs_init_super() function to initialize the
relevant fields of sbi. However, other initialization functions like
erofs_buffer_init() are coupled with the main() function, so it might
not be easy to wrap them into this function.
> Also what's the meaning of hard-coded 1000000?
>
How about adding an option to mkfs to allow configuring the number of
hash functions and the capacity of the bloom filter? If the user does
not specify, a default value will be set. Below is the implementation:
struct erofs_configure {
// ...
u32 c_bloom_nrfuc;
unsigned long c_bloom_bitsize;
};
Thanks,
Hongzhen Luo
> Thanks,
> Gao Xiang
More information about the Linux-erofs
mailing list