[PATCH 3/3] erofs-utils: lib: enhance erofs_rebuild_get_dentry with bloom filter

Hongzhen Luo hongzhen at linux.alibaba.com
Mon Jul 22 12:25:35 AEST 2024


On 2024/7/19 16:19, Gao Xiang wrote:
>
>
> On 2024/7/18 13:40, Hongzhen Luo wrote:
>> Add a bloom filter to exclude entries that are not in `pwd->i_subdirs`,
>> thereby improving the performance of `erofs_rebuild_get_dentry`. Below
>> are the results for different # of files in the same directory:
>>
>> +---------+--------------------+
>> | # files | time reduction (%) |
>> +---------+--------------------+
>> |   1e4   |         55%        |
>> +---------+--------------------+
>> |   1e5   |         98%        |
>> +---------+--------------------+
>> |   2e5   |         98%        |
>> +---------+--------------------+
>> |   3e5   |         99%        |
>> +---------+--------------------+
>>
>> Signed-off-by: Hongzhen Luo <hongzhen at linux.alibaba.com>
>> ---
>>   lib/rebuild.c | 61 ++++++++++++++++++++++++++++++++++++++++++++-------
>>   lib/super.c   |  2 ++
>>   mkfs/main.c   |  4 ++++
>>   3 files changed, 59 insertions(+), 8 deletions(-)
>>
>> diff --git a/lib/rebuild.c b/lib/rebuild.c
>> index 9e8bf8f..3fd3ea0 100644
>> --- a/lib/rebuild.c
>> +++ b/lib/rebuild.c
>> @@ -15,6 +15,7 @@
>>   #include "erofs/xattr.h"
>>   #include "erofs/blobchunk.h"
>>   #include "erofs/internal.h"
>> +#include "erofs/bloom_filter.h"
>>   #include "liberofs_uuid.h"
>>     #ifdef HAVE_LINUX_AUFS_TYPE_H
>> @@ -62,14 +63,48 @@ struct erofs_dentry 
>> *erofs_rebuild_get_dentry(struct erofs_inode *pwd,
>>           char *path, bool aufs, bool *whout, bool *opq, bool to_head)
>>   {
>>       struct erofs_dentry *d = NULL;
>> -    unsigned int len = strlen(path);
>> -    char *s = path;
>> +    unsigned int len, p = 0;
>> +    char *s;
>> +    struct erofs_sb_info *sbi = pwd->sbi;
>> +    char buf[4096];
>>         *whout = false;
>>       *opq = false;
>>   +    s = pwd->i_srcpath;
>> +    len = strlen(pwd->i_srcpath);
>> +    /* normalize the pwd path, e.g., /./../xxx/yyy -> /xxx/yyy */
>> +    buf[p++] = '/';
>> +    while (s < pwd->i_srcpath + len) {
>
> Why should we need that?
>
We need that because in the incremental build scenario, the `i_srcpath` 
of the inodes in tarerofs_parse_tar() could be "denormalized" (e.g., 
./data/1.txt) while the path in erofs_rebuild_load_basedir() is 
"normalized" (e.g., /data/1.txt). This inconsistency might cause the 
bloom filter to determine that the path does not exist, thereby 
preventing it from traversing the `/data` directory, which would result 
in an error.
>> +        char *slash = memchr(s, '/', pwd->i_srcpath + len - s);
>> +        if (slash) {
>> +            if (s == slash) {
>> +                while(*++s == '/');
>> +                continue;;
>> +            }
>> +            *slash = '\0';
>> +        }
>> +        if (memcmp(s, ".", 2) && memcmp(s, "..", 3)) {
>> +            memcpy(buf + p, s, strlen(s));
>> +            p += strlen(s);
>> +            buf[p++] = '/';
>> +
>> +        }
>> +        if (slash) {
>> +            *slash = '/';
>> +            s = slash + 1;
>> +        } else{
>> +            break;
>> +        }
>> +    }
>> +    if (buf[p - 1] != '/')
>> +        buf[p++] = '/';
>> +
>> +    s = path;
>> +    len = strlen(path);
>>       while (s < path + len) {
>>           char *slash = memchr(s, '/', path + len - s);
>> +        int bloom, slen;
>>             if (slash) {
>>               if (s == slash) {
>> @@ -97,13 +132,19 @@ struct erofs_dentry 
>> *erofs_rebuild_get_dentry(struct erofs_inode *pwd,
>>                   }
>>               }
>>   -            list_for_each_entry(d, &pwd->i_subdirs, d_child) {
>> -                if (!strcmp(d->name, s)) {
>> -                    if (d->type != EROFS_FT_DIR && slash)
>> -                        return ERR_PTR(-EIO);
>> -                    inode = d->inode;
>> -                    break;
>> +            slen = strlen(s);
>> +            memcpy(buf + p, s, slen);
>> +            p += slen;
>> +            if ((bloom = erofs_bloom_test(sbi, buf, p)) > 0) {
>> +                list_for_each_entry(d, &pwd->i_subdirs, d_child) {
>> +                    if (!strcmp(d->name, s)) {
>> +                        if (d->type != EROFS_FT_DIR && slash)
>> +                            return ERR_PTR(-EIO);
>> +                        inode = d->inode;
>> +                        break;
>> +                    }
>>                   }
>> +
>>               }
>>                 if (inode) {
>> @@ -124,6 +165,10 @@ struct erofs_dentry 
>> *erofs_rebuild_get_dentry(struct erofs_inode *pwd,
>>                       return d;
>>                   pwd = d->inode;
>>               }
>> +            if (!bloom && erofs_bloom_add(sbi, buf, p))
>> +                return ERR_PTR(-EINVAL);
>> +            if (slash)
>> +                buf[p++] = '/';
>>           }
>>           if (slash) {
>>               *slash = '/';
>> diff --git a/lib/super.c b/lib/super.c
>> index 45233c4..7289236 100644
>> --- a/lib/super.c
>> +++ b/lib/super.c
>> @@ -7,6 +7,7 @@
>>   #include "erofs/print.h"
>>   #include "erofs/xattr.h"
>>   #include "erofs/cache.h"
>> +#include "erofs/bloom_filter.h"
>>     static bool check_layout_compatibility(struct erofs_sb_info *sbi,
>>                          struct erofs_super_block *dsb)
>> @@ -153,6 +154,7 @@ void erofs_put_super(struct erofs_sb_info *sbi)
>>           erofs_buffer_exit(sbi->bmgr);
>>           sbi->bmgr = NULL;
>>       }
>> +    erofs_bloom_exit(sbi);
>>   }
>>     int erofs_writesb(struct erofs_sb_info *sbi, struct 
>> erofs_buffer_head *sb_bh,
>> diff --git a/mkfs/main.c b/mkfs/main.c
>> index 20f12fc..27a2ea0 100644
>> --- a/mkfs/main.c
>> +++ b/mkfs/main.c
>> @@ -31,6 +31,7 @@
>>   #include "../lib/liberofs_private.h"
>>   #include "../lib/liberofs_uuid.h"
>>   #include "../lib/compressor.h"
>> +#include "erofs/bloom_filter.h"
>>     static struct option long_options[] = {
>>       {"version", no_argument, 0, 'V'},
>> @@ -1344,6 +1345,9 @@ int main(int argc, char **argv)
>>       }
>>         erofs_inode_manager_init();
>> +    srand(time(NULL));
>> +    /* 1000000 should be enough */
>> +    erofs_bloom_init(&g_sbi, 5, 1000000, rand());
>
> I don't want to add any new init function anymore, please wrap
> all init functions into a common one.
>
Sure. I plan to use an erofs_init_super() function to initialize the 
relevant fields of sbi. However, other initialization functions like 
erofs_buffer_init() are coupled with the main() function, so it might 
not be easy to wrap them into this function.
> Also what's the meaning of hard-coded 1000000?
>
How about adding an option to mkfs to allow configuring the number of 
hash functions and the capacity of the bloom filter? If the user does 
not specify, a default value will be set. Below is the implementation:

struct erofs_configure {

// ...

u32 c_bloom_nrfuc;

unsigned long c_bloom_bitsize;

  };

Thanks,

Hongzhen Luo


> Thanks,
> Gao Xiang


More information about the Linux-erofs mailing list