[PATCH 2/3] erofs-utils: lib: add bloom filter

Gao Xiang hsiangkao at linux.alibaba.com
Fri Jul 19 18:16:32 AEST 2024



On 2024/7/18 13:40, Hongzhen Luo wrote:
> Introduce following bloom filter helpers:
> 
> 	erofs_bloom_init
> 	erofs_bloom_add
> 	erofs_bloom_test
> 	erofs_bloom_exit
> 
> Signed-off-by: Hongzhen Luo <hongzhen at linux.alibaba.com>
> ---
>   include/erofs/bloom_filter.h | 30 ++++++++++++
>   include/erofs/internal.h     |  2 +
>   lib/Makefile.am              |  2 +-
>   lib/bloom_filter.c           | 92 ++++++++++++++++++++++++++++++++++++
>   4 files changed, 125 insertions(+), 1 deletion(-)
>   create mode 100644 include/erofs/bloom_filter.h
>   create mode 100644 lib/bloom_filter.c
> 
> diff --git a/include/erofs/bloom_filter.h b/include/erofs/bloom_filter.h
> new file mode 100644
> index 0000000..a0915e4
> --- /dev/null
> +++ b/include/erofs/bloom_filter.h
> @@ -0,0 +1,30 @@
> +/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */
> +#ifndef __EROFS_BLOOM_FILTER_H
> +#define __EROFS_BLOOM_FILTER_H
> +
> +#ifdef __cplusplus
> +extern "C"
> +{
> +#endif
> +
> +#include "internal.h"
> +#include "bitops.h"
> +
> +struct erofs_bloom_filter {
> +        struct bitmap bmap;
> +        unsigned long bitmap_mask;
> +        u32 hash_seed;
> +        u32 nr_funcs;
> +};
> +
> +int erofs_bloom_init(struct erofs_sb_info *sbi, u32 nr_funcs,
> +                     unsigned long entries, u32 seed);
> +long erofs_bloom_add(struct erofs_sb_info *sbi, void *data, size_t length);
> +long erofs_bloom_test(struct erofs_sb_info *sbi, void *data, size_t length);
> +void erofs_bloom_exit(struct erofs_sb_info *sbi);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif
> \ No newline at end of file

What editor are you using?  It lacks a blank line at the end of line.


> diff --git a/include/erofs/internal.h b/include/erofs/internal.h
> index 708e51e..d3dd676 100644
> --- a/include/erofs/internal.h
> +++ b/include/erofs/internal.h
> @@ -74,6 +74,7 @@ struct erofs_xattr_prefix_item {
>   #define EROFS_PACKED_NID_UNALLOCATED	-1
>   
>   struct erofs_mkfs_dfops;
> +struct erofs_bloom_filter;
>   struct erofs_sb_info {
>   	struct erofs_device_info *devs;
>   	char *devname;
> @@ -134,6 +135,7 @@ struct erofs_sb_info {
>   #endif
>   	struct erofs_bufmgr *bmgr;
>   	bool useqpl;
> +	struct erofs_bloom_filter *bf;
>   };
>   
>   #define EROFS_SUPER_END (EROFS_SUPER_OFFSET + sizeof(struct erofs_super_block))
> diff --git a/lib/Makefile.am b/lib/Makefile.am
> index 6b52470..78140e7 100644
> --- a/lib/Makefile.am
> +++ b/lib/Makefile.am
> @@ -35,7 +35,7 @@ liberofs_la_SOURCES = config.c io.c cache.c super.c inode.c xattr.c exclude.c \
>   		      namei.c data.c compress.c compressor.c zmap.c decompress.c \
>   		      compress_hints.c hashmap.c sha256.c blobchunk.c dir.c \
>   		      fragments.c dedupe.c uuid_unparse.c uuid.c tar.c \
> -		      block_list.c xxhash.c rebuild.c diskbuf.c
> +		      block_list.c xxhash.c rebuild.c diskbuf.c bloom_filter.c
>   
>   liberofs_la_CFLAGS = -Wall ${libuuid_CFLAGS} -I$(top_srcdir)/include
>   if ENABLE_LZ4
> diff --git a/lib/bloom_filter.c b/lib/bloom_filter.c
> new file mode 100644
> index 0000000..c460261
> --- /dev/null
> +++ b/lib/bloom_filter.c
> @@ -0,0 +1,92 @@
> +// SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0
> +#include "erofs/bloom_filter.h"
> +#include "xxhash.h"
> +#include <stdlib.h>
> +
> +static u32 erofs_bloom_hash(struct erofs_bloom_filter *bf, void *data,
> +                            size_t length, u32 index)
> +{
> +        u32 h;
> +
> +        h = xxh32(data, length, bf->hash_seed + index);
> +        return h & bf->bitmap_mask;
> +}
> +
> +/* The optimal bit array size that minimizes the false positive is
> + * m * k / ln(2) where m is the # of elements inserted into the bloom
> + * filter and k is the # of hash functions. Here, 1.44 is used to approximate
> + * 1 / ln(2).
> + */

Wrong comment style.

> +int erofs_bloom_init(struct erofs_sb_info *sbi, u32 nr_funcs,
> +                     unsigned long entries, u32 seed)
> +{
> +        struct erofs_bloom_filter *bf;
> +
> +        bf = calloc(1, sizeof(struct erofs_bloom_filter));
> +        if (!bf)
> +                return -EINVAL;
> +
> +        bf->nr_funcs = nr_funcs;
> +        bf->hash_seed = seed;
> +        bf->bmap.size = roundup_pow_of_two(((long)(entries * nr_funcs * 1.44)));
> +        bf->bitmap_mask = bf->bmap.size - 1;
> +        bf->bmap.map = calloc(BITS_TO_LONGS(bf->bmap.size), sizeof(long));
> +        if (!bf->bmap.map) {
> +                free(bf);
> +                return -ENOMEM;
> +        }
> +        sbi->bf = bf;
> +
> +        return 0;
> +}
> +
> +long erofs_bloom_add(struct erofs_sb_info *sbi, void *data, size_t length)
> +{
> +        u32 i, h;
> +        struct erofs_bloom_filter *bf;

reverse chrismas tree makes this better, like:
	struct erofs_bloom_filter *bf;
	u32 i, h;

> +
> +        bf = sbi->bf;
> +        if (!bf)
> +                return -EINVAL;
> +
> +        for (i = 0; i < bf->nr_funcs; i ++) {
> +                h = erofs_bloom_hash(bf, data, length, i);
> +                set_bit(h, bf->bmap.map);
> +        }
> +
> +        return 0;
> +}
> +
> +/*
> + * Return negative error code on failure, 0 if the key is not in the bloom filter
> + * and 1 if the key might be in the bloom filter.
> + */
> +long erofs_bloom_test(struct erofs_sb_info *sbi, void *data, size_t length)
> +{
> +        u32 i, h;
> +        struct erofs_bloom_filter *bf;
> +
> +        bf = sbi->bf;
> +        if (!bf)
> +                return -EINVAL;
> +
> +        for (i = 0; i < bf->nr_funcs; i ++) {
> +                h = erofs_bloom_hash(bf, data, length, i);
> +                if (!test_bit(h, bf->bmap.map))
> +                        return 0;
> +        }
> +
> +        return 1;
> +}
> +
> +void erofs_bloom_exit(struct erofs_sb_info *sbi)
> +{
> +        if (sbi->bf) {
> +                if (sbi->bf->bmap.map) {
> +                        free(sbi->bf->bmap.map);
> +                        sbi->bf->bmap.map = NULL;
> +                }
> +                free(sbi->bf);
> +                sbi->bf = NULL;
> +        }
> +}
> \ No newline at end of file

Same here.


More information about the Linux-erofs mailing list