[PATCH 2/3] erofs-utils: lib: add bloom filter
Hongzhen Luo
hongzhen at linux.alibaba.com
Thu Jul 18 15:40:24 AEST 2024
Introduce following bloom filter helpers:
erofs_bloom_init
erofs_bloom_add
erofs_bloom_test
erofs_bloom_exit
Signed-off-by: Hongzhen Luo <hongzhen at linux.alibaba.com>
---
include/erofs/bloom_filter.h | 30 ++++++++++++
include/erofs/internal.h | 2 +
lib/Makefile.am | 2 +-
lib/bloom_filter.c | 92 ++++++++++++++++++++++++++++++++++++
4 files changed, 125 insertions(+), 1 deletion(-)
create mode 100644 include/erofs/bloom_filter.h
create mode 100644 lib/bloom_filter.c
diff --git a/include/erofs/bloom_filter.h b/include/erofs/bloom_filter.h
new file mode 100644
index 0000000..a0915e4
--- /dev/null
+++ b/include/erofs/bloom_filter.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */
+#ifndef __EROFS_BLOOM_FILTER_H
+#define __EROFS_BLOOM_FILTER_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "internal.h"
+#include "bitops.h"
+
+struct erofs_bloom_filter {
+ struct bitmap bmap;
+ unsigned long bitmap_mask;
+ u32 hash_seed;
+ u32 nr_funcs;
+};
+
+int erofs_bloom_init(struct erofs_sb_info *sbi, u32 nr_funcs,
+ unsigned long entries, u32 seed);
+long erofs_bloom_add(struct erofs_sb_info *sbi, void *data, size_t length);
+long erofs_bloom_test(struct erofs_sb_info *sbi, void *data, size_t length);
+void erofs_bloom_exit(struct erofs_sb_info *sbi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/include/erofs/internal.h b/include/erofs/internal.h
index 708e51e..d3dd676 100644
--- a/include/erofs/internal.h
+++ b/include/erofs/internal.h
@@ -74,6 +74,7 @@ struct erofs_xattr_prefix_item {
#define EROFS_PACKED_NID_UNALLOCATED -1
struct erofs_mkfs_dfops;
+struct erofs_bloom_filter;
struct erofs_sb_info {
struct erofs_device_info *devs;
char *devname;
@@ -134,6 +135,7 @@ struct erofs_sb_info {
#endif
struct erofs_bufmgr *bmgr;
bool useqpl;
+ struct erofs_bloom_filter *bf;
};
#define EROFS_SUPER_END (EROFS_SUPER_OFFSET + sizeof(struct erofs_super_block))
diff --git a/lib/Makefile.am b/lib/Makefile.am
index 6b52470..78140e7 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -35,7 +35,7 @@ liberofs_la_SOURCES = config.c io.c cache.c super.c inode.c xattr.c exclude.c \
namei.c data.c compress.c compressor.c zmap.c decompress.c \
compress_hints.c hashmap.c sha256.c blobchunk.c dir.c \
fragments.c dedupe.c uuid_unparse.c uuid.c tar.c \
- block_list.c xxhash.c rebuild.c diskbuf.c
+ block_list.c xxhash.c rebuild.c diskbuf.c bloom_filter.c
liberofs_la_CFLAGS = -Wall ${libuuid_CFLAGS} -I$(top_srcdir)/include
if ENABLE_LZ4
diff --git a/lib/bloom_filter.c b/lib/bloom_filter.c
new file mode 100644
index 0000000..c460261
--- /dev/null
+++ b/lib/bloom_filter.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0
+#include "erofs/bloom_filter.h"
+#include "xxhash.h"
+#include <stdlib.h>
+
+static u32 erofs_bloom_hash(struct erofs_bloom_filter *bf, void *data,
+ size_t length, u32 index)
+{
+ u32 h;
+
+ h = xxh32(data, length, bf->hash_seed + index);
+ return h & bf->bitmap_mask;
+}
+
+/* The optimal bit array size that minimizes the false positive is
+ * m * k / ln(2) where m is the # of elements inserted into the bloom
+ * filter and k is the # of hash functions. Here, 1.44 is used to approximate
+ * 1 / ln(2).
+ */
+int erofs_bloom_init(struct erofs_sb_info *sbi, u32 nr_funcs,
+ unsigned long entries, u32 seed)
+{
+ struct erofs_bloom_filter *bf;
+
+ bf = calloc(1, sizeof(struct erofs_bloom_filter));
+ if (!bf)
+ return -EINVAL;
+
+ bf->nr_funcs = nr_funcs;
+ bf->hash_seed = seed;
+ bf->bmap.size = roundup_pow_of_two(((long)(entries * nr_funcs * 1.44)));
+ bf->bitmap_mask = bf->bmap.size - 1;
+ bf->bmap.map = calloc(BITS_TO_LONGS(bf->bmap.size), sizeof(long));
+ if (!bf->bmap.map) {
+ free(bf);
+ return -ENOMEM;
+ }
+ sbi->bf = bf;
+
+ return 0;
+}
+
+long erofs_bloom_add(struct erofs_sb_info *sbi, void *data, size_t length)
+{
+ u32 i, h;
+ struct erofs_bloom_filter *bf;
+
+ bf = sbi->bf;
+ if (!bf)
+ return -EINVAL;
+
+ for (i = 0; i < bf->nr_funcs; i ++) {
+ h = erofs_bloom_hash(bf, data, length, i);
+ set_bit(h, bf->bmap.map);
+ }
+
+ return 0;
+}
+
+/*
+ * Return negative error code on failure, 0 if the key is not in the bloom filter
+ * and 1 if the key might be in the bloom filter.
+ */
+long erofs_bloom_test(struct erofs_sb_info *sbi, void *data, size_t length)
+{
+ u32 i, h;
+ struct erofs_bloom_filter *bf;
+
+ bf = sbi->bf;
+ if (!bf)
+ return -EINVAL;
+
+ for (i = 0; i < bf->nr_funcs; i ++) {
+ h = erofs_bloom_hash(bf, data, length, i);
+ if (!test_bit(h, bf->bmap.map))
+ return 0;
+ }
+
+ return 1;
+}
+
+void erofs_bloom_exit(struct erofs_sb_info *sbi)
+{
+ if (sbi->bf) {
+ if (sbi->bf->bmap.map) {
+ free(sbi->bf->bmap.map);
+ sbi->bf->bmap.map = NULL;
+ }
+ free(sbi->bf);
+ sbi->bf = NULL;
+ }
+}
\ No newline at end of file
--
2.43.5
More information about the Linux-erofs
mailing list