[PATCH v2] erofs-utils: lib: treat data blocks filled with 0s as a hole

Sandeep Dhavale dhavale at google.com
Wed Apr 10 08:14:30 AEST 2024


Add optimization to treat data blocks filled with 0s as a hole.
Even though diskspace savings are comparable to chunk based or dedupe,
having no block assigned saves us redundant disk IOs during read.

To detect blocks filled with zeros during chunking, we insert block
filled with zeros (zerochunk) in the hashmap. If we detect a possible
dedupe, we map it to the hole so there is no physical block assigned.

Signed-off-by: Sandeep Dhavale <dhavale at google.com>
---
Changes since v1:
	- Instead of checking every block for 0s word by word,
	  add a zerochunk in blobs during init. So we effectively
	  detect the zero blocks by comparing the hash.
 include/erofs/blobchunk.h |  2 +-
 lib/blobchunk.c           | 41 ++++++++++++++++++++++++++++++++++++---
 mkfs/main.c               |  2 +-
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/include/erofs/blobchunk.h b/include/erofs/blobchunk.h
index a674640..ebe2efe 100644
--- a/include/erofs/blobchunk.h
+++ b/include/erofs/blobchunk.h
@@ -23,7 +23,7 @@ int erofs_write_zero_inode(struct erofs_inode *inode);
 int tarerofs_write_chunkes(struct erofs_inode *inode, erofs_off_t data_offset);
 int erofs_mkfs_dump_blobs(struct erofs_sb_info *sbi);
 void erofs_blob_exit(void);
-int erofs_blob_init(const char *blobfile_path);
+int erofs_blob_init(const char *blobfile_path, erofs_off_t chunksize);
 int erofs_mkfs_init_devices(struct erofs_sb_info *sbi, unsigned int devices);
 
 #ifdef __cplusplus
diff --git a/lib/blobchunk.c b/lib/blobchunk.c
index 641e3d4..87c153f 100644
--- a/lib/blobchunk.c
+++ b/lib/blobchunk.c
@@ -323,13 +323,21 @@ int erofs_blob_write_chunked_file(struct erofs_inode *inode, int fd,
 			ret = -EIO;
 			goto err;
 		}
-
 		chunk = erofs_blob_getchunk(sbi, chunkdata, len);
 		if (IS_ERR(chunk)) {
 			ret = PTR_ERR(chunk);
 			goto err;
 		}
 
+		if (chunk->blkaddr == erofs_holechunk.blkaddr) {
+			*(void **)idx++ = &erofs_holechunk;
+			erofs_update_minextblks(sbi, interval_start, pos,
+						&minextblks);
+			interval_start = pos + len;
+			lastch = NULL;
+			continue;
+		}
+
 		if (lastch && (lastch->device_id != chunk->device_id ||
 		    erofs_pos(sbi, lastch->blkaddr) + lastch->chunksize !=
 		    erofs_pos(sbi, chunk->blkaddr))) {
@@ -540,7 +548,34 @@ void erofs_blob_exit(void)
 	}
 }
 
-int erofs_blob_init(const char *blobfile_path)
+static int erofs_insert_zerochunk(erofs_off_t chunksize)
+{
+	u8 *zeros;
+	struct erofs_blobchunk *chunk;
+	u8 sha256[32];
+	unsigned int hash;
+
+	zeros = calloc(1, chunksize);
+	if (!zeros)
+		return -ENOMEM;
+
+	erofs_sha256(zeros, chunksize, sha256);
+	hash = memhash(sha256, sizeof(sha256));
+	chunk = malloc(sizeof(struct erofs_blobchunk));
+	if (!chunk)
+		return -ENOMEM;
+
+	chunk->chunksize = chunksize;
+	/* treat chunk filled with zeros as hole */
+	chunk->blkaddr = erofs_holechunk.blkaddr;
+	memcpy(chunk->sha256, sha256, sizeof(sha256));
+
+	hashmap_entry_init(&chunk->ent, hash);
+	hashmap_add(&blob_hashmap, chunk);
+	return 0;
+}
+
+int erofs_blob_init(const char *blobfile_path, erofs_off_t chunksize)
 {
 	if (!blobfile_path) {
 #ifdef HAVE_TMPFILE64
@@ -557,7 +592,7 @@ int erofs_blob_init(const char *blobfile_path)
 		return -EACCES;
 
 	hashmap_init(&blob_hashmap, erofs_blob_hashmap_cmp, 0);
-	return 0;
+	return erofs_insert_zerochunk(chunksize);
 }
 
 int erofs_mkfs_init_devices(struct erofs_sb_info *sbi, unsigned int devices)
diff --git a/mkfs/main.c b/mkfs/main.c
index 2fb4a57..d632f74 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -1255,7 +1255,7 @@ int main(int argc, char **argv)
 	}
 
 	if (cfg.c_chunkbits) {
-		err = erofs_blob_init(cfg.c_blobdev_path);
+		err = erofs_blob_init(cfg.c_blobdev_path, 1 << cfg.c_chunkbits);
 		if (err)
 			return 1;
 	}
-- 
2.44.0.478.gd926399ef9-goog



More information about the Linux-erofs mailing list