[PATCH v2] erofs-utils: lib: treat data blocks filled with 0s as a hole
Gao Xiang
hsiangkao at linux.alibaba.com
Sat Apr 13 10:07:03 AEST 2024
Hi Sandeep,
On 2024/4/10 06:14, Sandeep Dhavale wrote:
> Add optimization to treat data blocks filled with 0s as a hole.
> Even though diskspace savings are comparable to chunk based or dedupe,
> having no block assigned saves us redundant disk IOs during read.
>
> To detect blocks filled with zeros during chunking, we insert block
> filled with zeros (zerochunk) in the hashmap. If we detect a possible
> dedupe, we map it to the hole so there is no physical block assigned.
>
> Signed-off-by: Sandeep Dhavale <dhavale at google.com>
> ---
> Changes since v1:
> - Instead of checking every block for 0s word by word,
> add a zerochunk in blobs during init. So we effectively
> detect the zero blocks by comparing the hash.
> include/erofs/blobchunk.h | 2 +-
> lib/blobchunk.c | 41 ++++++++++++++++++++++++++++++++++++---
> mkfs/main.c | 2 +-
> 3 files changed, 40 insertions(+), 5 deletions(-)
>
> diff --git a/include/erofs/blobchunk.h b/include/erofs/blobchunk.h
> index a674640..ebe2efe 100644
> --- a/include/erofs/blobchunk.h
> +++ b/include/erofs/blobchunk.h
> @@ -23,7 +23,7 @@ int erofs_write_zero_inode(struct erofs_inode *inode);
> int tarerofs_write_chunkes(struct erofs_inode *inode, erofs_off_t data_offset);
> int erofs_mkfs_dump_blobs(struct erofs_sb_info *sbi);
> void erofs_blob_exit(void);
> -int erofs_blob_init(const char *blobfile_path);
> +int erofs_blob_init(const char *blobfile_path, erofs_off_t chunksize);
> int erofs_mkfs_init_devices(struct erofs_sb_info *sbi, unsigned int devices);
>
> #ifdef __cplusplus
> diff --git a/lib/blobchunk.c b/lib/blobchunk.c
> index 641e3d4..87c153f 100644
> --- a/lib/blobchunk.c
> +++ b/lib/blobchunk.c
> @@ -323,13 +323,21 @@ int erofs_blob_write_chunked_file(struct erofs_inode *inode, int fd,
> ret = -EIO;
> goto err;
> }
> -
> chunk = erofs_blob_getchunk(sbi, chunkdata, len);
> if (IS_ERR(chunk)) {
> ret = PTR_ERR(chunk);
> goto err;
> }
Sorry for late reply since I'm working on multi-threaded mkfs.
Can erofs_blob_getchunk directly return &erofs_holechunk? I mean,
static struct erofs_blobchunk *erofs_blob_getchunk(struct erofs_sb_info *sbi,
u8 *buf, erofs_off_t chunksize)
{
...
chunk = hashmap_get_from_hash(&blob_hashmap, hash, sha256);
if (chunk) {
DBG_BUGON(chunksize != chunk->chunksize);
if (chunk->blkaddr == erofs_holechunk.blkaddr)
chunk = &erofs_holechunk;
sbi->saved_by_deduplication += chunksize;
erofs_dbg("Found duplicated chunk at %u", chunk->blkaddr);
return chunk;
}
...
}
>
> + if (chunk->blkaddr == erofs_holechunk.blkaddr) {
> + *(void **)idx++ = &erofs_holechunk;
> + erofs_update_minextblks(sbi, interval_start, pos,
> + &minextblks);
> + interval_start = pos + len;
I guess several zerochunks can also be merged? is this line
an expected behavior?
> + lastch = NULL;
> + continue;
> + }
> +
> if (lastch && (lastch->device_id != chunk->device_id ||
> erofs_pos(sbi, lastch->blkaddr) + lastch->chunksize !=
> erofs_pos(sbi, chunk->blkaddr))) {
I guess we could form a helper like
static bool erofs_blob_can_merge(struct erofs_sb_info *sbi,
struct erofs_blobchunk *lastch,
struct erofs_blobchunk *chunk)
{
if (lastch == &erofs_holechunk && chunk == &erofs_holechunk)
return true;
if (lastch->device_id == chunk->device_id &&
erofs_pos(sbi, lastch->blkaddr) + lastch->chunksize ==
erofs_pos(sbi, chunk->blkaddr))
return true;
return false;
}
if (lastch && erofs_blob_can_merge(sbi, lastch, chunk)) {
...
}
> @@ -540,7 +548,34 @@ void erofs_blob_exit(void)
> }
> }
>
> -int erofs_blob_init(const char *blobfile_path)
> +static int erofs_insert_zerochunk(erofs_off_t chunksize)
> +{
> + u8 *zeros;
> + struct erofs_blobchunk *chunk;
> + u8 sha256[32];
> + unsigned int hash;
> +
> + zeros = calloc(1, chunksize);
> + if (!zeros)
> + return -ENOMEM;
> +
> + erofs_sha256(zeros, chunksize, sha256);
> + hash = memhash(sha256, sizeof(sha256));
`zeros` needs to be freed here I guess:
free(zeros);
Thanks,
Gao Xiang
> + chunk = malloc(sizeof(struct erofs_blobchunk));
> + if (!chunk)
> + return -ENOMEM;> +
> + chunk->chunksize = chunksize;
> + /* treat chunk filled with zeros as hole */
> + chunk->blkaddr = erofs_holechunk.blkaddr;
> + memcpy(chunk->sha256, sha256, sizeof(sha256));
> +
> + hashmap_entry_init(&chunk->ent, hash);
> + hashmap_add(&blob_hashmap, chunk);
> + return 0;
> +}
> +
More information about the Linux-erofs
mailing list