[PATCH v2] erofs-utils: lib: treat data blocks filled with 0s as a hole

Gao Xiang hsiangkao at linux.alibaba.com
Sat Apr 13 10:07:03 AEST 2024


Hi Sandeep,

On 2024/4/10 06:14, Sandeep Dhavale wrote:
> Add optimization to treat data blocks filled with 0s as a hole.
> Even though diskspace savings are comparable to chunk based or dedupe,
> having no block assigned saves us redundant disk IOs during read.
> 
> To detect blocks filled with zeros during chunking, we insert block
> filled with zeros (zerochunk) in the hashmap. If we detect a possible
> dedupe, we map it to the hole so there is no physical block assigned.
> 
> Signed-off-by: Sandeep Dhavale <dhavale at google.com>
> ---
> Changes since v1:
> 	- Instead of checking every block for 0s word by word,
> 	  add a zerochunk in blobs during init. So we effectively
> 	  detect the zero blocks by comparing the hash.
>   include/erofs/blobchunk.h |  2 +-
>   lib/blobchunk.c           | 41 ++++++++++++++++++++++++++++++++++++---
>   mkfs/main.c               |  2 +-
>   3 files changed, 40 insertions(+), 5 deletions(-)
> 
> diff --git a/include/erofs/blobchunk.h b/include/erofs/blobchunk.h
> index a674640..ebe2efe 100644
> --- a/include/erofs/blobchunk.h
> +++ b/include/erofs/blobchunk.h
> @@ -23,7 +23,7 @@ int erofs_write_zero_inode(struct erofs_inode *inode);
>   int tarerofs_write_chunkes(struct erofs_inode *inode, erofs_off_t data_offset);
>   int erofs_mkfs_dump_blobs(struct erofs_sb_info *sbi);
>   void erofs_blob_exit(void);
> -int erofs_blob_init(const char *blobfile_path);
> +int erofs_blob_init(const char *blobfile_path, erofs_off_t chunksize);
>   int erofs_mkfs_init_devices(struct erofs_sb_info *sbi, unsigned int devices);
>   
>   #ifdef __cplusplus
> diff --git a/lib/blobchunk.c b/lib/blobchunk.c
> index 641e3d4..87c153f 100644
> --- a/lib/blobchunk.c
> +++ b/lib/blobchunk.c
> @@ -323,13 +323,21 @@ int erofs_blob_write_chunked_file(struct erofs_inode *inode, int fd,
>   			ret = -EIO;
>   			goto err;
>   		}
> -
>   		chunk = erofs_blob_getchunk(sbi, chunkdata, len);
>   		if (IS_ERR(chunk)) {
>   			ret = PTR_ERR(chunk);
>   			goto err;
>   		}


Sorry for late reply since I'm working on multi-threaded mkfs.

Can erofs_blob_getchunk directly return &erofs_holechunk? I mean,

static struct erofs_blobchunk *erofs_blob_getchunk(struct erofs_sb_info *sbi,
						u8 *buf, erofs_off_t chunksize)
{
...
	chunk = hashmap_get_from_hash(&blob_hashmap, hash, sha256);
	if (chunk) {
		DBG_BUGON(chunksize != chunk->chunksize);

		if (chunk->blkaddr == erofs_holechunk.blkaddr)
			chunk = &erofs_holechunk;

		sbi->saved_by_deduplication += chunksize;
		erofs_dbg("Found duplicated chunk at %u", chunk->blkaddr);
		return chunk;
	}
...
}

>   
> +		if (chunk->blkaddr == erofs_holechunk.blkaddr) {
> +			*(void **)idx++ = &erofs_holechunk;
> +			erofs_update_minextblks(sbi, interval_start, pos,
> +						&minextblks);
> +			interval_start = pos + len;


I guess several zerochunks can also be merged?  is this line
an expected behavior?

> +			lastch = NULL;
> +			continue;
> +		}
> +
>   		if (lastch && (lastch->device_id != chunk->device_id ||
>   		    erofs_pos(sbi, lastch->blkaddr) + lastch->chunksize !=
>   		    erofs_pos(sbi, chunk->blkaddr))) {

I guess we could form a helper like
static bool erofs_blob_can_merge(struct erofs_sb_info *sbi,
			    struct erofs_blobchunk *lastch,
			    struct erofs_blobchunk *chunk)
{
	if (lastch == &erofs_holechunk && chunk == &erofs_holechunk)
		return true;
	if (lastch->device_id == chunk->device_id &&
	    erofs_pos(sbi, lastch->blkaddr) + lastch->chunksize ==
	    erofs_pos(sbi, chunk->blkaddr))
		return true;
	return false;
}

		if (lastch && erofs_blob_can_merge(sbi, lastch, chunk)) {
			...
		}



> @@ -540,7 +548,34 @@ void erofs_blob_exit(void)
>   	}
>   }
>   
> -int erofs_blob_init(const char *blobfile_path)
> +static int erofs_insert_zerochunk(erofs_off_t chunksize)
> +{
> +	u8 *zeros;
> +	struct erofs_blobchunk *chunk;
> +	u8 sha256[32];
> +	unsigned int hash;
> +
> +	zeros = calloc(1, chunksize);
> +	if (!zeros)
> +		return -ENOMEM;
> +
> +	erofs_sha256(zeros, chunksize, sha256);
> +	hash = memhash(sha256, sizeof(sha256));


`zeros` needs to be freed here I guess:
	free(zeros);

Thanks,
Gao Xiang

> +	chunk = malloc(sizeof(struct erofs_blobchunk));
> +	if (!chunk)
> +		return -ENOMEM;> +
> +	chunk->chunksize = chunksize;
> +	/* treat chunk filled with zeros as hole */
> +	chunk->blkaddr = erofs_holechunk.blkaddr;
> +	memcpy(chunk->sha256, sha256, sizeof(sha256));
> +
> +	hashmap_entry_init(&chunk->ent, hash);
> +	hashmap_add(&blob_hashmap, chunk);
> +	return 0;
> +}
> +


More information about the Linux-erofs mailing list