[PATCH v2 applied] erofs-utils: mkfs: Implement 'dsunit' alignment on blobdev

Friendy Su friendy.su at sony.com
Sat Aug 23 18:34:53 AEST 2025


Align inode data to huge pages on blobdev, where dsunit * blocksize =
2MiB.

When a file is mmap()'ed with dax=always, aligning to huge pages allows
the kernel to map a 2M huge page per page fault, instead of mapping
a 4KiB normal page for each page fault.

This greatly improves mmap() performance by reducing times of page
fault being triggered.

Note that `chunksize` should not be smaller than `dsunit` so that
data alignment is preserved after deduplication.

Signed-off-by: Friendy Su <friendy.su at sony.com>
Reviewed-by: Yuezhang Mo <Yuezhang.Mo at sony.com>
Reviewed-by: Daniel Palmer <daniel.palmer at sony.com>
[ Gao Xiang: refine some informational messages. ]
Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
---
 lib/blobchunk.c  | 19 +++++++++++++++++++
 man/mkfs.erofs.1 | 13 +++++++++++++
 mkfs/main.c      | 15 +++++++++++++++
 3 files changed, 47 insertions(+)

diff --git a/lib/blobchunk.c b/lib/blobchunk.c
index af6ddd7..4ed463f 100644
--- a/lib/blobchunk.c
+++ b/lib/blobchunk.c
@@ -309,6 +309,25 @@ int erofs_blob_write_chunked_file(struct erofs_inode *inode, int fd,
 	minextblks = BLK_ROUND_UP(sbi, inode->i_size);
 	interval_start = 0;
 
+	/*
+	 * If dsunit <= chunksize, deduplication will not cause misalignment,
+	 * so it's uncontroversial to apply the current data alignment policy.
+	 */
+	if (sbi->bmgr->dsunit > 1 &&
+	    sbi->bmgr->dsunit <= 1u << (chunkbits - sbi->blkszbits)) {
+		off_t off = lseek(blobfile, 0, SEEK_CUR);
+
+		off = roundup(off, sbi->bmgr->dsunit * erofs_blksiz(sbi));
+		if (lseek(blobfile, off, SEEK_SET) != off) {
+			ret = -errno;
+			erofs_err("failed to lseek blobdev at 0x%llx: %s", off,
+				  erofs_strerror(ret));
+			goto err;
+		}
+		erofs_dbg("Align /%s on block #%d (0x%llx)",
+			  erofs_fspath(inode->i_srcpath), erofs_blknr(sbi, off), off);
+	}
+
 	for (pos = 0; pos < inode->i_size; pos += len) {
 #ifdef SEEK_DATA
 		off_t offset = lseek(fd, pos + startoff, SEEK_DATA);
diff --git a/man/mkfs.erofs.1 b/man/mkfs.erofs.1
index 63f7a2f..cc5a310 100644
--- a/man/mkfs.erofs.1
+++ b/man/mkfs.erofs.1
@@ -168,6 +168,19 @@ the output filesystem, with no leading /.
 .TP
 .BI "\-\-dsunit=" #
 Align all data block addresses to multiples of #.
+
+If \fI--dsunit\fR and \fI--chunksize\fR are both set, \fI--dsunit\fR will be
+ignored if it is larger than \fI--chunksize\fR.
+
+If \fI--dsunit\fR is larger, it spans multiple chunks, for example:
+\fI-b 4096\fR, \fI--dsunit 512\fR (2MiB), \fI--chunksize 4096\fR
+
+Once a chunk is deduplicated, all subsequent chunks will no longer be
+aligned. For optimal performance, it is recommended to set \fI--dsunit\fR to
+the same value as \fI--chunksize\fR:
+
+E.g. \fI-b\fR 4096, \fI--dsunit 512\fR (2MiB), \fI--chunksize $((4096*512))\fR
+
 .TP
 .BI "\-\-exclude-path=" path
 Ignore file that matches the exact literal path.
diff --git a/mkfs/main.c b/mkfs/main.c
index e0ba55d..2e6de00 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -1298,6 +1298,21 @@ static int mkfs_parse_options_cfg(struct erofs_importer_params *params,
 		return -EINVAL;
 	}
 
+	/*
+	 * chunksize must be greater than or equal to dsunit to keep
+	 * data alignment working.
+	 *
+	 * If chunksize is smaller than dsunit (e.g., chunksize=4K, dsunit=2M),
+	 * deduplicating a chunk will cause all subsequent data to become
+	 * unaligned. Therefore, let's issue a warning here and still skip
+	 * alignment for now.
+	 */
+	if (cfg.c_chunkbits && dsunit &&
+	    1u << (cfg.c_chunkbits - g_sbi.blkszbits) < dsunit) {
+		erofs_warn("chunksize %u bytes is smaller than dsunit %u blocks, ignore dsunit !",
+			   1u << cfg.c_chunkbits, dsunit);
+	}
+
 	if (pclustersize_packed) {
 		if (pclustersize_packed < (1U << mkfs_blkszbits) ||
 		    pclustersize_packed % (1U << mkfs_blkszbits)) {
-- 
2.43.5



More information about the Linux-erofs mailing list