[PATCH v2 applied] erofs-utils: mkfs: Implement 'dsunit' alignment= on blobdev

Friendy Su friendy.su at sony.com
Sat Aug 23 18:34:53 AEST 2025


Align inode data to huge pages on blobdev, where dsunit * blocksize =3D
2MiB.

When a file is mmap()'ed with dax=3Dalways, aligning to huge pages allows
the kernel to map a 2M huge page per page fault, instead of mapping
a 4KiB normal page for each page fault.

This greatly improves mmap() performance by reducing times of page
fault being triggered.

Note that `chunksize` should not be smaller than `dsunit` so that
data alignment is preserved after deduplication.

Signed-off-by: Friendy Su <friendy.su at sony.com>
Reviewed-by: Yuezhang Mo <Yuezhang.Mo at sony.com>
Reviewed-by: Daniel Palmer <daniel.palmer at sony.com>
[ Gao Xiang: refine some informational messages. ]
Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
---
 lib/blobchunk.c  | 19 +++++++++++++++++++
 man/mkfs.erofs.1 | 13 +++++++++++++
 mkfs/main.c      | 15 +++++++++++++++
 3 files changed, 47 insertions(+)

diff --git a/lib/blobchunk.c b/lib/blobchunk.c
index af6ddd7..4ed463f 100644
--- a/lib/blobchunk.c
+++ b/lib/blobchunk.c
@@ -309,6 +309,25 @@ int erofs_blob_write_chunked_file(struct erofs_inode *=
inode, int fd,
        minextblks =3D BLK_ROUND_UP(sbi, inode->i_size);
        interval_start =3D 0;

+       /*
+        * If dsunit <=3D chunksize, deduplication will not cause misalignm=
ent,
+        * so it's uncontroversial to apply the current data alignment poli=
cy.
+        */
+       if (sbi->bmgr->dsunit > 1 &&
+           sbi->bmgr->dsunit <=3D 1u << (chunkbits - sbi->blkszbits)) {
+               off_t off =3D lseek(blobfile, 0, SEEK_CUR);
+
+               off =3D roundup(off, sbi->bmgr->dsunit * erofs_blksiz(sbi));
+               if (lseek(blobfile, off, SEEK_SET) !=3D off) {
+                       ret =3D -errno;
+                       erofs_err("failed to lseek blobdev at 0x%llx: %s", off,
+                                 erofs_strerror(ret));
+                       goto err;
+               }
+               erofs_dbg("Align /%s on block #%d (0x%llx)",
+                         erofs_fspath(inode->i_srcpath), erofs_blknr(sbi, =
off), off);
+       }
+
        for (pos =3D 0; pos < inode->i_size; pos +=3D len) {
 #ifdef SEEK_DATA
                off_t offset =3D lseek(fd, pos + startoff, SEEK_DATA);
diff --git a/man/mkfs.erofs.1 b/man/mkfs.erofs.1
index 63f7a2f..cc5a310 100644
--- a/man/mkfs.erofs.1
+++ b/man/mkfs.erofs.1
@@ -168,6 +168,19 @@ the output filesystem, with no leading /.
 .TP
 .BI "\-\-dsunit=3D" #
 Align all data block addresses to multiples of #.
+
+If \fI--dsunit\fR and \fI--chunksize\fR are both set, \fI--dsunit\fR will =
be
+ignored if it is larger than \fI--chunksize\fR.
+
+If \fI--dsunit\fR is larger, it spans multiple chunks, for example:
+\fI-b 4096\fR, \fI--dsunit 512\fR (2MiB), \fI--chunksize 4096\fR
+
+Once a chunk is deduplicated, all subsequent chunks will no longer be
+aligned. For optimal performance, it is recommended to set \fI--dsunit\fR =
to
+the same value as \fI--chunksize\fR:
+
+E.g. \fI-b\fR 4096, \fI--dsunit 512\fR (2MiB), \fI--chunksize $((4096*512)=
)\fR
+
 .TP
 .BI "\-\-exclude-path=3D" path
 Ignore file that matches the exact literal path.
diff --git a/mkfs/main.c b/mkfs/main.c
index e0ba55d..2e6de00 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -1298,6 +1298,21 @@ static int mkfs_parse_options_cfg(struct erofs_impor=
ter_params *params,
                return -EINVAL;
        }

+       /*
+        * chunksize must be greater than or equal to dsunit to keep
+        * data alignment working.
+        *
+        * If chunksize is smaller than dsunit (e.g., chunksize=3D4K, dsuni=
t=3D2M),
+        * deduplicating a chunk will cause all subsequent data to become
+        * unaligned. Therefore, let's issue a warning here and still skip
+        * alignment for now.
+        */
+       if (cfg.c_chunkbits && dsunit &&
+           1u << (cfg.c_chunkbits - g_sbi.blkszbits) < dsunit) {
+               erofs_warn("chunksize %u bytes is smaller than dsunit %u bl=
ocks, ignore dsunit !",
+                          1u << cfg.c_chunkbits, dsunit);
+       }
+
        if (pclustersize_packed) {
                if (pclustersize_packed < (1U << mkfs_blkszbits) ||
                    pclustersize_packed % (1U << mkfs_blkszbits)) {
--
2.43.5




More information about the Linux-erofs mailing list