[PATCH] erofs-utils: mkfs: enable directory data in the metadata zone

Gao Xiang hsiangkao at linux.alibaba.com
Fri Jan 16 20:45:13 AEDT 2026


It allows directory data and inode metadata to be kept as close
as possible, significantly improving metadata performance for
long-latency remote image use cases.

Usage:
 $ mkfs.erofs --MZ foo.erofs foo/

Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
---
 include/erofs/importer.h |  1 +
 include/erofs/inode.h    |  3 +-
 include/erofs/internal.h |  8 ++++
 lib/cache.c              |  4 ++
 lib/inode.c              | 98 ++++++++++++++++++++++++++++------------
 lib/io.c                 |  5 +-
 lib/metabox.c            | 22 +++++----
 lib/remotes/s3.c         |  3 +-
 lib/super.c              |  4 +-
 lib/tar.c                |  2 +-
 mkfs/main.c              | 27 +++++++++--
 11 files changed, 129 insertions(+), 48 deletions(-)

diff --git a/include/erofs/importer.h b/include/erofs/importer.h
index a525b474f1d5..60160d6bea05 100644
--- a/include/erofs/importer.h
+++ b/include/erofs/importer.h
@@ -46,6 +46,7 @@ struct erofs_importer_params {
 	bool no_datainline;
 	/* Issue directory data (except inline data) separately from regular inodes */
 	bool grouped_dirdata;
+	bool dirdata_in_metazone;
 	bool hard_dereference;
 	bool ovlfs_strip;
 	bool dot_omitted;
diff --git a/include/erofs/inode.h b/include/erofs/inode.h
index 89bd16aecc06..ba62ece9a7cc 100644
--- a/include/erofs/inode.h
+++ b/include/erofs/inode.h
@@ -38,7 +38,8 @@ erofs_nid_t erofs_lookupnid(struct erofs_inode *inode);
 int erofs_iflush(struct erofs_inode *inode);
 struct erofs_dentry *erofs_d_alloc(struct erofs_inode *parent,
 				   const char *name);
-int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t nblocks);
+int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t nblocks,
+				 bool in_metazone);
 bool erofs_dentry_is_wht(struct erofs_sb_info *sbi, struct erofs_dentry *d);
 int __erofs_fill_inode(struct erofs_importer *im, struct erofs_inode *inode,
 		       struct stat *st, const char *path);
diff --git a/include/erofs/internal.h b/include/erofs/internal.h
index 5798f10e89c2..2fe4514b3d23 100644
--- a/include/erofs/internal.h
+++ b/include/erofs/internal.h
@@ -157,6 +157,7 @@ struct erofs_sb_info {
 	struct erofs_buffer_head *bh_devt;
 	bool useqpl;
 	bool sb_valid;
+	u32 metazone_startblk;
 };
 
 /* make sure that any user of the erofs headers has atleast 64bit off_t type */
@@ -205,6 +206,8 @@ struct erofs_diskbuf;
 #define EROFS_INODE_DATA_SOURCE_DISKBUF		2
 #define EROFS_INODE_DATA_SOURCE_RESVSP		3
 
+#define EROFS_I_BLKADDR_DEV_ID_BIT		48
+
 struct erofs_inode {
 	struct list_head i_hash, i_subdirs, i_xattrs;
 
@@ -308,6 +311,11 @@ static inline bool erofs_inode_in_metabox(struct erofs_inode *inode)
 	return inode->nid >> EROFS_DIRENT_NID_METABOX_BIT;
 }
 
+static inline erofs_blk_t erofs_inode_dev_baddr(struct erofs_inode *inode)
+{
+	return inode->u.i_blkaddr & (BIT(EROFS_I_BLKADDR_DEV_ID_BIT) - 1);
+}
+
 static inline erofs_off_t erofs_iloc(struct erofs_inode *inode)
 {
 	struct erofs_sb_info *sbi = inode->sbi;
diff --git a/lib/cache.c b/lib/cache.c
index a87575ad74d1..f23dbb06264a 100644
--- a/lib/cache.c
+++ b/lib/cache.c
@@ -479,6 +479,10 @@ static int __erofs_bflush(struct erofs_bufmgr *bmgr,
 
 			/* flush and remove bh */
 			ret = bh->op->flush(bh);
+			if (__erofs_unlikely(ret == -EBUSY && !forget)) {
+				skip = true;
+				continue;
+			}
 			if (ret < 0)
 				return ret;
 		}
diff --git a/lib/inode.c b/lib/inode.c
index e44e03cf460f..88dc41b19e5b 100644
--- a/lib/inode.c
+++ b/lib/inode.c
@@ -194,9 +194,12 @@ struct erofs_dentry *erofs_d_alloc(struct erofs_inode *parent,
 }
 
 /* allocate main data for an inode */
-int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t nblocks)
+int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t nblocks,
+				 bool in_metazone)
 {
-	struct erofs_bufmgr *bmgr = inode->sbi->bmgr;
+	struct erofs_sb_info *sbi = inode->sbi;
+	struct erofs_bufmgr *bmgr = in_metazone ?
+		erofs_metadata_bmgr(sbi, false) : sbi->bmgr;
 	struct erofs_buffer_head *bh;
 	int ret, type;
 
@@ -206,9 +209,15 @@ int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t nblocks)
 		return 0;
 	}
 
+	if (in_metazone && !bmgr) {
+		erofs_err("cannot allocate data in the metazone when unavailable for %s",
+			  inode->i_srcpath);
+		return -EINVAL;
+	}
+
 	/* allocate main data buffer */
 	type = S_ISDIR(inode->i_mode) ? DIRA : DATA;
-	bh = erofs_balloc(bmgr, type, erofs_pos(inode->sbi, nblocks), 0);
+	bh = erofs_balloc(bmgr, type, erofs_pos(sbi, nblocks), 0);
 	if (IS_ERR(bh))
 		return PTR_ERR(bh);
 
@@ -220,7 +229,8 @@ int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t nblocks)
 	DBG_BUGON(ret < 0);
 
 	/* write blocks except for the tail-end block */
-	inode->u.i_blkaddr = bh->block->blkaddr;
+	inode->u.i_blkaddr = bh->block->blkaddr | (in_metazone ?
+		(sbi->extra_devices + 1ULL) << EROFS_I_BLKADDR_DEV_ID_BIT : 0);
 	return 0;
 }
 
@@ -591,7 +601,7 @@ int erofs_write_file_from_buffer(struct erofs_inode *inode, char *buf)
 
 	inode->datalayout = EROFS_INODE_FLAT_INLINE;
 
-	ret = erofs_allocate_inode_bh_data(inode, nblocks);
+	ret = erofs_allocate_inode_bh_data(inode, nblocks, false);
 	if (ret)
 		return ret;
 
@@ -622,16 +632,17 @@ static bool erofs_file_is_compressible(struct erofs_importer *im,
 
 static int erofs_write_unencoded_data(struct erofs_inode *inode,
 				      struct erofs_vfile *vf, erofs_off_t fpos,
-				      bool noseek)
+				      bool noseek, bool in_metazone)
 {
 	struct erofs_sb_info *sbi = inode->sbi;
-	erofs_blk_t nblocks, i;
+	struct erofs_bufmgr *bmgr;
+	erofs_off_t remaining, pos;
 	unsigned int len;
 	int ret;
 
 	if (!noseek && erofs_sb_has_48bit(sbi)) {
-		if (erofs_io_lseek(vf, fpos, SEEK_DATA) < 0 && errno == ENXIO) {
-			ret = erofs_allocate_inode_bh_data(inode, 0);
+		if (erofs_io_lseek(vf, fpos, SEEK_DATA) == -ENXIO) {
+			ret = erofs_allocate_inode_bh_data(inode, 0, false);
 			if (ret)
 				return ret;
 			inode->datalayout = EROFS_INODE_FLAT_PLAIN;
@@ -640,27 +651,31 @@ static int erofs_write_unencoded_data(struct erofs_inode *inode,
 		ret = erofs_io_lseek(vf, fpos, SEEK_SET);
 		if (ret < 0)
 			return ret;
-		else if (ret != fpos)
+		if (ret != fpos)
 			return -EIO;
 	}
 
-	nblocks = inode->i_size >> sbi->blkszbits;
-	ret = erofs_allocate_inode_bh_data(inode, nblocks);
+	inode->idata_size = inode->i_size % erofs_blksiz(sbi);
+	remaining = inode->i_size - inode->idata_size;
+
+	ret = erofs_allocate_inode_bh_data(inode, remaining >> sbi->blkszbits,
+					   in_metazone);
 	if (ret)
 		return ret;
 
-	for (i = 0; i < nblocks; i += (len >> sbi->blkszbits)) {
+	bmgr = in_metazone ? erofs_metadata_bmgr(sbi, false) : sbi->bmgr;
+	pos = erofs_pos(sbi, erofs_inode_dev_baddr(inode));
+	while (remaining) {
 		len = min_t(u64, round_down(UINT_MAX, 1U << sbi->blkszbits),
-			    erofs_pos(sbi, nblocks - i));
-		ret = erofs_io_xcopy(&sbi->bdev,
-				     erofs_pos(sbi, inode->u.i_blkaddr + i),
-				     vf, len, noseek);
+			    remaining);
+		ret = erofs_io_xcopy(bmgr->vf, pos, vf, len, noseek);
 		if (ret)
 			return ret;
+		pos += len;
+		remaining -= len;
 	}
 
 	/* read the tail-end data */
-	inode->idata_size = inode->i_size % erofs_blksiz(sbi);
 	if (inode->idata_size) {
 		inode->idata = malloc(inode->idata_size);
 		if (!inode->idata)
@@ -691,10 +706,11 @@ int erofs_write_unencoded_file(struct erofs_inode *inode, int fd, u64 fpos)
 	/* fallback to all data uncompressed */
 	return erofs_write_unencoded_data(inode,
 			&(struct erofs_vfile){ .fd = fd }, fpos,
-			inode->datasource == EROFS_INODE_DATA_SOURCE_DISKBUF);
+			inode->datasource == EROFS_INODE_DATA_SOURCE_DISKBUF, false);
 }
 
-static int erofs_write_dir_file(struct erofs_inode *dir)
+static int erofs_write_dir_file(const struct erofs_importer *im,
+				struct erofs_inode *dir)
 {
 	unsigned int bsz = erofs_blksiz(dir->sbi);
 	struct erofs_vfile *vf;
@@ -708,7 +724,8 @@ static int erofs_write_dir_file(struct erofs_inode *dir)
 		err = erofs_write_compress_dir(dir, vf);
 	} else {
 		DBG_BUGON(dir->idata_size != (dir->i_size & (bsz - 1)));
-		err = erofs_write_unencoded_data(dir, vf, 0, true);
+		err = erofs_write_unencoded_data(dir, vf, 0, true,
+					im->params->dirdata_in_metazone);
 	}
 	erofs_io_close(vf);
 	return err;
@@ -732,19 +749,39 @@ int erofs_iflush(struct erofs_inode *inode)
 	struct iovec iov[2];
 	char *xattrs = NULL;
 	bool nlink_1 = true;
-	int ret, fmt;
+	int ret, fmt, dev_id;
 
 	DBG_BUGON(bh && erofs_btell(bh, false) != off);
-
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
-	    S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode))
+	    S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 		u1.rdev = cpu_to_le32(inode->u.i_rdev);
-	else if (is_inode_layout_compression(inode))
+	} else if (is_inode_layout_compression(inode)) {
 		u1.blocks_lo = cpu_to_le32(inode->u.i_blocks);
-	else if (inode->datalayout == EROFS_INODE_CHUNK_BASED)
+	} else if (inode->datalayout == EROFS_INODE_CHUNK_BASED) {
 		u1.c.format = cpu_to_le16(inode->u.chunkformat);
-	else
+	} else {
+		if (inode->u.i_blkaddr != EROFS_NULL_ADDR) {
+			dev_id = inode->u.i_blkaddr >> EROFS_I_BLKADDR_DEV_ID_BIT;
+
+			if (dev_id) {
+				if (dev_id <= sbi->extra_devices) {
+					if (!sbi->devs[dev_id - 1].uniaddr) {
+						DBG_BUGON(1);	/* impossible now */
+						return -EBUSY;
+					}
+					inode->u.i_blkaddr += sbi->devs[dev_id - 1].uniaddr;
+				} else {
+					if (sbi->metazone_startblk == EROFS_META_NEW_ADDR) {
+						DBG_BUGON(1);	/* impossible now */
+						return -EBUSY;
+					}
+					DBG_BUGON(dev_id != sbi->extra_devices + 1);
+					inode->u.i_blkaddr += sbi->metazone_startblk;
+				}
+			}
+		}
 		u1.startblk_lo = cpu_to_le32(inode->u.i_blkaddr);
+	}
 
 	if (is_inode_layout_compression(inode) &&
 	    inode->u.i_blocks > UINT32_MAX) {
@@ -894,7 +931,7 @@ static bool erofs_inode_need_48bit(struct erofs_inode *inode)
 			return true;
 	} else if (!is_inode_layout_compression(inode)) {
 		if (inode->u.i_blkaddr != EROFS_NULL_ADDR &&
-		    inode->u.i_blkaddr > UINT32_MAX)
+		    erofs_inode_dev_baddr(inode) > UINT32_MAX)
 			return true;
 	}
 	return false;
@@ -1564,7 +1601,7 @@ static int erofs_mkfs_jobfn(const struct erofs_mkfs_btctx *ctx,
 		return erofs_mkfs_create_directory(ctx, inode);
 
 	if (item->type == EROFS_MKFS_JOB_DIR_BH) {
-		ret = erofs_write_dir_file(inode);
+		ret = erofs_write_dir_file(ctx->im, inode);
 		if (ret)
 			return ret;
 		erofs_write_tail_end(inode);
@@ -2313,7 +2350,8 @@ struct erofs_inode *erofs_mkfs_build_special_from_fd(struct erofs_importer *im,
 	inode->datalayout = EROFS_INODE_FLAT_INLINE;
 	ret = erofs_write_unencoded_data(inode,
 			&(struct erofs_vfile){ .fd = fd }, 0,
-			inode->datasource == EROFS_INODE_DATA_SOURCE_DISKBUF);
+			inode->datasource == EROFS_INODE_DATA_SOURCE_DISKBUF,
+			false);
 	if (ret)
 		return ERR_PTR(ret);
 out:
diff --git a/lib/io.c b/lib/io.c
index 37a74f63c45e..0c5eb2c29989 100644
--- a/lib/io.c
+++ b/lib/io.c
@@ -571,10 +571,13 @@ ssize_t erofs_io_write(struct erofs_vfile *vf, void *buf, size_t len)
 
 off_t erofs_io_lseek(struct erofs_vfile *vf, u64 offset, int whence)
 {
+	off_t ret;
+
 	if (vf->ops)
 		return vf->ops->lseek(vf, offset, whence);
 
-	return lseek(vf->fd, offset, whence);
+	ret = lseek(vf->fd, offset, whence);
+	return ret < 0 ? -errno : ret;
 }
 
 ssize_t erofs_io_sendfile(struct erofs_vfile *vout, struct erofs_vfile *vin,
diff --git a/lib/metabox.c b/lib/metabox.c
index 37267ddb73cf..d6abd5123cc8 100644
--- a/lib/metabox.c
+++ b/lib/metabox.c
@@ -54,7 +54,7 @@ int erofs_metadata_init(struct erofs_sb_info *sbi)
 	struct erofs_metamgr *m2gr;
 	int ret;
 
-	if (!sbi->m2gr && sbi->meta_blkaddr == EROFS_META_NEW_ADDR) {
+	if (!sbi->m2gr && sbi->metazone_startblk == EROFS_META_NEW_ADDR) {
 		m2gr = malloc(sizeof(*m2gr));
 		if (!m2gr)
 			return -ENOMEM;
@@ -62,6 +62,8 @@ int erofs_metadata_init(struct erofs_sb_info *sbi)
 		if (ret)
 			goto err_free;
 		sbi->m2gr = m2gr;
+		/* FIXME: sbi->meta_blkaddr should be 0 for 48-bit layouts */
+		sbi->meta_blkaddr = EROFS_META_NEW_ADDR;
 	}
 
 	if (!sbi->mxgr && erofs_sb_has_metabox(sbi)) {
@@ -124,20 +126,24 @@ int erofs_metazone_flush(struct erofs_sb_info *sbi)
 
 	if (!m2gr)
 		return 0;
-	m2bgr = m2gr->bmgr;
+	bh = erofs_balloc(sbi->bmgr, DATA, 0, 0);
+	if (!bh)
+		return PTR_ERR(bh);
+	erofs_mapbh(NULL, bh->block);
+	pos_out = erofs_btell(bh, false);
+	meta_blkaddr = pos_out >> sbi->blkszbits;
+	sbi->metazone_startblk = meta_blkaddr;
 
+	m2bgr = m2gr->bmgr;
 	ret = erofs_bflush(m2bgr, NULL);
 	if (ret)
 		return ret;
 
 	length = erofs_mapbh(m2bgr, NULL) << sbi->blkszbits;
-	bh = erofs_balloc(sbi->bmgr, DATA, length, 0);
-	if (!bh)
-		return PTR_ERR(bh);
+	ret = erofs_bh_balloon(bh, length);
+	if (ret < 0)
+		return ret;
 
-	erofs_mapbh(NULL, bh->block);
-	pos_out = erofs_btell(bh, false);
-	meta_blkaddr = pos_out >> sbi->blkszbits;
 	do {
 		count = min_t(erofs_off_t, length, INT_MAX);
 		ret = erofs_io_xcopy(sbi->bmgr->vf, pos_out,
diff --git a/lib/remotes/s3.c b/lib/remotes/s3.c
index 223c3e89d6fd..b0ca84b51afc 100644
--- a/lib/remotes/s3.c
+++ b/lib/remotes/s3.c
@@ -1032,7 +1032,8 @@ static int s3erofs_remote_getobject(struct erofs_importer *im,
 		inode->datalayout = EROFS_INODE_FLAT_PLAIN;
 		inode->idata_size = 0;
 		ret = erofs_allocate_inode_bh_data(inode,
-				DIV_ROUND_UP(inode->i_size, 1U << sbi->blkszbits));
+				DIV_ROUND_UP(inode->i_size, 1U << sbi->blkszbits),
+				false);
 		if (ret)
 			return ret;
 		resp.vf = &sbi->bdev;
diff --git a/lib/super.c b/lib/super.c
index a4837e5702ed..0180087e184e 100644
--- a/lib/super.c
+++ b/lib/super.c
@@ -445,9 +445,9 @@ int erofs_mkfs_format_fs(struct erofs_sb_info *sbi, unsigned int blkszbits,
 	sbi->bmgr = bmgr;
 	bmgr->dsunit = dsunit;
 	if (metazone)
-		sbi->meta_blkaddr = EROFS_META_NEW_ADDR;
+		sbi->metazone_startblk = EROFS_META_NEW_ADDR;
 	else
-		sbi->meta_blkaddr = 0;
+		sbi->metazone_startblk = 0;
 	bh = erofs_reserve_sb(bmgr);
 	if (IS_ERR(bh))
 		return PTR_ERR(bh);
diff --git a/lib/tar.c b/lib/tar.c
index d5095169f9ba..1f3092566bd9 100644
--- a/lib/tar.c
+++ b/lib/tar.c
@@ -632,7 +632,7 @@ static int tarerofs_write_uncompressed_file(struct erofs_inode *inode,
 	inode->datalayout = EROFS_INODE_FLAT_PLAIN;
 	nblocks = DIV_ROUND_UP(inode->i_size, 1U << sbi->blkszbits);
 
-	ret = erofs_allocate_inode_bh_data(inode, nblocks);
+	ret = erofs_allocate_inode_bh_data(inode, nblocks, false);
 	if (ret)
 		return ret;
 
diff --git a/mkfs/main.c b/mkfs/main.c
index 620b1ed2b0c3..ffcb8cf75225 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -100,7 +100,7 @@ static struct option long_options[] = {
 	{"oci", optional_argument, NULL, 534},
 #endif
 	{"zD", optional_argument, NULL, 536},
-	{"ZI", optional_argument, NULL, 537},
+	{"MZ", optional_argument, NULL, 537},
 	{"xattr-prefix", required_argument, NULL, 538},
 	{"xattr-inode-digest", required_argument, NULL, 539},
 	{0, 0, 0, 0},
@@ -178,7 +178,8 @@ static void usage(int argc, char **argv)
 		"    --mkfs-time         the timestamp is applied as build time only\n"
 		" -UX                    use a given filesystem UUID\n"
 		" --zD[=<0|1>]           specify directory compression: 0=disable [default], 1=enable\n"
-		" --ZI[=<0|1>]           specify the separate inode metadata zone availability: 0=disable [default], 1=enable\n"
+		" --MZ[=<0|[id]>]        put inode metadata ('i') and/or directory data ('d') into the separate metadata zone.\n"
+		"                        No argument enables both.  0=disable [default].\n"
 		" --all-root             make all files owned by root\n"
 #ifdef EROFS_MT_ENABLED
 		" --async-queue-limit=#  specify the maximum number of entries in the multi-threaded job queue\n"
@@ -1411,10 +1412,28 @@ static int mkfs_parse_options_cfg(struct erofs_importer_params *params,
 			}
 			break;
 		case 537:
-			if (!optarg || strcmp(optarg, "1"))
+			if (!optarg) {
 				mkfscfg.inode_metazone = true;
-			else
+				params->dirdata_in_metazone = true;
+			} else if (!strcmp(optarg, "0")) {
 				mkfscfg.inode_metazone = false;
+				params->dirdata_in_metazone = false;
+			} else {
+				for (i = 0; optarg[i]; ++i) {
+					if (optarg[i] == 'i') {
+						mkfscfg.inode_metazone = true;
+					} else if (optarg[i] == 'd') {
+						params->dirdata_in_metazone = true;
+					} else {
+						erofs_err("invalid metazone flags `%s`", optarg);
+						return -EINVAL;
+					}
+				}
+				if (params->dirdata_in_metazone && !mkfscfg.inode_metazone) {
+					erofs_err("inode metadata must be in the metadata zone if directory data is stored there");
+					return -EINVAL;
+				}
+			}
 			break;
 		case 538:
 			errno = 0;
-- 
2.43.5



More information about the Linux-erofs mailing list