[PATCH 3/3] erofs-utils: mkfs: add optional support for inode meta zone

Gao Xiang hsiangkao at linux.alibaba.com
Fri Nov 7 21:06:09 AEDT 2025


Many use cases benefit from concentrating inode metadata, such as
image filesystems primarily accessed over a network (e.g., EROFS
native full container images).  Otherwise, scattered on-disk inodes
increase network access overhead and make metadata prefetching (so
that systems won't be stuck by metadata I/Os due to network failures,
for example) difficult to implement.

Usage:
 `--ZI` or `--ZI=1`	Enable inode meta zone;
 `--ZI=0`               Disable inode meta zone (default).

Closes: https://lore.kernel.org/r/20250422123612.261764-1-lihongbo22@huawei.com
Cc: Hongbo Li <lihongbo22 at huawei.com>
Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
---
 include/erofs/internal.h |  12 ++--
 lib/importer.c           |  10 ++-
 lib/inode.c              |  46 ++++++++------
 lib/liberofs_metabox.h   |  14 ++++-
 lib/metabox.c            | 127 ++++++++++++++++++++++++++++++---------
 lib/super.c              |  10 ++-
 lib/xattr.c              |   2 +-
 mkfs/main.c              |  12 +++-
 8 files changed, 168 insertions(+), 65 deletions(-)

diff --git a/include/erofs/internal.h b/include/erofs/internal.h
index 610650138bee..62594b877151 100644
--- a/include/erofs/internal.h
+++ b/include/erofs/internal.h
@@ -97,7 +97,7 @@ struct erofs_sb_info {
 	u64 total_blocks;
 	u64 primarydevice_blocks;
 
-	u32 meta_blkaddr;
+	s32 meta_blkaddr;
 	u32 xattr_blkaddr;
 
 	u32 feature_compat;
@@ -150,7 +150,7 @@ struct erofs_sb_info {
 	struct erofs_bufmgr *bmgr;
 	struct erofs_xattrmgr *xamgr;
 	struct z_erofs_mgr *zmgr;
-	struct erofs_metaboxmgr *m2gr;
+	struct erofs_metamgr *m2gr, *mxgr;
 	struct erofs_packed_inode *packedinode;
 	struct erofs_buffer_head *bh_sb;
 	struct erofs_buffer_head *bh_devt;
@@ -309,8 +309,8 @@ static inline bool erofs_inode_in_metabox(struct erofs_inode *inode)
 static inline erofs_off_t erofs_iloc(struct erofs_inode *inode)
 {
 	struct erofs_sb_info *sbi = inode->sbi;
-	erofs_off_t base = erofs_inode_in_metabox(inode) ? 0 :
-			erofs_pos(sbi, sbi->meta_blkaddr);
+	s64 base = erofs_inode_in_metabox(inode) ? 0 :
+		(s64)erofs_pos(sbi, sbi->meta_blkaddr);
 
 	return base + ((inode->nid & EROFS_DIRENT_NID_MASK) << EROFS_ISLOTBITS);
 }
@@ -434,8 +434,8 @@ int erofs_mkfs_init_devices(struct erofs_sb_info *sbi, unsigned int devices);
 int erofs_write_device_table(struct erofs_sb_info *sbi);
 int erofs_enable_sb_chksum(struct erofs_sb_info *sbi, u32 *crc);
 int erofs_superblock_csum_verify(struct erofs_sb_info *sbi);
-int erofs_mkfs_format_fs(struct erofs_sb_info *sbi,
-			 unsigned int blkszbits, unsigned int dsunit);
+int erofs_mkfs_format_fs(struct erofs_sb_info *sbi, unsigned int blkszbits,
+			 unsigned int dsunit, bool metazone);
 int erofs_mkfs_load_fs(struct erofs_sb_info *sbi, unsigned int dsunit);
 
 /* namei.c */
diff --git a/lib/importer.c b/lib/importer.c
index c73dde2529b7..958a433b9eaa 100644
--- a/lib/importer.c
+++ b/lib/importer.c
@@ -69,8 +69,8 @@ int erofs_importer_init(struct erofs_importer *im)
 			goto out_err;
 	}
 
-	subsys = "metabox";
-	err = erofs_metabox_init(sbi);
+	subsys = "metadata";
+	err = erofs_metadata_init(sbi);
 	if (err)
 		goto out_err;
 
@@ -107,6 +107,10 @@ int erofs_importer_flush_all(struct erofs_importer *im)
 	if (err)
 		return err;
 
+	err = erofs_metazone_flush(sbi);
+	if (err)
+		return err;
+
 	fsalignblks = im->params->fsalignblks ?
 		roundup_pow_of_two(im->params->fsalignblks) : 1;
 	sbi->primarydevice_blocks = roundup(erofs_mapbh(sbi->bmgr, NULL),
@@ -128,6 +132,6 @@ void erofs_importer_exit(struct erofs_importer *im)
 	struct erofs_sb_info *sbi = im->sbi;
 
 	z_erofs_dedupe_ext_exit();
-	erofs_metabox_exit(sbi);
+	erofs_metadata_exit(sbi);
 	erofs_packedfile_exit(sbi);
 }
diff --git a/lib/inode.c b/lib/inode.c
index 09b2e507c609..64f6bc34610f 100644
--- a/lib/inode.c
+++ b/lib/inode.c
@@ -376,18 +376,19 @@ erofs_nid_t erofs_lookupnid(struct erofs_inode *inode)
 {
 	struct erofs_buffer_head *const bh = inode->bh;
 	struct erofs_sb_info *sbi = inode->sbi;
-	erofs_off_t off, meta_offset;
+	erofs_off_t off;
+	s64 meta_offset;
 	erofs_nid_t nid;
 
 	if (bh && inode->nid == EROFS_NID_UNALLOCATED) {
 		erofs_mapbh(NULL, bh->block);
 		off = erofs_btell(bh, false);
 
-		if (!inode->in_metabox) {
-			meta_offset = erofs_pos(sbi, sbi->meta_blkaddr);
-			DBG_BUGON(off < meta_offset);
-		} else {
+		if (inode->in_metabox) {
 			meta_offset = 0;
+		} else {
+			meta_offset = (s64)erofs_pos(sbi, sbi->meta_blkaddr);
+			DBG_BUGON(off < meta_offset && !sbi->m2gr);
 		}
 
 		nid = (off - meta_offset) >> EROFS_ISLOTBITS;
@@ -718,8 +719,8 @@ int erofs_iflush(struct erofs_inode *inode)
 	struct erofs_sb_info *sbi = inode->sbi;
 	struct erofs_buffer_head *bh = inode->bh;
 	erofs_off_t off = erofs_iloc(inode);
-	struct erofs_bufmgr *ibmgr = inode->in_metabox ?
-				erofs_metabox_bmgr(sbi) : sbi->bmgr;
+	struct erofs_bufmgr *ibmgr =
+		erofs_metadata_bmgr(sbi, inode->in_metabox) ?: sbi->bmgr;
 	union {
 		struct erofs_inode_compact dic;
 		struct erofs_inode_extended die;
@@ -921,12 +922,9 @@ static int erofs_prepare_inode_buffer(struct erofs_importer *im,
 	if (inode->extent_isize)
 		inodesize = roundup(inodesize, 8) + inode->extent_isize;
 
-	if (!erofs_is_special_identifier(inode->i_srcpath) &&
-	    erofs_metabox_bmgr(sbi))
+	if (!erofs_is_special_identifier(inode->i_srcpath) && sbi->mxgr)
 		inode->in_metabox = true;
-
-	if (inode->in_metabox)
-		ibmgr = erofs_metabox_bmgr(sbi) ?: sbi->bmgr;
+	ibmgr = erofs_metadata_bmgr(sbi, inode->in_metabox) ?: sbi->bmgr;
 
 	if (inode->datalayout == EROFS_INODE_FLAT_PLAIN)
 		goto noinline;
@@ -1000,8 +998,8 @@ static int erofs_bh_flush_write_inline(struct erofs_buffer_head *bh)
 {
 	struct erofs_inode *const inode = bh->fsprivate;
 	struct erofs_sb_info *sbi = inode->sbi;
-	struct erofs_bufmgr *ibmgr = inode->in_metabox ?
-				erofs_metabox_bmgr(sbi) : sbi->bmgr;
+	struct erofs_bufmgr *ibmgr =
+		erofs_metadata_bmgr(sbi, inode->in_metabox) ?: sbi->bmgr;
 	const erofs_off_t off = erofs_btell(bh, false);
 	int ret;
 
@@ -1360,21 +1358,29 @@ static void erofs_fixup_meta_blkaddr(struct erofs_inode *root)
 	const erofs_off_t rootnid_maxoffset = 0xffff << EROFS_ISLOTBITS;
 	struct erofs_buffer_head *const bh = root->bh;
 	struct erofs_sb_info *sbi = root->sbi;
-	erofs_off_t meta_offset = 0;
+	int bsz = erofs_blksiz(sbi);
+	int meta_offset;
 	erofs_off_t off;
 
 	erofs_mapbh(NULL, bh->block);
 	off = erofs_btell(bh, false);
-	if (!root->in_metabox && off > rootnid_maxoffset)
-		meta_offset = round_up(off - rootnid_maxoffset,
-				       erofs_blksiz(sbi));
-	else if (root->in_metabox && !erofs_sb_has_48bit(sbi)) {
+	if (!root->in_metabox) {
+		if (!off) {
+			DBG_BUGON(!sbi->m2gr);
+			DBG_BUGON(sbi->meta_blkaddr != -1);
+			meta_offset = -bsz;	/* avoid NID 0 */
+		} else if (off > rootnid_maxoffset) {
+			meta_offset = round_up(off - rootnid_maxoffset, bsz);
+			sbi->meta_blkaddr = erofs_blknr(sbi, meta_offset);
+		} else {
+			meta_offset = 0;
+		}
+	} else if (!erofs_sb_has_48bit(sbi)) {
 		sbi->build_time = sbi->epoch;
 		sbi->epoch = max_t(s64, 0, (s64)sbi->build_time - UINT32_MAX);
 		sbi->build_time -= sbi->epoch;
 		erofs_sb_set_48bit(sbi);
 	}
-	sbi->meta_blkaddr = erofs_blknr(sbi, meta_offset);
 	root->nid = ((off - meta_offset) >> EROFS_ISLOTBITS) |
 		((u64)root->in_metabox << EROFS_DIRENT_NID_METABOX_BIT);
 }
diff --git a/lib/liberofs_metabox.h b/lib/liberofs_metabox.h
index d8896c01c298..bf4051cf18e2 100644
--- a/lib/liberofs_metabox.h
+++ b/lib/liberofs_metabox.h
@@ -4,6 +4,8 @@
 
 #include "erofs/internal.h"
 
+#define EROFS_META_NEW_ADDR	((u32)-1ULL)
+
 extern const char *erofs_metabox_identifier;
 #define EROFS_METABOX_INODE	erofs_metabox_identifier
 
@@ -12,11 +14,17 @@ static inline bool erofs_is_metabox_inode(struct erofs_inode *inode)
 	return inode->i_srcpath == EROFS_METABOX_INODE;
 }
 
+static inline bool erofs_has_meta_zone(struct erofs_sb_info *sbi)
+{
+	return sbi->m2gr || sbi->meta_blkaddr == EROFS_META_NEW_ADDR;
+}
+
 struct erofs_importer;
 
-void erofs_metabox_exit(struct erofs_sb_info *sbi);
-int erofs_metabox_init(struct erofs_sb_info *sbi);
-struct erofs_bufmgr *erofs_metabox_bmgr(struct erofs_sb_info *sbi);
+void erofs_metadata_exit(struct erofs_sb_info *sbi);
+int erofs_metadata_init(struct erofs_sb_info *sbi);
+struct erofs_bufmgr *erofs_metadata_bmgr(struct erofs_sb_info *sbi, bool mbox);
 int erofs_metabox_iflush(struct erofs_importer *im);
+int erofs_metazone_flush(struct erofs_sb_info *sbi);
 
 #endif
diff --git a/lib/metabox.c b/lib/metabox.c
index bf188f6db0f5..37267ddb73cf 100644
--- a/lib/metabox.c
+++ b/lib/metabox.c
@@ -2,81 +2,152 @@
 #include <stdlib.h>
 #include "erofs/inode.h"
 #include "erofs/importer.h"
+#include "erofs/print.h"
 #include "liberofs_cache.h"
 #include "liberofs_private.h"
 #include "liberofs_metabox.h"
 
 const char *erofs_metabox_identifier = "metabox";
 
-struct erofs_metaboxmgr {
+struct erofs_metamgr {
 	struct erofs_vfile vf;
 	struct erofs_bufmgr *bmgr;
 };
 
-void erofs_metabox_exit(struct erofs_sb_info *sbi)
+static void erofs_metamgr_exit(struct erofs_metamgr *m2gr)
 {
-	struct erofs_metaboxmgr *m2gr = sbi->m2gr;
-
-	if (!m2gr)
-		return;
 	DBG_BUGON(!m2gr->bmgr);
 	erofs_buffer_exit(m2gr->bmgr);
 	erofs_io_close(&m2gr->vf);
 	free(m2gr);
 }
-
-int erofs_metabox_init(struct erofs_sb_info *sbi)
+static int erofs_metamgr_init(struct erofs_sb_info *sbi,
+			      struct erofs_metamgr *m2gr)
 {
-	struct erofs_metaboxmgr *m2gr;
 	int ret;
 
-	if (!erofs_sb_has_metabox(sbi))
-		return 0;
-
-	m2gr = malloc(sizeof(*m2gr));
-	if (!m2gr)
-		return -ENOMEM;
-
 	ret = erofs_tmpfile();
 	if (ret < 0)
-		goto out_err;
+		return ret;
 
 	m2gr->vf = (struct erofs_vfile){ .fd = ret };
 	m2gr->bmgr = erofs_buffer_init(sbi, 0, &m2gr->vf);
-	if (m2gr->bmgr) {
+	if (!m2gr->bmgr)
+		return -ENOMEM;
+	return 0;
+}
+
+void erofs_metadata_exit(struct erofs_sb_info *sbi)
+{
+	if (sbi->m2gr) {
+		erofs_metamgr_exit(sbi->m2gr);
+		sbi->m2gr = NULL;
+	}
+	if (sbi->mxgr) {
+		erofs_metamgr_exit(sbi->mxgr);
+		sbi->mxgr = NULL;
+	}
+}
+
+int erofs_metadata_init(struct erofs_sb_info *sbi)
+{
+	struct erofs_metamgr *m2gr;
+	int ret;
+
+	if (!sbi->m2gr && sbi->meta_blkaddr == EROFS_META_NEW_ADDR) {
+		m2gr = malloc(sizeof(*m2gr));
+		if (!m2gr)
+			return -ENOMEM;
+		ret = erofs_metamgr_init(sbi, m2gr);
+		if (ret)
+			goto err_free;
 		sbi->m2gr = m2gr;
-		return 0;
 	}
-	ret = -ENOMEM;
-out_err:
+
+	if (!sbi->mxgr && erofs_sb_has_metabox(sbi)) {
+		m2gr = malloc(sizeof(*m2gr));
+		if (!m2gr)
+			return -ENOMEM;
+		ret = erofs_metamgr_init(sbi, m2gr);
+		if (ret)
+			goto err_free;
+		sbi->mxgr = m2gr;
+	}
+	return 0;
+err_free:
 	free(m2gr);
 	return ret;
 }
 
-struct erofs_bufmgr *erofs_metabox_bmgr(struct erofs_sb_info *sbi)
+struct erofs_bufmgr *erofs_metadata_bmgr(struct erofs_sb_info *sbi, bool mbox)
 {
-	return sbi->m2gr ? sbi->m2gr->bmgr : NULL;
+	if (mbox) {
+		if (sbi->mxgr)
+			return sbi->mxgr->bmgr;
+	} else if (sbi->m2gr) {
+		return sbi->m2gr->bmgr;
+	}
+	return NULL;
 }
 
 int erofs_metabox_iflush(struct erofs_importer *im)
 {
 	struct erofs_sb_info *sbi = im->sbi;
-	struct erofs_metaboxmgr *m2gr = sbi->m2gr;
+	struct erofs_metamgr *mxgr = sbi->mxgr;
 	struct erofs_inode *inode;
 	int err;
 
-	if (!m2gr || !erofs_sb_has_metabox(sbi))
+	if (!mxgr || !erofs_sb_has_metabox(sbi))
 		return -EINVAL;
 
-	err = erofs_bflush(m2gr->bmgr, NULL);
+	err = erofs_bflush(mxgr->bmgr, NULL);
 	if (err)
 		return err;
 
-	if (erofs_io_lseek(&m2gr->vf, 0, SEEK_END) <= 0)
+	if (erofs_io_lseek(&mxgr->vf, 0, SEEK_END) <= 0)
 		return 0;
-	inode = erofs_mkfs_build_special_from_fd(im, m2gr->vf.fd,
+	inode = erofs_mkfs_build_special_from_fd(im, mxgr->vf.fd,
 						 EROFS_METABOX_INODE);
 	sbi->metabox_nid = erofs_lookupnid(inode);
 	erofs_iput(inode);
 	return 0;
 }
+
+int erofs_metazone_flush(struct erofs_sb_info *sbi)
+{
+	struct erofs_metamgr *m2gr = sbi->m2gr;
+	struct erofs_buffer_head *bh;
+	struct erofs_bufmgr *m2bgr;
+	erofs_blk_t meta_blkaddr;
+	u64 length, pos_out;
+	int ret, count;
+
+	if (!m2gr)
+		return 0;
+	m2bgr = m2gr->bmgr;
+
+	ret = erofs_bflush(m2bgr, NULL);
+	if (ret)
+		return ret;
+
+	length = erofs_mapbh(m2bgr, NULL) << sbi->blkszbits;
+	bh = erofs_balloc(sbi->bmgr, DATA, length, 0);
+	if (!bh)
+		return PTR_ERR(bh);
+
+	erofs_mapbh(NULL, bh->block);
+	pos_out = erofs_btell(bh, false);
+	meta_blkaddr = pos_out >> sbi->blkszbits;
+	do {
+		count = min_t(erofs_off_t, length, INT_MAX);
+		ret = erofs_io_xcopy(sbi->bmgr->vf, pos_out,
+				     &m2gr->vf, count, false);
+		if (ret < 0)
+			break;
+		pos_out += count;
+	} while (length -= count);
+	bh->op = &erofs_drop_directly_bhops;
+	erofs_bdrop(bh, false);
+	sbi->meta_blkaddr += meta_blkaddr;
+	return 0;
+}
diff --git a/lib/super.c b/lib/super.c
index 9760265aa754..d626c7cdc76f 100644
--- a/lib/super.c
+++ b/lib/super.c
@@ -8,6 +8,7 @@
 #include "erofs/xattr.h"
 #include "liberofs_cache.h"
 #include "liberofs_compress.h"
+#include "liberofs_metabox.h"
 
 static bool check_layout_compatibility(struct erofs_sb_info *sbi,
 				       struct erofs_super_block *dsb)
@@ -418,8 +419,8 @@ out:
 	return 0;
 }
 
-int erofs_mkfs_format_fs(struct erofs_sb_info *sbi,
-			 unsigned int blkszbits, unsigned int dsunit)
+int erofs_mkfs_format_fs(struct erofs_sb_info *sbi, unsigned int blkszbits,
+			 unsigned int dsunit, bool metazone)
 {
 	struct erofs_buffer_head *bh;
 	struct erofs_bufmgr *bmgr;
@@ -430,7 +431,10 @@ int erofs_mkfs_format_fs(struct erofs_sb_info *sbi,
 		return -ENOMEM;
 	sbi->bmgr = bmgr;
 	bmgr->dsunit = dsunit;
-
+	if (metazone)
+		sbi->meta_blkaddr = EROFS_META_NEW_ADDR;
+	else
+		sbi->meta_blkaddr = 0;
 	bh = erofs_reserve_sb(bmgr);
 	if (IS_ERR(bh))
 		return PTR_ERR(bh);
diff --git a/lib/xattr.c b/lib/xattr.c
index fc22c817f136..8f0332b44a02 100644
--- a/lib/xattr.c
+++ b/lib/xattr.c
@@ -828,7 +828,7 @@ int erofs_xattr_flush_name_prefixes(struct erofs_importer *im, bool plain)
 
 	if (!plain) {
 		if (erofs_sb_has_metabox(sbi)) {
-			bmgr = erofs_metabox_bmgr(sbi);
+			bmgr = erofs_metadata_bmgr(sbi, true);
 			vf = bmgr->vf;
 		} else if (may_fragments) {
 			erofs_sb_set_fragments(sbi);
diff --git a/mkfs/main.c b/mkfs/main.c
index 4de298b6dedd..76bf84348364 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -101,6 +101,7 @@ static struct option long_options[] = {
 	{"oci", optional_argument, NULL, 534},
 #endif
 	{"zD", optional_argument, NULL, 536},
+	{"ZI", optional_argument, NULL, 537},
 	{0, 0, 0, 0},
 };
 
@@ -176,6 +177,7 @@ static void usage(int argc, char **argv)
 		"    --mkfs-time        the timestamp is applied as build time only\n"
 		" -UX                   use a given filesystem UUID\n"
 		" --zD[=<0|1>]          specify directory compression: 0=disable [default], 1=enable\n"
+		" --ZI[=<0|1>]          specify the separate inode metadata zone availability: 0=disable [default], 1=enable\n"
 		" --all-root            make all files owned by root\n"
 #ifdef EROFS_MT_ENABLED
 		" --async-queue-limit=# specify the maximum number of entries in the multi-threaded job queue\n"
@@ -269,6 +271,7 @@ static void version(void)
 static struct erofsmkfs_cfg {
 	/* < 0, xattr disabled and >= INT_MAX, always use inline xattrs */
 	long inlinexattr_tolerance;
+	bool inode_metazone;
 } mkfscfg = {
 	.inlinexattr_tolerance = 2,
 };
@@ -1412,6 +1415,12 @@ static int mkfs_parse_options_cfg(struct erofs_importer_params *params,
 			else
 				params->compress_dir = false;
 			break;
+		case 537:
+			if (!optarg || strcmp(optarg, "1"))
+				mkfscfg.inode_metazone = true;
+			else
+				mkfscfg.inode_metazone = false;
+			break;
 		case 'V':
 			version();
 			exit(0);
@@ -1787,7 +1796,8 @@ int main(int argc, char **argv)
 	}
 
 	if (!incremental_mode)
-		err = erofs_mkfs_format_fs(&g_sbi, mkfs_blkszbits, dsunit);
+		err = erofs_mkfs_format_fs(&g_sbi, mkfs_blkszbits, dsunit,
+					   mkfscfg.inode_metazone);
 	else
 		err = erofs_mkfs_load_fs(&g_sbi, dsunit);
 	if (err)
-- 
2.43.5



More information about the Linux-erofs mailing list