[PATCH] erofs-utils: mkfs: enable directory data in the metadata zone
Gao Xiang
hsiangkao at linux.alibaba.com
Fri Jan 16 20:45:13 AEDT 2026
It allows directory data and inode metadata to be kept as close
as possible, significantly improving metadata performance for
long-latency remote image use cases.
Usage:
$ mkfs.erofs --MZ foo.erofs foo/
Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
---
include/erofs/importer.h | 1 +
include/erofs/inode.h | 3 +-
include/erofs/internal.h | 8 ++++
lib/cache.c | 4 ++
lib/inode.c | 98 ++++++++++++++++++++++++++++------------
lib/io.c | 5 +-
lib/metabox.c | 22 +++++----
lib/remotes/s3.c | 3 +-
lib/super.c | 4 +-
lib/tar.c | 2 +-
mkfs/main.c | 27 +++++++++--
11 files changed, 129 insertions(+), 48 deletions(-)
diff --git a/include/erofs/importer.h b/include/erofs/importer.h
index a525b474f1d5..60160d6bea05 100644
--- a/include/erofs/importer.h
+++ b/include/erofs/importer.h
@@ -46,6 +46,7 @@ struct erofs_importer_params {
bool no_datainline;
/* Issue directory data (except inline data) separately from regular inodes */
bool grouped_dirdata;
+ bool dirdata_in_metazone;
bool hard_dereference;
bool ovlfs_strip;
bool dot_omitted;
diff --git a/include/erofs/inode.h b/include/erofs/inode.h
index 89bd16aecc06..ba62ece9a7cc 100644
--- a/include/erofs/inode.h
+++ b/include/erofs/inode.h
@@ -38,7 +38,8 @@ erofs_nid_t erofs_lookupnid(struct erofs_inode *inode);
int erofs_iflush(struct erofs_inode *inode);
struct erofs_dentry *erofs_d_alloc(struct erofs_inode *parent,
const char *name);
-int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t nblocks);
+int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t nblocks,
+ bool in_metazone);
bool erofs_dentry_is_wht(struct erofs_sb_info *sbi, struct erofs_dentry *d);
int __erofs_fill_inode(struct erofs_importer *im, struct erofs_inode *inode,
struct stat *st, const char *path);
diff --git a/include/erofs/internal.h b/include/erofs/internal.h
index 5798f10e89c2..2fe4514b3d23 100644
--- a/include/erofs/internal.h
+++ b/include/erofs/internal.h
@@ -157,6 +157,7 @@ struct erofs_sb_info {
struct erofs_buffer_head *bh_devt;
bool useqpl;
bool sb_valid;
+ u32 metazone_startblk;
};
/* make sure that any user of the erofs headers has atleast 64bit off_t type */
@@ -205,6 +206,8 @@ struct erofs_diskbuf;
#define EROFS_INODE_DATA_SOURCE_DISKBUF 2
#define EROFS_INODE_DATA_SOURCE_RESVSP 3
+#define EROFS_I_BLKADDR_DEV_ID_BIT 48
+
struct erofs_inode {
struct list_head i_hash, i_subdirs, i_xattrs;
@@ -308,6 +311,11 @@ static inline bool erofs_inode_in_metabox(struct erofs_inode *inode)
return inode->nid >> EROFS_DIRENT_NID_METABOX_BIT;
}
+static inline erofs_blk_t erofs_inode_dev_baddr(struct erofs_inode *inode)
+{
+ return inode->u.i_blkaddr & (BIT(EROFS_I_BLKADDR_DEV_ID_BIT) - 1);
+}
+
static inline erofs_off_t erofs_iloc(struct erofs_inode *inode)
{
struct erofs_sb_info *sbi = inode->sbi;
diff --git a/lib/cache.c b/lib/cache.c
index a87575ad74d1..f23dbb06264a 100644
--- a/lib/cache.c
+++ b/lib/cache.c
@@ -479,6 +479,10 @@ static int __erofs_bflush(struct erofs_bufmgr *bmgr,
/* flush and remove bh */
ret = bh->op->flush(bh);
+ if (__erofs_unlikely(ret == -EBUSY && !forget)) {
+ skip = true;
+ continue;
+ }
if (ret < 0)
return ret;
}
diff --git a/lib/inode.c b/lib/inode.c
index e44e03cf460f..88dc41b19e5b 100644
--- a/lib/inode.c
+++ b/lib/inode.c
@@ -194,9 +194,12 @@ struct erofs_dentry *erofs_d_alloc(struct erofs_inode *parent,
}
/* allocate main data for an inode */
-int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t nblocks)
+int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t nblocks,
+ bool in_metazone)
{
- struct erofs_bufmgr *bmgr = inode->sbi->bmgr;
+ struct erofs_sb_info *sbi = inode->sbi;
+ struct erofs_bufmgr *bmgr = in_metazone ?
+ erofs_metadata_bmgr(sbi, false) : sbi->bmgr;
struct erofs_buffer_head *bh;
int ret, type;
@@ -206,9 +209,15 @@ int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t nblocks)
return 0;
}
+ if (in_metazone && !bmgr) {
+ erofs_err("cannot allocate data in the metazone when unavailable for %s",
+ inode->i_srcpath);
+ return -EINVAL;
+ }
+
/* allocate main data buffer */
type = S_ISDIR(inode->i_mode) ? DIRA : DATA;
- bh = erofs_balloc(bmgr, type, erofs_pos(inode->sbi, nblocks), 0);
+ bh = erofs_balloc(bmgr, type, erofs_pos(sbi, nblocks), 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
@@ -220,7 +229,8 @@ int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t nblocks)
DBG_BUGON(ret < 0);
/* write blocks except for the tail-end block */
- inode->u.i_blkaddr = bh->block->blkaddr;
+ inode->u.i_blkaddr = bh->block->blkaddr | (in_metazone ?
+ (sbi->extra_devices + 1ULL) << EROFS_I_BLKADDR_DEV_ID_BIT : 0);
return 0;
}
@@ -591,7 +601,7 @@ int erofs_write_file_from_buffer(struct erofs_inode *inode, char *buf)
inode->datalayout = EROFS_INODE_FLAT_INLINE;
- ret = erofs_allocate_inode_bh_data(inode, nblocks);
+ ret = erofs_allocate_inode_bh_data(inode, nblocks, false);
if (ret)
return ret;
@@ -622,16 +632,17 @@ static bool erofs_file_is_compressible(struct erofs_importer *im,
static int erofs_write_unencoded_data(struct erofs_inode *inode,
struct erofs_vfile *vf, erofs_off_t fpos,
- bool noseek)
+ bool noseek, bool in_metazone)
{
struct erofs_sb_info *sbi = inode->sbi;
- erofs_blk_t nblocks, i;
+ struct erofs_bufmgr *bmgr;
+ erofs_off_t remaining, pos;
unsigned int len;
int ret;
if (!noseek && erofs_sb_has_48bit(sbi)) {
- if (erofs_io_lseek(vf, fpos, SEEK_DATA) < 0 && errno == ENXIO) {
- ret = erofs_allocate_inode_bh_data(inode, 0);
+ if (erofs_io_lseek(vf, fpos, SEEK_DATA) == -ENXIO) {
+ ret = erofs_allocate_inode_bh_data(inode, 0, false);
if (ret)
return ret;
inode->datalayout = EROFS_INODE_FLAT_PLAIN;
@@ -640,27 +651,31 @@ static int erofs_write_unencoded_data(struct erofs_inode *inode,
ret = erofs_io_lseek(vf, fpos, SEEK_SET);
if (ret < 0)
return ret;
- else if (ret != fpos)
+ if (ret != fpos)
return -EIO;
}
- nblocks = inode->i_size >> sbi->blkszbits;
- ret = erofs_allocate_inode_bh_data(inode, nblocks);
+ inode->idata_size = inode->i_size % erofs_blksiz(sbi);
+ remaining = inode->i_size - inode->idata_size;
+
+ ret = erofs_allocate_inode_bh_data(inode, remaining >> sbi->blkszbits,
+ in_metazone);
if (ret)
return ret;
- for (i = 0; i < nblocks; i += (len >> sbi->blkszbits)) {
+ bmgr = in_metazone ? erofs_metadata_bmgr(sbi, false) : sbi->bmgr;
+ pos = erofs_pos(sbi, erofs_inode_dev_baddr(inode));
+ while (remaining) {
len = min_t(u64, round_down(UINT_MAX, 1U << sbi->blkszbits),
- erofs_pos(sbi, nblocks - i));
- ret = erofs_io_xcopy(&sbi->bdev,
- erofs_pos(sbi, inode->u.i_blkaddr + i),
- vf, len, noseek);
+ remaining);
+ ret = erofs_io_xcopy(bmgr->vf, pos, vf, len, noseek);
if (ret)
return ret;
+ pos += len;
+ remaining -= len;
}
/* read the tail-end data */
- inode->idata_size = inode->i_size % erofs_blksiz(sbi);
if (inode->idata_size) {
inode->idata = malloc(inode->idata_size);
if (!inode->idata)
@@ -691,10 +706,11 @@ int erofs_write_unencoded_file(struct erofs_inode *inode, int fd, u64 fpos)
/* fallback to all data uncompressed */
return erofs_write_unencoded_data(inode,
&(struct erofs_vfile){ .fd = fd }, fpos,
- inode->datasource == EROFS_INODE_DATA_SOURCE_DISKBUF);
+ inode->datasource == EROFS_INODE_DATA_SOURCE_DISKBUF, false);
}
-static int erofs_write_dir_file(struct erofs_inode *dir)
+static int erofs_write_dir_file(const struct erofs_importer *im,
+ struct erofs_inode *dir)
{
unsigned int bsz = erofs_blksiz(dir->sbi);
struct erofs_vfile *vf;
@@ -708,7 +724,8 @@ static int erofs_write_dir_file(struct erofs_inode *dir)
err = erofs_write_compress_dir(dir, vf);
} else {
DBG_BUGON(dir->idata_size != (dir->i_size & (bsz - 1)));
- err = erofs_write_unencoded_data(dir, vf, 0, true);
+ err = erofs_write_unencoded_data(dir, vf, 0, true,
+ im->params->dirdata_in_metazone);
}
erofs_io_close(vf);
return err;
@@ -732,19 +749,39 @@ int erofs_iflush(struct erofs_inode *inode)
struct iovec iov[2];
char *xattrs = NULL;
bool nlink_1 = true;
- int ret, fmt;
+ int ret, fmt, dev_id;
DBG_BUGON(bh && erofs_btell(bh, false) != off);
-
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
- S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode))
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
u1.rdev = cpu_to_le32(inode->u.i_rdev);
- else if (is_inode_layout_compression(inode))
+ } else if (is_inode_layout_compression(inode)) {
u1.blocks_lo = cpu_to_le32(inode->u.i_blocks);
- else if (inode->datalayout == EROFS_INODE_CHUNK_BASED)
+ } else if (inode->datalayout == EROFS_INODE_CHUNK_BASED) {
u1.c.format = cpu_to_le16(inode->u.chunkformat);
- else
+ } else {
+ if (inode->u.i_blkaddr != EROFS_NULL_ADDR) {
+ dev_id = inode->u.i_blkaddr >> EROFS_I_BLKADDR_DEV_ID_BIT;
+
+ if (dev_id) {
+ if (dev_id <= sbi->extra_devices) {
+ if (!sbi->devs[dev_id - 1].uniaddr) {
+ DBG_BUGON(1); /* impossible now */
+ return -EBUSY;
+ }
+ inode->u.i_blkaddr += sbi->devs[dev_id - 1].uniaddr;
+ } else {
+ if (sbi->metazone_startblk == EROFS_META_NEW_ADDR) {
+ DBG_BUGON(1); /* impossible now */
+ return -EBUSY;
+ }
+ DBG_BUGON(dev_id != sbi->extra_devices + 1);
+ inode->u.i_blkaddr += sbi->metazone_startblk;
+ }
+ }
+ }
u1.startblk_lo = cpu_to_le32(inode->u.i_blkaddr);
+ }
if (is_inode_layout_compression(inode) &&
inode->u.i_blocks > UINT32_MAX) {
@@ -894,7 +931,7 @@ static bool erofs_inode_need_48bit(struct erofs_inode *inode)
return true;
} else if (!is_inode_layout_compression(inode)) {
if (inode->u.i_blkaddr != EROFS_NULL_ADDR &&
- inode->u.i_blkaddr > UINT32_MAX)
+ erofs_inode_dev_baddr(inode) > UINT32_MAX)
return true;
}
return false;
@@ -1564,7 +1601,7 @@ static int erofs_mkfs_jobfn(const struct erofs_mkfs_btctx *ctx,
return erofs_mkfs_create_directory(ctx, inode);
if (item->type == EROFS_MKFS_JOB_DIR_BH) {
- ret = erofs_write_dir_file(inode);
+ ret = erofs_write_dir_file(ctx->im, inode);
if (ret)
return ret;
erofs_write_tail_end(inode);
@@ -2313,7 +2350,8 @@ struct erofs_inode *erofs_mkfs_build_special_from_fd(struct erofs_importer *im,
inode->datalayout = EROFS_INODE_FLAT_INLINE;
ret = erofs_write_unencoded_data(inode,
&(struct erofs_vfile){ .fd = fd }, 0,
- inode->datasource == EROFS_INODE_DATA_SOURCE_DISKBUF);
+ inode->datasource == EROFS_INODE_DATA_SOURCE_DISKBUF,
+ false);
if (ret)
return ERR_PTR(ret);
out:
diff --git a/lib/io.c b/lib/io.c
index 37a74f63c45e..0c5eb2c29989 100644
--- a/lib/io.c
+++ b/lib/io.c
@@ -571,10 +571,13 @@ ssize_t erofs_io_write(struct erofs_vfile *vf, void *buf, size_t len)
off_t erofs_io_lseek(struct erofs_vfile *vf, u64 offset, int whence)
{
+ off_t ret;
+
if (vf->ops)
return vf->ops->lseek(vf, offset, whence);
- return lseek(vf->fd, offset, whence);
+ ret = lseek(vf->fd, offset, whence);
+ return ret < 0 ? -errno : ret;
}
ssize_t erofs_io_sendfile(struct erofs_vfile *vout, struct erofs_vfile *vin,
diff --git a/lib/metabox.c b/lib/metabox.c
index 37267ddb73cf..d6abd5123cc8 100644
--- a/lib/metabox.c
+++ b/lib/metabox.c
@@ -54,7 +54,7 @@ int erofs_metadata_init(struct erofs_sb_info *sbi)
struct erofs_metamgr *m2gr;
int ret;
- if (!sbi->m2gr && sbi->meta_blkaddr == EROFS_META_NEW_ADDR) {
+ if (!sbi->m2gr && sbi->metazone_startblk == EROFS_META_NEW_ADDR) {
m2gr = malloc(sizeof(*m2gr));
if (!m2gr)
return -ENOMEM;
@@ -62,6 +62,8 @@ int erofs_metadata_init(struct erofs_sb_info *sbi)
if (ret)
goto err_free;
sbi->m2gr = m2gr;
+ /* FIXME: sbi->meta_blkaddr should be 0 for 48-bit layouts */
+ sbi->meta_blkaddr = EROFS_META_NEW_ADDR;
}
if (!sbi->mxgr && erofs_sb_has_metabox(sbi)) {
@@ -124,20 +126,24 @@ int erofs_metazone_flush(struct erofs_sb_info *sbi)
if (!m2gr)
return 0;
- m2bgr = m2gr->bmgr;
+ bh = erofs_balloc(sbi->bmgr, DATA, 0, 0);
+ if (!bh)
+ return PTR_ERR(bh);
+ erofs_mapbh(NULL, bh->block);
+ pos_out = erofs_btell(bh, false);
+ meta_blkaddr = pos_out >> sbi->blkszbits;
+ sbi->metazone_startblk = meta_blkaddr;
+ m2bgr = m2gr->bmgr;
ret = erofs_bflush(m2bgr, NULL);
if (ret)
return ret;
length = erofs_mapbh(m2bgr, NULL) << sbi->blkszbits;
- bh = erofs_balloc(sbi->bmgr, DATA, length, 0);
- if (!bh)
- return PTR_ERR(bh);
+ ret = erofs_bh_balloon(bh, length);
+ if (ret < 0)
+ return ret;
- erofs_mapbh(NULL, bh->block);
- pos_out = erofs_btell(bh, false);
- meta_blkaddr = pos_out >> sbi->blkszbits;
do {
count = min_t(erofs_off_t, length, INT_MAX);
ret = erofs_io_xcopy(sbi->bmgr->vf, pos_out,
diff --git a/lib/remotes/s3.c b/lib/remotes/s3.c
index 223c3e89d6fd..b0ca84b51afc 100644
--- a/lib/remotes/s3.c
+++ b/lib/remotes/s3.c
@@ -1032,7 +1032,8 @@ static int s3erofs_remote_getobject(struct erofs_importer *im,
inode->datalayout = EROFS_INODE_FLAT_PLAIN;
inode->idata_size = 0;
ret = erofs_allocate_inode_bh_data(inode,
- DIV_ROUND_UP(inode->i_size, 1U << sbi->blkszbits));
+ DIV_ROUND_UP(inode->i_size, 1U << sbi->blkszbits),
+ false);
if (ret)
return ret;
resp.vf = &sbi->bdev;
diff --git a/lib/super.c b/lib/super.c
index a4837e5702ed..0180087e184e 100644
--- a/lib/super.c
+++ b/lib/super.c
@@ -445,9 +445,9 @@ int erofs_mkfs_format_fs(struct erofs_sb_info *sbi, unsigned int blkszbits,
sbi->bmgr = bmgr;
bmgr->dsunit = dsunit;
if (metazone)
- sbi->meta_blkaddr = EROFS_META_NEW_ADDR;
+ sbi->metazone_startblk = EROFS_META_NEW_ADDR;
else
- sbi->meta_blkaddr = 0;
+ sbi->metazone_startblk = 0;
bh = erofs_reserve_sb(bmgr);
if (IS_ERR(bh))
return PTR_ERR(bh);
diff --git a/lib/tar.c b/lib/tar.c
index d5095169f9ba..1f3092566bd9 100644
--- a/lib/tar.c
+++ b/lib/tar.c
@@ -632,7 +632,7 @@ static int tarerofs_write_uncompressed_file(struct erofs_inode *inode,
inode->datalayout = EROFS_INODE_FLAT_PLAIN;
nblocks = DIV_ROUND_UP(inode->i_size, 1U << sbi->blkszbits);
- ret = erofs_allocate_inode_bh_data(inode, nblocks);
+ ret = erofs_allocate_inode_bh_data(inode, nblocks, false);
if (ret)
return ret;
diff --git a/mkfs/main.c b/mkfs/main.c
index 620b1ed2b0c3..ffcb8cf75225 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -100,7 +100,7 @@ static struct option long_options[] = {
{"oci", optional_argument, NULL, 534},
#endif
{"zD", optional_argument, NULL, 536},
- {"ZI", optional_argument, NULL, 537},
+ {"MZ", optional_argument, NULL, 537},
{"xattr-prefix", required_argument, NULL, 538},
{"xattr-inode-digest", required_argument, NULL, 539},
{0, 0, 0, 0},
@@ -178,7 +178,8 @@ static void usage(int argc, char **argv)
" --mkfs-time the timestamp is applied as build time only\n"
" -UX use a given filesystem UUID\n"
" --zD[=<0|1>] specify directory compression: 0=disable [default], 1=enable\n"
- " --ZI[=<0|1>] specify the separate inode metadata zone availability: 0=disable [default], 1=enable\n"
+ " --MZ[=<0|[id]>] put inode metadata ('i') and/or directory data ('d') into the separate metadata zone.\n"
+ " No argument enables both. 0=disable [default].\n"
" --all-root make all files owned by root\n"
#ifdef EROFS_MT_ENABLED
" --async-queue-limit=# specify the maximum number of entries in the multi-threaded job queue\n"
@@ -1411,10 +1412,28 @@ static int mkfs_parse_options_cfg(struct erofs_importer_params *params,
}
break;
case 537:
- if (!optarg || strcmp(optarg, "1"))
+ if (!optarg) {
mkfscfg.inode_metazone = true;
- else
+ params->dirdata_in_metazone = true;
+ } else if (!strcmp(optarg, "0")) {
mkfscfg.inode_metazone = false;
+ params->dirdata_in_metazone = false;
+ } else {
+ for (i = 0; optarg[i]; ++i) {
+ if (optarg[i] == 'i') {
+ mkfscfg.inode_metazone = true;
+ } else if (optarg[i] == 'd') {
+ params->dirdata_in_metazone = true;
+ } else {
+ erofs_err("invalid metazone flags `%s`", optarg);
+ return -EINVAL;
+ }
+ }
+ if (params->dirdata_in_metazone && !mkfscfg.inode_metazone) {
+ erofs_err("inode metadata must be in the metadata zone if directory data is stored there");
+ return -EINVAL;
+ }
+ }
break;
case 538:
errno = 0;
--
2.43.5
More information about the Linux-erofs
mailing list