[PATCH] erofs-utils: mkfs: avoid partial writes in erofs_write_tail_end()
Gao Xiang
hsiangkao at linux.alibaba.com
Wed Jun 11 13:25:09 AEST 2025
Partial writes to a physical block device (rather than to some hole in
a regular file) can cause unnecessary read-modify-write cycles. In
particular, redundant reads can take noticeable overhead on cloud disks.
Write full blocks with pwritev(2) instead.
Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
---
configure.ac | 2 ++
include/erofs/io.h | 5 +++++
lib/inode.c | 33 +++++++++++++++------------------
lib/io.c | 33 +++++++++++++++++++++++++++++++++
4 files changed, 55 insertions(+), 18 deletions(-)
diff --git a/configure.ac b/configure.ac
index 88f1cbe..a73a9ba 100644
--- a/configure.ac
+++ b/configure.ac
@@ -216,6 +216,7 @@ AC_CHECK_HEADERS(m4_flatten([
sys/statfs.h
sys/sysmacros.h
sys/time.h
+ sys/uio.h
unistd.h
]))
@@ -274,6 +275,7 @@ AC_CHECK_FUNCS(m4_flatten([
ftello64
pread64
pwrite64
+ pwritev
posix_fadvise
fstatfs
sendfile
diff --git a/include/erofs/io.h b/include/erofs/io.h
index 3179ea1..101a5ba 100644
--- a/include/erofs/io.h
+++ b/include/erofs/io.h
@@ -16,6 +16,7 @@ extern "C"
#define _GNU_SOURCE
#endif
#include <unistd.h>
+#include <sys/uio.h>
#include "defs.h"
#ifndef O_BINARY
@@ -27,6 +28,8 @@ struct erofs_vfile;
struct erofs_vfops {
ssize_t (*pread)(struct erofs_vfile *vf, void *buf, u64 offset, size_t len);
ssize_t (*pwrite)(struct erofs_vfile *vf, const void *buf, u64 offset, size_t len);
+ ssize_t (*pwritev)(struct erofs_vfile *vf, const struct iovec *iov,
+ int iovcnt, u64 pos);
int (*fsync)(struct erofs_vfile *vf);
int (*fallocate)(struct erofs_vfile *vf, u64 offset, size_t len, bool pad);
int (*ftruncate)(struct erofs_vfile *vf, u64 length);
@@ -53,6 +56,8 @@ ssize_t __erofs_io_write(int fd, const void *buf, size_t len);
int erofs_io_fstat(struct erofs_vfile *vf, struct stat *buf);
ssize_t erofs_io_pwrite(struct erofs_vfile *vf, const void *buf, u64 pos, size_t len);
+ssize_t erofs_io_pwritev(struct erofs_vfile *vf, const struct iovec *iov,
+ int iovcnt, u64 pos);
int erofs_io_fsync(struct erofs_vfile *vf);
ssize_t erofs_io_fallocate(struct erofs_vfile *vf, u64 offset, size_t len, bool pad);
int erofs_io_ftruncate(struct erofs_vfile *vf, u64 length);
diff --git a/lib/inode.c b/lib/inode.c
index a36ade2..09f519b 100644
--- a/lib/inode.c
+++ b/lib/inode.c
@@ -827,6 +827,7 @@ static struct erofs_bhops erofs_write_inline_bhops = {
static int erofs_write_tail_end(struct erofs_inode *inode)
{
+ static const u8 zeroed[EROFS_MAX_BLOCK_SIZE];
struct erofs_sb_info *sbi = inode->sbi;
struct erofs_buffer_head *bh, *ibh;
@@ -843,8 +844,10 @@ static int erofs_write_tail_end(struct erofs_inode *inode)
ibh->fsprivate = erofs_igrab(inode);
ibh->op = &erofs_write_inline_bhops;
} else {
+ struct iovec iov[2];
+ erofs_off_t pos;
int ret;
- erofs_off_t pos, zero_pos;
+ bool h0;
if (!bh) {
bh = erofs_balloc(sbi->bmgr,
@@ -874,25 +877,19 @@ static int erofs_write_tail_end(struct erofs_inode *inode)
pos = erofs_btell(bh, true) - erofs_blksiz(sbi);
/* 0'ed data should be padded at head for 0padding conversion */
- if (erofs_sb_has_lz4_0padding(sbi) && inode->compressed_idata) {
- zero_pos = pos;
- pos += erofs_blksiz(sbi) - inode->idata_size;
- } else {
- /* pad 0'ed data for the other cases */
- zero_pos = pos + inode->idata_size;
- }
- ret = erofs_dev_write(sbi, inode->idata, pos, inode->idata_size);
- if (ret)
+ h0 = erofs_sb_has_lz4_0padding(sbi) && inode->compressed_idata;
+ DBG_BUGON(inode->idata_size > erofs_blksiz(sbi));
+
+ iov[h0] = (struct iovec) { .iov_base = inode->idata,
+ .iov_len = inode->idata_size };
+ iov[!h0] = (struct iovec) { .iov_base = (u8 *)zeroed,
+ erofs_blksiz(sbi) - inode->idata_size };
+ ret = erofs_io_pwritev(&sbi->bdev, iov, 2, pos);
+ if (ret < 0)
return ret;
+ else if (ret < erofs_blksiz(sbi))
+ return -EIO;
- DBG_BUGON(inode->idata_size > erofs_blksiz(sbi));
- if (inode->idata_size < erofs_blksiz(sbi)) {
- ret = erofs_dev_fillzero(sbi, zero_pos,
- erofs_blksiz(sbi) - inode->idata_size,
- false);
- if (ret)
- return ret;
- }
inode->idata_size = 0;
free(inode->idata);
inode->idata = NULL;
diff --git a/lib/io.c b/lib/io.c
index 5c3d263..aa043ca 100644
--- a/lib/io.c
+++ b/lib/io.c
@@ -96,6 +96,39 @@ ssize_t erofs_io_pwrite(struct erofs_vfile *vf, const void *buf,
return written;
}
+ssize_t erofs_io_pwritev(struct erofs_vfile *vf, const struct iovec *iov,
+ int iovcnt, u64 pos)
+{
+ ssize_t ret, written;
+ int i;
+
+ if (__erofs_unlikely(cfg.c_dry_run))
+ return 0;
+
+#ifdef HAVE_PWRITEV
+ if (!vf->ops) {
+ ret = pwritev(vf->fd, iov, iovcnt, pos + vf->offset);
+ if (ret < 0)
+ return -errno;
+ return ret;
+ }
+#endif
+ if (vf->ops && vf->ops->pwritev)
+ return vf->ops->pwritev(vf, iov, iovcnt, pos);
+ written = 0;
+ for (i = 0; i < iovcnt; ++i) {
+ ret = erofs_io_pwrite(vf, iov[i].iov_base, pos, iov[i].iov_len);
+ if (ret < iov[i].iov_len) {
+ if (ret < 0)
+ return ret;
+ return written + ret;
+ }
+ written += iov[i].iov_len;
+ pos += iov[i].iov_len;
+ }
+ return written;
+}
+
int erofs_io_fsync(struct erofs_vfile *vf)
{
int ret;
--
2.43.5
More information about the Linux-erofs
mailing list