[PATCH] erofs-utils: mkfs: avoid partial writes in erofs_write_tail_end()

Gao Xiang hsiangkao at linux.alibaba.com
Wed Jun 11 13:25:09 AEST 2025


Partial writes to a physical block device (rather than to some hole in
a regular file) can cause unnecessary read-modify-write cycles.  In
particular, redundant reads can take noticeable overhead on cloud disks.

Write full blocks with pwritev(2) instead.

Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
---
 configure.ac       |  2 ++
 include/erofs/io.h |  5 +++++
 lib/inode.c        | 33 +++++++++++++++------------------
 lib/io.c           | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/configure.ac b/configure.ac
index 88f1cbe..a73a9ba 100644
--- a/configure.ac
+++ b/configure.ac
@@ -216,6 +216,7 @@ AC_CHECK_HEADERS(m4_flatten([
 	sys/statfs.h
 	sys/sysmacros.h
 	sys/time.h
+	sys/uio.h
 	unistd.h
 ]))
 
@@ -274,6 +275,7 @@ AC_CHECK_FUNCS(m4_flatten([
 	ftello64
 	pread64
 	pwrite64
+	pwritev
 	posix_fadvise
 	fstatfs
 	sendfile
diff --git a/include/erofs/io.h b/include/erofs/io.h
index 3179ea1..101a5ba 100644
--- a/include/erofs/io.h
+++ b/include/erofs/io.h
@@ -16,6 +16,7 @@ extern "C"
 #define _GNU_SOURCE
 #endif
 #include <unistd.h>
+#include <sys/uio.h>
 #include "defs.h"
 
 #ifndef O_BINARY
@@ -27,6 +28,8 @@ struct erofs_vfile;
 struct erofs_vfops {
 	ssize_t (*pread)(struct erofs_vfile *vf, void *buf, u64 offset, size_t len);
 	ssize_t (*pwrite)(struct erofs_vfile *vf, const void *buf, u64 offset, size_t len);
+	ssize_t (*pwritev)(struct erofs_vfile *vf, const struct iovec *iov,
+			   int iovcnt, u64 pos);
 	int (*fsync)(struct erofs_vfile *vf);
 	int (*fallocate)(struct erofs_vfile *vf, u64 offset, size_t len, bool pad);
 	int (*ftruncate)(struct erofs_vfile *vf, u64 length);
@@ -53,6 +56,8 @@ ssize_t __erofs_io_write(int fd, const void *buf, size_t len);
 
 int erofs_io_fstat(struct erofs_vfile *vf, struct stat *buf);
 ssize_t erofs_io_pwrite(struct erofs_vfile *vf, const void *buf, u64 pos, size_t len);
+ssize_t erofs_io_pwritev(struct erofs_vfile *vf, const struct iovec *iov,
+			 int iovcnt, u64 pos);
 int erofs_io_fsync(struct erofs_vfile *vf);
 ssize_t erofs_io_fallocate(struct erofs_vfile *vf, u64 offset, size_t len, bool pad);
 int erofs_io_ftruncate(struct erofs_vfile *vf, u64 length);
diff --git a/lib/inode.c b/lib/inode.c
index a36ade2..09f519b 100644
--- a/lib/inode.c
+++ b/lib/inode.c
@@ -827,6 +827,7 @@ static struct erofs_bhops erofs_write_inline_bhops = {
 
 static int erofs_write_tail_end(struct erofs_inode *inode)
 {
+	static const u8 zeroed[EROFS_MAX_BLOCK_SIZE];
 	struct erofs_sb_info *sbi = inode->sbi;
 	struct erofs_buffer_head *bh, *ibh;
 
@@ -843,8 +844,10 @@ static int erofs_write_tail_end(struct erofs_inode *inode)
 		ibh->fsprivate = erofs_igrab(inode);
 		ibh->op = &erofs_write_inline_bhops;
 	} else {
+		struct iovec iov[2];
+		erofs_off_t pos;
 		int ret;
-		erofs_off_t pos, zero_pos;
+		bool h0;
 
 		if (!bh) {
 			bh = erofs_balloc(sbi->bmgr,
@@ -874,25 +877,19 @@ static int erofs_write_tail_end(struct erofs_inode *inode)
 		pos = erofs_btell(bh, true) - erofs_blksiz(sbi);
 
 		/* 0'ed data should be padded at head for 0padding conversion */
-		if (erofs_sb_has_lz4_0padding(sbi) && inode->compressed_idata) {
-			zero_pos = pos;
-			pos += erofs_blksiz(sbi) - inode->idata_size;
-		} else {
-			/* pad 0'ed data for the other cases */
-			zero_pos = pos + inode->idata_size;
-		}
-		ret = erofs_dev_write(sbi, inode->idata, pos, inode->idata_size);
-		if (ret)
+		h0 = erofs_sb_has_lz4_0padding(sbi) && inode->compressed_idata;
+		DBG_BUGON(inode->idata_size > erofs_blksiz(sbi));
+
+		iov[h0] = (struct iovec) { .iov_base = inode->idata,
+					   .iov_len = inode->idata_size };
+		iov[!h0] = (struct iovec) { .iov_base = (u8 *)zeroed,
+				erofs_blksiz(sbi) - inode->idata_size };
+		ret = erofs_io_pwritev(&sbi->bdev, iov, 2, pos);
+		if (ret < 0)
 			return ret;
+		else if (ret < erofs_blksiz(sbi))
+			return -EIO;
 
-		DBG_BUGON(inode->idata_size > erofs_blksiz(sbi));
-		if (inode->idata_size < erofs_blksiz(sbi)) {
-			ret = erofs_dev_fillzero(sbi, zero_pos,
-					   erofs_blksiz(sbi) - inode->idata_size,
-					   false);
-			if (ret)
-				return ret;
-		}
 		inode->idata_size = 0;
 		free(inode->idata);
 		inode->idata = NULL;
diff --git a/lib/io.c b/lib/io.c
index 5c3d263..aa043ca 100644
--- a/lib/io.c
+++ b/lib/io.c
@@ -96,6 +96,39 @@ ssize_t erofs_io_pwrite(struct erofs_vfile *vf, const void *buf,
 	return written;
 }
 
+ssize_t erofs_io_pwritev(struct erofs_vfile *vf, const struct iovec *iov,
+			 int iovcnt, u64 pos)
+{
+	ssize_t ret, written;
+	int i;
+
+	if (__erofs_unlikely(cfg.c_dry_run))
+		return 0;
+
+#ifdef HAVE_PWRITEV
+	if (!vf->ops) {
+		ret = pwritev(vf->fd, iov, iovcnt, pos + vf->offset);
+		if (ret < 0)
+			return -errno;
+		return ret;
+	}
+#endif
+	if (vf->ops && vf->ops->pwritev)
+		return vf->ops->pwritev(vf, iov, iovcnt, pos);
+	written = 0;
+	for (i = 0; i < iovcnt; ++i) {
+		ret = erofs_io_pwrite(vf, iov[i].iov_base, pos, iov[i].iov_len);
+		if (ret < iov[i].iov_len) {
+			if (ret < 0)
+				return ret;
+			return written + ret;
+		}
+		written += iov[i].iov_len;
+		pos += iov[i].iov_len;
+	}
+	return written;
+}
+
 int erofs_io_fsync(struct erofs_vfile *vf)
 {
 	int ret;
-- 
2.43.5



More information about the Linux-erofs mailing list