[PATCH] erofs-utils: mount: add NBD backend support

Gao Xiang hsiangkao at linux.alibaba.com
Wed Aug 27 16:11:39 AEST 2025


Currently, only local images are supported (so using the current NBD
backend is mostly for functional testing), e.g.:

However, it already uses the vfile interface, so extending it to
remote sources should be straightforward.

NBD failover is not supported yet and should be implemented via the
new netlink interface.

Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
---
 include/erofs/defs.h |   9 ++
 include/erofs/io.h   |   6 ++
 lib/Makefile.am      |   4 +
 lib/backends/nbd.c   | 223 +++++++++++++++++++++++++++++++++++++++++++
 lib/io.c             |  69 ++++++++++++-
 lib/liberofs_nbd.h   |  39 ++++++++
 mount/main.c         | 156 +++++++++++++++++++++++++++++-
 7 files changed, 499 insertions(+), 7 deletions(-)
 create mode 100644 lib/backends/nbd.c
 create mode 100644 lib/liberofs_nbd.h

diff --git a/include/erofs/defs.h b/include/erofs/defs.h
index 0f3e754..8af99ae 100644
--- a/include/erofs/defs.h
+++ b/include/erofs/defs.h
@@ -88,6 +88,10 @@ typedef int64_t         s64;
 #define le32_to_cpu(x) ((__u32)(x))
 #define le64_to_cpu(x) ((__u64)(x))
 
+#define cpu_to_be32(x) ((__be32)__builtin_bswap32(x))
+#define cpu_to_be64(x) ((__be64)__builtin_bswap64(x))
+#define be32_to_cpu(x) (__builtin_bswap32(x))
+#define be64_to_cpu(x) (__builtin_bswap64(x))
 #else
 #if __BYTE_ORDER == __BIG_ENDIAN
 #define cpu_to_le16(x) (__builtin_bswap16(x))
@@ -96,6 +100,11 @@ typedef int64_t         s64;
 #define le16_to_cpu(x) (__builtin_bswap16(x))
 #define le32_to_cpu(x) (__builtin_bswap32(x))
 #define le64_to_cpu(x) (__builtin_bswap64(x))
+
+#define cpu_to_be32(x) ((__be32)(x))
+#define cpu_to_be64(x) ((__be64)(x))
+#define be32_to_cpu(x) ((__u32)(x))
+#define be64_to_cpu(x) ((__u64)(x))
 #else
 #pragma error
 #endif
diff --git a/include/erofs/io.h b/include/erofs/io.h
index cc7a3cd..370765f 100644
--- a/include/erofs/io.h
+++ b/include/erofs/io.h
@@ -16,6 +16,7 @@ extern "C"
 #define _GNU_SOURCE
 #endif
 #include <unistd.h>
+#include <sys/stat.h>
 #include <sys/uio.h>
 #include "defs.h"
 
@@ -36,6 +37,8 @@ struct erofs_vfops {
 	ssize_t (*read)(struct erofs_vfile *vf, void *buf, size_t len);
 	off_t (*lseek)(struct erofs_vfile *vf, u64 offset, int whence);
 	int (*fstat)(struct erofs_vfile *vf, struct stat *buf);
+	ssize_t (*sendfile)(struct erofs_vfile *vout, struct erofs_vfile *vin,
+			    off_t *pos, size_t count);
 	int (*xcopy)(struct erofs_vfile *vout, off_t pos,
 		     struct erofs_vfile *vin, unsigned int len, bool noseek);
 };
@@ -53,6 +56,7 @@ struct erofs_vfile {
 };
 
 ssize_t __erofs_io_write(int fd, const void *buf, size_t len);
+int __erofs_0write(int fd, size_t len);
 
 int erofs_io_fstat(struct erofs_vfile *vf, struct stat *buf);
 ssize_t erofs_io_pwrite(struct erofs_vfile *vf, const void *buf, u64 pos, size_t len);
@@ -67,6 +71,8 @@ off_t erofs_io_lseek(struct erofs_vfile *vf, u64 offset, int whence);
 
 ssize_t erofs_copy_file_range(int fd_in, u64 *off_in, int fd_out, u64 *off_out,
 			      size_t length);
+ssize_t erofs_io_sendfile(struct erofs_vfile *vout, struct erofs_vfile *vin,
+			  off_t *pos, size_t count);
 int erofs_io_xcopy(struct erofs_vfile *vout, off_t pos,
 		   struct erofs_vfile *vin, unsigned int len, bool noseek);
 
diff --git a/lib/Makefile.am b/lib/Makefile.am
index 955495d..4f8e767 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -31,6 +31,7 @@ noinst_HEADERS = $(top_srcdir)/include/erofs_fs.h \
       $(top_srcdir)/lib/liberofs_private.h \
       $(top_srcdir)/lib/liberofs_xxhash.h \
       $(top_srcdir)/lib/liberofs_metabox.h \
+      $(top_srcdir)/lib/liberofs_nbd.h \
       $(top_srcdir)/lib/liberofs_s3.h
 
 noinst_HEADERS += compressor.h
@@ -76,3 +77,6 @@ if ENABLE_EROFS_MT
 liberofs_la_LDFLAGS = -lpthread
 liberofs_la_SOURCES += workqueue.c
 endif
+if OS_LINUX
+liberofs_la_SOURCES += backends/nbd.c
+endif
diff --git a/lib/backends/nbd.c b/lib/backends/nbd.c
new file mode 100644
index 0000000..398a1e9
--- /dev/null
+++ b/lib/backends/nbd.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0
+/*
+ * Copyright (C) 2025 Alibaba Cloud
+ */
+#include <errno.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/un.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include "erofs/io.h"
+#include "erofs/err.h"
+#include "erofs/print.h"
+#include "liberofs_nbd.h"
+
+#define NBD_SET_SOCK		_IO( 0xab, 0 )
+#define NBD_SET_BLKSIZE		_IO( 0xab, 1 )
+#define NBD_DO_IT		_IO( 0xab, 3 )
+#define NBD_CLEAR_SOCK		_IO( 0xab, 4 )
+#define NBD_SET_SIZE_BLOCKS     _IO( 0xab, 7 )
+#define NBD_SET_TIMEOUT		_IO( 0xab, 9 )
+#define NBD_SET_FLAGS		_IO( 0xab, 10)
+
+#define NBD_REQUEST_MAGIC	0x25609513
+#define NBD_REPLY_MAGIC		0x67446698
+
+#define NBD_FLAG_READ_ONLY	(1 << 1)	/* device is read-only */
+
+/*
+ * This is the reply packet that nbd-server sends back to the client after
+ * it has completed an I/O request (or an error occurs).
+ */
+struct nbd_reply {
+	__be32 magic;		/* NBD_REPLY_MAGIC */
+	__be32 error;		/* 0 = ok, else error */
+	union {
+		__be64 cookie;	/* Opaque identifier from request */
+		char handle[8];	/* older spelling of cookie */
+	};
+} __packed;
+
+long erofs_nbd_in_service(int nbdnum)
+{
+	int fd, err;
+	char s[32];
+
+	(void)snprintf(s, sizeof(s), "/sys/block/nbd%d/size", nbdnum);
+	fd = open(s, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+	err = read(fd, s, sizeof(s));
+	if (err < 0) {
+		err = -errno;
+		close(fd);
+		return err;
+	}
+	close(fd);
+	if (!memcmp(s, "0\n", sizeof("0\n") - 1))
+		return -ENOTCONN;
+
+	(void)snprintf(s, sizeof(s), "/sys/block/nbd%d/pid", nbdnum);
+	fd = open(s, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+	err = read(fd, s, sizeof(s));
+	if (err < 0) {
+		err = -errno;
+		close(fd);
+		return err;
+	}
+	close(fd);
+	return strtol(s, NULL, 10);
+}
+
+int erofs_nbd_devscan(void)
+{
+	DIR *_dir;
+	int err;
+
+	_dir = opendir("/sys/block");
+	if (!_dir) {
+		fprintf(stderr, "failed to opendir /sys/block: %s\n",
+			strerror(errno));
+		return -errno;
+	}
+
+	while (1) {
+		struct dirent *dp;
+		char path[64];
+
+		/*
+		 * set errno to 0 before calling readdir() in order to
+		 * distinguish end of stream and from an error.
+		 */
+		errno = 0;
+		dp = readdir(_dir);
+		if (!dp) {
+			if (errno)
+				err = -errno;
+			else
+				err = -EBUSY;
+			break;
+		}
+
+		if (strncmp(dp->d_name, "nbd", 3))
+			continue;
+
+		/* Skip nbdX with valid `pid` or `backend` */
+		err = snprintf(path, sizeof(path), "%s/pid", dp->d_name);
+		if (err < 0)
+			continue;
+		if (!faccessat(dirfd(_dir), path, F_OK, 0))
+			continue;
+		err = snprintf(path, sizeof(path), "%s/backend", dp->d_name);
+		if (err < 0)
+			continue;
+		if (!faccessat(dirfd(_dir), path, F_OK, 0))
+			continue;
+		err = atoi(dp->d_name + 3);
+		break;
+	}
+	closedir(_dir);
+	return err;
+}
+
+int erofs_nbd_connect(int nbdfd, int blkbits, u64 blocks)
+{
+	int sv[2], err;
+
+	err = socketpair(AF_UNIX, SOCK_STREAM, 0, sv);
+	if (err < 0)
+		return -errno;
+
+	err = ioctl(nbdfd, NBD_CLEAR_SOCK, 0);
+	if (err < 0)
+		goto err_out;
+
+	err = ioctl(nbdfd, NBD_SET_BLKSIZE, 1U << blkbits);
+	if (err < 0)
+		goto err_out;
+
+	err = ioctl(nbdfd, NBD_SET_SIZE_BLOCKS, blocks);
+	if (err < 0)
+		goto err_out;
+
+	err = ioctl(nbdfd, NBD_SET_TIMEOUT, 0);
+	if (err < 0)
+		goto err_out;
+
+	err = ioctl(nbdfd, NBD_SET_FLAGS, NBD_FLAG_READ_ONLY);
+	if (err < 0)
+		goto err_out;
+
+	err = ioctl(nbdfd, NBD_SET_SOCK, sv[1]);
+	if (err < 0)
+		goto err_out;
+	return sv[0];
+err_out:
+	close(sv[0]);
+	close(sv[1]);
+	return err;
+}
+
+int erofs_nbd_do_it(int nbdfd)
+{
+	int err;
+
+	err = ioctl(nbdfd, NBD_DO_IT, 0);
+	if (err < 0) {
+		if (errno == EPIPE)
+			/*
+			 * `ioctl(NBD_DO_IT)` normally returns EPIPE when someone has
+			 * disconnected the socket via NBD_DISCONNECT.  We do not want
+			 * to return 1 in that case.
+			*/
+			err = 0;
+		else
+			err = -errno;
+	}
+	if (err)
+		erofs_err("NBD_DO_IT ends with %s", erofs_strerror(err));
+	close(nbdfd);
+	return err;
+}
+
+int erofs_nbd_get_request(int skfd, struct erofs_nbd_request *rq)
+{
+	struct erofs_vfile vf = { .fd = skfd };
+	int err;
+
+	err = erofs_io_read(&vf, rq, sizeof(*rq));
+	if (err < sizeof(*rq))
+		return -EPIPE;
+
+	if (rq->magic != cpu_to_be32(NBD_REQUEST_MAGIC))
+		return -EIO;
+
+	rq->type = be32_to_cpu((__be32)rq->type);
+	rq->from = be64_to_cpu((__be64)rq->from);
+	rq->len = be32_to_cpu((__be32)rq->len);
+	return 0;
+}
+
+int erofs_nbd_send_reply_header(int skfd, __le64 cookie, int err)
+{
+	struct nbd_reply reply = {
+		.magic = cpu_to_be32(NBD_REPLY_MAGIC),
+		.error = cpu_to_be32(err),
+		.cookie = cookie,
+	};
+	int ret;
+
+	ret = write(skfd, &reply, sizeof(reply));
+	if (ret == sizeof(reply))
+		return 0;
+	return ret < 0 ? -errno : -EIO;
+}
diff --git a/lib/io.c b/lib/io.c
index b91c93c..ff3b794 100644
--- a/lib/io.c
+++ b/lib/io.c
@@ -147,10 +147,29 @@ int erofs_io_fsync(struct erofs_vfile *vf)
 	return 0;
 }
 
+static const char erofs_zeroed[EROFS_MAX_BLOCK_SIZE];
+
+int __erofs_0write(int fd, size_t len)
+{
+	int err = 0;
+
+	while (len) {
+		u32 count = min_t(u64, sizeof(erofs_zeroed), len);
+
+		err = write(fd, erofs_zeroed, count);
+		if (err <= 0) {
+			if (err < 0)
+				err = -errno;
+			break;
+		}
+		len -= err;
+	}
+	return err < 0 ? err : len;
+}
+
 int erofs_io_fallocate(struct erofs_vfile *vf, u64 offset,
 		       size_t len, bool zeroout)
 {
-	static const char zero[EROFS_MAX_BLOCK_SIZE] = {0};
 	ssize_t ret;
 
 	if (__erofs_unlikely(cfg.c_dry_run))
@@ -164,14 +183,15 @@ int erofs_io_fallocate(struct erofs_vfile *vf, u64 offset,
 		    FALLOC_FL_KEEP_SIZE, offset + vf->offset, len) >= 0)
 		return 0;
 #endif
-	while (len > EROFS_MAX_BLOCK_SIZE) {
-		ret = erofs_io_pwrite(vf, zero, offset, EROFS_MAX_BLOCK_SIZE);
+	while (len > sizeof(erofs_zeroed)) {
+		ret = erofs_io_pwrite(vf, erofs_zeroed, offset,
+				      sizeof(erofs_zeroed));
 		if (ret < 0)
 			return (int)ret;
 		len -= ret;
 		offset += ret;
 	}
-	return erofs_io_pwrite(vf, zero, offset, len) == len ? 0 : -EIO;
+	return erofs_io_pwrite(vf, erofs_zeroed, offset, len) == len ? 0 : -EIO;
 }
 
 int erofs_io_ftruncate(struct erofs_vfile *vf, u64 length)
@@ -551,6 +571,47 @@ off_t erofs_io_lseek(struct erofs_vfile *vf, u64 offset, int whence)
 	return lseek(vf->fd, offset, whence);
 }
 
+ssize_t erofs_io_sendfile(struct erofs_vfile *vout, struct erofs_vfile *vin,
+			  off_t *pos, size_t count)
+{
+	ssize_t written;
+
+	if (vin->ops || vout->ops) {
+		if (vin->ops)
+			return vin->ops->sendfile(vout, vin, pos, count);
+		return vout->ops->sendfile(vout, vin, pos, count);
+	}
+#if defined(HAVE_SYS_SENDFILE_H) && defined(HAVE_SENDFILE)
+	do {
+		written = sendfile(vout->fd, vin->fd, pos, count);
+		if (written <= 0) {
+			if (written < 0) {
+				written = -errno;
+				if (written == -EOVERFLOW && pos)
+					written = 0;
+			}
+			break;
+		}
+		count -= written;
+	} while (written);
+#endif
+	while (count) {
+		char buf[EROFS_MAX_BLOCK_SIZE];
+
+		written = min_t(u64, count, sizeof(buf));
+		if (pos)
+			written = erofs_io_pread(vin, buf, written, *pos);
+		else
+			written = erofs_io_read(vin, buf, written);
+		if (written <= 0)
+			break;
+		count -= written;
+		if (pos)
+			*pos += written;
+	}
+	return written < 0 ? written : count;
+}
+
 int erofs_io_xcopy(struct erofs_vfile *vout, off_t pos,
 		   struct erofs_vfile *vin, unsigned int len, bool noseek)
 {
diff --git a/lib/liberofs_nbd.h b/lib/liberofs_nbd.h
new file mode 100644
index 0000000..6660df1
--- /dev/null
+++ b/lib/liberofs_nbd.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */
+/*
+ * Copyright (C) 2025 Alibaba Cloud
+ */
+#ifndef __EROFS_LIB_LIBEROFS_NBD_H
+#define __EROFS_LIB_LIBEROFS_NBD_H
+
+#include "erofs/defs.h"
+
+/* Supported request types */
+enum {
+	EROFS_NBD_CMD_READ		= 0,
+	EROFS_NBD_CMD_WRITE		= 1,
+	EROFS_NBD_CMD_DISC		= 2,
+	EROFS_NBD_CMD_FLUSH		= 3,
+	EROFS_NBD_CMD_TRIM		= 4,
+	/* userspace defines additional extension commands */
+	EROFS_NBD_CMD_WRITE_ZEROES	= 6,
+};
+
+struct erofs_nbd_request {
+	__be32 magic;			/* NBD_REQUEST_MAGIC */
+	u32 type;			/* See NBD_CMD_* */
+	union {
+		__be64 cookie;		/* Opaque identifier for request */
+		char   handle[8];	/* older spelling of cookie */
+	};
+	u64 from;
+        u32 len;
+} __packed;
+
+long erofs_nbd_in_service(int nbdnum);
+int erofs_nbd_devscan(void);
+int erofs_nbd_connect(int nbdfd, int blkbits, u64 blocks);
+int erofs_nbd_do_it(int nbdfd);
+int erofs_nbd_get_request(int skfd, struct erofs_nbd_request *rq);
+int erofs_nbd_send_reply_header(int skfd, __le64 cookie, int err);
+
+#endif
diff --git a/mount/main.c b/mount/main.c
index 0f7538a..9cb203f 100644
--- a/mount/main.c
+++ b/mount/main.c
@@ -6,10 +6,13 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mount.h>
+#include <pthread.h>
 #include <unistd.h>
 #include "erofs/config.h"
 #include "erofs/print.h"
 #include "erofs/err.h"
+#include "erofs/io.h"
+#include "../lib/liberofs_nbd.h"
 #ifdef HAVE_LINUX_LOOP_H
 #include <linux/loop.h>
 #else
@@ -30,6 +33,7 @@ enum erofs_backend_drv {
 	EROFSAUTO,
 	EROFSLOCAL,
 	EROFSFUSE,
+	EROFSNBD,
 };
 
 static struct erofsmount_cfg {
@@ -132,6 +136,8 @@ static int erofsmount_parse_options(int argc, char **argv)
 					mountcfg.backend = EROFSFUSE;
 				} else if (!strcmp(dot + 1, "local")) {
 					mountcfg.backend = EROFSLOCAL;
+				} else if (!strcmp(dot + 1, "nbd")) {
+					mountcfg.backend = EROFSNBD;
 				} else {
 					erofs_err("invalid filesystem subtype `%s`", dot + 1);
 					return -EINVAL;
@@ -196,11 +202,148 @@ static int erofsmount_fuse(const char *source, const char *mountpoint,
 	return 0;
 }
 
+struct erofsmount_nbd_ctx {
+	struct erofs_vfile vd;		/* virtual device */
+	struct erofs_vfile sk;		/* socket file */
+};
+
+static void *erofsmount_nbd_loopfn(void *arg)
+{
+	struct erofsmount_nbd_ctx *ctx = arg;
+	int err;
+
+	while (1) {
+		struct erofs_nbd_request rq;
+		ssize_t rem;
+		off_t pos;
+
+		err = erofs_nbd_get_request(ctx->sk.fd, &rq);
+		if (err < 0) {
+			if (err == -EPIPE)
+				err = 0;
+			break;
+		}
+
+		if (rq.type != EROFS_NBD_CMD_READ) {
+			err = erofs_nbd_send_reply_header(ctx->sk.fd,
+						rq.cookie, -EIO);
+			if (err)
+				break;
+		}
+
+		erofs_nbd_send_reply_header(ctx->sk.fd, rq.cookie, 0);
+		pos = rq.from;
+		rem = erofs_io_sendfile(&ctx->sk, &ctx->vd, &pos, rq.len);
+		if (rem < 0) {
+			err = -errno;
+			break;
+		}
+		err = __erofs_0write(ctx->sk.fd, rem);
+		if (err) {
+			if (err > 0)
+				err = -EIO;
+			break;
+		}
+	}
+	close(ctx->vd.fd);
+	close(ctx->sk.fd);
+	return (void *)(uintptr_t)err;
+}
+
+static int erofsmount_startnbd(int nbdfd, const char *source)
+{
+	struct erofsmount_nbd_ctx ctx = {};
+	uintptr_t retcode;
+	pthread_t th;
+	int err, err2;
+
+	err = open(source, O_RDONLY);
+	if (err < 0) {
+		err = -errno;
+		goto out_closefd;
+	}
+	ctx.vd.fd = err;
+
+	err = erofs_nbd_connect(nbdfd, 9, INT64_MAX >> 9);
+	if (err < 0) {
+		close(ctx.vd.fd);
+		goto out_closefd;
+	}
+	ctx.sk.fd = err;
+
+	err = -pthread_create(&th, NULL, erofsmount_nbd_loopfn, &ctx);
+	if (err) {
+		close(ctx.vd.fd);
+		close(ctx.sk.fd);
+		goto out_closefd;
+	}
+
+	err = erofs_nbd_do_it(nbdfd);
+	err2 = -pthread_join(th, (void **)&retcode);
+	if (!err2 && retcode) {
+		erofs_err("NBD worker failed with %s",
+		          erofs_strerror(retcode));
+		err2 = retcode;
+	}
+	return err ?: err2;
+out_closefd:
+	close(nbdfd);
+	return err;
+}
+
+static int erofsmount_nbd(const char *source, const char *mountpoint,
+			  const char *fstype, int flags,
+			  const char *options)
+{
+	char nbdpath[32];
+	int num, nbdfd;
+	pid_t pid;
+	long err;
+
+	if (strcmp(fstype, "erofs")) {
+		fprintf(stderr, "unsupported filesystem type `%s`\n",
+			mountcfg.fstype);
+		return -ENODEV;
+	}
+	flags |= MS_RDONLY;
+
+	num = erofs_nbd_devscan();
+	if (num < 0)
+		return num;
+
+	(void)snprintf(nbdpath, sizeof(nbdpath), "/dev/nbd%d", num);
+	nbdfd = open(nbdpath, O_RDWR);
+	if (nbdfd < 0)
+		return -errno;
+
+	if ((pid = fork()) == 0)
+		return erofsmount_startnbd(nbdfd, source) ?
+			EXIT_FAILURE : EXIT_SUCCESS;
+	close(nbdfd);
+
+	while (1) {
+		err = erofs_nbd_in_service(num);
+		if (err == -ENOENT || err == -ENOTCONN) {
+			usleep(50000);
+			continue;
+		}
+		if (err >= 0)
+			err = (err != pid ? -EBUSY : 0);
+		break;
+	}
+	if (!err) {
+		err = mount(nbdpath, mountpoint, fstype, flags, options);
+		if (err < 0)
+			err = -errno;
+	}
+	return err;
+}
+
 #define EROFSMOUNT_LOOPDEV_RETRIES	3
 
-int erofsmount_loopmount(const char *source, const char *mountpoint,
-			 const char *fstype, int flags,
-			 const char *options)
+static int erofsmount_loopmount(const char *source, const char *mountpoint,
+				const char *fstype, int flags,
+				const char *options)
 {
 	int fd, dfd, num;
 	struct loop_info li = {};
@@ -269,6 +412,13 @@ int main(int argc, char *argv[])
 		goto exit;
 	}
 
+	if (mountcfg.backend == EROFSNBD) {
+		err = erofsmount_nbd(mountcfg.device, mountcfg.mountpoint,
+				     mountcfg.fstype, mountcfg.flags,
+				     mountcfg.options);
+		goto exit;
+	}
+
 	err = mount(mountcfg.device, mountcfg.mountpoint, mountcfg.fstype,
 		    mountcfg.flags, mountcfg.options);
 	if (err < 0)
-- 
2.43.0



More information about the Linux-erofs mailing list