[PATCH] erofs-utils: mount: add NBD backend support
Gao Xiang
hsiangkao at linux.alibaba.com
Wed Aug 27 16:11:39 AEST 2025
Currently, only local images are supported (so using the current NBD
backend is mostly for functional testing), e.g.:
However, it already uses the vfile interface, so extending it to
remote sources should be straightforward.
NBD failover is not supported yet and should be implemented via the
new netlink interface.
Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
---
include/erofs/defs.h | 9 ++
include/erofs/io.h | 6 ++
lib/Makefile.am | 4 +
lib/backends/nbd.c | 223 +++++++++++++++++++++++++++++++++++++++++++
lib/io.c | 69 ++++++++++++-
lib/liberofs_nbd.h | 39 ++++++++
mount/main.c | 156 +++++++++++++++++++++++++++++-
7 files changed, 499 insertions(+), 7 deletions(-)
create mode 100644 lib/backends/nbd.c
create mode 100644 lib/liberofs_nbd.h
diff --git a/include/erofs/defs.h b/include/erofs/defs.h
index 0f3e754..8af99ae 100644
--- a/include/erofs/defs.h
+++ b/include/erofs/defs.h
@@ -88,6 +88,10 @@ typedef int64_t s64;
#define le32_to_cpu(x) ((__u32)(x))
#define le64_to_cpu(x) ((__u64)(x))
+#define cpu_to_be32(x) ((__be32)__builtin_bswap32(x))
+#define cpu_to_be64(x) ((__be64)__builtin_bswap64(x))
+#define be32_to_cpu(x) (__builtin_bswap32(x))
+#define be64_to_cpu(x) (__builtin_bswap64(x))
#else
#if __BYTE_ORDER == __BIG_ENDIAN
#define cpu_to_le16(x) (__builtin_bswap16(x))
@@ -96,6 +100,11 @@ typedef int64_t s64;
#define le16_to_cpu(x) (__builtin_bswap16(x))
#define le32_to_cpu(x) (__builtin_bswap32(x))
#define le64_to_cpu(x) (__builtin_bswap64(x))
+
+#define cpu_to_be32(x) ((__be32)(x))
+#define cpu_to_be64(x) ((__be64)(x))
+#define be32_to_cpu(x) ((__u32)(x))
+#define be64_to_cpu(x) ((__u64)(x))
#else
#pragma error
#endif
diff --git a/include/erofs/io.h b/include/erofs/io.h
index cc7a3cd..370765f 100644
--- a/include/erofs/io.h
+++ b/include/erofs/io.h
@@ -16,6 +16,7 @@ extern "C"
#define _GNU_SOURCE
#endif
#include <unistd.h>
+#include <sys/stat.h>
#include <sys/uio.h>
#include "defs.h"
@@ -36,6 +37,8 @@ struct erofs_vfops {
ssize_t (*read)(struct erofs_vfile *vf, void *buf, size_t len);
off_t (*lseek)(struct erofs_vfile *vf, u64 offset, int whence);
int (*fstat)(struct erofs_vfile *vf, struct stat *buf);
+ ssize_t (*sendfile)(struct erofs_vfile *vout, struct erofs_vfile *vin,
+ off_t *pos, size_t count);
int (*xcopy)(struct erofs_vfile *vout, off_t pos,
struct erofs_vfile *vin, unsigned int len, bool noseek);
};
@@ -53,6 +56,7 @@ struct erofs_vfile {
};
ssize_t __erofs_io_write(int fd, const void *buf, size_t len);
+int __erofs_0write(int fd, size_t len);
int erofs_io_fstat(struct erofs_vfile *vf, struct stat *buf);
ssize_t erofs_io_pwrite(struct erofs_vfile *vf, const void *buf, u64 pos, size_t len);
@@ -67,6 +71,8 @@ off_t erofs_io_lseek(struct erofs_vfile *vf, u64 offset, int whence);
ssize_t erofs_copy_file_range(int fd_in, u64 *off_in, int fd_out, u64 *off_out,
size_t length);
+ssize_t erofs_io_sendfile(struct erofs_vfile *vout, struct erofs_vfile *vin,
+ off_t *pos, size_t count);
int erofs_io_xcopy(struct erofs_vfile *vout, off_t pos,
struct erofs_vfile *vin, unsigned int len, bool noseek);
diff --git a/lib/Makefile.am b/lib/Makefile.am
index 955495d..4f8e767 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -31,6 +31,7 @@ noinst_HEADERS = $(top_srcdir)/include/erofs_fs.h \
$(top_srcdir)/lib/liberofs_private.h \
$(top_srcdir)/lib/liberofs_xxhash.h \
$(top_srcdir)/lib/liberofs_metabox.h \
+ $(top_srcdir)/lib/liberofs_nbd.h \
$(top_srcdir)/lib/liberofs_s3.h
noinst_HEADERS += compressor.h
@@ -76,3 +77,6 @@ if ENABLE_EROFS_MT
liberofs_la_LDFLAGS = -lpthread
liberofs_la_SOURCES += workqueue.c
endif
+if OS_LINUX
+liberofs_la_SOURCES += backends/nbd.c
+endif
diff --git a/lib/backends/nbd.c b/lib/backends/nbd.c
new file mode 100644
index 0000000..398a1e9
--- /dev/null
+++ b/lib/backends/nbd.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0
+/*
+ * Copyright (C) 2025 Alibaba Cloud
+ */
+#include <errno.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/un.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include "erofs/io.h"
+#include "erofs/err.h"
+#include "erofs/print.h"
+#include "liberofs_nbd.h"
+
+#define NBD_SET_SOCK _IO( 0xab, 0 )
+#define NBD_SET_BLKSIZE _IO( 0xab, 1 )
+#define NBD_DO_IT _IO( 0xab, 3 )
+#define NBD_CLEAR_SOCK _IO( 0xab, 4 )
+#define NBD_SET_SIZE_BLOCKS _IO( 0xab, 7 )
+#define NBD_SET_TIMEOUT _IO( 0xab, 9 )
+#define NBD_SET_FLAGS _IO( 0xab, 10)
+
+#define NBD_REQUEST_MAGIC 0x25609513
+#define NBD_REPLY_MAGIC 0x67446698
+
+#define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */
+
+/*
+ * This is the reply packet that nbd-server sends back to the client after
+ * it has completed an I/O request (or an error occurs).
+ */
+struct nbd_reply {
+ __be32 magic; /* NBD_REPLY_MAGIC */
+ __be32 error; /* 0 = ok, else error */
+ union {
+ __be64 cookie; /* Opaque identifier from request */
+ char handle[8]; /* older spelling of cookie */
+ };
+} __packed;
+
+long erofs_nbd_in_service(int nbdnum)
+{
+ int fd, err;
+ char s[32];
+
+ (void)snprintf(s, sizeof(s), "/sys/block/nbd%d/size", nbdnum);
+ fd = open(s, O_RDONLY);
+ if (fd < 0)
+ return -errno;
+ err = read(fd, s, sizeof(s));
+ if (err < 0) {
+ err = -errno;
+ close(fd);
+ return err;
+ }
+ close(fd);
+ if (!memcmp(s, "0\n", sizeof("0\n") - 1))
+ return -ENOTCONN;
+
+ (void)snprintf(s, sizeof(s), "/sys/block/nbd%d/pid", nbdnum);
+ fd = open(s, O_RDONLY);
+ if (fd < 0)
+ return -errno;
+ err = read(fd, s, sizeof(s));
+ if (err < 0) {
+ err = -errno;
+ close(fd);
+ return err;
+ }
+ close(fd);
+ return strtol(s, NULL, 10);
+}
+
+int erofs_nbd_devscan(void)
+{
+ DIR *_dir;
+ int err;
+
+ _dir = opendir("/sys/block");
+ if (!_dir) {
+ fprintf(stderr, "failed to opendir /sys/block: %s\n",
+ strerror(errno));
+ return -errno;
+ }
+
+ while (1) {
+ struct dirent *dp;
+ char path[64];
+
+ /*
+ * set errno to 0 before calling readdir() in order to
+ * distinguish end of stream and from an error.
+ */
+ errno = 0;
+ dp = readdir(_dir);
+ if (!dp) {
+ if (errno)
+ err = -errno;
+ else
+ err = -EBUSY;
+ break;
+ }
+
+ if (strncmp(dp->d_name, "nbd", 3))
+ continue;
+
+ /* Skip nbdX with valid `pid` or `backend` */
+ err = snprintf(path, sizeof(path), "%s/pid", dp->d_name);
+ if (err < 0)
+ continue;
+ if (!faccessat(dirfd(_dir), path, F_OK, 0))
+ continue;
+ err = snprintf(path, sizeof(path), "%s/backend", dp->d_name);
+ if (err < 0)
+ continue;
+ if (!faccessat(dirfd(_dir), path, F_OK, 0))
+ continue;
+ err = atoi(dp->d_name + 3);
+ break;
+ }
+ closedir(_dir);
+ return err;
+}
+
+int erofs_nbd_connect(int nbdfd, int blkbits, u64 blocks)
+{
+ int sv[2], err;
+
+ err = socketpair(AF_UNIX, SOCK_STREAM, 0, sv);
+ if (err < 0)
+ return -errno;
+
+ err = ioctl(nbdfd, NBD_CLEAR_SOCK, 0);
+ if (err < 0)
+ goto err_out;
+
+ err = ioctl(nbdfd, NBD_SET_BLKSIZE, 1U << blkbits);
+ if (err < 0)
+ goto err_out;
+
+ err = ioctl(nbdfd, NBD_SET_SIZE_BLOCKS, blocks);
+ if (err < 0)
+ goto err_out;
+
+ err = ioctl(nbdfd, NBD_SET_TIMEOUT, 0);
+ if (err < 0)
+ goto err_out;
+
+ err = ioctl(nbdfd, NBD_SET_FLAGS, NBD_FLAG_READ_ONLY);
+ if (err < 0)
+ goto err_out;
+
+ err = ioctl(nbdfd, NBD_SET_SOCK, sv[1]);
+ if (err < 0)
+ goto err_out;
+ return sv[0];
+err_out:
+ close(sv[0]);
+ close(sv[1]);
+ return err;
+}
+
+int erofs_nbd_do_it(int nbdfd)
+{
+ int err;
+
+ err = ioctl(nbdfd, NBD_DO_IT, 0);
+ if (err < 0) {
+ if (errno == EPIPE)
+ /*
+ * `ioctl(NBD_DO_IT)` normally returns EPIPE when someone has
+ * disconnected the socket via NBD_DISCONNECT. We do not want
+ * to return 1 in that case.
+ */
+ err = 0;
+ else
+ err = -errno;
+ }
+ if (err)
+ erofs_err("NBD_DO_IT ends with %s", erofs_strerror(err));
+ close(nbdfd);
+ return err;
+}
+
+int erofs_nbd_get_request(int skfd, struct erofs_nbd_request *rq)
+{
+ struct erofs_vfile vf = { .fd = skfd };
+ int err;
+
+ err = erofs_io_read(&vf, rq, sizeof(*rq));
+ if (err < sizeof(*rq))
+ return -EPIPE;
+
+ if (rq->magic != cpu_to_be32(NBD_REQUEST_MAGIC))
+ return -EIO;
+
+ rq->type = be32_to_cpu((__be32)rq->type);
+ rq->from = be64_to_cpu((__be64)rq->from);
+ rq->len = be32_to_cpu((__be32)rq->len);
+ return 0;
+}
+
+int erofs_nbd_send_reply_header(int skfd, __le64 cookie, int err)
+{
+ struct nbd_reply reply = {
+ .magic = cpu_to_be32(NBD_REPLY_MAGIC),
+ .error = cpu_to_be32(err),
+ .cookie = cookie,
+ };
+ int ret;
+
+ ret = write(skfd, &reply, sizeof(reply));
+ if (ret == sizeof(reply))
+ return 0;
+ return ret < 0 ? -errno : -EIO;
+}
diff --git a/lib/io.c b/lib/io.c
index b91c93c..ff3b794 100644
--- a/lib/io.c
+++ b/lib/io.c
@@ -147,10 +147,29 @@ int erofs_io_fsync(struct erofs_vfile *vf)
return 0;
}
+static const char erofs_zeroed[EROFS_MAX_BLOCK_SIZE];
+
+int __erofs_0write(int fd, size_t len)
+{
+ int err = 0;
+
+ while (len) {
+ u32 count = min_t(u64, sizeof(erofs_zeroed), len);
+
+ err = write(fd, erofs_zeroed, count);
+ if (err <= 0) {
+ if (err < 0)
+ err = -errno;
+ break;
+ }
+ len -= err;
+ }
+ return err < 0 ? err : len;
+}
+
int erofs_io_fallocate(struct erofs_vfile *vf, u64 offset,
size_t len, bool zeroout)
{
- static const char zero[EROFS_MAX_BLOCK_SIZE] = {0};
ssize_t ret;
if (__erofs_unlikely(cfg.c_dry_run))
@@ -164,14 +183,15 @@ int erofs_io_fallocate(struct erofs_vfile *vf, u64 offset,
FALLOC_FL_KEEP_SIZE, offset + vf->offset, len) >= 0)
return 0;
#endif
- while (len > EROFS_MAX_BLOCK_SIZE) {
- ret = erofs_io_pwrite(vf, zero, offset, EROFS_MAX_BLOCK_SIZE);
+ while (len > sizeof(erofs_zeroed)) {
+ ret = erofs_io_pwrite(vf, erofs_zeroed, offset,
+ sizeof(erofs_zeroed));
if (ret < 0)
return (int)ret;
len -= ret;
offset += ret;
}
- return erofs_io_pwrite(vf, zero, offset, len) == len ? 0 : -EIO;
+ return erofs_io_pwrite(vf, erofs_zeroed, offset, len) == len ? 0 : -EIO;
}
int erofs_io_ftruncate(struct erofs_vfile *vf, u64 length)
@@ -551,6 +571,47 @@ off_t erofs_io_lseek(struct erofs_vfile *vf, u64 offset, int whence)
return lseek(vf->fd, offset, whence);
}
+ssize_t erofs_io_sendfile(struct erofs_vfile *vout, struct erofs_vfile *vin,
+ off_t *pos, size_t count)
+{
+ ssize_t written;
+
+ if (vin->ops || vout->ops) {
+ if (vin->ops)
+ return vin->ops->sendfile(vout, vin, pos, count);
+ return vout->ops->sendfile(vout, vin, pos, count);
+ }
+#if defined(HAVE_SYS_SENDFILE_H) && defined(HAVE_SENDFILE)
+ do {
+ written = sendfile(vout->fd, vin->fd, pos, count);
+ if (written <= 0) {
+ if (written < 0) {
+ written = -errno;
+ if (written == -EOVERFLOW && pos)
+ written = 0;
+ }
+ break;
+ }
+ count -= written;
+ } while (written);
+#endif
+ while (count) {
+ char buf[EROFS_MAX_BLOCK_SIZE];
+
+ written = min_t(u64, count, sizeof(buf));
+ if (pos)
+ written = erofs_io_pread(vin, buf, written, *pos);
+ else
+ written = erofs_io_read(vin, buf, written);
+ if (written <= 0)
+ break;
+ count -= written;
+ if (pos)
+ *pos += written;
+ }
+ return written < 0 ? written : count;
+}
+
int erofs_io_xcopy(struct erofs_vfile *vout, off_t pos,
struct erofs_vfile *vin, unsigned int len, bool noseek)
{
diff --git a/lib/liberofs_nbd.h b/lib/liberofs_nbd.h
new file mode 100644
index 0000000..6660df1
--- /dev/null
+++ b/lib/liberofs_nbd.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */
+/*
+ * Copyright (C) 2025 Alibaba Cloud
+ */
+#ifndef __EROFS_LIB_LIBEROFS_NBD_H
+#define __EROFS_LIB_LIBEROFS_NBD_H
+
+#include "erofs/defs.h"
+
+/* Supported request types */
+enum {
+ EROFS_NBD_CMD_READ = 0,
+ EROFS_NBD_CMD_WRITE = 1,
+ EROFS_NBD_CMD_DISC = 2,
+ EROFS_NBD_CMD_FLUSH = 3,
+ EROFS_NBD_CMD_TRIM = 4,
+ /* userspace defines additional extension commands */
+ EROFS_NBD_CMD_WRITE_ZEROES = 6,
+};
+
+struct erofs_nbd_request {
+ __be32 magic; /* NBD_REQUEST_MAGIC */
+ u32 type; /* See NBD_CMD_* */
+ union {
+ __be64 cookie; /* Opaque identifier for request */
+ char handle[8]; /* older spelling of cookie */
+ };
+ u64 from;
+ u32 len;
+} __packed;
+
+long erofs_nbd_in_service(int nbdnum);
+int erofs_nbd_devscan(void);
+int erofs_nbd_connect(int nbdfd, int blkbits, u64 blocks);
+int erofs_nbd_do_it(int nbdfd);
+int erofs_nbd_get_request(int skfd, struct erofs_nbd_request *rq);
+int erofs_nbd_send_reply_header(int skfd, __le64 cookie, int err);
+
+#endif
diff --git a/mount/main.c b/mount/main.c
index 0f7538a..9cb203f 100644
--- a/mount/main.c
+++ b/mount/main.c
@@ -6,10 +6,13 @@
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
+#include <pthread.h>
#include <unistd.h>
#include "erofs/config.h"
#include "erofs/print.h"
#include "erofs/err.h"
+#include "erofs/io.h"
+#include "../lib/liberofs_nbd.h"
#ifdef HAVE_LINUX_LOOP_H
#include <linux/loop.h>
#else
@@ -30,6 +33,7 @@ enum erofs_backend_drv {
EROFSAUTO,
EROFSLOCAL,
EROFSFUSE,
+ EROFSNBD,
};
static struct erofsmount_cfg {
@@ -132,6 +136,8 @@ static int erofsmount_parse_options(int argc, char **argv)
mountcfg.backend = EROFSFUSE;
} else if (!strcmp(dot + 1, "local")) {
mountcfg.backend = EROFSLOCAL;
+ } else if (!strcmp(dot + 1, "nbd")) {
+ mountcfg.backend = EROFSNBD;
} else {
erofs_err("invalid filesystem subtype `%s`", dot + 1);
return -EINVAL;
@@ -196,11 +202,148 @@ static int erofsmount_fuse(const char *source, const char *mountpoint,
return 0;
}
+struct erofsmount_nbd_ctx {
+ struct erofs_vfile vd; /* virtual device */
+ struct erofs_vfile sk; /* socket file */
+};
+
+static void *erofsmount_nbd_loopfn(void *arg)
+{
+ struct erofsmount_nbd_ctx *ctx = arg;
+ int err;
+
+ while (1) {
+ struct erofs_nbd_request rq;
+ ssize_t rem;
+ off_t pos;
+
+ err = erofs_nbd_get_request(ctx->sk.fd, &rq);
+ if (err < 0) {
+ if (err == -EPIPE)
+ err = 0;
+ break;
+ }
+
+ if (rq.type != EROFS_NBD_CMD_READ) {
+ err = erofs_nbd_send_reply_header(ctx->sk.fd,
+ rq.cookie, -EIO);
+ if (err)
+ break;
+ }
+
+ erofs_nbd_send_reply_header(ctx->sk.fd, rq.cookie, 0);
+ pos = rq.from;
+ rem = erofs_io_sendfile(&ctx->sk, &ctx->vd, &pos, rq.len);
+ if (rem < 0) {
+ err = -errno;
+ break;
+ }
+ err = __erofs_0write(ctx->sk.fd, rem);
+ if (err) {
+ if (err > 0)
+ err = -EIO;
+ break;
+ }
+ }
+ close(ctx->vd.fd);
+ close(ctx->sk.fd);
+ return (void *)(uintptr_t)err;
+}
+
+static int erofsmount_startnbd(int nbdfd, const char *source)
+{
+ struct erofsmount_nbd_ctx ctx = {};
+ uintptr_t retcode;
+ pthread_t th;
+ int err, err2;
+
+ err = open(source, O_RDONLY);
+ if (err < 0) {
+ err = -errno;
+ goto out_closefd;
+ }
+ ctx.vd.fd = err;
+
+ err = erofs_nbd_connect(nbdfd, 9, INT64_MAX >> 9);
+ if (err < 0) {
+ close(ctx.vd.fd);
+ goto out_closefd;
+ }
+ ctx.sk.fd = err;
+
+ err = -pthread_create(&th, NULL, erofsmount_nbd_loopfn, &ctx);
+ if (err) {
+ close(ctx.vd.fd);
+ close(ctx.sk.fd);
+ goto out_closefd;
+ }
+
+ err = erofs_nbd_do_it(nbdfd);
+ err2 = -pthread_join(th, (void **)&retcode);
+ if (!err2 && retcode) {
+ erofs_err("NBD worker failed with %s",
+ erofs_strerror(retcode));
+ err2 = retcode;
+ }
+ return err ?: err2;
+out_closefd:
+ close(nbdfd);
+ return err;
+}
+
+static int erofsmount_nbd(const char *source, const char *mountpoint,
+ const char *fstype, int flags,
+ const char *options)
+{
+ char nbdpath[32];
+ int num, nbdfd;
+ pid_t pid;
+ long err;
+
+ if (strcmp(fstype, "erofs")) {
+ fprintf(stderr, "unsupported filesystem type `%s`\n",
+ mountcfg.fstype);
+ return -ENODEV;
+ }
+ flags |= MS_RDONLY;
+
+ num = erofs_nbd_devscan();
+ if (num < 0)
+ return num;
+
+ (void)snprintf(nbdpath, sizeof(nbdpath), "/dev/nbd%d", num);
+ nbdfd = open(nbdpath, O_RDWR);
+ if (nbdfd < 0)
+ return -errno;
+
+ if ((pid = fork()) == 0)
+ return erofsmount_startnbd(nbdfd, source) ?
+ EXIT_FAILURE : EXIT_SUCCESS;
+ close(nbdfd);
+
+ while (1) {
+ err = erofs_nbd_in_service(num);
+ if (err == -ENOENT || err == -ENOTCONN) {
+ usleep(50000);
+ continue;
+ }
+ if (err >= 0)
+ err = (err != pid ? -EBUSY : 0);
+ break;
+ }
+ if (!err) {
+ err = mount(nbdpath, mountpoint, fstype, flags, options);
+ if (err < 0)
+ err = -errno;
+ }
+ return err;
+}
+
#define EROFSMOUNT_LOOPDEV_RETRIES 3
-int erofsmount_loopmount(const char *source, const char *mountpoint,
- const char *fstype, int flags,
- const char *options)
+static int erofsmount_loopmount(const char *source, const char *mountpoint,
+ const char *fstype, int flags,
+ const char *options)
{
int fd, dfd, num;
struct loop_info li = {};
@@ -269,6 +412,13 @@ int main(int argc, char *argv[])
goto exit;
}
+ if (mountcfg.backend == EROFSNBD) {
+ err = erofsmount_nbd(mountcfg.device, mountcfg.mountpoint,
+ mountcfg.fstype, mountcfg.flags,
+ mountcfg.options);
+ goto exit;
+ }
+
err = mount(mountcfg.device, mountcfg.mountpoint, mountcfg.fstype,
mountcfg.flags, mountcfg.options);
if (err < 0)
--
2.43.0
More information about the Linux-erofs
mailing list