[PATCH 1/4] erofs-utils: lib: nbd: add support for the netlink interface
Gao Xiang
hsiangkao at linux.alibaba.com
Wed Sep 3 01:06:07 AEST 2025
Meta (formerly Facebook) developed a new netlink‑based interface [1]
since Linux 4.12 to replace the old ioctl‑based interface for crash
recovery and daemon hot upgrade.
[1] https://lore.kernel.org/r/1491512527-4286-1-git-send-email-jbacik@fb.com
Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
---
configure.ac | 29 +++++++
lib/Makefile.am | 1 +
lib/backends/nbd.c | 191 +++++++++++++++++++++++++++++++++++++++++++++
lib/liberofs_nbd.h | 2 +
mount/Makefile.am | 2 +-
mount/main.c | 79 ++++++++++++++++---
6 files changed, 290 insertions(+), 14 deletions(-)
diff --git a/configure.ac b/configure.ac
index 7db4489..0c03a1d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -181,6 +181,10 @@ AC_ARG_WITH(json_c,
[AS_HELP_STRING([--with-json-c],
[Enable and build with json-c support @<:@default=auto@:>@])])
+AC_ARG_WITH(libnl3,
+ [AS_HELP_STRING([--with-libnl3],
+ [Enable and build with libnl3 support @<:@default=auto@:>@])])
+
AC_ARG_ENABLE(s3,
[AS_HELP_STRING([--enable-s3], [enable s3 image generation support @<:@default=no@:>@])],
[enable_s3="$enableval"], [enable_s3="no"])
@@ -718,6 +722,31 @@ AS_IF([test "x$with_libxml2" != "xno"], [
])
])
+# Configure libnl3
+have_libnl3="no"
+AS_IF([test "x$with_libnl3" != "xno"], [
+ PKG_CHECK_MODULES([libnl3], [libnl-genl-3.0 >= 3.1], [
+ # Paranoia: don't trust the result reported by pkgconfig before trying out
+ saved_LIBS="$LIBS"
+ saved_CPPFLAGS=${CPPFLAGS}
+ CPPFLAGS="${libnl3_CFLAGS} ${CPPFLAGS}"
+ LIBS="${libnl3_LIBS} $LIBS"
+ AC_CHECK_HEADERS([netlink/genl/genl.h],[
+ AC_CHECK_LIB(nl-genl-3, genl_connect, [], [
+ AC_MSG_ERROR([libnl3 doesn't work properly])])
+ AC_CHECK_DECL(genl_connect, [have_libnl3="yes"],
+ [AC_MSG_ERROR([libnl3 doesn't work properly])], [[
+#include <netlink/genl/genl.h>
+ ]])
+ ])
+ LIBS="${saved_LIBS}"
+ CPPFLAGS="${saved_CPPFLAGS}"], [
+ AS_IF([test "x$with_libnl3" = "xyes"], [
+ AC_MSG_ERROR([Cannot find proper libnl3])
+ ])
+ ])
+])
+
AS_IF([test "x$enable_s3" != "xno"], [
AS_IF(
[test "x$have_libcurl" = "xyes" && \
diff --git a/lib/Makefile.am b/lib/Makefile.am
index 1c8be2c..1d7958b 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -79,6 +79,7 @@ liberofs_la_LDFLAGS += -lpthread
liberofs_la_SOURCES += workqueue.c
endif
if OS_LINUX
+liberofs_la_CFLAGS += ${libnl3_CFLAGS}
liberofs_la_SOURCES += backends/nbd.c
endif
if ENABLE_OCI
diff --git a/lib/backends/nbd.c b/lib/backends/nbd.c
index 43630f0..7058a81 100644
--- a/lib/backends/nbd.c
+++ b/lib/backends/nbd.c
@@ -19,6 +19,12 @@
#include "erofs/print.h"
#include "liberofs_nbd.h"
+#ifdef HAVE_NETLINK_GENL_GENL_H
+#include <netlink/netlink.h>
+#include <netlink/genl/genl.h>
+#include <netlink/genl/ctrl.h>
+#endif
+
#define NBD_SET_SOCK _IO( 0xab, 0 )
#define NBD_SET_BLKSIZE _IO( 0xab, 1 )
#define NBD_DO_IT _IO( 0xab, 3 )
@@ -168,6 +174,191 @@ err_out:
return err;
}
+#if defined(HAVE_NETLINK_GENL_GENL_H) && defined(HAVE_LIBNL_GENL_3)
+enum {
+ NBD_ATTR_UNSPEC,
+ NBD_ATTR_INDEX,
+ NBD_ATTR_SIZE_BYTES,
+ NBD_ATTR_BLOCK_SIZE_BYTES,
+ NBD_ATTR_TIMEOUT,
+ NBD_ATTR_SERVER_FLAGS,
+ NBD_ATTR_CLIENT_FLAGS,
+ NBD_ATTR_SOCKETS,
+ NBD_ATTR_DEAD_CONN_TIMEOUT,
+ NBD_ATTR_DEVICE_LIST,
+ NBD_ATTR_BACKEND_IDENTIFIER,
+ __NBD_ATTR_MAX,
+};
+#define NBD_ATTR_MAX (__NBD_ATTR_MAX - 1)
+
+enum {
+ NBD_SOCK_ITEM_UNSPEC,
+ NBD_SOCK_ITEM,
+ __NBD_SOCK_ITEM_MAX,
+};
+#define NBD_SOCK_ITEM_MAX (__NBD_SOCK_ITEM_MAX - 1)
+
+enum {
+ NBD_SOCK_UNSPEC,
+ NBD_SOCK_FD,
+ __NBD_SOCK_MAX,
+};
+#define NBD_SOCK_MAX (__NBD_SOCK_MAX - 1)
+
+enum {
+ NBD_CMD_UNSPEC,
+ NBD_CMD_CONNECT,
+ NBD_CMD_DISCONNECT,
+ __NBD_CMD_MAX,
+};
+
+/* client behavior specific flags */
+/* delete the nbd device on disconnect */
+#define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0)
+/* disconnect the nbd device on close by last opener */
+#define NBD_CFLAG_DISCONNECT_ON_CLOSE (1 << 1)
+
+static struct nl_sock *erofs_nbd_get_nl_sock(int *driver_id)
+{
+ struct nl_sock *socket;
+ int err;
+
+ socket = nl_socket_alloc();
+ if (!socket) {
+ erofs_err("Couldn't allocate netlink socket");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ err = genl_connect(socket);
+ if (err) {
+ erofs_err("Couldn't connect to the generic netlink socket");
+ return ERR_PTR(err);
+ }
+
+ err = genl_ctrl_resolve(socket, "nbd");
+ if (err < 0) {
+ erofs_err("Failed to resolve NBD netlink family. Ensure the NBD module is loaded and it supports netlink.");
+ return ERR_PTR(err);
+ }
+ *driver_id = err;
+ return socket;
+}
+
+struct erofs_nbd_nl_cfg_cbctx {
+ int *index;
+ int errcode;
+};
+
+static int erofs_nbd_nl_cfg_cb(struct nl_msg *msg, void *arg)
+{
+ struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg));
+ struct nlattr *msg_attr[NBD_ATTR_MAX + 1];
+ struct erofs_nbd_nl_cfg_cbctx *ctx = arg;
+ int err;
+
+ err = nla_parse(msg_attr, NBD_ATTR_MAX, genlmsg_attrdata(gnlh, 0),
+ genlmsg_attrlen(gnlh, 0), NULL);
+ if (err) {
+ erofs_err("Invalid response from the kernel");
+ ctx->errcode = err;
+ }
+
+ if (!msg_attr[NBD_ATTR_INDEX]) {
+ erofs_err("Did not receive index from the kernel");
+ ctx->errcode = -EBADMSG;
+ }
+ *ctx->index = nla_get_u32(msg_attr[NBD_ATTR_INDEX]);
+ erofs_dbg("Connected /dev/nbd%d\n", *ctx->index);
+ ctx->errcode = 0;
+ return NL_OK;
+}
+
+int erofs_nbd_nl_connect(int *index, int blkbits, u64 blocks,
+ const char *identifier)
+{
+ struct erofs_nbd_nl_cfg_cbctx cbctx = {
+ .index = index,
+ };
+ struct nl_sock *socket;
+ struct nlattr *sock_attr, *sock_opt;
+ struct nl_msg *msg;
+ int sv[2], err;
+ int driver_id;
+
+ err = socketpair(AF_UNIX, SOCK_STREAM, 0, sv);
+ if (err < 0)
+ return -errno;
+
+ socket = erofs_nbd_get_nl_sock(&driver_id);
+ if (IS_ERR(socket)) {
+ err = PTR_ERR(socket);
+ goto err_out;
+ }
+ nl_socket_modify_cb(socket, NL_CB_VALID, NL_CB_CUSTOM,
+ erofs_nbd_nl_cfg_cb, &cbctx);
+
+ msg = nlmsg_alloc();
+ if (!msg) {
+ erofs_err("Couldn't allocate netlink message");
+ err = -ENOMEM;
+ goto err_nls_free;
+ }
+
+ genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, driver_id, 0, 0,
+ NBD_CMD_CONNECT, 0);
+ if (*index >= 0)
+ NLA_PUT_U32(msg, NBD_ATTR_INDEX, *index);
+ NLA_PUT_U64(msg, NBD_ATTR_BLOCK_SIZE_BYTES, 1u << blkbits);
+ NLA_PUT_U64(msg, NBD_ATTR_SIZE_BYTES, blocks << blkbits);
+ NLA_PUT_U64(msg, NBD_ATTR_SERVER_FLAGS, NBD_FLAG_READ_ONLY);
+ NLA_PUT_U64(msg, NBD_ATTR_TIMEOUT, 0);
+ if (identifier)
+ NLA_PUT_STRING(msg, NBD_ATTR_BACKEND_IDENTIFIER, identifier);
+
+ err = -EINVAL;
+ sock_attr = nla_nest_start(msg, NBD_ATTR_SOCKETS);
+ if (!sock_attr) {
+ erofs_err("Couldn't nest the sockets for our connection");
+ goto err_nlm_free;
+ }
+
+ sock_opt = nla_nest_start(msg, NBD_SOCK_ITEM);
+ if (!sock_attr) {
+ nla_nest_cancel(msg, sock_attr);
+ goto err_nlm_free;
+ }
+ NLA_PUT_U32(msg, NBD_SOCK_FD, sv[1]);
+ nla_nest_end(msg, sock_opt);
+ nla_nest_end(msg, sock_attr);
+
+ err = nl_send_sync(socket, msg);
+ if (err)
+ goto err_out;
+ nl_socket_free(socket);
+ if (cbctx.errcode)
+ return cbctx.errcode;
+ return sv[0];
+
+nla_put_failure:
+ nla_nest_cancel(msg, sock_opt);
+ nla_nest_cancel(msg, sock_attr);
+err_nlm_free:
+ nlmsg_free(msg);
+err_nls_free:
+ nl_socket_free(socket);
+err_out:
+ close(sv[0]);
+ close(sv[1]);
+ return err;
+}
+#else
+int erofs_nbd_nl_connect(int *index, int blkbits, u64 blocks,
+ const char *identifier)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
int erofs_nbd_do_it(int nbdfd)
{
int err;
diff --git a/lib/liberofs_nbd.h b/lib/liberofs_nbd.h
index c493aca..89c4cf2 100644
--- a/lib/liberofs_nbd.h
+++ b/lib/liberofs_nbd.h
@@ -39,4 +39,6 @@ int erofs_nbd_get_request(int skfd, struct erofs_nbd_request *rq);
int erofs_nbd_send_reply_header(int skfd, __le64 cookie, int err);
int erofs_nbd_disconnect(int nbdfd);
+int erofs_nbd_nl_connect(int *index, int blkbits, u64 blocks,
+ const char *identifier);
#endif
diff --git a/mount/Makefile.am b/mount/Makefile.am
index b76e336..d93f3f4 100644
--- a/mount/Makefile.am
+++ b/mount/Makefile.am
@@ -9,5 +9,5 @@ mount_erofs_SOURCES = main.c
mount_erofs_CFLAGS = -Wall -I$(top_srcdir)/include
mount_erofs_LDADD = $(top_builddir)/lib/liberofs.la ${libselinux_LIBS} \
${liblz4_LIBS} ${liblzma_LIBS} ${zlib_LIBS} ${libdeflate_LIBS} \
- ${libzstd_LIBS} ${libqpl_LIBS} ${libxxhash_LIBS}
+ ${libzstd_LIBS} ${libqpl_LIBS} ${libxxhash_LIBS} ${libnl3_LIBS}
endif
diff --git a/mount/main.c b/mount/main.c
index c9deae2..0df885b 100644
--- a/mount/main.c
+++ b/mount/main.c
@@ -299,6 +299,53 @@ out_closefd:
return err;
}
+static int erofsmount_startnbd_nl(pid_t *pid, const char *source)
+{
+ struct erofsmount_nbd_ctx ctx = {};
+ int err, num;
+ int pipefd[2];
+
+ err = open(source, O_RDONLY);
+ if (err < 0)
+ return -errno;
+ ctx.vd.fd = err;
+
+ err = pipe(pipefd);
+ if (err < 0) {
+ err = -errno;
+ close(ctx.vd.fd);
+ return err;
+ }
+ if ((*pid = fork()) == 0) {
+ /* Otherwise, NBD disconnect sends SIGPIPE, skipping cleanup */
+ if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
+ close(ctx.vd.fd);
+ exit(EXIT_FAILURE);
+ }
+
+ num = -1;
+ err = erofs_nbd_nl_connect(&num, 9, INT64_MAX >> 9, NULL);
+ if (err >= 0) {
+ ctx.sk.fd = err;
+ err = write(pipefd[1], &num, sizeof(int));
+ if (err >= sizeof(int)) {
+ close(pipefd[1]);
+ close(pipefd[0]);
+ err = (int)(uintptr_t)erofsmount_nbd_loopfn(&ctx);
+ exit(err ? EXIT_FAILURE : EXIT_SUCCESS);
+ }
+ }
+ close(ctx.vd.fd);
+ exit(EXIT_FAILURE);
+ }
+ close(pipefd[1]);
+ err = read(pipefd[0], &num, sizeof(int));
+ close(pipefd[0]);
+ if (err < sizeof(int))
+ return -EPIPE;
+ return num;
+}
+
static int erofsmount_nbd(const char *source, const char *mountpoint,
const char *fstype, int flags,
const char *options)
@@ -315,19 +362,25 @@ static int erofsmount_nbd(const char *source, const char *mountpoint,
}
flags |= MS_RDONLY;
- num = erofs_nbd_devscan();
- if (num < 0)
- return num;
-
- (void)snprintf(nbdpath, sizeof(nbdpath), "/dev/nbd%d", num);
- nbdfd = open(nbdpath, O_RDWR);
- if (nbdfd < 0)
- return -errno;
-
- if ((pid = fork()) == 0)
- return erofsmount_startnbd(nbdfd, source) ?
- EXIT_FAILURE : EXIT_SUCCESS;
- close(nbdfd);
+ err = erofsmount_startnbd_nl(&pid, source);
+ if (err < 0) {
+ num = erofs_nbd_devscan();
+ if (num < 0)
+ return num;
+
+ (void)snprintf(nbdpath, sizeof(nbdpath), "/dev/nbd%d", num);
+ nbdfd = open(nbdpath, O_RDWR);
+ if (nbdfd < 0)
+ return -errno;
+
+ if ((pid = fork()) == 0)
+ return erofsmount_startnbd(nbdfd, source) ?
+ EXIT_FAILURE : EXIT_SUCCESS;
+ close(nbdfd);
+ } else {
+ num = err;
+ (void)snprintf(nbdpath, sizeof(nbdpath), "/dev/nbd%d", num);
+ }
while (1) {
err = erofs_nbd_in_service(num);
--
2.43.5
More information about the Linux-erofs
mailing list