[PATCH 3/4] erofs-utils: add ublk userspace block device backend
Chengyu
hudson at cyzhu.com
Mon Mar 16 01:27:44 AEDT 2026
From: Chengyu Zhu <hudsonzhu at tencent.com>
Add a ublk backend powered by io_uring. This provides
an alternative to nbd for exposing an erofs image
as a block device.
Signed-off-by: Chengyu Zhu <hudsonzhu at tencent.com>
---
configure.ac | 34 +
lib/Makefile.am | 9 +-
lib/backends/ublk.c | 1429 +++++++++++++++++++++++++++++++++++++++++++
lib/liberofs_ublk.h | 53 ++
4 files changed, 1524 insertions(+), 1 deletion(-)
create mode 100644 lib/backends/ublk.c
create mode 100644 lib/liberofs_ublk.h
diff --git a/configure.ac b/configure.ac
index 8a8e9b3..08f16e5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -185,6 +185,10 @@ AC_ARG_WITH(libnl3,
[AS_HELP_STRING([--with-libnl3],
[Enable and build with libnl3 support @<:@default=auto@:>@])])
+AC_ARG_ENABLE(ublk,
+ [AS_HELP_STRING([--enable-ublk], [enable ublk block device support @<:@default=no@:>@])],
+ [enable_ublk="$enableval"], [enable_ublk="no"])
+
AC_ARG_ENABLE(s3,
[AS_HELP_STRING([--enable-s3], [enable s3 image generation support @<:@default=no@:>@])],
[enable_s3="$enableval"], [enable_s3="no"])
@@ -727,6 +731,29 @@ AS_IF([test "x$with_libnl3" != "xno"], [
])
])
+# Configure ublk (requires liburing)
+have_ublk="no"
+AS_IF([test "x$enable_ublk" = "xyes"], [
+ PKG_CHECK_MODULES([liburing], [liburing >= 2.0], [
+ # Paranoia: don't trust the result reported by pkgconfig before trying out
+ saved_LIBS="$LIBS"
+ saved_CPPFLAGS=${CPPFLAGS}
+ CPPFLAGS="${liburing_CFLAGS} ${CPPFLAGS}"
+ LIBS="${liburing_LIBS} $LIBS"
+ AC_CHECK_HEADERS([liburing.h],[
+ AC_CHECK_LIB(uring, io_uring_queue_init, [], [
+ AC_MSG_ERROR([liburing doesn't work properly])])
+ AC_CHECK_DECL(io_uring_queue_init, [have_ublk="yes"],
+ [AC_MSG_ERROR([liburing doesn't work properly])], [[
+#include <liburing.h>
+ ]])
+ ])
+ LIBS="${saved_LIBS}"
+ CPPFLAGS="${saved_CPPFLAGS}"], [
+ AC_MSG_ERROR([ublk requires liburing >= 2.0])
+ ])
+])
+
AS_IF([test "x$enable_s3" != "xno"], [
AS_IF(
[test "x$have_libcurl" = "xyes" && \
@@ -766,6 +793,7 @@ AM_CONDITIONAL([ENABLE_LIBXML2], [test "x${have_libxml2}" = "xyes"])
AM_CONDITIONAL([ENABLE_S3], [test "x${have_s3}" = "xyes"])
AM_CONDITIONAL([ENABLE_STATIC_FUSE], [test "x${enable_static_fuse}" = "xyes"])
AM_CONDITIONAL([ENABLE_OCI], [test "x${have_oci}" = "xyes"])
+AM_CONDITIONAL([ENABLE_UBLK], [test "x${have_ublk}" = "xyes"])
if test "x$have_uuid" = "xyes"; then
AC_DEFINE([HAVE_LIBUUID], 1, [Define to 1 if libuuid is found])
@@ -842,6 +870,12 @@ if test "x$have_oci" = "xyes"; then
AC_DEFINE([OCIEROFS_ENABLED], 1, [Define to 1 if OCI registry is enabled])
fi
+if test "x$have_ublk" = "xyes"; then
+ AC_DEFINE([HAVE_LIBURING], 1, [Define to 1 if liburing is found])
+ AC_SUBST([liburing_LIBS])
+ AC_SUBST([liburing_CFLAGS])
+fi
+
# Dump maximum block size
AS_IF([test "x$erofs_cv_max_block_size" = "x"],
[$erofs_cv_max_block_size = 4096], [])
diff --git a/lib/Makefile.am b/lib/Makefile.am
index 77f6fd8..fd94e46 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -33,7 +33,8 @@ noinst_HEADERS = $(top_srcdir)/include/erofs_fs.h \
$(top_srcdir)/lib/liberofs_gzran.h \
$(top_srcdir)/lib/liberofs_metabox.h \
$(top_srcdir)/lib/liberofs_nbd.h \
- $(top_srcdir)/lib/liberofs_s3.h
+ $(top_srcdir)/lib/liberofs_s3.h \
+ $(top_srcdir)/lib/liberofs_ublk.h
noinst_HEADERS += compressor.h
liberofs_la_SOURCES = config.c io.c cache.c super.c inode.c xattr.c exclude.c \
@@ -89,6 +90,12 @@ liberofs_la_CFLAGS += ${libnl3_CFLAGS}
liberofs_la_LDFLAGS += ${libnl3_LIBS}
liberofs_la_SOURCES += backends/nbd.c
endif
+
+if ENABLE_UBLK
+liberofs_la_CFLAGS += ${liburing_CFLAGS}
+liberofs_la_LDFLAGS += ${liburing_LIBS}
+liberofs_la_SOURCES += backends/ublk.c
+endif
liberofs_la_SOURCES += remotes/oci.c remotes/docker_config.c
liberofs_la_CFLAGS += ${json_c_CFLAGS}
liberofs_la_LDFLAGS += ${json_c_LIBS}
diff --git a/lib/backends/ublk.c b/lib/backends/ublk.c
new file mode 100644
index 0000000..c1ef0a9
--- /dev/null
+++ b/lib/backends/ublk.c
@@ -0,0 +1,1429 @@
+// SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0
+/*
+ * Copyright (C) 2026 Tencent, Inc.
+ * http://www.tencent.com/
+ */
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/eventfd.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+#include <linux/io_uring.h>
+
+#include "erofs/err.h"
+#include "erofs/print.h"
+#include "liberofs_ublk.h"
+
+#ifdef HAVE_LIBURING
+#include <liburing.h>
+#endif
+
+#ifndef PR_SET_IO_FLUSHER
+#define PR_SET_IO_FLUSHER 49
+#endif
+
+#define UBLK_CMD_GET_QUEUE_AFFINITY 0x01
+#define UBLK_CMD_GET_DEV_INFO 0x02
+#define UBLK_CMD_ADD_DEV 0x04
+#define UBLK_CMD_DEL_DEV 0x05
+#define UBLK_CMD_START_DEV 0x06
+#define UBLK_CMD_STOP_DEV 0x07
+#define UBLK_CMD_SET_PARAMS 0x08
+#define UBLK_CMD_GET_PARAMS 0x09
+#define UBLK_CMD_START_USER_RECOVERY 0x10
+#define UBLK_CMD_END_USER_RECOVERY 0x11
+
+#define UBLK_IO_FETCH_REQ 0x20
+#define UBLK_IO_COMMIT_AND_FETCH_REQ 0x21
+#define UBLK_IO_NEED_GET_DATA 0x22
+
+#define UBLK_U_CMD_GET_QUEUE_AFFINITY \
+ _IOR('u', UBLK_CMD_GET_QUEUE_AFFINITY, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_GET_DEV_INFO \
+ _IOR('u', UBLK_CMD_GET_DEV_INFO, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_ADD_DEV \
+ _IOWR('u', UBLK_CMD_ADD_DEV, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_DEL_DEV \
+ _IOWR('u', UBLK_CMD_DEL_DEV, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_START_DEV \
+ _IOWR('u', UBLK_CMD_START_DEV, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_STOP_DEV \
+ _IOWR('u', UBLK_CMD_STOP_DEV, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_SET_PARAMS \
+ _IOWR('u', UBLK_CMD_SET_PARAMS, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_GET_PARAMS \
+ _IOR('u', UBLK_CMD_GET_PARAMS, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_START_USER_RECOVERY \
+ _IOWR('u', UBLK_CMD_START_USER_RECOVERY, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_END_USER_RECOVERY \
+ _IOWR('u', UBLK_CMD_END_USER_RECOVERY, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_DEL_DEV_ASYNC \
+ _IOR('u', 0x14, struct ublksrv_ctrl_cmd)
+
+#define UBLK_U_IO_FETCH_REQ \
+ _IOWR('u', UBLK_IO_FETCH_REQ, struct ublksrv_io_cmd)
+#define UBLK_U_IO_COMMIT_AND_FETCH_REQ \
+ _IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd)
+#define UBLK_U_IO_NEED_GET_DATA \
+ _IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd)
+
+#define UBLK_F_URING_CMD_COMP_IN_TASK (1ULL << 1)
+#define UBLK_F_USER_RECOVERY (1ULL << 3)
+#define UBLK_F_UNPRIVILEGED_DEV (1ULL << 5)
+#define UBLK_F_CMD_IOCTL_ENCODE (1ULL << 6)
+#define UBLK_F_USER_COPY (1ULL << 7)
+
+#define UBLK_S_DEV_QUIESCED 2
+#define UBLK_S_DEV_FAIL_IO 3
+
+#define UBLK_IO_RES_NEED_GET_DATA 1
+#define UBLK_IO_RES_ABORT (-ENODEV)
+
+#define UBLKSRV_CMD_BUF_OFFSET 0
+#define UBLKSRV_IO_BUF_OFFSET 0x80000000
+
+#define UBLK_MAX_QUEUE_DEPTH 4096
+
+#define UBLK_IO_BUF_BITS 25
+#define UBLK_IO_BUF_BITS_MASK ((1ULL << UBLK_IO_BUF_BITS) - 1)
+#define UBLK_TAG_OFF UBLK_IO_BUF_BITS
+#define UBLK_TAG_BITS 16
+#define UBLK_QID_OFF (UBLK_TAG_OFF + UBLK_TAG_BITS)
+
+#define UBLK_IO_OP_READ 0
+
+struct ublksrv_ctrl_cmd {
+ __u32 dev_id;
+ __u16 queue_id;
+ __u16 len;
+ __u64 addr;
+ __u64 data[1];
+ __u16 dev_path_len;
+ __u16 pad;
+ __u32 reserved;
+};
+
+struct ublksrv_ctrl_dev_info {
+ __u16 nr_hw_queues;
+ __u16 queue_depth;
+ __u16 state;
+ __u16 pad0;
+ __u32 max_io_buf_bytes;
+ __u32 dev_id;
+ __s32 ublksrv_pid;
+ __u32 pad1;
+ __u64 flags;
+ __u64 ublksrv_flags;
+ __u32 owner_uid;
+ __u32 owner_gid;
+ __u64 reserved1;
+ __u64 reserved2;
+};
+
+struct ublksrv_io_cmd {
+ __u16 q_id;
+ __u16 tag;
+ __s32 result;
+ __u64 addr;
+};
+
+struct ublksrv_io_desc {
+ __u32 op_flags;
+ __u32 nr_sectors;
+ __u64 start_sector;
+ __u64 addr;
+};
+
+#define UBLK_PARAM_TYPE_BASIC (1 << 0)
+
+struct ublk_param_basic {
+#define UBLK_ATTR_READ_ONLY (1 << 0)
+ __u32 attrs;
+ __u8 logical_bs_shift;
+ __u8 physical_bs_shift;
+ __u8 io_opt_shift;
+ __u8 io_min_shift;
+ __u32 max_sectors;
+ __u32 chunk_sectors;
+ __u64 dev_sectors;
+ __u64 virt_boundary_mask;
+};
+
+struct ublk_params {
+ __u32 len;
+ __u32 types;
+ struct ublk_param_basic basic;
+};
+
+
+#ifdef HAVE_LIBURING
+
+#define EROFS_UBLK_DEF_NR_HW_QUEUES 1
+#define EROFS_UBLK_DEF_QUEUE_DEPTH 128
+#define EROFS_UBLK_DEF_MAX_IO_BUF_BYTES (512 * 1024)
+#define EROFS_UBLK_DEF_BLK_BITS 12
+
+#define UBLKSRV_IO_FREE (1U << 0)
+#define UBLKSRV_NEED_COMMIT_RQ_COMP (1U << 2)
+#define UBLKSRV_IO_NEED_GET_DATA (1U << 3)
+
+#define UBLKSRV_QUEUE_STOPPING (1U << 0)
+#define UBLKSRV_QUEUE_IDLE (1U << 1)
+
+struct erofs_ublk_io {
+ unsigned int flags;
+ int result;
+};
+
+struct erofs_ublk_queue {
+ int q_id;
+ int q_depth;
+ struct erofs_ublk_dev *dev;
+ struct ublksrv_io_desc *io_cmd_buf;
+ struct erofs_ublk_io *ios;
+ void *io_buf;
+ size_t io_buf_size;
+ pthread_t thread;
+ unsigned int state;
+ cpu_set_t *cpuset;
+ int idle_ticks;
+ pthread_barrier_t *init_barrier;
+ struct io_uring ring;
+};
+
+struct erofs_ublk_dev {
+ int ctrl_fd;
+ int cdev_fd;
+ struct ublksrv_ctrl_dev_info dev_info;
+ struct ublk_params params;
+ struct erofs_ublk_queue *queues;
+ erofs_ublk_io_handler_t handler;
+ void *handler_ctx;
+ int running;
+ int stop_requested;
+ int stop_efd;
+ int recovering;
+ struct io_uring ctrl_ring;
+ int ctrl_ring_initialized;
+};
+
+#define UBLK_CTRL_DEV "/dev/ublk-control"
+#define UBLK_CDEV_FMT "/dev/ublkc%d"
+
+#define UBLK_IDLE_TIMEOUT_TICKS 200
+
+#define UBLK_MAX_DEVS 128
+static struct erofs_ublk_dev *ublk_devs[UBLK_MAX_DEVS];
+
+static struct erofs_ublk_dev *ublk_get_dev(int dev_id)
+{
+ if (dev_id < 0 || dev_id >= UBLK_MAX_DEVS)
+ return NULL;
+ return ublk_devs[dev_id];
+}
+
+static volatile sig_atomic_t g_sig_dev_id = -1;
+
+static inline __u8 ublksrv_get_op(const struct ublksrv_io_desc *iod)
+{
+ return iod->op_flags & 0xff;
+}
+
+static inline unsigned int ublk_cmd_buf_sz(int q_depth)
+{
+ unsigned int size = q_depth * sizeof(struct ublksrv_io_desc);
+ unsigned int page_size = getpagesize();
+
+ return (size + page_size - 1) & ~(page_size - 1);
+}
+
+static inline void *ublk_get_io_buf(struct erofs_ublk_queue *q, int tag)
+{
+ return (char *)q->io_buf + tag * q->dev->dev_info.max_io_buf_bytes;
+}
+
+static inline __u64 ublk_pos(__u16 q_id, __u16 tag, __u32 offset)
+{
+ return UBLKSRV_IO_BUF_OFFSET +
+ (((__u64)q_id) << UBLK_QID_OFF) |
+ (((__u64)tag) << UBLK_TAG_OFF) |
+ ((__u64)(offset & UBLK_IO_BUF_BITS_MASK));
+}
+
+static void ublk_apply_oom_protection(void)
+{
+ char path[64];
+ int fd;
+
+ snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", getpid());
+ fd = open(path, O_RDWR);
+ if (fd >= 0) {
+ const char *val = "-1000";
+ ssize_t ret = write(fd, val, strlen(val));
+
+ if (ret != (ssize_t)strlen(val))
+ erofs_dbg("failed to set oom_score_adj: %s",
+ strerror(errno));
+ close(fd);
+ }
+}
+
+static void ublk_set_io_flusher(void)
+{
+ if (prctl(PR_SET_IO_FLUSHER, 0, 0, 0, 0) != 0)
+ erofs_dbg("prctl(PR_SET_IO_FLUSHER) failed: %s",
+ strerror(errno));
+}
+
+static int ublk_ctrl_ring_init(struct io_uring *ring)
+{
+ struct io_uring_params p;
+
+ memset(&p, 0, sizeof(p));
+ p.flags = IORING_SETUP_SQE128;
+
+ return io_uring_queue_init_params(4, ring, &p);
+}
+
+static int ublk_ctrl_cmd_ring(struct io_uring *ring, int ctrl_fd,
+ __u32 cmd_op,
+ const struct ublksrv_ctrl_cmd *cmd_data)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ struct ublksrv_ctrl_cmd *cmd;
+ int ret;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe)
+ return -ENOMEM;
+
+ io_uring_prep_rw(IORING_OP_URING_CMD, sqe, ctrl_fd, NULL, 0, 0);
+ sqe->cmd_op = cmd_op;
+
+ if (cmd_data) {
+ cmd = (struct ublksrv_ctrl_cmd *)sqe->cmd;
+ memcpy(cmd, cmd_data, sizeof(*cmd_data));
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret < 0) {
+ erofs_err("io_uring_submit failed: %s", strerror(-ret));
+ return ret;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ erofs_err("io_uring_wait_cqe failed: %s", strerror(-ret));
+ return ret;
+ }
+
+ ret = cqe->res;
+ io_uring_cqe_seen(ring, cqe);
+ return ret;
+}
+
+static int ublk_ctrl_cmd(int ctrl_fd, __u32 cmd_op,
+ const struct ublksrv_ctrl_cmd *cmd_data)
+{
+ struct io_uring ring;
+ int ret;
+
+ ret = ublk_ctrl_ring_init(&ring);
+ if (ret < 0) {
+ erofs_err("io_uring_queue_init failed: %s", strerror(-ret));
+ return ret;
+ }
+
+ ret = ublk_ctrl_cmd_ring(&ring, ctrl_fd, cmd_op, cmd_data);
+ io_uring_queue_exit(&ring);
+ return ret;
+}
+
+static int ublk_dev_ctrl_cmd(struct erofs_ublk_dev *dev, __u32 cmd_op,
+ const struct ublksrv_ctrl_cmd *cmd_data)
+{
+ return ublk_ctrl_cmd_ring(&dev->ctrl_ring, dev->ctrl_fd,
+ cmd_op, cmd_data);
+}
+
+static int ublk_get_queue_affinity(struct erofs_ublk_dev *dev, int q_id,
+ cpu_set_t *cpuset)
+{
+ struct ublksrv_ctrl_cmd cmd = {0};
+ int ret;
+
+ cmd.dev_id = dev->dev_info.dev_id;
+ cmd.queue_id = q_id;
+ cmd.len = sizeof(cpu_set_t);
+ cmd.addr = (__u64)(uintptr_t)cpuset;
+
+ ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_GET_QUEUE_AFFINITY, &cmd);
+ if (ret < 0)
+ erofs_dbg("GET_QUEUE_AFFINITY failed for q%d: %s",
+ q_id, strerror(-ret));
+ return ret;
+}
+
+static int ublk_add_dev(struct erofs_ublk_dev *dev,
+ const struct erofs_ublk_dev_info *info)
+{
+ struct ublksrv_ctrl_cmd cmd = {0};
+ struct ublksrv_ctrl_dev_info *dev_info = &dev->dev_info;
+ int ret;
+
+ memset(dev_info, 0, sizeof(*dev_info));
+ dev_info->nr_hw_queues = info->nr_hw_queues;
+ dev_info->queue_depth = info->queue_depth;
+ dev_info->max_io_buf_bytes = info->max_io_buf_bytes;
+ dev_info->dev_id = info->dev_id;
+
+ dev_info->flags = UBLK_F_CMD_IOCTL_ENCODE |
+ UBLK_F_URING_CMD_COMP_IN_TASK;
+
+ dev_info->flags |= UBLK_F_USER_COPY;
+
+ if (info->flags & EROFS_UBLK_F_UNPRIVILEGED)
+ dev_info->flags |= UBLK_F_UNPRIVILEGED_DEV;
+ if (info->flags & EROFS_UBLK_F_USER_RECOVERY)
+ dev_info->flags |= UBLK_F_USER_RECOVERY;
+
+ cmd.dev_id = dev_info->dev_id;
+ cmd.queue_id = (__u16)-1;
+ cmd.len = sizeof(*dev_info);
+ cmd.addr = (__u64)(uintptr_t)dev_info;
+
+ ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_ADD_DEV, &cmd);
+ if (ret < 0) {
+ erofs_err("UBLK_CMD_ADD_DEV failed: %s", strerror(-ret));
+ return ret;
+ }
+
+ erofs_info("ublk device %d added (queues=%d, depth=%d, io_buf=%u)",
+ dev_info->dev_id, dev_info->nr_hw_queues,
+ dev_info->queue_depth, dev_info->max_io_buf_bytes);
+ return 0;
+}
+
+static int ublk_del_dev(struct erofs_ublk_dev *dev)
+{
+ struct ublksrv_ctrl_cmd cmd = {0};
+
+ cmd.dev_id = dev->dev_info.dev_id;
+ cmd.queue_id = (__u16)-1;
+
+ return ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_DEL_DEV, &cmd);
+}
+
+static int ublk_set_params(struct erofs_ublk_dev *dev,
+ const struct erofs_ublk_dev_info *info)
+{
+ struct ublksrv_ctrl_cmd cmd = {0};
+ struct ublk_params *params = &dev->params;
+ int ret;
+
+ memset(params, 0, sizeof(*params));
+ params->len = sizeof(*params);
+ params->types = UBLK_PARAM_TYPE_BASIC;
+
+ params->basic.attrs = UBLK_ATTR_READ_ONLY;
+ params->basic.logical_bs_shift = info->blkbits;
+ params->basic.physical_bs_shift = info->blkbits;
+ params->basic.io_opt_shift = info->blkbits;
+ params->basic.io_min_shift = info->blkbits;
+ params->basic.max_sectors = info->max_io_buf_bytes >> 9;
+ params->basic.dev_sectors = info->dev_size >> 9;
+
+ cmd.dev_id = dev->dev_info.dev_id;
+ cmd.queue_id = (__u16)-1;
+ cmd.len = sizeof(*params);
+ cmd.addr = (__u64)(uintptr_t)params;
+
+ ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_SET_PARAMS, &cmd);
+ if (ret < 0)
+ erofs_err("UBLK_CMD_SET_PARAMS failed: %s", strerror(-ret));
+
+ return ret;
+}
+
+static int ublk_start_dev(struct erofs_ublk_dev *dev)
+{
+ struct ublksrv_ctrl_cmd cmd = {0};
+ int ret;
+
+ cmd.dev_id = dev->dev_info.dev_id;
+ cmd.queue_id = (__u16)-1;
+ cmd.data[0] = getpid();
+
+ ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_START_DEV, &cmd);
+ if (ret < 0)
+ erofs_err("UBLK_CMD_START_DEV failed: %s", strerror(-ret));
+ else
+ erofs_info("ublk device /dev/ublkb%d started",
+ dev->dev_info.dev_id);
+
+ return ret;
+}
+
+static int ublk_stop_dev(struct erofs_ublk_dev *dev)
+{
+ struct ublksrv_ctrl_cmd cmd = {0};
+
+ cmd.dev_id = dev->dev_info.dev_id;
+ cmd.queue_id = (__u16)-1;
+
+ return ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_STOP_DEV, &cmd);
+}
+
+static int ublk_start_recovery(struct erofs_ublk_dev *dev)
+{
+ struct ublksrv_ctrl_cmd cmd = {0};
+ int ret;
+
+ cmd.dev_id = dev->dev_info.dev_id;
+ cmd.queue_id = (__u16)-1;
+
+ ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_START_USER_RECOVERY, &cmd);
+ if (ret < 0)
+ erofs_err("START_USER_RECOVERY failed: %s", strerror(-ret));
+ else
+ erofs_info("ublk device %d recovery started",
+ dev->dev_info.dev_id);
+
+ return ret;
+}
+
+static int ublk_end_recovery(struct erofs_ublk_dev *dev)
+{
+ struct ublksrv_ctrl_cmd cmd = {0};
+ int ret;
+
+ cmd.dev_id = dev->dev_info.dev_id;
+ cmd.queue_id = (__u16)-1;
+ cmd.data[0] = getpid();
+
+ ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_END_USER_RECOVERY, &cmd);
+ if (ret < 0)
+ erofs_err("END_USER_RECOVERY failed: %s", strerror(-ret));
+ else
+ erofs_info("ublk device %d recovery completed",
+ dev->dev_info.dev_id);
+
+ return ret;
+}
+
+static int ublk_get_dev_info(struct erofs_ublk_dev *dev, int dev_id)
+{
+ struct ublksrv_ctrl_cmd cmd = {0};
+ int ret;
+
+ cmd.dev_id = dev_id;
+ cmd.queue_id = (__u16)-1;
+ cmd.len = sizeof(dev->dev_info);
+ cmd.addr = (__u64)(uintptr_t)&dev->dev_info;
+
+ ret = ublk_ctrl_cmd(dev->ctrl_fd, UBLK_U_CMD_GET_DEV_INFO, &cmd);
+ if (ret < 0)
+ erofs_err("GET_DEV_INFO failed: %s", strerror(-ret));
+
+ return ret;
+}
+
+static int ublk_get_params(struct erofs_ublk_dev *dev)
+{
+ struct ublksrv_ctrl_cmd cmd = {0};
+ int ret;
+
+ dev->params.len = sizeof(dev->params);
+
+ cmd.dev_id = dev->dev_info.dev_id;
+ cmd.queue_id = (__u16)-1;
+ cmd.len = sizeof(dev->params);
+ cmd.addr = (__u64)(uintptr_t)&dev->params;
+
+ ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_GET_PARAMS, &cmd);
+ if (ret < 0)
+ erofs_err("GET_PARAMS failed: %s", strerror(-ret));
+
+ return ret;
+}
+
+static int ublk_queue_io_cmd(struct erofs_ublk_queue *q, int tag)
+{
+ struct io_uring_sqe *sqe;
+ struct ublksrv_io_cmd *cmd;
+ struct erofs_ublk_io *io = &q->ios[tag];
+ __u32 cmd_op;
+
+ sqe = io_uring_get_sqe(&q->ring);
+ if (!sqe)
+ return -ENOMEM;
+
+ if (io->flags & UBLKSRV_IO_FREE)
+ cmd_op = UBLK_U_IO_FETCH_REQ;
+ else if (io->flags & UBLKSRV_IO_NEED_GET_DATA)
+ cmd_op = UBLK_U_IO_NEED_GET_DATA;
+ else
+ cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
+
+ memset(sqe, 0, sizeof(*sqe) * 2);
+
+ sqe->opcode = IORING_OP_URING_CMD;
+ sqe->fd = q->dev->cdev_fd;
+ sqe->cmd_op = cmd_op;
+ sqe->user_data = tag;
+
+ cmd = (struct ublksrv_io_cmd *)sqe->cmd;
+ cmd->q_id = q->q_id;
+ cmd->tag = tag;
+
+ if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ)
+ cmd->result = io->result;
+ else
+ cmd->result = 0;
+
+ cmd->addr = 0;
+
+ io->flags = 0;
+
+ return 0;
+}
+
+static int ublk_submit_fetch_commands(struct erofs_ublk_queue *q)
+{
+ int i, ret;
+
+ for (i = 0; i < q->q_depth; i++) {
+ q->ios[i].flags = UBLKSRV_IO_FREE;
+ ret = ublk_queue_io_cmd(q, i);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = io_uring_submit(&q->ring);
+ if (ret < 0) {
+ erofs_err("io_uring_submit (fetch) failed: %s", strerror(-ret));
+ return ret;
+ }
+
+ return 0;
+}
+
+static int ublk_user_copy_read(struct erofs_ublk_queue *q, int tag, size_t len)
+{
+ __u64 pos = ublk_pos(q->q_id, tag, 0);
+ void *buf = ublk_get_io_buf(q, tag);
+ ssize_t ret;
+
+ ret = pread(q->dev->cdev_fd, buf, len, pos);
+ if (ret < 0) {
+ erofs_err("user_copy pread failed: %s", strerror(errno));
+ return -errno;
+ }
+ if ((size_t)ret != len) {
+ erofs_err("user_copy pread short: %zd/%zu", ret, len);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int ublk_user_copy_write(struct erofs_ublk_queue *q, int tag, size_t len)
+{
+ __u64 pos = ublk_pos(q->q_id, tag, 0);
+ void *buf = ublk_get_io_buf(q, tag);
+ ssize_t ret;
+
+ ret = pwrite(q->dev->cdev_fd, buf, len, pos);
+ if (ret < 0) {
+ erofs_err("user_copy pwrite failed: %s", strerror(errno));
+ return -errno;
+ }
+ if ((size_t)ret != len) {
+ erofs_err("user_copy pwrite short: %zd/%zu", ret, len);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int ublk_handle_io(struct erofs_ublk_queue *q, int tag)
+{
+ struct erofs_ublk_dev *dev = q->dev;
+ const struct ublksrv_io_desc *iod = &q->io_cmd_buf[tag];
+ struct erofs_ublk_io *io = &q->ios[tag];
+ struct erofs_ublk_request req;
+ int ret;
+
+ req.op = ublksrv_get_op(iod);
+ req.start_sector = iod->start_sector;
+ req.nr_sectors = iod->nr_sectors;
+ req.buf = ublk_get_io_buf(q, tag);
+ req.result = 0;
+
+ if (dev->handler) {
+ ret = dev->handler(dev->handler_ctx, &req);
+
+ if (ret < 0) {
+ io->result = ret;
+ } else {
+ size_t io_bytes = req.nr_sectors << 9;
+
+ if (req.op == UBLK_IO_OP_READ && io_bytes > 0) {
+ ret = ublk_user_copy_write(q, tag, io_bytes);
+ if (ret < 0)
+ io->result = ret;
+ else
+ io->result = io_bytes;
+ } else {
+ io->result = io_bytes;
+ }
+ }
+ } else {
+ io->result = -EOPNOTSUPP;
+ }
+
+ io->flags = UBLKSRV_NEED_COMMIT_RQ_COMP;
+
+ return 0;
+}
+
+static int ublk_handle_cqe(struct erofs_ublk_queue *q,
+ struct io_uring_cqe *cqe)
+{
+ int tag = cqe->user_data;
+ struct erofs_ublk_io *io = &q->ios[tag];
+ int ret = cqe->res;
+
+ if (ret == UBLK_IO_RES_ABORT)
+ return -ENODEV;
+
+ if (ret == UBLK_IO_RES_NEED_GET_DATA) {
+ const struct ublksrv_io_desc *iod = &q->io_cmd_buf[tag];
+ size_t io_bytes = (size_t)iod->nr_sectors << 9;
+
+ ret = ublk_user_copy_read(q, tag, io_bytes);
+ if (ret < 0) {
+ io->result = ret;
+ io->flags = UBLKSRV_NEED_COMMIT_RQ_COMP;
+ return ublk_queue_io_cmd(q, tag);
+ }
+
+ io->flags = UBLKSRV_IO_NEED_GET_DATA;
+ return ublk_queue_io_cmd(q, tag);
+ }
+
+ if (ret < 0) {
+ erofs_err("queue %d tag %d IO error: %s",
+ q->q_id, tag, strerror(-ret));
+ io->flags = UBLKSRV_IO_FREE;
+ return ublk_queue_io_cmd(q, tag);
+ }
+
+ ublk_handle_io(q, tag);
+
+ return ublk_queue_io_cmd(q, tag);
+}
+
+static void ublk_queue_enter_idle(struct erofs_ublk_queue *q)
+{
+ if (q->state & UBLKSRV_QUEUE_IDLE)
+ return;
+
+ q->state |= UBLKSRV_QUEUE_IDLE;
+
+ if (q->io_buf && q->io_buf_size > 0)
+ madvise(q->io_buf, q->io_buf_size, MADV_DONTNEED);
+
+ erofs_dbg("queue %d entered idle state", q->q_id);
+}
+
+static void ublk_queue_exit_idle(struct erofs_ublk_queue *q)
+{
+ if (!(q->state & UBLKSRV_QUEUE_IDLE))
+ return;
+
+ q->state &= ~UBLKSRV_QUEUE_IDLE;
+ q->idle_ticks = 0;
+ erofs_dbg("queue %d exited idle state", q->q_id);
+}
+
+static void *ublk_queue_thread(void *arg)
+{
+ struct erofs_ublk_queue *q = arg;
+ struct io_uring_cqe *cqe;
+ struct __kernel_timespec ts = {
+ .tv_sec = 0,
+ .tv_nsec = 100000000,
+ };
+ unsigned int head;
+ int ret, nr_events;
+
+ if (q->cpuset)
+ sched_setaffinity(0, sizeof(cpu_set_t), q->cpuset);
+
+ setpriority(PRIO_PROCESS, 0, -20);
+
+ ublk_set_io_flusher();
+
+ ret = ublk_submit_fetch_commands(q);
+ if (ret < 0) {
+ erofs_err("Failed to submit fetch commands: %s",
+ strerror(-ret));
+ if (q->init_barrier)
+ pthread_barrier_wait(q->init_barrier);
+ return NULL;
+ }
+
+ if (q->init_barrier)
+ pthread_barrier_wait(q->init_barrier);
+
+ erofs_info("queue %d thread started (tid=%d)", q->q_id, gettid());
+
+ while (!(q->state & UBLKSRV_QUEUE_STOPPING)) {
+ ret = io_uring_submit_and_wait_timeout(&q->ring, &cqe, 1,
+ &ts, NULL);
+ if (ret == -ETIME) {
+ if (++q->idle_ticks >= UBLK_IDLE_TIMEOUT_TICKS)
+ ublk_queue_enter_idle(q);
+ continue;
+ }
+
+ if (ret < 0) {
+ if (ret == -EINTR)
+ continue;
+ erofs_err("io_uring_submit_and_wait_timeout: %s",
+ strerror(-ret));
+ break;
+ }
+
+ ublk_queue_exit_idle(q);
+
+ nr_events = 0;
+ io_uring_for_each_cqe(&q->ring, head, cqe) {
+ ret = ublk_handle_cqe(q, cqe);
+ nr_events++;
+ if (ret == -ENODEV) {
+ q->state |= UBLKSRV_QUEUE_STOPPING;
+ break;
+ }
+ }
+
+ if (nr_events)
+ io_uring_cq_advance(&q->ring, nr_events);
+
+ io_uring_submit(&q->ring);
+ }
+
+ erofs_info("queue %d thread exiting", q->q_id);
+ __atomic_store_n(&q->dev->stop_requested, 1, __ATOMIC_RELAXED);
+ if (q->dev->stop_efd >= 0) {
+ uint64_t val = 1;
+
+ (void)write(q->dev->stop_efd, &val, sizeof(val));
+ }
+ return NULL;
+}
+
+static int ublk_init_queue(struct erofs_ublk_dev *dev, int q_id)
+{
+ struct erofs_ublk_queue *q = &dev->queues[q_id];
+ struct io_uring_params p;
+ unsigned int cmd_buf_size;
+ int ret;
+
+ q->q_id = q_id;
+ q->q_depth = dev->dev_info.queue_depth;
+ q->dev = dev;
+ q->state = 0;
+ q->idle_ticks = 0;
+
+ q->cpuset = malloc(sizeof(cpu_set_t));
+ if (q->cpuset) {
+ CPU_ZERO(q->cpuset);
+ ret = ublk_get_queue_affinity(dev, q_id, q->cpuset);
+ if (ret < 0) {
+ free(q->cpuset);
+ q->cpuset = NULL;
+ }
+ }
+
+ cmd_buf_size = ublk_cmd_buf_sz(q->q_depth);
+ q->io_cmd_buf = mmap(NULL, cmd_buf_size, PROT_READ,
+ MAP_SHARED | MAP_POPULATE, dev->cdev_fd,
+ UBLKSRV_CMD_BUF_OFFSET +
+ q_id * (UBLK_MAX_QUEUE_DEPTH *
+ sizeof(struct ublksrv_io_desc)));
+ if (q->io_cmd_buf == MAP_FAILED) {
+ erofs_err("mmap io_cmd_buf failed: %s", strerror(errno));
+ ret = -errno;
+ goto err_free_cpuset;
+ }
+
+ q->ios = calloc(q->q_depth, sizeof(struct erofs_ublk_io));
+ if (!q->ios) {
+ ret = -ENOMEM;
+ goto err_unmap_cmd;
+ }
+
+ q->io_buf_size = (size_t)q->q_depth * dev->dev_info.max_io_buf_bytes;
+ q->io_buf = mmap(NULL, q->io_buf_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (q->io_buf == MAP_FAILED) {
+ erofs_err("mmap io_buf failed: %s", strerror(errno));
+ ret = -errno;
+ goto err_free_ios;
+ }
+
+ memset(&p, 0, sizeof(p));
+ p.flags = IORING_SETUP_SQE128;
+
+ ret = io_uring_queue_init_params(q->q_depth * 2, &q->ring, &p);
+ if (ret < 0) {
+ erofs_err("io_uring_queue_init failed: %s", strerror(-ret));
+ goto err_unmap_buf;
+ }
+
+ return 0;
+
+err_unmap_buf:
+ munmap(q->io_buf, q->io_buf_size);
+err_free_ios:
+ free(q->ios);
+err_unmap_cmd:
+ munmap(q->io_cmd_buf, cmd_buf_size);
+err_free_cpuset:
+ free(q->cpuset);
+ return ret;
+}
+
+static void ublk_cleanup_queue(struct erofs_ublk_dev *dev, int q_id)
+{
+ struct erofs_ublk_queue *q = &dev->queues[q_id];
+
+ q->state |= UBLKSRV_QUEUE_STOPPING;
+
+ if (q->thread) {
+ pthread_join(q->thread, NULL);
+ q->thread = 0;
+ }
+
+ io_uring_queue_exit(&q->ring);
+
+ if (q->io_buf && q->io_buf != MAP_FAILED)
+ munmap(q->io_buf, q->io_buf_size);
+
+ free(q->ios);
+
+ if (q->io_cmd_buf && q->io_cmd_buf != MAP_FAILED)
+ munmap(q->io_cmd_buf, ublk_cmd_buf_sz(q->q_depth));
+
+ free(q->cpuset);
+}
+
+static int erofs_ublk_stop(int dev_id)
+{
+ struct erofs_ublk_dev *dev = ublk_get_dev(dev_id);
+ int i;
+
+ if (!dev)
+ return -EINVAL;
+
+ __atomic_store_n(&dev->stop_requested, 1, __ATOMIC_RELAXED);
+
+ if (dev->stop_efd >= 0) {
+ uint64_t val = 1;
+
+ if (write(dev->stop_efd, &val, sizeof(val)) != sizeof(val))
+ erofs_dbg("stop_efd write failed");
+ }
+
+ for (i = 0; i < dev->dev_info.nr_hw_queues; i++)
+ dev->queues[i].state |= UBLKSRV_QUEUE_STOPPING;
+
+ if (__atomic_load_n(&dev->running, __ATOMIC_RELAXED))
+ ublk_stop_dev(dev);
+
+ __atomic_store_n(&dev->running, 0, __ATOMIC_RELAXED);
+ return 0;
+}
+
+static void ublk_sig_handler(int sig)
+{
+ if (sig == SIGTERM || sig == SIGINT) {
+ erofs_info("received signal %d, stopping ublk device",
+ sig);
+ if (g_sig_dev_id >= 0)
+ erofs_ublk_stop(g_sig_dev_id);
+ }
+}
+
+static int ublk_install_sig_handler(int dev_id)
+{
+ struct sigaction sa;
+
+ g_sig_dev_id = dev_id;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = ublk_sig_handler;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = 0;
+
+ if (sigaction(SIGTERM, &sa, NULL) < 0) {
+ erofs_err("sigaction(SIGTERM) failed: %s",
+ strerror(errno));
+ return -errno;
+ }
+
+ if (sigaction(SIGINT, &sa, NULL) < 0) {
+ erofs_err("sigaction(SIGINT) failed: %s",
+ strerror(errno));
+ return -errno;
+ }
+ return 0;
+}
+
+int erofs_ublk_init(void)
+{
+ if (access(UBLK_CTRL_DEV, F_OK) != 0)
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+int erofs_ublk_create_dev(const struct erofs_ublk_dev_info *info,
+ erofs_ublk_io_handler_t handler,
+ void *handler_ctx)
+{
+ struct erofs_ublk_dev *dev;
+ struct erofs_ublk_dev_info def_info;
+ char cdev_path[64];
+ int i, ret, dev_id;
+
+ if (!info) {
+ memset(&def_info, 0, sizeof(def_info));
+ def_info.nr_hw_queues = EROFS_UBLK_DEF_NR_HW_QUEUES;
+ def_info.queue_depth = EROFS_UBLK_DEF_QUEUE_DEPTH;
+ def_info.max_io_buf_bytes = EROFS_UBLK_DEF_MAX_IO_BUF_BYTES;
+ def_info.blkbits = EROFS_UBLK_DEF_BLK_BITS;
+ def_info.dev_id = (__u32)-1;
+ info = &def_info;
+ }
+
+ dev = calloc(1, sizeof(*dev));
+ if (!dev)
+ return -ENOMEM;
+
+ dev->handler = handler;
+ dev->handler_ctx = handler_ctx;
+ dev->ctrl_fd = -1;
+ dev->cdev_fd = -1;
+ dev->stop_efd = -1;
+
+ dev->ctrl_fd = open(UBLK_CTRL_DEV, O_RDWR);
+ if (dev->ctrl_fd < 0) {
+ ret = -errno;
+ erofs_err("failed to open %s: %s",
+ UBLK_CTRL_DEV, strerror(errno));
+ goto err_free;
+ }
+
+ ret = ublk_ctrl_ring_init(&dev->ctrl_ring);
+ if (ret < 0) {
+ erofs_err("ctrl ring init failed: %s",
+ strerror(-ret));
+ goto err_close_ctrl;
+ }
+ dev->ctrl_ring_initialized = 1;
+
+ dev->stop_efd = eventfd(0, EFD_CLOEXEC);
+ if (dev->stop_efd < 0)
+ erofs_dbg("stop eventfd creation failed: %s",
+ strerror(errno));
+
+ ret = ublk_add_dev(dev, info);
+ if (ret < 0)
+ goto err_close_ctrl;
+
+ snprintf(cdev_path, sizeof(cdev_path), UBLK_CDEV_FMT,
+ dev->dev_info.dev_id);
+ dev->cdev_fd = open(cdev_path, O_RDWR);
+ if (dev->cdev_fd < 0) {
+ ret = -errno;
+ erofs_err("failed to open %s: %s",
+ cdev_path, strerror(errno));
+ goto err_del_dev;
+ }
+
+ ret = ublk_set_params(dev, info);
+ if (ret < 0)
+ goto err_close_cdev;
+
+ dev->queues = calloc(dev->dev_info.nr_hw_queues,
+ sizeof(struct erofs_ublk_queue));
+ if (!dev->queues) {
+ ret = -ENOMEM;
+ goto err_close_cdev;
+ }
+
+ for (i = 0; i < dev->dev_info.nr_hw_queues; i++) {
+ ret = ublk_init_queue(dev, i);
+ if (ret < 0)
+ goto err_cleanup_queues;
+ }
+
+ dev_id = dev->dev_info.dev_id;
+ if (dev_id < 0 || dev_id >= UBLK_MAX_DEVS) {
+ erofs_err("kernel assigned dev_id %d out of range",
+ dev_id);
+ ret = -ERANGE;
+ goto err_cleanup_queues;
+ }
+ ublk_devs[dev_id] = dev;
+ return dev_id;
+
+err_cleanup_queues:
+ for (i = i - 1; i >= 0; i--)
+ ublk_cleanup_queue(dev, i);
+ free(dev->queues);
+err_close_cdev:
+ close(dev->cdev_fd);
+err_del_dev:
+ ublk_del_dev(dev);
+err_close_ctrl:
+ close(dev->ctrl_fd);
+err_free:
+ free(dev);
+ return ret;
+}
+
+void erofs_ublk_destroy(int dev_id)
+{
+ struct erofs_ublk_dev *dev = ublk_get_dev(dev_id);
+ int i;
+
+ if (!dev)
+ return;
+
+ erofs_ublk_stop(dev_id);
+
+ if (dev->queues) {
+ for (i = 0; i < dev->dev_info.nr_hw_queues; i++)
+ ublk_cleanup_queue(dev, i);
+ free(dev->queues);
+ }
+
+ if (dev->cdev_fd >= 0)
+ close(dev->cdev_fd);
+
+ if (dev->ctrl_fd >= 0) {
+ ublk_del_dev(dev);
+ close(dev->ctrl_fd);
+ }
+
+ if (dev->ctrl_ring_initialized)
+ io_uring_queue_exit(&dev->ctrl_ring);
+
+ if (dev->stop_efd >= 0)
+ close(dev->stop_efd);
+
+ ublk_devs[dev_id] = NULL;
+ free(dev);
+}
+
+int erofs_ublk_start(int dev_id, int ready_fd)
+{
+ struct erofs_ublk_dev *dev = ublk_get_dev(dev_id);
+ pthread_barrier_t init_barrier;
+ int i, ret;
+
+ if (!dev)
+ return -EINVAL;
+
+ ublk_install_sig_handler(dev_id);
+ ublk_apply_oom_protection();
+
+ ret = pthread_barrier_init(&init_barrier,
+ NULL, dev->dev_info.nr_hw_queues + 1);
+ if (ret) {
+ erofs_err("pthread_barrier_init failed: %s", strerror(ret));
+ return -ret;
+ }
+
+ for (i = 0; i < dev->dev_info.nr_hw_queues; i++) {
+ dev->queues[i].init_barrier = &init_barrier;
+ ret = pthread_create(&dev->queues[i].thread, NULL,
+ ublk_queue_thread, &dev->queues[i]);
+ if (ret) {
+ erofs_err("pthread_create failed: %s", strerror(ret));
+ goto err_stop_threads;
+ }
+ }
+
+ pthread_barrier_wait(&init_barrier);
+ pthread_barrier_destroy(&init_barrier);
+ for (i = 0; i < dev->dev_info.nr_hw_queues; i++)
+ dev->queues[i].init_barrier = NULL;
+
+ if (dev->recovering)
+ ret = ublk_end_recovery(dev);
+ else
+ ret = ublk_start_dev(dev);
+ if (ret < 0)
+ goto err_stop_threads;
+
+ __atomic_store_n(&dev->running, 1, __ATOMIC_RELAXED);
+
+ if (!dev->recovering && ready_fd >= 0) {
+ char ready = 0;
+
+ if (write(ready_fd, &ready, 1) != 1)
+ erofs_dbg("ready_fd write failed");
+ close(ready_fd);
+ }
+ erofs_info("ublk device %s successfully",
+ dev->recovering ? "recovery completed" : "started");
+
+ if (dev->stop_efd >= 0) {
+ uint64_t val;
+
+ while (!__atomic_load_n(&dev->stop_requested, __ATOMIC_RELAXED)) {
+ if (read(dev->stop_efd, &val, sizeof(val)) < 0) {
+ if (errno == EINTR)
+ continue;
+ break;
+ }
+ break;
+ }
+ } else {
+ while (!__atomic_load_n(&dev->stop_requested, __ATOMIC_RELAXED))
+ usleep(100000);
+ }
+
+ return 0;
+
+err_stop_threads:
+ pthread_barrier_destroy(&init_barrier);
+ for (i = 0; i < dev->dev_info.nr_hw_queues; i++) {
+ dev->queues[i].init_barrier = NULL;
+ if (dev->queues[i].thread) {
+ dev->queues[i].state |= UBLKSRV_QUEUE_STOPPING;
+ pthread_join(dev->queues[i].thread, NULL);
+ dev->queues[i].thread = 0;
+ }
+ }
+ return ret;
+}
+
+int erofs_ublk_recover_dev(int dev_id,
+ erofs_ublk_io_handler_t handler,
+ void *handler_ctx)
+{
+ struct erofs_ublk_dev *dev;
+ char cdev_path[64];
+ int i, ret;
+
+ if (dev_id < 0)
+ return -EINVAL;
+
+ dev = calloc(1, sizeof(*dev));
+ if (!dev)
+ return -ENOMEM;
+
+ dev->handler = handler;
+ dev->handler_ctx = handler_ctx;
+ dev->ctrl_fd = -1;
+ dev->cdev_fd = -1;
+ dev->stop_efd = -1;
+
+ dev->ctrl_fd = open(UBLK_CTRL_DEV, O_RDWR);
+ if (dev->ctrl_fd < 0) {
+ ret = -errno;
+ erofs_err("failed to open %s: %s",
+ UBLK_CTRL_DEV, strerror(errno));
+ goto err_free;
+ }
+
+ ret = ublk_ctrl_ring_init(&dev->ctrl_ring);
+ if (ret < 0) {
+ erofs_err("ctrl ring init failed: %s",
+ strerror(-ret));
+ goto err_close_ctrl;
+ }
+ dev->ctrl_ring_initialized = 1;
+
+ dev->stop_efd = eventfd(0, EFD_CLOEXEC);
+ if (dev->stop_efd < 0)
+ erofs_dbg("stop eventfd creation failed: %s",
+ strerror(errno));
+
+ ret = ublk_get_dev_info(dev, dev_id);
+ if (ret < 0)
+ goto err_close_ctrl;
+
+ if (!(dev->dev_info.flags & UBLK_F_USER_RECOVERY)) {
+ erofs_err("Device %d does not support user recovery", dev_id);
+ ret = -EOPNOTSUPP;
+ goto err_close_ctrl;
+ }
+
+ if (dev->dev_info.state != UBLK_S_DEV_QUIESCED &&
+ dev->dev_info.state != UBLK_S_DEV_FAIL_IO) {
+ erofs_err("Device %d is not in recoverable state (state=%d)",
+ dev_id, dev->dev_info.state);
+ ret = -EBUSY;
+ goto err_close_ctrl;
+ }
+
+ ret = ublk_get_params(dev);
+ if (ret < 0)
+ goto err_close_ctrl;
+
+ ret = ublk_start_recovery(dev);
+ if (ret < 0)
+ goto err_close_ctrl;
+
+ snprintf(cdev_path, sizeof(cdev_path), UBLK_CDEV_FMT, dev_id);
+ dev->cdev_fd = open(cdev_path, O_RDWR);
+ if (dev->cdev_fd < 0) {
+ ret = -errno;
+ erofs_err("Failed to open %s: %s", cdev_path, strerror(errno));
+ goto err_close_ctrl;
+ }
+
+ dev->queues = calloc(dev->dev_info.nr_hw_queues,
+ sizeof(struct erofs_ublk_queue));
+ if (!dev->queues) {
+ ret = -ENOMEM;
+ goto err_close_cdev;
+ }
+
+ for (i = 0; i < dev->dev_info.nr_hw_queues; i++) {
+ ret = ublk_init_queue(dev, i);
+ if (ret < 0)
+ goto err_cleanup_queues;
+ }
+
+ dev->recovering = 1;
+ ublk_devs[dev_id] = dev;
+ return 0;
+
+err_cleanup_queues:
+ for (i = i - 1; i >= 0; i--)
+ ublk_cleanup_queue(dev, i);
+ free(dev->queues);
+err_close_cdev:
+ close(dev->cdev_fd);
+err_close_ctrl:
+ close(dev->ctrl_fd);
+err_free:
+ free(dev);
+ return ret;
+}
+
+int erofs_ublk_is_recoverable(int dev_id)
+{
+ struct erofs_ublk_dev dev;
+ int ctrl_fd, ret;
+
+ ctrl_fd = open(UBLK_CTRL_DEV, O_RDWR);
+ if (ctrl_fd < 0)
+ return 0;
+
+ memset(&dev, 0, sizeof(dev));
+ dev.ctrl_fd = ctrl_fd;
+
+ ret = ublk_get_dev_info(&dev, dev_id);
+ close(ctrl_fd);
+
+ if (ret < 0)
+ return 0;
+
+ if ((dev.dev_info.flags & UBLK_F_USER_RECOVERY) &&
+ dev.dev_info.state == UBLK_S_DEV_QUIESCED)
+ return 1;
+
+ return 0;
+}
+
+int erofs_ublk_del_dev_by_id(int dev_id)
+{
+ struct ublksrv_ctrl_cmd cmd = {0};
+ int ctrl_fd, ret;
+
+ ctrl_fd = open(UBLK_CTRL_DEV, O_RDWR);
+ if (ctrl_fd < 0)
+ return -errno;
+
+ cmd.dev_id = dev_id;
+ cmd.queue_id = (__u16)-1;
+
+ ret = ublk_ctrl_cmd(ctrl_fd, UBLK_U_CMD_STOP_DEV, &cmd);
+ if (ret < 0 && ret != -ENODEV)
+ erofs_dbg("STOP_DEV %d: %s", dev_id, strerror(-ret));
+
+ ret = ublk_ctrl_cmd(ctrl_fd, UBLK_U_CMD_DEL_DEV_ASYNC, &cmd);
+ close(ctrl_fd);
+ return ret;
+}
+
+#else /* !HAVE_LIBURING */
+
+int erofs_ublk_init(void)
+{
+ return -EOPNOTSUPP;
+}
+
+int erofs_ublk_create_dev(const struct erofs_ublk_dev_info *info,
+ erofs_ublk_io_handler_t handler,
+ void *handler_ctx)
+{
+ (void)info;
+ (void)handler;
+ (void)handler_ctx;
+ return -EOPNOTSUPP;
+}
+
+void erofs_ublk_destroy(int dev_id)
+{
+ (void)dev_id;
+}
+
+int erofs_ublk_start(int dev_id, int ready_fd)
+{
+ (void)dev_id;
+ (void)ready_fd;
+ return -EOPNOTSUPP;
+}
+
+int erofs_ublk_recover_dev(int dev_id,
+ erofs_ublk_io_handler_t handler,
+ void *handler_ctx)
+{
+ (void)dev_id;
+ (void)handler;
+ (void)handler_ctx;
+ return -EOPNOTSUPP;
+}
+
+int erofs_ublk_is_recoverable(int dev_id)
+{
+ (void)dev_id;
+ return 0;
+}
+
+int erofs_ublk_del_dev_by_id(int dev_id)
+{
+ (void)dev_id;
+ return -EOPNOTSUPP;
+}
+
+#endif /* HAVE_LIBURING */
diff --git a/lib/liberofs_ublk.h b/lib/liberofs_ublk.h
new file mode 100644
index 0000000..5aef9c3
--- /dev/null
+++ b/lib/liberofs_ublk.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */
+/*
+ * Copyright (C) 2026 Tencent, Inc.
+ * http://www.tencent.com/
+ */
+#ifndef __EROFS_LIB_LIBEROFS_UBLK_H
+#define __EROFS_LIB_LIBEROFS_UBLK_H
+
+#include "erofs/defs.h"
+
+#define EROFS_UBLK_F_UNPRIVILEGED (1U << 0)
+#define EROFS_UBLK_F_USER_RECOVERY (1U << 1)
+
+#define EROFS_UBLK_OP_READ 0
+
+struct erofs_ublk_dev_info {
+ u16 nr_hw_queues;
+ u16 queue_depth;
+ u32 max_io_buf_bytes;
+ u32 dev_id;
+ u64 dev_size;
+ u8 blkbits;
+ u8 reserved[3];
+ u32 flags;
+};
+
+struct erofs_ublk_request {
+ u8 op;
+ u64 start_sector;
+ u32 nr_sectors;
+ void *buf;
+ int result;
+};
+
+typedef int (*erofs_ublk_io_handler_t)(void *ctx,
+ struct erofs_ublk_request *req);
+
+int erofs_ublk_init(void);
+
+int erofs_ublk_create_dev(const struct erofs_ublk_dev_info *info,
+ erofs_ublk_io_handler_t handler,
+ void *handler_ctx);
+int erofs_ublk_start(int dev_id, int ready_fd);
+void erofs_ublk_destroy(int dev_id);
+
+int erofs_ublk_del_dev_by_id(int dev_id);
+
+int erofs_ublk_recover_dev(int dev_id,
+ erofs_ublk_io_handler_t handler,
+ void *handler_ctx);
+int erofs_ublk_is_recoverable(int dev_id);
+
+#endif
--
2.47.1
More information about the Linux-erofs
mailing list