[PATCH] fsck.erofs: introduce multi-threaded decompression PoC with pcluster batching
Nithurshen
nithurshen.dev at gmail.com
Mon Mar 2 18:32:16 AEDT 2026
This is a Proof of Concept to introduce a scalable, multi-threaded
decompression framework into fsck.erofs to reduce extraction time.
Baseline Profiling:
Using the Linux 6.7 kernel source packed with LZ4HC (4K pclusters),
perf showed a strictly synchronous execution path. The main thread
spent ~52% of its time in LZ4_decompress_safe, heavily blocked by
synchronous I/O (~32% in el0_svc/vfs_read).
First Iteration (Naive Workqueue):
A standard producer-consumer workqueue overlapping compute with pwrite()
suffered massive scheduling overhead. For 4KB LZ4 clusters, workers
spent ~44% of CPU time spinning on __arm64_sys_futex and try_to_wake_up.
Current PoC (Dynamic Pcluster Batching):
To eliminate lock contention, this patch introduces a batching context.
Instead of queuing 1 pcluster per task, the main thread collects an
array of sequential pclusters (Z_EROFS_PCLUSTER_BATCH_SIZE = 32) before
submitting a single erofs_work unit.
Results:
- Scheduling overhead (futex) dropped significantly.
- Workers stay cache-hot, decompressing 32 blocks per wakeup.
- LZ4_decompress_safe is successfully offloaded to background cores
(~18.8% self-execution time), completely decoupled from main thread I/O.
Signed-off-by: Nithurshen <nithurshen.dev at gmail.com>
---
fsck/main.c | 144 ++++++++++++-------------------
include/erofs/internal.h | 15 +++-
lib/data.c | 182 +++++++++++++++++++++++++++++++--------
3 files changed, 217 insertions(+), 124 deletions(-)
diff --git a/fsck/main.c b/fsck/main.c
index ab697be..1b6db42 100644
--- a/fsck/main.c
+++ b/fsck/main.c
@@ -15,6 +15,9 @@
#include "erofs/xattr.h"
#include "../lib/compressor.h"
#include "../lib/liberofs_compress.h"
+#include "erofs/workqueue.h"
+
+extern struct erofs_workqueue erofs_wq;
static int erofsfsck_check_inode(erofs_nid_t pnid, erofs_nid_t nid);
@@ -493,135 +496,96 @@ out:
static int erofs_verify_inode_data(struct erofs_inode *inode, int outfd)
{
- struct erofs_map_blocks map = {
- .buf = __EROFS_BUF_INITIALIZER,
- };
+ struct erofs_map_blocks map = { .buf = __EROFS_BUF_INITIALIZER };
bool needdecode = fsckcfg.check_decomp && !erofs_is_packed_inode(inode);
int ret = 0;
- bool compressed;
+ bool compressed = erofs_inode_is_data_compressed(inode->datalayout);
erofs_off_t pos = 0;
u64 pchunk_len = 0;
- unsigned int raw_size = 0, buffer_size = 0;
- char *raw = NULL, *buffer = NULL;
- erofs_dbg("verify data chunk of nid(%llu): type(%d)",
- inode->nid | 0ULL, inode->datalayout);
+ struct z_erofs_read_ctx ctx = {
+ .pending_tasks = 0,
+ .final_err = 0,
+ .outfd = outfd,
+ .current_task = NULL
+ };
+ pthread_mutex_init(&ctx.lock, NULL);
+ pthread_cond_init(&ctx.cond, NULL);
- compressed = erofs_inode_is_data_compressed(inode->datalayout);
- while (pos < inode->i_size) {
- unsigned int alloc_rawsize;
+ erofs_dbg("verify data chunk of nid(%llu): type(%d)", inode->nid | 0ULL, inode->datalayout);
+ while (pos < inode->i_size) {
map.m_la = pos;
ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_FIEMAP);
- if (ret)
- goto out;
+ if (ret) goto out;
- if (!compressed && map.m_llen != map.m_plen) {
- erofs_err("broken chunk length m_la %" PRIu64 " m_llen %" PRIu64 " m_plen %" PRIu64,
- map.m_la, map.m_llen, map.m_plen);
- ret = -EFSCORRUPTED;
- goto out;
- }
-
- /* the last lcluster can be divided into 3 parts */
if (map.m_la + map.m_llen > inode->i_size)
map.m_llen = inode->i_size - map.m_la;
pchunk_len += map.m_plen;
pos += map.m_llen;
- /* should skip decomp? */
if (map.m_la >= inode->i_size || !needdecode)
continue;
if (outfd >= 0 && !(map.m_flags & EROFS_MAP_MAPPED)) {
- ret = lseek(outfd, map.m_llen, SEEK_CUR);
- if (ret < 0) {
- ret = -errno;
- goto out;
- }
continue;
}
- if (map.m_plen > Z_EROFS_PCLUSTER_MAX_SIZE) {
- if (compressed && !(map.m_flags & __EROFS_MAP_FRAGMENT)) {
- erofs_err("invalid pcluster size %" PRIu64 " @ offset %" PRIu64 " of nid %" PRIu64,
- map.m_plen, map.m_la,
- inode->nid | 0ULL);
- ret = -EFSCORRUPTED;
- goto out;
- }
- alloc_rawsize = Z_EROFS_PCLUSTER_MAX_SIZE;
- } else {
- alloc_rawsize = map.m_plen;
- }
-
- if (alloc_rawsize > raw_size) {
- char *newraw = realloc(raw, alloc_rawsize);
-
- if (!newraw) {
+ if (compressed) {
+ char *raw = malloc(map.m_plen);
+ size_t buffer_size = map.m_llen > erofs_blksiz(inode->sbi) ? map.m_llen : erofs_blksiz(inode->sbi);
+ char *buffer = malloc(buffer_size);
+ if (!raw || !buffer) {
+ free(raw); free(buffer);
ret = -ENOMEM;
goto out;
}
- raw = newraw;
- raw_size = alloc_rawsize;
- }
- if (compressed) {
- if (map.m_llen > buffer_size) {
- char *newbuffer;
-
- buffer_size = map.m_llen;
- newbuffer = realloc(buffer, buffer_size);
- if (!newbuffer) {
- ret = -ENOMEM;
- goto out;
- }
- buffer = newbuffer;
+ pthread_mutex_lock(&ctx.lock);
+ if (erofs_wq.job_count > 128) {
+ z_erofs_read_ctx_enqueue(&ctx);
+ while (ctx.pending_tasks > 0)
+ pthread_cond_wait(&ctx.cond, &ctx.lock);
}
- ret = z_erofs_read_one_data(inode, &map, raw, buffer,
- 0, map.m_llen, false);
- if (ret)
+ pthread_mutex_unlock(&ctx.lock);
+
+ ret = z_erofs_read_one_data(inode, &map, raw, buffer, 0, map.m_llen, false, map.m_la, &ctx);
+ if (ret) {
+ free(raw); free(buffer);
goto out;
+ }
- if (outfd >= 0 && write(outfd, buffer, map.m_llen) < 0)
- goto fail_eio;
} else {
- u64 p = 0;
-
- do {
- u64 count = min_t(u64, alloc_rawsize,
- map.m_llen);
-
- ret = erofs_read_one_data(inode, &map, raw, p, count);
- if (ret)
- goto out;
-
- if (outfd >= 0 && write(outfd, raw, count) < 0)
- goto fail_eio;
- map.m_llen -= count;
- p += count;
- } while (map.m_llen);
+ char *raw = malloc(map.m_llen);
+ ret = erofs_read_one_data(inode, &map, raw, 0, map.m_llen);
+ if (ret == 0 && outfd >= 0)
+ pwrite(outfd, raw, map.m_llen, map.m_la);
+ free(raw);
+ if (ret) goto out;
}
}
+ z_erofs_read_ctx_enqueue(&ctx);
+
+out:
+ pthread_mutex_lock(&ctx.lock);
+ while (ctx.pending_tasks > 0)
+ pthread_cond_wait(&ctx.cond, &ctx.lock);
+ if (ctx.final_err < 0 && ret == 0)
+ ret = ctx.final_err;
+ pthread_mutex_unlock(&ctx.lock);
if (fsckcfg.print_comp_ratio) {
if (!erofs_is_packed_inode(inode))
fsckcfg.logical_blocks += BLK_ROUND_UP(inode->sbi, inode->i_size);
fsckcfg.physical_blocks += BLK_ROUND_UP(inode->sbi, pchunk_len);
}
-out:
- if (raw)
- free(raw);
- if (buffer)
- free(buffer);
- return ret < 0 ? ret : 0;
-fail_eio:
- erofs_err("I/O error occurred when verifying data chunk @ nid %llu",
- inode->nid | 0ULL);
- ret = -EIO;
- goto out;
+ if (outfd >= 0 && ret == 0)
+ ftruncate(outfd, inode->i_size);
+ pthread_mutex_destroy(&ctx.lock);
+ pthread_cond_destroy(&ctx.cond);
+ return ret < 0 ? ret : 0;
}
static inline int erofs_extract_dir(struct erofs_inode *inode)
@@ -1019,6 +983,8 @@ int main(int argc, char *argv[])
erofs_init_configure();
+ erofs_alloc_workqueue(&erofs_wq, 4, 256, NULL, NULL);
+
fsckcfg.physical_blocks = 0;
fsckcfg.logical_blocks = 0;
fsckcfg.extract_path = NULL;
diff --git a/include/erofs/internal.h b/include/erofs/internal.h
index e741f1c..de9ac49 100644
--- a/include/erofs/internal.h
+++ b/include/erofs/internal.h
@@ -63,6 +63,8 @@ struct erofs_buf {
#define BLK_ROUND_UP(sbi, addr) \
(roundup(addr, erofs_blksiz(sbi)) >> (sbi)->blkszbits)
+#define Z_EROFS_PCLUSTER_BATCH_SIZE 32
+
struct erofs_buffer_head;
struct erofs_bufmgr;
@@ -475,9 +477,20 @@ int erofs_map_blocks(struct erofs_inode *inode,
int erofs_map_dev(struct erofs_sb_info *sbi, struct erofs_map_dev *map);
int erofs_read_one_data(struct erofs_inode *inode, struct erofs_map_blocks *map,
char *buffer, u64 offset, size_t len);
+struct z_erofs_decompress_task;
+struct z_erofs_read_ctx {
+ pthread_mutex_t lock;
+ pthread_cond_t cond;
+ int pending_tasks;
+ int final_err;
+ int outfd;
+ struct z_erofs_decompress_task *current_task;
+};
+void z_erofs_read_ctx_enqueue(struct z_erofs_read_ctx *ctx);
int z_erofs_read_one_data(struct erofs_inode *inode,
struct erofs_map_blocks *map, char *raw, char *buffer,
- erofs_off_t skip, erofs_off_t length, bool trimmed);
+ erofs_off_t skip, erofs_off_t length, bool trimmed,
+ erofs_off_t out_offset, struct z_erofs_read_ctx *ctx);
void *erofs_read_metadata(struct erofs_sb_info *sbi, erofs_nid_t nid,
erofs_off_t *offset, int *lengthp);
int z_erofs_parse_cfgs(struct erofs_sb_info *sbi, struct erofs_super_block *dsb);
diff --git a/lib/data.c b/lib/data.c
index 6fd1389..4d8fcef 100644
--- a/lib/data.c
+++ b/lib/data.c
@@ -9,6 +9,35 @@
#include "erofs/trace.h"
#include "erofs/decompress.h"
#include "liberofs_fragments.h"
+#include "erofs/workqueue.h"
+#include <pthread.h>
+
+struct erofs_workqueue erofs_wq;
+
+/* struct z_erofs_read_ctx {
+ pthread_mutex_t lock;
+ pthread_cond_t cond;
+ int pending_tasks;
+ int final_err;
+ int outfd;
+}; */
+
+struct z_erofs_decompress_task {
+ struct erofs_work work;
+ struct z_erofs_read_ctx *ctx;
+ struct z_erofs_decompress_req reqs[Z_EROFS_PCLUSTER_BATCH_SIZE];
+ char *raw_bufs[Z_EROFS_PCLUSTER_BATCH_SIZE];
+ erofs_off_t out_offsets[Z_EROFS_PCLUSTER_BATCH_SIZE];
+ unsigned int nr_reqs;
+};
+
+void z_erofs_read_ctx_enqueue(struct z_erofs_read_ctx *ctx)
+{
+ if (ctx && ctx->current_task) {
+ erofs_queue_work(&erofs_wq, &ctx->current_task->work);
+ ctx->current_task = NULL;
+ }
+}
void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap)
{
@@ -275,9 +304,45 @@ static int erofs_read_raw_data(struct erofs_inode *inode, char *buffer,
return 0;
}
+static void z_erofs_decompress_worker(struct erofs_work *work, void *tlsp)
+{
+ struct z_erofs_decompress_task *task = (struct z_erofs_decompress_task *)work;
+ struct z_erofs_read_ctx *ctx = task->ctx;
+ int i, ret = 0, first_err = 0;
+
+ for (i = 0; i < task->nr_reqs; ++i) {
+ ret = z_erofs_decompress(&task->reqs[i]);
+
+ if (ret == 0 && ctx && ctx->outfd >= 0) {
+ if (pwrite(ctx->outfd, task->reqs[i].out,
+ task->reqs[i].decodedlength, task->out_offsets[i]) < 0)
+ ret = -errno;
+ }
+
+ if (ret < 0 && first_err == 0)
+ first_err = ret;
+
+ free(task->raw_bufs[i]);
+ if (ctx && ctx->outfd >= 0)
+ free(task->reqs[i].out);
+ }
+
+ if (ctx) {
+ pthread_mutex_lock(&ctx->lock);
+ if (first_err < 0 && ctx->final_err == 0)
+ ctx->final_err = first_err;
+ ctx->pending_tasks--;
+ if (ctx->pending_tasks == 0)
+ pthread_cond_signal(&ctx->cond);
+ pthread_mutex_unlock(&ctx->lock);
+ }
+ free(task);
+}
+
int z_erofs_read_one_data(struct erofs_inode *inode,
struct erofs_map_blocks *map, char *raw, char *buffer,
- erofs_off_t skip, erofs_off_t length, bool trimmed)
+ erofs_off_t skip, erofs_off_t length, bool trimmed,
+ erofs_off_t out_offset, struct z_erofs_read_ctx *ctx)
{
struct erofs_sb_info *sbi = inode->sbi;
struct erofs_map_dev mdev;
@@ -307,24 +372,40 @@ int z_erofs_read_one_data(struct erofs_inode *inode,
if (ret < 0)
return ret;
- ret = z_erofs_decompress(&(struct z_erofs_decompress_req) {
- .sbi = sbi,
- .in = raw,
- .out = buffer,
- .decodedskip = skip,
- .interlaced_offset =
- map->m_algorithmformat == Z_EROFS_COMPRESSION_INTERLACED ?
- erofs_blkoff(sbi, map->m_la) : 0,
- .inputsize = map->m_plen,
- .decodedlength = length,
- .alg = map->m_algorithmformat,
- .partial_decoding = trimmed ? true :
- !(map->m_flags & EROFS_MAP_FULL_MAPPED) ||
- (map->m_flags & EROFS_MAP_PARTIAL_REF),
- });
- if (ret < 0)
- return ret;
- return 0;
+ struct z_erofs_decompress_task *task = ctx->current_task;
+ if (!task) {
+ task = calloc(1, sizeof(*task));
+ task->ctx = ctx;
+ task->work.fn = z_erofs_decompress_worker;
+ ctx->current_task = task;
+
+ pthread_mutex_lock(&ctx->lock);
+ ctx->pending_tasks++;
+ pthread_mutex_unlock(&ctx->lock);
+ }
+
+ int idx = task->nr_reqs++;
+ task->reqs[idx] = (struct z_erofs_decompress_req) {
+ .sbi = sbi,
+ .in = raw,
+ .out = buffer,
+ .decodedskip = skip,
+ .interlaced_offset = map->m_algorithmformat == Z_EROFS_COMPRESSION_INTERLACED ?
+ erofs_blkoff(sbi, map->m_la) : 0,
+ .inputsize = map->m_plen,
+ .decodedlength = length,
+ .alg = map->m_algorithmformat,
+ .partial_decoding = trimmed ? true :
+ !(map->m_flags & EROFS_MAP_FULL_MAPPED) ||
+ (map->m_flags & EROFS_MAP_PARTIAL_REF),
+ };
+ task->raw_bufs[idx] = raw;
+ task->out_offsets[idx] = out_offset;
+
+ if (task->nr_reqs == Z_EROFS_PCLUSTER_BATCH_SIZE) {
+ z_erofs_read_ctx_enqueue(ctx);
+ }
+ return 0;
}
static int z_erofs_read_data(struct erofs_inode *inode, char *buffer,
@@ -335,10 +416,17 @@ static int z_erofs_read_data(struct erofs_inode *inode, char *buffer,
.buf = __EROFS_BUF_INITIALIZER,
};
bool trimmed;
- unsigned int bufsize = 0;
- char *raw = NULL;
int ret = 0;
+ struct z_erofs_read_ctx ctx = {
+ .pending_tasks = 0,
+ .final_err = 0,
+ .outfd = -1,
+ .current_task = NULL
+ };
+ pthread_mutex_init(&ctx.lock, NULL);
+ pthread_cond_init(&ctx.cond, NULL);
+
end = offset + size;
while (end > offset) {
map.m_la = end - 1;
@@ -374,25 +462,51 @@ static int z_erofs_read_data(struct erofs_inode *inode, char *buffer,
continue;
}
- if (map.m_plen > bufsize) {
- char *newraw;
+ /*
+ * If the global workqueue is getting too deep,
+ * dynamically throttle the producer by forcing the main thread
+ * to wait early. Prevents memory bloat from fast I/O out-pacing
+ * the decompression threads.
+ */
+ pthread_mutex_lock(&ctx.lock);
+ if (erofs_wq.job_count > 128) {
+ z_erofs_read_ctx_enqueue(&ctx);
+ while (ctx.pending_tasks > 0)
+ pthread_cond_wait(&ctx.cond, &ctx.lock);
+ }
+ pthread_mutex_unlock(&ctx.lock);
- bufsize = map.m_plen;
- newraw = realloc(raw, bufsize);
- if (!newraw) {
- ret = -ENOMEM;
- break;
- }
- raw = newraw;
+ /* Allocate fresh raw buffer for each pcluster. */
+ char *raw = malloc(map.m_plen);
+ if (!raw) {
+ ret = -ENOMEM;
+ break;
}
ret = z_erofs_read_one_data(inode, &map, raw,
- buffer + end - offset, skip, length, trimmed);
- if (ret < 0)
+ buffer + end - offset, skip, length, trimmed, 0, &ctx);
+ if (ret < 0) {
+ free(raw);
break;
+ }
+ }
+ z_erofs_read_ctx_enqueue(&ctx);
+
+ /*
+ * Wait for all queued pclusters for this read request to finish
+ * before allowing the VFS layer or fsck core to consume the buffer.
+ */
+ pthread_mutex_lock(&ctx.lock);
+ while (ctx.pending_tasks > 0) {
+ pthread_cond_wait(&ctx.cond, &ctx.lock);
}
- if (raw)
- free(raw);
+
+ /* Bubble up any decompression errors caught by the worker threads */
+ if (ctx.final_err < 0 && ret == 0)
+ ret = ctx.final_err;
+
+ pthread_mutex_destroy(&ctx.lock);
+ pthread_cond_destroy(&ctx.cond);
return ret < 0 ? ret : 0;
}
--
2.51.0
More information about the Linux-erofs
mailing list