[PATCH] erofs: fix infinite loop due to a race of filling compressed_bvecs
Gao Xiang
hsiangkao at linux.alibaba.com
Fri Jan 26 16:06:49 AEDT 2024
On 2024/1/26 12:56, Sandeep Dhavale via Linux-erofs wrote:
> On Thu, Jan 25, 2024 at 4:01 AM Gao Xiang <hsiangkao at linux.alibaba.com> wrote:
>>
>> I encountered a race issue after lengthy (~594647 sec) stress tests on
>> a 64k-page arm64 VM with several 4k-block EROFS images. The timing
>> is like below:
>>
>> z_erofs_try_inplace_io z_erofs_fill_bio_vec
>> cmpxchg(&compressed_bvecs[].page,
>> NULL, ..)
>> [access bufvec]
>> compressed_bvecs[] = *bvec;
>>
>> Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
>> other fields in bufvec didn't matter. After the subpage block support
>> is landed, .offset and .end can be used too, but filling bufvec isn't
>> an atomic operation which can cause inconsistency.
>>
>> Let's use a spinlock to keep the atomicity of each bufvec. More
>> specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
>> since it's rarely used (also it takes a short time if even used) as long
>> as the pcluster has a reference.
>>
>> Fixes: 192351616a9d ("erofs: support I/O submission for sub-page compressed blocks")
>> Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
>> ---
>> fs/erofs/zdata.c | 74 +++++++++++++++++++++++++-----------------------
>> 1 file changed, 38 insertions(+), 36 deletions(-)
>>
>> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
>> index 583c062cd0e4..c1c77166b30f 100644
>> --- a/fs/erofs/zdata.c
>> +++ b/fs/erofs/zdata.c
>> @@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
>> __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
>> unsigned int i;
>>
>> - if (i_blocksize(fe->inode) != PAGE_SIZE)
>> - return;
>> - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
>> + if (i_blocksize(fe->inode) != PAGE_SIZE ||
>> + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
>> return;
>>
>> for (i = 0; i < pclusterpages; ++i) {
>> struct page *page, *newpage;
>> void *t; /* mark pages just found for debugging */
>>
>> - /* the compressed page was loaded before */
>> + /* Inaccurate check w/o locking to avoid unneeded lookups */
>> if (READ_ONCE(pcl->compressed_bvecs[i].page))
>> continue;
>>
>> page = find_get_page(mc, pcl->obj.index + i);
>> -
>> if (page) {
>> t = (void *)((unsigned long)page | 1);
>> newpage = NULL;
>> @@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
>> set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
>> t = (void *)((unsigned long)newpage | 1);
>> }
>> -
>> - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
>> + spin_lock(&pcl->obj.lockref.lock);
>> + if (!pcl->compressed_bvecs[i].page) {
>> + pcl->compressed_bvecs[i].page = t;
>> + spin_unlock(&pcl->obj.lockref.lock);
>> continue;
>> + }
>> + spin_unlock(&pcl->obj.lockref.lock);
>>
>> if (page)
>> put_page(page);
>> @@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
>> return 0;
>> }
>>
>> -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
>> - struct z_erofs_bvec *bvec)
>> -{
>> - struct z_erofs_pcluster *const pcl = fe->pcl;
>> -
>> - while (fe->icur > 0) {
>> - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
>> - NULL, bvec->page)) {
>> - pcl->compressed_bvecs[fe->icur] = *bvec;
>> - return true;
>> - }
>> - }
>> - return false;
>> -}
>> -
>> /* callers must be with pcluster lock held */
>> static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
>> struct z_erofs_bvec *bvec, bool exclusive)
>> {
>> + struct z_erofs_pcluster *pcl = fe->pcl;
>> int ret;
>>
>> if (exclusive) {
>> /* give priority for inplaceio to use file pages first */
>> - if (z_erofs_try_inplace_io(fe, bvec))
>> + spin_lock(&pcl->obj.lockref.lock);
>> + while (fe->icur > 0) {
>> + if (pcl->compressed_bvecs[--fe->icur].page)
>> + continue;
>> + pcl->compressed_bvecs[fe->icur] = *bvec;
>> + spin_unlock(&pcl->obj.lockref.lock);
>> return 0;
>> + }
>> + spin_unlock(&pcl->obj.lockref.lock);
>> +
>> /* otherwise, check if it can be used as a bvpage */
>> if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
>> !fe->candidate_bvpage)
>> @@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>> {
>> gfp_t gfp = mapping_gfp_mask(mc);
>> bool tocache = false;
>> - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
>> + struct z_erofs_bvec zbv;
>> struct address_space *mapping;
>> - struct page *page, *oldpage;
>> + struct page *page;
>> int justfound, bs = i_blocksize(f->inode);
>>
>> /* Except for inplace pages, the entire page can be used for I/Os */
>> bvec->bv_offset = 0;
>> bvec->bv_len = PAGE_SIZE;
>> repeat:
>> - oldpage = READ_ONCE(zbv->page);
>> - if (!oldpage)
>> + spin_lock(&pcl->obj.lockref.lock);
>> + zbv = pcl->compressed_bvecs[nr];
>> + page = zbv.page;
>> + justfound = (unsigned long)page & 1UL;
>> + page = (struct page *)((unsigned long)page & ~1UL);
>> + pcl->compressed_bvecs[nr].page = page;
>> + spin_unlock(&pcl->obj.lockref.lock);
>> + if (!page)
>> goto out_allocpage;
>>
>> - justfound = (unsigned long)oldpage & 1UL;
>> - page = (struct page *)((unsigned long)oldpage & ~1UL);
>> bvec->bv_page = page;
>> -
>> DBG_BUGON(z_erofs_is_shortlived_page(page));
>> /*
>> * Handle preallocated cached pages. We tried to allocate such pages
>> @@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>> */
>> if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
>> set_page_private(page, 0);
>> - WRITE_ONCE(zbv->page, page);
>> tocache = true;
>> goto out_tocache;
>> }
>> @@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>> * therefore it is impossible for `mapping` to be NULL.
>> */
>> if (mapping && mapping != mc) {
>> - if (zbv->offset < 0)
>> - bvec->bv_offset = round_up(-zbv->offset, bs);
>> - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
>> + if (zbv.offset < 0)
>> + bvec->bv_offset = round_up(-zbv.offset, bs);
>> + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
>> return;
>> }
>>
>> @@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>>
>> /* the cached page is still in managed cache */
>> if (page->mapping == mc) {
>> - WRITE_ONCE(zbv->page, page);
>> /*
>> * The cached page is still available but without a valid
>> * `->private` pcluster hint. Let's reconnect them.
>> @@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>> put_page(page);
>> out_allocpage:
>> page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
>> - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
>> + spin_lock(&pcl->obj.lockref.lock);
>> + if (pcl->compressed_bvecs[nr].page) {
>> erofs_pagepool_add(&f->pagepool, page);
>> + spin_unlock(&pcl->obj.lockref.lock);
>> cond_resched();
>> goto repeat;
>> }
>> + pcl->compressed_bvecs[nr].page = page;
>> + spin_unlock(&pcl->obj.lockref.lock);
>> bvec->bv_page = page;
>> out_tocache:
>> if (!tocache || bs != PAGE_SIZE ||
>> @@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
>>
>> if (cur + bvec.bv_len > end)
>> bvec.bv_len = end - cur;
>> + DBG_BUGON(bvec.bv_len < sb->s_blocksize);
>> if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
>> bvec.bv_offset))
>> goto submit_bio_retry;
>> --
>> 2.39.3
>>
>
> LGTM!
>
> Reviewed-by: Sandeep Dhavale <dhavale at google.com>
Thanks for the review :-)
Thanks,
Gao Xiang
More information about the Linux-erofs
mailing list