[PATCH] erofs: fix infinite loop due to a race of filling compressed_bvecs

Fri Jan 26 15:56:05 AEDT 2024

On Thu, Jan 25, 2024 at 4:01 AM Gao Xiang <hsiangkao at linux.alibaba.com> wrote:
>
> I encountered a race issue after lengthy (~594647 sec) stress tests on
> a 64k-page arm64 VM with several 4k-block EROFS images.  The timing
> is like below:
>
> z_erofs_try_inplace_io                  z_erofs_fill_bio_vec
>   cmpxchg(&compressed_bvecs[].page,
>           NULL, ..)
>                                         [access bufvec]
>   compressed_bvecs[] = *bvec;
>
> Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
> other fields in bufvec didn't matter.  After the subpage block support
> is landed, .offset and .end can be used too, but filling bufvec isn't
> an atomic operation which can cause inconsistency.
>
> Let's use a spinlock to keep the atomicity of each bufvec.  More
> specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
> since it's rarely used (also it takes a short time if even used) as long
> as the pcluster has a reference.
>
> Fixes: 192351616a9d ("erofs: support I/O submission for sub-page compressed blocks")
> Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
> ---
>  fs/erofs/zdata.c | 74 +++++++++++++++++++++++++-----------------------
>  1 file changed, 38 insertions(+), 36 deletions(-)
>
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index 583c062cd0e4..c1c77166b30f 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
>                         __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
>         unsigned int i;
>
> -       if (i_blocksize(fe->inode) != PAGE_SIZE)
> -               return;
> -       if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
> +       if (i_blocksize(fe->inode) != PAGE_SIZE ||
> +           fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
>                 return;
>
>         for (i = 0; i < pclusterpages; ++i) {
>                 struct page *page, *newpage;
>                 void *t;        /* mark pages just found for debugging */
>
> -               /* the compressed page was loaded before */
> +               /* Inaccurate check w/o locking to avoid unneeded lookups */
>                 if (READ_ONCE(pcl->compressed_bvecs[i].page))
>                         continue;
>
>                 page = find_get_page(mc, pcl->obj.index + i);
> -
>                 if (page) {
>                         t = (void *)((unsigned long)page | 1);
>                         newpage = NULL;
> @@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
>                         set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
>                         t = (void *)((unsigned long)newpage | 1);
>                 }
> -
> -               if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
> +               spin_lock(&pcl->obj.lockref.lock);
> +               if (!pcl->compressed_bvecs[i].page) {
> +                       pcl->compressed_bvecs[i].page = t;
> +                       spin_unlock(&pcl->obj.lockref.lock);
>                         continue;
> +               }
> +               spin_unlock(&pcl->obj.lockref.lock);
>
>                 if (page)
>                         put_page(page);
> @@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
>         return 0;
>  }
>
> -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
> -                                  struct z_erofs_bvec *bvec)
> -{
> -       struct z_erofs_pcluster *const pcl = fe->pcl;
> -
> -       while (fe->icur > 0) {
> -               if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
> -                            NULL, bvec->page)) {
> -                       pcl->compressed_bvecs[fe->icur] = *bvec;
> -                       return true;
> -               }
> -       }
> -       return false;
> -}
> -
>  /* callers must be with pcluster lock held */
>  static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
>                                struct z_erofs_bvec *bvec, bool exclusive)
>  {
> +       struct z_erofs_pcluster *pcl = fe->pcl;
>         int ret;
>
>         if (exclusive) {
>                 /* give priority for inplaceio to use file pages first */
> -               if (z_erofs_try_inplace_io(fe, bvec))
> +               spin_lock(&pcl->obj.lockref.lock);
> +               while (fe->icur > 0) {
> +                       if (pcl->compressed_bvecs[--fe->icur].page)
> +                               continue;
> +                       pcl->compressed_bvecs[fe->icur] = *bvec;
> +                       spin_unlock(&pcl->obj.lockref.lock);
>                         return 0;
> +               }
> +               spin_unlock(&pcl->obj.lockref.lock);
> +
>                 /* otherwise, check if it can be used as a bvpage */
>                 if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
>                     !fe->candidate_bvpage)
> @@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>  {
>         gfp_t gfp = mapping_gfp_mask(mc);
>         bool tocache = false;
> -       struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
> +       struct z_erofs_bvec zbv;
>         struct address_space *mapping;
> -       struct page *page, *oldpage;
> +       struct page *page;
>         int justfound, bs = i_blocksize(f->inode);
>
>         /* Except for inplace pages, the entire page can be used for I/Os */
>         bvec->bv_offset = 0;
>         bvec->bv_len = PAGE_SIZE;
>  repeat:
> -       oldpage = READ_ONCE(zbv->page);
> -       if (!oldpage)
> +       spin_lock(&pcl->obj.lockref.lock);
> +       zbv = pcl->compressed_bvecs[nr];
> +       page = zbv.page;
> +       justfound = (unsigned long)page & 1UL;
> +       page = (struct page *)((unsigned long)page & ~1UL);
> +       pcl->compressed_bvecs[nr].page = page;
> +       spin_unlock(&pcl->obj.lockref.lock);
> +       if (!page)
>                 goto out_allocpage;
>
> -       justfound = (unsigned long)oldpage & 1UL;
> -       page = (struct page *)((unsigned long)oldpage & ~1UL);
>         bvec->bv_page = page;
> -
>         DBG_BUGON(z_erofs_is_shortlived_page(page));
>         /*
>          * Handle preallocated cached pages.  We tried to allocate such pages
> @@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>          */
>         if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
>                 set_page_private(page, 0);
> -               WRITE_ONCE(zbv->page, page);
>                 tocache = true;
>                 goto out_tocache;
>         }
> @@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>          * therefore it is impossible for `mapping` to be NULL.
>          */
>         if (mapping && mapping != mc) {
> -               if (zbv->offset < 0)
> -                       bvec->bv_offset = round_up(-zbv->offset, bs);
> -               bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
> +               if (zbv.offset < 0)
> +                       bvec->bv_offset = round_up(-zbv.offset, bs);
> +               bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
>                 return;
>         }
>
> @@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>
>         /* the cached page is still in managed cache */
>         if (page->mapping == mc) {
> -               WRITE_ONCE(zbv->page, page);
>                 /*
>                  * The cached page is still available but without a valid
>                  * `->private` pcluster hint.  Let's reconnect them.
> @@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>         put_page(page);
>  out_allocpage:
>         page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
> -       if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
> +       spin_lock(&pcl->obj.lockref.lock);
> +       if (pcl->compressed_bvecs[nr].page) {
>                 erofs_pagepool_add(&f->pagepool, page);
> +               spin_unlock(&pcl->obj.lockref.lock);
>                 cond_resched();
>                 goto repeat;
>         }
> +       pcl->compressed_bvecs[nr].page = page;
> +       spin_unlock(&pcl->obj.lockref.lock);
>         bvec->bv_page = page;
>  out_tocache:
>         if (!tocache || bs != PAGE_SIZE ||
> @@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
>
>                         if (cur + bvec.bv_len > end)
>                                 bvec.bv_len = end - cur;
> +                       DBG_BUGON(bvec.bv_len < sb->s_blocksize);
>                         if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
>                                           bvec.bv_offset))
>                                 goto submit_bio_retry;
> --
> 2.39.3
>

LGTM!

Reviewed-by: Sandeep Dhavale <dhavale at google.com>