[PATCH v2] erofs: support direct IO for ondemand mode
Jingbo Xu
jefflexu at linux.alibaba.com
Mon Jul 29 22:09:44 AEST 2024
On 7/26/24 10:06 AM, Hongbo Li wrote:
> erofs over fscache cannot handle the direct read io. When the file
> is opened with O_DIRECT flag, -EINVAL will reback. We support the
> DIO in erofs over fscache by bypassing the erofs page cache and
> reading target data into ubuf from fscache's file directly.
>
> The alignment for buffer memory, offset and size now is restricted
> by erofs, since `i_blocksize` is enough for the under filesystems.
>
> Signed-off-by: Hongbo Li <lihongbo22 at huawei.com>
>
> ---
> v2:
> - Change the directIO helper name to erofs_fscache_direct_io
> - Add some io interception when begin direct io
> - Optimize the direct io logical in erofs_fscache_data_read_slice
>
> v1:
> - https://lore.kernel.org/linux-erofs/6f1a5c1c-d44d-4561-9377-b935ff4531f2@huawei.com/T/#t
> ---
> fs/erofs/data.c | 3 ++
> fs/erofs/fscache.c | 84 +++++++++++++++++++++++++++++++++++++++++-----
> 2 files changed, 78 insertions(+), 9 deletions(-)
>
> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
> index 8be60797ea2f..dbfafe358de4 100644
> --- a/fs/erofs/data.c
> +++ b/fs/erofs/data.c
> @@ -391,6 +391,9 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
> iov_iter_alignment(to)) & blksize_mask)
> return -EINVAL;
>
> + if (erofs_is_fscache_mode(inode->i_sb))
> + return generic_file_read_iter(iocb, to);
> +
> return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
> NULL, 0, NULL, 0);
> }
> diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
> index fda16eedafb5..e3e6c579eb81 100644
> --- a/fs/erofs/fscache.c
> +++ b/fs/erofs/fscache.c
> @@ -35,6 +35,8 @@ struct erofs_fscache_io {
>
> struct erofs_fscache_rq {
> struct address_space *mapping; /* The mapping being accessed */
> + struct iov_iter *iter; /* Destination for direct io */
> + struct completion done; /* Sync for direct io */
> loff_t start; /* Start position */
> size_t len; /* Length of the request */
> size_t submitted; /* Length of submitted */
> @@ -76,7 +78,11 @@ static void erofs_fscache_req_put(struct erofs_fscache_rq *req)
> {
> if (!refcount_dec_and_test(&req->ref))
> return;
> - erofs_fscache_req_complete(req);
> +
> + if (req->iter)
> + complete(&req->done);
> + else
> + erofs_fscache_req_complete(req);
> kfree(req);
> }
>
> @@ -88,6 +94,7 @@ static struct erofs_fscache_rq *erofs_fscache_req_alloc(struct address_space *ma
> if (!req)
> return NULL;
> req->mapping = mapping;
> + req->iter = NULL;
> req->start = start;
> req->len = len;
> refcount_set(&req->ref, 1);
> @@ -258,6 +265,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
> struct address_space *mapping = req->mapping;
> struct inode *inode = mapping->host;
> struct super_block *sb = inode->i_sb;
> + struct iov_iter *dest_iter = req->iter;
> struct erofs_fscache_io *io;
> struct erofs_map_blocks map;
> struct erofs_map_dev mdev;
> @@ -270,33 +278,45 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
> if (ret)
> return ret;
>
> + count = req->len - req->submitted;
> if (map.m_flags & EROFS_MAP_META) {
> struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
> struct iov_iter iter;
> - size_t size = map.m_llen;
> + struct iov_iter *iterp;
> + size_t size = min_t(size_t, map.m_llen, count);
> void *src;
> + ssize_t filled = 0;
>
> src = erofs_read_metabuf(&buf, sb, map.m_pa, EROFS_KMAP);
> if (IS_ERR(src))
> return PTR_ERR(src);
>
> iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE);
> - if (copy_to_iter(src, size, &iter) != size) {
> + iterp = (dest_iter) ?: &iter;
Could we also reuse erofs_fscache_rq->iter for the non-direct-IO case?
That is, initializing erofs_fscache_rq->iter before calling
erofs_fscache_data_read() just like what erofs_fscache_direct_io() does,
so that we don't need disturbing the iter here inside
erofs_fscache_data_read_slice().
> + if (copy_to_iter(src, size, iterp) != size) {
> erofs_put_metabuf(&buf);
> return -EFAULT;
> }
> - iov_iter_zero(PAGE_SIZE - size, &iter);
> +
> + filled += size;
> + if (!dest_iter) {
> + iov_iter_zero(PAGE_SIZE - size, iterp);
> + filled += (PAGE_SIZE - size);
> + }
> +
Don't we need also zeroing the remaining part for direct IO, e.g. when
direct reading the last inlined part of a file of
EROFS_INODE_FLAT_INLINE layout?
> erofs_put_metabuf(&buf);
> - req->submitted += PAGE_SIZE;
> + req->submitted += filled;
> return 0;
> }
>
> - count = req->len - req->submitted;
> if (!(map.m_flags & EROFS_MAP_MAPPED)) {
> struct iov_iter iter;
>
> - iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
> - iov_iter_zero(count, &iter);
> + if (!dest_iter) {
> + iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
> + iov_iter_zero(count, &iter);
> + }
> +
Ditto. Don't we need handling the unmapped case for direct IO, e.g.
when direct reading a sparse file?
> req->submitted += count;
> return 0;
> }
> @@ -315,9 +335,17 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
> io = erofs_fscache_req_io_alloc(req);
> if (!io)
> return -ENOMEM;
> - iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count);
> +
> + if (dest_iter)
> + iov_iter_ubuf(&io->iter, ITER_DEST,
> + dest_iter->ubuf + dest_iter->iov_offset, count);
> + else
> + iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count);
> +
> ret = erofs_fscache_read_io_async(mdev.m_fscache->cookie,
> mdev.m_pa + (pos - map.m_la), io);
> + if (dest_iter)
> + iov_iter_advance(dest_iter, io->iter.iov_offset);
> erofs_fscache_req_io_put(io);
>
> req->submitted += count;
> @@ -373,6 +401,43 @@ static void erofs_fscache_readahead(struct readahead_control *rac)
> erofs_fscache_req_put(req);
> }
>
> +static ssize_t erofs_fscache_direct_io(struct kiocb *iocb, struct iov_iter *iter)
> +{
> + struct file *file = iocb->ki_filp;
> + size_t count = iov_iter_count(iter);
> + size_t i_size = i_size_read(file->f_inode);
> + struct erofs_fscache_rq *req;
> + struct completion *ctr;
> + ssize_t rsize;
> + int ret;
> +
> + if (unlikely(iov_iter_rw(iter) == WRITE))
> + return -EROFS;
We will get here by no way since .write_iter() is not implemented at
all, right?
> +
> + if (unlikely(iocb->ki_pos >= i_size))
> + return 0;
> +
> + if (count + iocb->ki_pos > i_size)
> + count = i_size - iocb->ki_pos;
> +
> + if (!count)
> + return 0;
> +
> + req = erofs_fscache_req_alloc(file->f_mapping, iocb->ki_pos, count);
> + if (!req)
> + return -ENOMEM;
> +
> + req->iter = iter;
> + init_completion(&req->done);
> + ctr = &req->done;
> + ret = erofs_fscache_data_read(req);
> + rsize = (ret == 0) ? (ssize_t)req->submitted : ret;
> + erofs_fscache_req_put(req);
If we get error in erofs_fscache_data_read(), the above
erofs_fscache_req_put() will put the last reference of erofs_fscache_rq
and free the erofs_fscache_rq structure. Then the following
wait_for_completion() will cause UAF.
> + wait_for_completion(ctr);
> +
> + return rsize;
> +}
> +
> static const struct address_space_operations erofs_fscache_meta_aops = {
> .read_folio = erofs_fscache_meta_read_folio,
> };
> @@ -380,6 +445,7 @@ static const struct address_space_operations erofs_fscache_meta_aops = {
> const struct address_space_operations erofs_fscache_access_aops = {
> .read_folio = erofs_fscache_read_folio,
> .readahead = erofs_fscache_readahead,
> + .direct_IO = erofs_fscache_direct_io,
> };
>
> static void erofs_fscache_domain_put(struct erofs_domain *domain)
--
Thanks,
Jingbo
More information about the Linux-erofs
mailing list