[PATCH] erofs: support direct IO for ondemand mode

Hongbo Li lihongbo22 at huawei.com
Thu Jul 18 17:11:04 AEST 2024



On 2024/7/18 10:40, Gao Xiang wrote:
> Hi Hongbo,
> 
> I'd like to request Jingbo's review too.
> 
> On 2024/7/18 09:05, Hongbo Li wrote:
>> erofs over fscache cannot handle the direct read io. When the file
>> is opened with O_DIRECT flag, -EINVAL will reback. We support the
>> DIO in erofs over fscache by bypassing the erofs page cache and
>> reading target data into ubuf from fscache's file directly.
> 
> Could you give more hints in the commit message on the target user
> of fscache DIO?
> 
To be honest, I haven't come across such containers using direct I/O 
yet. I've just run fio and some other tests for the direct mode in 
containers, and they failed during open. If a traditional container 
start using direct I/O when it's then migrated to the erofs over fscache 
solution (ie. Nydus), it won't run properly. This is because the current 
on-demand mode of erofs does not support direct I/O. Currently, I 
thought there are two approaches to solve this: 1. direct I/O can 
fallback to buffered I/O (simpler but seems non-reasonable); 2. 
implement the direct I/O process like this way.

Thanks,
Hongbo

> For Android use cases, direct I/O support is mainly used for loop
> device direct mode.
> 
>>
>> The alignment for buffer memory, offset and size now is restricted
>> by erofs, since `i_blocksize` is enough for the under filesystems.
>>
>> Signed-off-by: Hongbo Li <lihongbo22 at huawei.com>
>> ---
>>   fs/erofs/data.c    |  3 ++
>>   fs/erofs/fscache.c | 95 +++++++++++++++++++++++++++++++++++++++++++---
>>   2 files changed, 93 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
>> index 8be60797ea2f..dbfafe358de4 100644
>> --- a/fs/erofs/data.c
>> +++ b/fs/erofs/data.c
>> @@ -391,6 +391,9 @@ static ssize_t erofs_file_read_iter(struct kiocb 
>> *iocb, struct iov_iter *to)
>>                iov_iter_alignment(to)) & blksize_mask)
>>               return -EINVAL;
>> +        if (erofs_is_fscache_mode(inode->i_sb))
>> +            return generic_file_read_iter(iocb, to);
>> +
>>           return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
>>                       NULL, 0, NULL, 0);
>>       }
>> diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
>> index fda16eedafb5..f5a09b168539 100644
>> --- a/fs/erofs/fscache.c
>> +++ b/fs/erofs/fscache.c
>> @@ -35,6 +35,8 @@ struct erofs_fscache_io {
>>   struct erofs_fscache_rq {
>>       struct address_space    *mapping;    /* The mapping being 
>> accessed */
>> +    struct iov_iter        *iter;        /* dst buf for direct io */
>> +    struct completion    done;        /* for synced direct io */
>>       loff_t            start;        /* Start position */
>>       size_t            len;        /* Length of the request */
>>       size_t            submitted;    /* Length of submitted */
>> @@ -76,7 +78,11 @@ static void erofs_fscache_req_put(struct 
>> erofs_fscache_rq *req)
>>   {
>>       if (!refcount_dec_and_test(&req->ref))
>>           return;
>> -    erofs_fscache_req_complete(req);
>> +
>> +    if (req->iter)
>> +        complete(&req->done);
>> +    else
>> +        erofs_fscache_req_complete(req);
>>       kfree(req);
>>   }
>> @@ -88,6 +94,7 @@ static struct erofs_fscache_rq 
>> *erofs_fscache_req_alloc(struct address_space *ma
>>       if (!req)
>>           return NULL;
>>       req->mapping = mapping;
>> +    req->iter = NULL;
>>       req->start = start;
>>       req->len = len;
>>       refcount_set(&req->ref, 1);
>> @@ -253,6 +260,55 @@ static int erofs_fscache_meta_read_folio(struct 
>> file *data, struct folio *folio)
>>       return ret;
>>   }
>> +static int erofs_fscache_data_dio_read(struct erofs_fscache_rq *req)
>> +{
>> +    struct address_space *mapping = req->mapping;
>> +    struct inode *inode = mapping->host;
>> +    struct super_block *sb = inode->i_sb;
>> +    struct iov_iter *iter = req->iter;
>> +    struct erofs_fscache_io *io;
>> +    struct erofs_map_blocks map;
>> +    struct erofs_map_dev mdev;
>> +    loff_t pos = req->start + req->submitted;
>> +    size_t count;
>> +    int ret;
>> +
>> +    map.m_la = pos;
>> +    ret = erofs_map_blocks(inode, &map);
>> +    if (ret)
>> +        return ret;
>> +
>> +    count = req->len - req->submitted;
>> +    if (!(map.m_flags & EROFS_MAP_MAPPED)) {
>> +        iov_iter_zero(count, iter);
>> +        req->submitted += count;
>> +        return 0;
>> +    }
>> +
>> +    count = min_t(size_t, map.m_llen - (pos - map.m_la), count);
>> +    DBG_BUGON(!count || count % PAGE_SIZE);
>> +
>> +    mdev = (struct erofs_map_dev) {
>> +        .m_deviceid = map.m_deviceid,
>> +        .m_pa = map.m_pa,
>> +    };
>> +    ret = erofs_map_dev(sb, &mdev);
>> +    if (ret)
>> +        return ret;
>> +
>> +    io = erofs_fscache_req_io_alloc(req);
>> +    if (!io)
>> +        return -ENOMEM;
>> +
>> +    iov_iter_ubuf(&io->iter, ITER_DEST, iter->ubuf + 
>> iter->iov_offset, count);
>> +    ret = erofs_fscache_read_io_async(mdev.m_fscache->cookie, 
>> mdev.m_pa + (pos - map.m_la), io);
>> +    iov_iter_advance(iter, io->iter.iov_offset);
>> +    erofs_fscache_req_io_put(io);
>> +
>> +    req->submitted += count;
>> +    return ret;
>> +}
>> +
>>   static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
>>   {
>>       struct address_space *mapping = req->mapping;
>> @@ -324,12 +380,13 @@ static int erofs_fscache_data_read_slice(struct 
>> erofs_fscache_rq *req)
>>       return ret;
>>   }
>> -static int erofs_fscache_data_read(struct erofs_fscache_rq *req)
>> +static int erofs_fscache_data_read(struct erofs_fscache_rq *req, bool 
>> direct)
>>   {
>>       int ret;
>>       do {
>> -        ret = erofs_fscache_data_read_slice(req);
>> +        ret = (direct) ? erofs_fscache_data_dio_read(req)
>> +                : erofs_fscache_data_read_slice(req);
>>           if (ret)
>>               req->error = ret;
>>       } while (!ret && req->submitted < req->len);
>> @@ -348,7 +405,7 @@ static int erofs_fscache_read_folio(struct file 
>> *file, struct folio *folio)
>>           return -ENOMEM;
>>       }
>> -    ret = erofs_fscache_data_read(req);
>> +    ret = erofs_fscache_data_read(req, false);
>>       erofs_fscache_req_put(req);
>>       return ret;
>>   }
>> @@ -369,8 +426,35 @@ static void erofs_fscache_readahead(struct 
>> readahead_control *rac)
>>       while (readahead_folio(rac))
>>           ;
>> -    erofs_fscache_data_read(req);
>> +    erofs_fscache_data_read(req, false);
>> +    erofs_fscache_req_put(req);
>> +}
>> +
>> +static ssize_t erofs_fscache_directIO(struct kiocb *iocb, struct 
>> iov_iter *iter)
> 
> How about get rid of upper case names, e.g.
> using erofs_fscache_direct_io instead?
> 
ok, I'll refine the name in later submission.
> Thanks,
> Gao Xiang
> 
>> +{
>> +    struct file *file = iocb->ki_filp;
>> +    size_t count = iov_iter_count(iter);
>> +    struct erofs_fscache_rq *req;
>> +    struct completion *ctr;
>> +    ssize_t rsize;
>> +    int ret;
>> +
>> +    if (unlikely(iov_iter_rw(iter) == WRITE))
>> +        return -EROFS;
>> +
>> +    req = erofs_fscache_req_alloc(file->f_mapping, iocb->ki_pos, count);
>> +    if (!req)
>> +        return -ENOMEM;
>> +
>> +    req->iter = iter;
>> +    init_completion(&req->done);
>> +    ctr = &req->done;
>> +    ret = erofs_fscache_data_read(req, true);
>> +    rsize = (ret == 0) ? (ssize_t)req->submitted : ret;
>>       erofs_fscache_req_put(req);
>> +    wait_for_completion(ctr);
>> +
>> +    return rsize;
>>   }
>>   static const struct address_space_operations erofs_fscache_meta_aops 
>> = {
>> @@ -380,6 +464,7 @@ static const struct address_space_operations 
>> erofs_fscache_meta_aops = {
>>   const struct address_space_operations erofs_fscache_access_aops = {
>>       .read_folio = erofs_fscache_read_folio,
>>       .readahead = erofs_fscache_readahead,
>> +    .direct_IO = erofs_fscache_directIO,
>>   };
>>   static void erofs_fscache_domain_put(struct erofs_domain *domain)


More information about the Linux-erofs mailing list