Hi,

On 9/3/2024 4:44 PM, Jingbo Xu wrote:
>
> On 8/31/24 5:37 PM, Hou Tao wrote:
>> From: Hou Tao <hout...@huawei.com>
>>
>> When trying to insert a 10MB kernel module kept in a virtio-fs with cache
>> disabled, the following warning was reported:
>>

SNIP
>>
>> Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem")
>> Signed-off-by: Hou Tao <hout...@huawei.com>
> Tested-by: Jingbo Xu <jeffl...@linux.alibaba.com>

Thanks for the test.
>
>
>> ---
>>  fs/fuse/file.c      | 62 +++++++++++++++++++++++++++++++--------------
>>  fs/fuse/fuse_i.h    |  6 +++++
>>  fs/fuse/virtio_fs.c |  1 +
>>  3 files changed, 50 insertions(+), 19 deletions(-)
>>
>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
>> index f39456c65ed7..331208d3e4d1 100644
>> --- a/fs/fuse/file.c
>> +++ b/fs/fuse/file.c
>> @@ -645,7 +645,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct 
>> file *file, loff_t pos,
>>      args->out_args[0].size = count;
>>  }
>>  
>> -

SNIP
>>  static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter 
>> *ii,
>>                             size_t *nbytesp, int write,
>> -                           unsigned int max_pages)
>> +                           unsigned int max_pages,
>> +                           bool use_pages_for_kvec_io)
>>  {
>> +    bool flush_or_invalidate = false;
>>      size_t nbytes = 0;  /* # bytes already packed in req */
>>      ssize_t ret = 0;
>>  
>> -    /* Special case for kernel I/O: can copy directly into the buffer */
>> +    /* Special case for kernel I/O: can copy directly into the buffer.
>> +     * However if the implementation of fuse_conn requires pages instead of
>> +     * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead.
>> +     */
>>      if (iov_iter_is_kvec(ii)) {
>> -            unsigned long user_addr = fuse_get_user_addr(ii);
>> -            size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
>> +            void *user_addr = (void *)fuse_get_user_addr(ii);
>>  
>> -            if (write)
>> -                    ap->args.in_args[1].value = (void *) user_addr;
>> -            else
>> -                    ap->args.out_args[0].value = (void *) user_addr;
>> +            if (!use_pages_for_kvec_io) {
>> +                    size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
>>  
>> -            iov_iter_advance(ii, frag_size);
>> -            *nbytesp = frag_size;
>> -            return 0;
>> +                    if (write)
>> +                            ap->args.in_args[1].value = user_addr;
>> +                    else
>> +                            ap->args.out_args[0].value = user_addr;
>> +
>> +                    iov_iter_advance(ii, frag_size);
>> +                    *nbytesp = frag_size;
>> +                    return 0;
>> +            }
>> +
>> +            if (is_vmalloc_addr(user_addr)) {
>> +                    ap->args.vmap_base = user_addr;
>> +                    flush_or_invalidate = true;
> Could we move flush_kernel_vmap_range() upon here, so that
> flush_or_invalidate is not needed anymore and the code looks cleaner?

flush_kernel_vmap_range() needs to know the length of the flushed area,
if moving it here(), the length will be unknown.
>
>> +            }
>>      }
>>  
>>      while (nbytes < *nbytesp && ap->num_pages < max_pages) {
>> @@ -1513,6 +1533,10 @@ static int fuse_get_user_pages(struct fuse_args_pages 
>> *ap, struct iov_iter *ii,
>>                      (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
>>      }
>>  
>> +    if (write && flush_or_invalidate)
>> +            flush_kernel_vmap_range(ap->args.vmap_base, nbytes);
>> +
>> +    ap->args.invalidate_vmap = !write && flush_or_invalidate;
> How about initializing vmap_base only when the data buffer is vmalloced
> and it's a read request?  In this case invalidate_vmap is no longer needed.

You mean using the value of vmap_base to indicate whether invalidation
is needed or not, right ? I prefer to keep it, because the extra
variable invalidate_vmap indicates the required action for the vmap area
and it doesn't increase the size of fuse_args.

>
>>      ap->args.is_pinned = iov_iter_extract_will_pin(ii);
>>      ap->args.user_pages = true;
>>      if (write)
>> @@ -1581,7 +1605,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct 
>> iov_iter *iter,
>>              size_t nbytes = min(count, nmax);
>>  
>>              err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
>> -                                      max_pages);
>> +                                      max_pages, fc->use_pages_for_kvec_io);
>>              if (err && !nbytes)
>>                      break;
>>  
>> @@ -1595,7 +1619,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct 
>> iov_iter *iter,
>>              }
>>  
>>              if (!io->async || nres < 0) {
>> -                    fuse_release_user_pages(&ia->ap, io->should_dirty);
>> +                    fuse_release_user_pages(&ia->ap, nres, 
>> io->should_dirty);
>>                      fuse_io_free(ia);
>>              }
>>              ia = NULL;
>> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
>> index f23919610313..79add14c363f 100644
>> --- a/fs/fuse/fuse_i.h
>> +++ b/fs/fuse/fuse_i.h
>> @@ -309,9 +309,12 @@ struct fuse_args {
>>      bool may_block:1;
>>      bool is_ext:1;
>>      bool is_pinned:1;
>> +    bool invalidate_vmap:1;
>>      struct fuse_in_arg in_args[3];
>>      struct fuse_arg out_args[2];
>>      void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
>> +    /* Used for kvec iter backed by vmalloc address */
>> +    void *vmap_base;
>>  };
>>  
>>  struct fuse_args_pages {
>> @@ -860,6 +863,9 @@ struct fuse_conn {
>>      /** Passthrough support for read/write IO */
>>      unsigned int passthrough:1;
>>  
>> +    /* Use pages instead of pointer for kernel I/O */
>> +    unsigned int use_pages_for_kvec_io:1;
> Maybe we need a better (actually shorter) name for this flag. kvec_pages?

Naming is hard. The name "use_pages_for_kvec_io" is verbose indeed.
kvec_pages is better. Will update it in the next spin.
>
>> +
>>      /** Maximum stack depth for passthrough backing files */
>>      int max_stack_depth;
>>  
>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
>> index dd5260141615..43d66ab5e891 100644
>> --- a/fs/fuse/virtio_fs.c
>> +++ b/fs/fuse/virtio_fs.c
>> @@ -1568,6 +1568,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
>>      fc->delete_stale = true;
>>      fc->auto_submounts = true;
>>      fc->sync_fs = true;
>> +    fc->use_pages_for_kvec_io = true;
>>  
>>      /* Tell FUSE to split requests that exceed the virtqueue's size */
>>      fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit,


Reply via email to