This commit fills in read/write/mmap handling for famfs files. The dev_dax_iomap interface is used - just like xfs in fs-dax mode.
Signed-off-by: John Groves <j...@groves.net> --- fs/fuse/famfs.c | 432 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/file.c | 14 ++ 2 files changed, 446 insertions(+) diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c index 2e182cb7d7c9..8c12e8bd96b2 100644 --- a/fs/fuse/famfs.c +++ b/fs/fuse/famfs.c @@ -603,3 +603,435 @@ famfs_file_init_dax( return rc; } +/********************************************************************* + * iomap_operations + * + * This stuff uses the iomap (dax-related) helpers to resolve file offsets to + * offsets within a dax device. + */ + +static ssize_t famfs_file_invalid(struct inode *inode); + +static int +famfs_meta_to_dax_offset_v2(struct inode *inode, struct iomap *iomap, + loff_t file_offset, off_t len, unsigned int flags) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct famfs_file_meta *meta = fi->famfs_meta; + struct fuse_conn *fc = get_fuse_conn(inode); + loff_t local_offset = file_offset; + int i; + + /* This function is only for extent_type INTERLEAVED_EXTENT */ + if (meta->fm_extent_type != INTERLEAVED_EXTENT) { + pr_err("%s: bad extent type\n", __func__); + goto err_out; + } + + if (famfs_file_invalid(inode)) + goto err_out; + + iomap->offset = file_offset; + + for (i = 0; i < meta->fm_niext; i++) { + struct famfs_meta_interleaved_ext *fei = &meta->ie[i]; + u64 chunk_size = fei->fie_chunk_size; + u64 nstrips = fei->fie_nstrips; + u64 ext_size = fei->fie_nbytes; + + ext_size = min_t(u64, ext_size, meta->file_size); + + if (ext_size == 0) { + pr_err("%s: ext_size=%lld file_size=%ld\n", + __func__, fei->fie_nbytes, meta->file_size); + goto err_out; + } + + /* Is the data is in this striped extent? */ + if (local_offset < ext_size) { + u64 chunk_num = local_offset / chunk_size; + u64 chunk_offset = local_offset % chunk_size; + u64 stripe_num = chunk_num / nstrips; + u64 strip_num = chunk_num % nstrips; + u64 chunk_remainder = chunk_size - chunk_offset; + u64 strip_offset = chunk_offset + (stripe_num * chunk_size); + u64 strip_dax_ofs = fei->ie_strips[strip_num].ext_offset; + u64 strip_devidx = fei->ie_strips[strip_num].dev_index; + + if (!fc->dax_devlist->devlist[strip_devidx].valid) { + pr_err("%s: daxdev=%lld invalid\n", __func__, + strip_devidx); + goto err_out; + } + iomap->addr = strip_dax_ofs + strip_offset; + iomap->offset = file_offset; + iomap->length = min_t(loff_t, len, chunk_remainder); + + iomap->dax_dev = fc->dax_devlist->devlist[strip_devidx].devp; + + iomap->type = IOMAP_MAPPED; + iomap->flags = flags; + + return 0; + } + local_offset -= ext_size; /* offset is beyond this striped extent */ + } + + err_out: + pr_err("%s: err_out\n", __func__); + + /* We fell out the end of the extent list. + * Set iomap to zero length in this case, and return 0 + * This just means that the r/w is past EOF + */ + iomap->addr = 0; /* there is no valid dax device offset */ + iomap->offset = file_offset; /* file offset */ + iomap->length = 0; /* this had better result in no access to dax mem */ + iomap->dax_dev = NULL; + iomap->type = IOMAP_MAPPED; + iomap->flags = flags; + + return 0; +} + +/** + * famfs_meta_to_dax_offset() - Resolve (file, offset, len) to (daxdev, offset, len) + * + * This function is called by famfs_iomap_begin() to resolve an offset in a + * file to an offset in a dax device. This is upcalled from dax from calls to + * both * dax_iomap_fault() and dax_iomap_rw(). Dax finishes the job resolving + * a fault to a specific physical page (the fault case) or doing a memcpy + * variant (the rw case) + * + * Pages can be PTE (4k), PMD (2MiB) or (theoretically) PuD (1GiB) + * (these sizes are for X86; may vary on other cpu architectures + * + * @inode: The file where the fault occurred + * @iomap: To be filled in to indicate where to find the right memory, + * relative to a dax device. + * @file_offset: Within the file where the fault occurred (will be page boundary) + * @len: The length of the faulted mapping (will be a page multiple) + * (will be trimmed in *iomap if it's disjoint in the extent list) + * @flags: + * + * Return values: 0. (info is returned in a modified @iomap struct) + */ +static int +famfs_meta_to_dax_offset(struct inode *inode, struct iomap *iomap, + loff_t file_offset, off_t len, unsigned int flags) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct famfs_file_meta *meta = fi->famfs_meta; + struct fuse_conn *fc = get_fuse_conn(inode); + loff_t local_offset = file_offset; + int i; + + if (!fc->dax_devlist) { + pr_err("%s: null dax_devlist\n", __func__); + goto err_out; + } + + if (famfs_file_invalid(inode)) + goto err_out; + + if (meta->fm_extent_type == INTERLEAVED_EXTENT) + return famfs_meta_to_dax_offset_v2(inode, iomap, file_offset, + len, flags); + + iomap->offset = file_offset; + + for (i = 0; i < meta->fm_nextents; i++) { + /* TODO: check devindex too */ + loff_t dax_ext_offset = meta->se[i].ext_offset; + loff_t dax_ext_len = meta->se[i].ext_len; + u64 daxdev_idx = meta->se[i].dev_index; + + if ((dax_ext_offset == 0) && + (meta->file_type != FAMFS_SUPERBLOCK)) + pr_warn("%s: zero offset on non-superblock file!!\n", + __func__); + + /* local_offset is the offset minus the size of extents skipped + * so far; If local_offset < dax_ext_len, the data of interest + * starts in this extent + */ + if (local_offset < dax_ext_len) { + loff_t ext_len_remainder = dax_ext_len - local_offset; + + if (!fc->dax_devlist->devlist[daxdev_idx].valid) { + pr_err("%s: daxdev=%lld invalid\n", __func__, + daxdev_idx); + goto err_out; + } + + /* + * OK, we found the file metadata extent where this + * data begins + * @local_offset - The offset within the current + * extent + * @ext_len_remainder - Remaining length of ext after + * skipping local_offset + * Outputs: + * iomap->addr: the offset within the dax device where + * the data starts + * iomap->offset: the file offset + * iomap->length: the valid length resolved here + */ + iomap->addr = dax_ext_offset + local_offset; + iomap->offset = file_offset; + iomap->length = min_t(loff_t, len, ext_len_remainder); + + iomap->dax_dev = fc->dax_devlist->devlist[daxdev_idx].devp; + + iomap->type = IOMAP_MAPPED; + iomap->flags = flags; + return 0; + } + local_offset -= dax_ext_len; /* Get ready for the next extent */ + } + + err_out: + pr_err("%s: err_out\n", __func__); + + /* We fell out the end of the extent list. + * Set iomap to zero length in this case, and return 0 + * This just means that the r/w is past EOF + */ + iomap->addr = 0; /* there is no valid dax device offset */ + iomap->offset = file_offset; /* file offset */ + iomap->length = 0; /* this had better result in no access to dax mem */ + iomap->dax_dev = NULL; + iomap->type = IOMAP_MAPPED; + iomap->flags = flags; + + return 0; +} + +/** + * famfs_iomap_begin() - Handler for iomap_begin upcall from dax + * + * This function is pretty simple because files are + * * never partially allocated + * * never have holes (never sparse) + * * never "allocate on write" + * + * @inode: inode for the file being accessed + * @offset: offset within the file + * @length: Length being accessed at offset + * @flags: + * @iomap: iomap struct to be filled in, resolving (offset, length) to + * (daxdev, offset, len) + * @srcmap: + */ +static int +famfs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct famfs_file_meta *meta = fi->famfs_meta; + size_t size; + + size = i_size_read(inode); + + WARN_ON(size != meta->file_size); + + return famfs_meta_to_dax_offset(inode, iomap, offset, length, flags); +} + +/* Note: We never need a special set of write_iomap_ops because famfs never + * performs allocation on write. + */ +const struct iomap_ops famfs_iomap_ops = { + .iomap_begin = famfs_iomap_begin, +}; + +/********************************************************************* + * vm_operations + */ +static vm_fault_t +__famfs_filemap_fault(struct vm_fault *vmf, unsigned int pe_size, + bool write_fault) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + vm_fault_t ret; + pfn_t pfn; + + if (!IS_DAX(file_inode(vmf->vma->vm_file))) { + pr_err("%s: file not marked IS_DAX!!\n", __func__); + return VM_FAULT_SIGBUS; + } + + if (write_fault) { + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + } + + ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &famfs_iomap_ops); + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, pe_size, pfn); + + if (write_fault) + sb_end_pagefault(inode->i_sb); + + return ret; +} + +static inline bool +famfs_is_write_fault(struct vm_fault *vmf) +{ + return (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); +} + +static vm_fault_t +famfs_filemap_fault(struct vm_fault *vmf) +{ + return __famfs_filemap_fault(vmf, 0, famfs_is_write_fault(vmf)); +} + +static vm_fault_t +famfs_filemap_huge_fault(struct vm_fault *vmf, unsigned int pe_size) +{ + return __famfs_filemap_fault(vmf, pe_size, famfs_is_write_fault(vmf)); +} + +static vm_fault_t +famfs_filemap_page_mkwrite(struct vm_fault *vmf) +{ + return __famfs_filemap_fault(vmf, 0, true); +} + +static vm_fault_t +famfs_filemap_pfn_mkwrite(struct vm_fault *vmf) +{ + return __famfs_filemap_fault(vmf, 0, true); +} + +static vm_fault_t +famfs_filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, + pgoff_t end_pgoff) +{ + return filemap_map_pages(vmf, start_pgoff, end_pgoff); +} + +const struct vm_operations_struct famfs_file_vm_ops = { + .fault = famfs_filemap_fault, + .huge_fault = famfs_filemap_huge_fault, + .map_pages = famfs_filemap_map_pages, + .page_mkwrite = famfs_filemap_page_mkwrite, + .pfn_mkwrite = famfs_filemap_pfn_mkwrite, +}; + +/********************************************************************* + * file_operations + */ + +/* Reject I/O to files that aren't in a valid state */ +static ssize_t +famfs_file_invalid(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct famfs_file_meta *meta = fi->famfs_meta; + size_t i_size = i_size_read(inode); + + if (!meta) { + pr_debug("%s: un-initialized famfs file\n", __func__); + return -EIO; + } + if (meta->error) { + pr_debug("%s: previously detected metadata errors\n", __func__); + return -EIO; + } + if (i_size != meta->file_size) { + pr_warn("%s: i_size overwritten from %ld to %ld\n", + __func__, meta->file_size, i_size); + meta->error = true; + return -ENXIO; + } + if (!IS_DAX(inode)) { + pr_debug("%s: inode %llx IS_DAX is false\n", __func__, (u64)inode); + return -ENXIO; + } + return 0; +} + +static ssize_t +famfs_rw_prep(struct kiocb *iocb, struct iov_iter *ubuf) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + size_t i_size = i_size_read(inode); + size_t count = iov_iter_count(ubuf); + size_t max_count; + ssize_t rc; + + rc = famfs_file_invalid(inode); + if (rc) + return rc; + + max_count = max_t(size_t, 0, i_size - iocb->ki_pos); + + if (count > max_count) + iov_iter_truncate(ubuf, max_count); + + if (!iov_iter_count(ubuf)) + return 0; + + return rc; +} + +ssize_t +famfs_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t rc; + + rc = famfs_rw_prep(iocb, to); + if (rc) + return rc; + + if (!iov_iter_count(to)) + return 0; + + rc = dax_iomap_rw(iocb, to, &famfs_iomap_ops); + + file_accessed(iocb->ki_filp); + return rc; +} + +/** + * famfs_dax_write_iter() + * + * We need our own write-iter in order to prevent append + * + * @iocb: + * @from: iterator describing the user memory source for the write + */ +ssize_t +famfs_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t rc; + + rc = famfs_rw_prep(iocb, from); + if (rc) + return rc; + + if (!iov_iter_count(from)) + return 0; + + return dax_iomap_rw(iocb, from, &famfs_iomap_ops); +} + +int +famfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(file); + ssize_t rc; + + rc = famfs_file_invalid(inode); + if (rc) + return (int)rc; + + file_accessed(file); + vma->vm_ops = &famfs_file_vm_ops; + vm_flags_set(vma, VM_HUGEPAGE); + return 0; +} diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6f10ae54e710..11201195924d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1777,6 +1777,8 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (FUSE_IS_VIRTIO_DAX(fi)) return fuse_dax_read_iter(iocb, to); + if (fuse_file_famfs(fi)) + return famfs_dax_read_iter(iocb, to); /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ if (ff->open_flags & FOPEN_DIRECT_IO) @@ -1799,6 +1801,8 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (FUSE_IS_VIRTIO_DAX(fi)) return fuse_dax_write_iter(iocb, from); + if (fuse_file_famfs(fi)) + return famfs_dax_write_iter(iocb, from); /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ if (ff->open_flags & FOPEN_DIRECT_IO) @@ -1814,10 +1818,14 @@ static ssize_t fuse_splice_read(struct file *in, loff_t *ppos, unsigned int flags) { struct fuse_file *ff = in->private_data; + struct inode *inode = file_inode(in); + struct fuse_inode *fi = get_fuse_inode(inode); /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_passthrough_splice_read(in, ppos, pipe, len, flags); + else if (fuse_file_famfs(fi)) + return -EIO; /* direct I/O doesn't make sense in dax_iomap */ else return filemap_splice_read(in, ppos, pipe, len, flags); } @@ -1826,10 +1834,14 @@ static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct fuse_file *ff = out->private_data; + struct inode *inode = file_inode(out); + struct fuse_inode *fi = get_fuse_inode(inode); /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_passthrough_splice_write(pipe, out, ppos, len, flags); + else if (fuse_file_famfs(fi)) + return -EIO; /* direct I/O doesn't make sense in dax_iomap */ else return iter_file_splice_write(pipe, out, ppos, len, flags); } @@ -2635,6 +2647,8 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) /* DAX mmap is superior to direct_io mmap */ if (FUSE_IS_VIRTIO_DAX(fi)) return fuse_dax_mmap(file, vma); + if (fuse_file_famfs(fi)) + return famfs_file_mmap(file, vma); /* * If inode is in passthrough io mode, because it has some file open -- 2.49.0