The commit is pushed to "branch-rh9-5.14.0-427.37.1.vz9.78.x-ovz" and will appear at g...@bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.37.1.vz9.78.3 ------> commit ead9b9aec946bcf8f4395f89e82310d180221f0b Author: Alexey Kuznetsov <kuz...@virtuozzo.com> Date: Fri Nov 8 00:01:10 2024 +0800
fs: fuse: kio: resolve race condition fallocate vs shrink It was missed kio/pcs module has warning on out of mutex execution of fallocate request. It is not a bug per se, yet the situation is dubious. The rationale was that logically fallocate(zerorange) is _not_ different of aio direct write of zeros to the same area, and that write used to be executed out of inode_lock. So asserting the lock in fallocate might be a noise. Yet we must comply completely. There is one difference, aio takes dio_write counter, which blocks concurrent truncates until write is completed. This is important mostly for shrinks, as expanding writes are not executed asynchronously, fallocates likewise. So, we take dio_write for async fallocate out of inode_lock, making it 100% equivalent to write. NOTE: the situation with expanding writes/fallocates is complicated. We might even have a zero day bug here, I do not see one now, but this does not mean we did not have there any holes. Just a note. Affects: #VSTOR-94829 https://virtuozzo.atlassian.net/browse/VSTOR-94829 Signed-off-by: Alexey Kuznetsov <kuz...@virtuozzo.com> Feature: vStorage --- fs/fuse/file.c | 11 +++++++++-- fs/fuse/kio/pcs/pcs_fuse_kdirect.c | 21 ++++++++++++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7bf7c9e45b98..195b77114d02 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3450,8 +3450,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - if (revoke_lock) + if (revoke_lock) { + /* This blocks truncates which might happen after + * inode_lock is released + */ + fuse_write_dio_begin(fi); inode_unlock(inode); + } args.opcode = FUSE_FALLOCATE; args.io_inode = inode; @@ -3460,8 +3465,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; err = fuse_simple_request(fm, &args); - if (revoke_lock) + if (revoke_lock) { inode_lock(inode); + fuse_write_dio_end(fi); + } if (err == -ENOSYS) { fm->fc->no_fallocate = 1; err = -EOPNOTSUPP; diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c index 1bd43ecf3b74..5b141fe5bbd4 100644 --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c @@ -1019,19 +1019,19 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) break; case FUSE_FALLOCATE: { struct fuse_fallocate_in *inarg = (void*) args->in_args[0].value; + size_t sz = READ_ONCE(di->fileinfo.attr.size); if (pfc->fc->no_fallocate) { req->out.h.error = -EOPNOTSUPP; goto error; } - if (inarg->offset >= di->fileinfo.attr.size) + if (inarg->offset >= sz) inarg->mode &= ~FALLOC_FL_ZERO_RANGE; if (inarg->mode == FALLOC_FL_KEEP_SIZE) break; /* NOPE */ - WARN_ON_ONCE(!inode_is_locked(&fi->inode)); if (inarg->mode & (FALLOC_FL_ZERO_RANGE|FALLOC_FL_PUNCH_HOLE)) { if ((inarg->offset & (PAGE_SIZE - 1)) || (inarg->length & (PAGE_SIZE - 1))) { req->out.h.error = -EINVAL; @@ -1040,10 +1040,21 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) } if (inarg->mode & FALLOC_FL_KEEP_SIZE) { - if (inarg->offset > di->fileinfo.attr.size) + if (inarg->offset > sz) break; /* NOPE */ - if (inarg->offset + inarg->length > di->fileinfo.attr.size) - inarg->length = di->fileinfo.attr.size - inarg->offset; + if (inarg->offset + inarg->length > sz) + inarg->length = sz - inarg->offset; + } else { + /* attr.size can expand, but not shrink. If the file is expanded + * by fallocate inode_lock cannot be dropped. + * + * Interesting new situation to consider: update i_size happens of exit + * from allocating write/falloc and we have a window where attr.size is ahead + * of i_size. It looks safe as allocating call is deemed to be synchronous, + * and as i_size is still not advanced all the following ones are. + */ + WARN_ON_ONCE(inarg->offset + inarg->length > sz && + !inode_is_locked(&fi->inode)); } ret = pcs_fuse_prep_rw(r); _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel