On Sat, Apr 05, 2025 at 04:52:29PM -0700, Pinku Deb Nath wrote: > Full Unit Access (FUA) is an optimization where a disk write with the > flag set will be persisted to disk immediately instead of potentially > remaining in the disk's write cache. > > This commit address the todo task > for using pwritev2() with RWF_DSYNC in the thread pool section of > raw_co_prw(), if pwritev2() with RWF_DSYNC is available in the host, > which is always the case for Linux kernel >= 4.7. > > The intent for FUA is indicated with the BDRV_REQ_FUA flag. > The old code paths are preserved in case BDRV_REQ_FUA is off > or pwritev2() with RWF_DSYNC is not available. > > Support for disk writes with FUA is handled in qemu_pwritev_fua(), > which uses pwritev2() with RWF_DSYNC if available, otherwise falls > back to pwritev2() with no flags followed by flush using > handle_aiocb_flush(). > > If pwritev2() is not implemented, then disk write in the linear FUA > will fallback to pwrite() + handle_aiocb_flush(). > > Signed-off-by: Pinku Deb Nath <pranto...@gmail.com> > > --- > > v4: > - Add fallback when qemu_pwritev_fua() returns ENOSYS > - Similar fallback was not added for handle_aiocb_rw_vector() > since there is a preadv_present check in handle_aiocb_rw() > > v3: > - Changed signature to add fd, iov, nr_iov > - Return -ENOSYS for non-Linux hosts > > v2: > - Moved handle_aiocb_flush() into qemu_pwritev_fua() > - In handle_aiocb_rw_linear(), iovec with iovcnt=1 is created > based on the assumption that there will be only one buffer > --- > block/file-posix.c | 68 ++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 56 insertions(+), 12 deletions(-) > > diff --git a/block/file-posix.c b/block/file-posix.c > index 56d1972d15..59bed7866a 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -229,6 +229,7 @@ typedef struct RawPosixAIOData { > unsigned long op; > } zone_mgmt; > }; > + BdrvRequestFlags flags; > } RawPosixAIOData; > > #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > @@ -1674,6 +1675,20 @@ qemu_pwritev(int fd, const struct iovec *iov, int > nr_iov, off_t offset) > return pwritev(fd, iov, nr_iov, offset); > } > > +static ssize_t > +qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const > RawPosixAIOData *aiocb) > +{ > +#ifdef RWF_DSYNC > + return pwritev2(fd, iov, nr_iov, offset, RWF_DSYNC); > +#else > + ssize_t len = pwritev2(fd, iov, nr_iov, offset, 0);
This will fail to compile on non-Linux OSes that provide preadv(2) (CONFIG_PREADV) because they do not have pwritev2(2). This can be fixed by using pwritev() since the flags aren't needed: ssize_t len = pwritev(fd, iov, nr_iov, offset); > + if (len == 0) { > + len = handle_aiocb_flush(aiocb); > + } > + return len; > +#endif > +} > + > #else > > static bool preadv_present = false; > @@ -1690,6 +1705,11 @@ qemu_pwritev(int fd, const struct iovec *iov, int > nr_iov, off_t offset) > return -ENOSYS; > } > > +static ssize_t > +qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const > RawPosixAIOData *aiocb) > +{ > + return -ENOSYS; > +} > #endif > > static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) > @@ -1698,10 +1718,16 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData > *aiocb) > > len = RETRY_ON_EINTR( > (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ? > - qemu_pwritev(aiocb->aio_fildes, > - aiocb->io.iov, > - aiocb->io.niov, > - aiocb->aio_offset) : > + (aiocb->flags & BDRV_REQ_FUA) ? > + qemu_pwritev_fua(aiocb->aio_fildes, > + aiocb->io.iov, > + aiocb->io.niov, > + aiocb->aio_offset, > + aiocb) : > + qemu_pwritev(aiocb->aio_fildes, > + aiocb->io.iov, > + aiocb->io.niov, > + aiocb->aio_offset) : > qemu_preadv(aiocb->aio_fildes, > aiocb->io.iov, > aiocb->io.niov, > @@ -1727,10 +1753,31 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData > *aiocb, char *buf) > > while (offset < aiocb->aio_nbytes) { > if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > - len = pwrite(aiocb->aio_fildes, > - (const char *)buf + offset, > - aiocb->aio_nbytes - offset, > - aiocb->aio_offset + offset); > + if (aiocb->flags & BDRV_REQ_FUA) { > + struct iovec iov = { > + .iov_base = buf + offset, > + .iov_len = aiocb->aio_nbytes - offset, > + }; > + len = qemu_pwritev_fua(aiocb->aio_fildes, > + &iov, > + 1, > + aiocb->aio_offset + offset, > + aiocb); > + if (len == -ENOSYS) { > + len = pwrite(aiocb->aio_fildes, > + (const char *)buf + offset, > + aiocb->aio_nbytes - offset, > + aiocb->aio_offset + offset); > + if (len == 0) { > + len = handle_aiocb_flush(aiocb); > + } > + } > + } else { > + len = pwrite(aiocb->aio_fildes, > + (const char *)buf + offset, > + aiocb->aio_nbytes - offset, > + aiocb->aio_offset + offset); > + } > } else { > len = pread(aiocb->aio_fildes, > buf + offset, > @@ -2539,14 +2586,11 @@ static int coroutine_fn raw_co_prw(BlockDriverState > *bs, int64_t *offset_ptr, > .iov = qiov->iov, > .niov = qiov->niov, > }, > + .flags = flags, > }; > > assert(qiov->size == bytes); > ret = raw_thread_pool_submit(handle_aiocb_rw, &acb); > - if (ret == 0 && (flags & BDRV_REQ_FUA)) { > - /* TODO Use pwritev2() instead if it's available */ > - ret = raw_co_flush_to_disk(bs); > - } > goto out; /* Avoid the compiler err of unused label */ > > out: > -- > 2.43.0 >
signature.asc
Description: PGP signature