On Wed, Apr 02, 2025 at 02:31:01AM -0700, Pinku Deb Nath wrote: > Full Unit Access (FUA) is an optimization where a disk write with the > flag set will be persisted to disk immediately instead of potentially > remaining in the disk's write cache. This commit address the todo task > for using pwritev2() with RWF_DSYNC in the thread pool section of > raw_co_prw(), if pwritev2 with RWF_DSYNC is available in the host, > which is alway for Linux kernel >= 4.7. The intent for FUA is indicated
"alway" -> "always true"? > with the BDRV_REQ_FUA flag. The old code paths are preserved in case > BDRV_REQ_FUA is off or pwritev2() with RWF_DSYNC is not available. > > During testing, I observed that the BDRV_REQ_FUA is always turned on > when blk->enable_write_cache is not set in block/block-backend.c, so > I commented this section off during testing: > https://gitlab.com/qemu-project/qemu/-/blob/master/block/block-backend.c?ref_type=heads#L1432-1434 You can also use the qemu-io "open -t writeback <filename>" command to open the image file with enable_write_cache. > > Signed-off-by: Pinku Deb Nath <pranto...@gmail.com> > --- > block/file-posix.c | 42 ++++++++++++++++++++++++++++++++++-------- > 1 file changed, 34 insertions(+), 8 deletions(-) > > diff --git a/block/file-posix.c b/block/file-posix.c > index 56d1972d15..34de816eab 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -229,6 +229,7 @@ typedef struct RawPosixAIOData { > unsigned long op; > } zone_mgmt; > }; > + BdrvRequestFlags flags; > } RawPosixAIOData; > > #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > @@ -1674,6 +1675,16 @@ qemu_pwritev(int fd, const struct iovec *iov, int > nr_iov, off_t offset) > return pwritev(fd, iov, nr_iov, offset); > } > > +static ssize_t > +qemu_pwrite_fua(int fd, const struct iovec *iov, int nr_iov, off_t offset) Please call this function qemu_pwritev_fua() since it takes an iovec array. > +{ > +#ifdef RWF_DSYNC > + return pwritev2(fd, iov, nr_iov, offset, RWF_DSYNC); > +#else > + return pwritev2(fd, iov, nr_iov, offset, 0); In the #else case the function is not doing what its name says - FUA is skipped. Kevin mentioned that the flush could be moved into the handle function. How about calling handle_aiocb_flush() so that qemu_pwrite_fua() has the intended semantics even on hosts without RWF_DSYNC? > +#endif > +} > + > #else > > static bool preadv_present = false; > @@ -1698,10 +1709,15 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData > *aiocb) > > len = RETRY_ON_EINTR( > (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ? > - qemu_pwritev(aiocb->aio_fildes, > - aiocb->io.iov, > - aiocb->io.niov, > - aiocb->aio_offset) : > + (aiocb->flags & BDRV_REQ_FUA) ? > + qemu_pwrite_fua(aiocb->aio_fildes, > + aiocb->io.iov, > + aiocb->io.niov, > + aiocb->aio_offset) : > + qemu_pwritev(aiocb->aio_fildes, > + aiocb->io.iov, > + aiocb->io.niov, > + aiocb->aio_offset) : > qemu_preadv(aiocb->aio_fildes, > aiocb->io.iov, > aiocb->io.niov, > @@ -1727,10 +1743,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData > *aiocb, char *buf) > > while (offset < aiocb->aio_nbytes) { > if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > - len = pwrite(aiocb->aio_fildes, > - (const char *)buf + offset, > - aiocb->aio_nbytes - offset, > - aiocb->aio_offset + offset); > + if (aiocb->flags & BDRV_REQ_FUA) { > + len = qemu_pwrite_fua(aiocb->aio_fildes, > + aiocb->io.iov, > + aiocb->io.niov, > + aiocb->aio_offset); handle_aiocb_rw() can pass in a temporary buffer (instead of iov), so it's incorrect to use aiocb->io.iov here. Instead, create a local struct iov iov = { .iov_base = buf, .iov_len = aiocb->aio_nbytes - offset } here. > + } else { > + len = pwrite(aiocb->aio_fildes, > + (const char *)buf + offset, > + aiocb->aio_nbytes - offset, > + aiocb->aio_offset + offset); > + } > } else { > len = pread(aiocb->aio_fildes, > buf + offset, > @@ -2539,14 +2562,17 @@ static int coroutine_fn raw_co_prw(BlockDriverState > *bs, int64_t *offset_ptr, > .iov = qiov->iov, > .niov = qiov->niov, > }, > + .flags = flags, > }; > > assert(qiov->size == bytes); > ret = raw_thread_pool_submit(handle_aiocb_rw, &acb); > +#ifndef RWD_DSYNC RWF_DSYNC > if (ret == 0 && (flags & BDRV_REQ_FUA)) { > /* TODO Use pwritev2() instead if it's available */ This TODO comment can be deleted now. > ret = raw_co_flush_to_disk(bs); This can be removed if qemu_pwritev_fua() ensures that the flush has occurred. > } > +#endif > goto out; /* Avoid the compiler err of unused label */ > > out: > -- > 2.43.0 >
signature.asc
Description: PGP signature