On Wed, Apr 02, 2025 at 02:31:01AM -0700, Pinku Deb Nath wrote:
> Full Unit Access (FUA) is an optimization where a disk write with the
> flag set will be persisted to disk immediately instead of potentially
> remaining in the disk's write cache. This commit address the todo task
> for using pwritev2() with RWF_DSYNC in the thread pool section of
> raw_co_prw(), if pwritev2 with RWF_DSYNC is available in the host,
> which is alway for Linux kernel >= 4.7. The intent for FUA is indicated

"alway" -> "always true"?

> with the BDRV_REQ_FUA flag. The old code paths are preserved in case
> BDRV_REQ_FUA is off or pwritev2() with RWF_DSYNC is not available.
> 
> During testing, I observed that the BDRV_REQ_FUA is always turned on
> when blk->enable_write_cache is not set in block/block-backend.c, so
> I commented this section off during testing:
> https://gitlab.com/qemu-project/qemu/-/blob/master/block/block-backend.c?ref_type=heads#L1432-1434

You can also use the qemu-io "open -t writeback <filename>" command to
open the image file with enable_write_cache.

> 
> Signed-off-by: Pinku Deb Nath <pranto...@gmail.com>
> ---
>  block/file-posix.c | 42 ++++++++++++++++++++++++++++++++++--------
>  1 file changed, 34 insertions(+), 8 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 56d1972d15..34de816eab 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -229,6 +229,7 @@ typedef struct RawPosixAIOData {
>              unsigned long op;
>          } zone_mgmt;
>      };
> +    BdrvRequestFlags flags;
>  } RawPosixAIOData;
>  
>  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> @@ -1674,6 +1675,16 @@ qemu_pwritev(int fd, const struct iovec *iov, int 
> nr_iov, off_t offset)
>      return pwritev(fd, iov, nr_iov, offset);
>  }
>  
> +static ssize_t
> +qemu_pwrite_fua(int fd, const struct iovec *iov, int nr_iov, off_t offset)

Please call this function qemu_pwritev_fua() since it takes an iovec
array.

> +{
> +#ifdef RWF_DSYNC
> +    return pwritev2(fd, iov, nr_iov, offset, RWF_DSYNC);
> +#else
> +    return pwritev2(fd, iov, nr_iov, offset, 0);

In the #else case the function is not doing what its name says - FUA is
skipped. Kevin mentioned that the flush could be moved into the handle
function. How about calling handle_aiocb_flush() so that
qemu_pwrite_fua() has the intended semantics even on hosts without
RWF_DSYNC?

> +#endif
> +}
> +
>  #else
>  
>  static bool preadv_present = false;
> @@ -1698,10 +1709,15 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData 
> *aiocb)
>  
>      len = RETRY_ON_EINTR(
>          (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
> -            qemu_pwritev(aiocb->aio_fildes,
> -                           aiocb->io.iov,
> -                           aiocb->io.niov,
> -                           aiocb->aio_offset) :
> +            (aiocb->flags &  BDRV_REQ_FUA) ?
> +                qemu_pwrite_fua(aiocb->aio_fildes,
> +                                aiocb->io.iov,
> +                                aiocb->io.niov,
> +                                aiocb->aio_offset) :
> +                qemu_pwritev(aiocb->aio_fildes,
> +                            aiocb->io.iov,
> +                            aiocb->io.niov,
> +                            aiocb->aio_offset) :
>              qemu_preadv(aiocb->aio_fildes,
>                            aiocb->io.iov,
>                            aiocb->io.niov,
> @@ -1727,10 +1743,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData 
> *aiocb, char *buf)
>  
>      while (offset < aiocb->aio_nbytes) {
>          if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> -            len = pwrite(aiocb->aio_fildes,
> -                         (const char *)buf + offset,
> -                         aiocb->aio_nbytes - offset,
> -                         aiocb->aio_offset + offset);
> +            if (aiocb->flags & BDRV_REQ_FUA) {
> +                len = qemu_pwrite_fua(aiocb->aio_fildes,
> +                                    aiocb->io.iov,
> +                                    aiocb->io.niov,
> +                                    aiocb->aio_offset);

handle_aiocb_rw() can pass in a temporary buffer (instead of iov), so
it's incorrect to use aiocb->io.iov here. Instead, create a local struct
iov iov = { .iov_base = buf, .iov_len = aiocb->aio_nbytes - offset }
here.

> +            } else {
> +                len = pwrite(aiocb->aio_fildes,
> +                            (const char *)buf + offset,
> +                            aiocb->aio_nbytes - offset,
> +                            aiocb->aio_offset + offset);
> +            }
>          } else {
>              len = pread(aiocb->aio_fildes,
>                          buf + offset,
> @@ -2539,14 +2562,17 @@ static int coroutine_fn raw_co_prw(BlockDriverState 
> *bs, int64_t *offset_ptr,
>              .iov            = qiov->iov,
>              .niov           = qiov->niov,
>          },
> +        .flags          = flags,
>      };
>  
>      assert(qiov->size == bytes);
>      ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
> +#ifndef RWD_DSYNC

RWF_DSYNC

>      if (ret == 0 && (flags & BDRV_REQ_FUA)) {
>          /* TODO Use pwritev2() instead if it's available */

This TODO comment can be deleted now.

>          ret = raw_co_flush_to_disk(bs);

This can be removed if qemu_pwritev_fua() ensures that the flush has
occurred.

>      }
> +#endif
>      goto out; /* Avoid the compiler err of unused label */
>  
>  out:
> -- 
> 2.43.0
> 

Attachment: signature.asc
Description: PGP signature

Reply via email to