On Sat, Apr 05, 2025 at 04:52:29PM -0700, Pinku Deb Nath wrote:
> Full Unit Access (FUA) is an optimization where a disk write with the
> flag set will be persisted to disk immediately instead of potentially
> remaining in the disk's write cache.
> 
> This commit address the todo task
> for using pwritev2() with RWF_DSYNC in the thread pool section of
> raw_co_prw(), if pwritev2() with RWF_DSYNC is available in the host,
> which is always the case for Linux kernel >= 4.7.
> 
> The intent for FUA is indicated with the BDRV_REQ_FUA flag.
> The old code paths are preserved in case BDRV_REQ_FUA is off
> or pwritev2() with RWF_DSYNC is not available.
> 
> Support for disk writes with FUA is handled in qemu_pwritev_fua(),
> which uses pwritev2() with RWF_DSYNC if available, otherwise falls
> back to pwritev2() with no flags followed by flush using
> handle_aiocb_flush().
> 
> If pwritev2() is not implemented, then disk write in the linear FUA
> will fallback to pwrite() + handle_aiocb_flush().
> 
> Signed-off-by: Pinku Deb Nath <pranto...@gmail.com>
> 
> ---
> 
> v4:
> - Add fallback when qemu_pwritev_fua() returns ENOSYS
> - Similar fallback was not added for handle_aiocb_rw_vector()
> since there is a preadv_present check in handle_aiocb_rw()
> 
> v3:
> - Changed signature to add fd, iov, nr_iov
> - Return -ENOSYS for non-Linux hosts
> 
> v2:
> - Moved handle_aiocb_flush() into qemu_pwritev_fua()
> - In handle_aiocb_rw_linear(), iovec with iovcnt=1 is created
> based on the assumption that there will be only one buffer
> ---
>  block/file-posix.c | 68 ++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 56 insertions(+), 12 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 56d1972d15..59bed7866a 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -229,6 +229,7 @@ typedef struct RawPosixAIOData {
>              unsigned long op;
>          } zone_mgmt;
>      };
> +    BdrvRequestFlags flags;
>  } RawPosixAIOData;
>  
>  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> @@ -1674,6 +1675,20 @@ qemu_pwritev(int fd, const struct iovec *iov, int 
> nr_iov, off_t offset)
>      return pwritev(fd, iov, nr_iov, offset);
>  }
>  
> +static ssize_t
> +qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const 
> RawPosixAIOData *aiocb)
> +{
> +#ifdef RWF_DSYNC
> +    return pwritev2(fd, iov, nr_iov, offset, RWF_DSYNC);
> +#else
> +    ssize_t len = pwritev2(fd, iov, nr_iov, offset, 0);

This will fail to compile on non-Linux OSes that provide preadv(2)
(CONFIG_PREADV) because they do not have pwritev2(2). This can be fixed
by using pwritev() since the flags aren't needed:

  ssize_t len = pwritev(fd, iov, nr_iov, offset);

> +    if (len == 0) {
> +        len = handle_aiocb_flush(aiocb);
> +    }
> +    return len;
> +#endif
> +}
> +
>  #else
>  
>  static bool preadv_present = false;
> @@ -1690,6 +1705,11 @@ qemu_pwritev(int fd, const struct iovec *iov, int 
> nr_iov, off_t offset)
>      return -ENOSYS;
>  }
>  
> +static ssize_t
> +qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const 
> RawPosixAIOData *aiocb)
> +{
> +    return -ENOSYS;
> +}
>  #endif
>  
>  static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
> @@ -1698,10 +1718,16 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData 
> *aiocb)
>  
>      len = RETRY_ON_EINTR(
>          (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
> -            qemu_pwritev(aiocb->aio_fildes,
> -                           aiocb->io.iov,
> -                           aiocb->io.niov,
> -                           aiocb->aio_offset) :
> +            (aiocb->flags &  BDRV_REQ_FUA) ?
> +                qemu_pwritev_fua(aiocb->aio_fildes,
> +                                aiocb->io.iov,
> +                                aiocb->io.niov,
> +                                aiocb->aio_offset,
> +                                aiocb) :
> +                qemu_pwritev(aiocb->aio_fildes,
> +                            aiocb->io.iov,
> +                            aiocb->io.niov,
> +                            aiocb->aio_offset) :
>              qemu_preadv(aiocb->aio_fildes,
>                            aiocb->io.iov,
>                            aiocb->io.niov,
> @@ -1727,10 +1753,31 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData 
> *aiocb, char *buf)
>  
>      while (offset < aiocb->aio_nbytes) {
>          if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> -            len = pwrite(aiocb->aio_fildes,
> -                         (const char *)buf + offset,
> -                         aiocb->aio_nbytes - offset,
> -                         aiocb->aio_offset + offset);
> +            if (aiocb->flags & BDRV_REQ_FUA) {
> +                struct iovec iov = {
> +                    .iov_base = buf + offset,
> +                    .iov_len = aiocb->aio_nbytes - offset,
> +                };
> +                len = qemu_pwritev_fua(aiocb->aio_fildes,
> +                                    &iov,
> +                                    1,
> +                                    aiocb->aio_offset + offset,
> +                                    aiocb);
> +                if (len == -ENOSYS) {
> +                    len = pwrite(aiocb->aio_fildes,
> +                                (const char *)buf + offset,
> +                                aiocb->aio_nbytes - offset,
> +                                aiocb->aio_offset + offset);
> +                    if (len == 0) {
> +                        len = handle_aiocb_flush(aiocb);
> +                    }
> +                }
> +            } else {
> +                len = pwrite(aiocb->aio_fildes,
> +                            (const char *)buf + offset,
> +                            aiocb->aio_nbytes - offset,
> +                            aiocb->aio_offset + offset);
> +            }
>          } else {
>              len = pread(aiocb->aio_fildes,
>                          buf + offset,
> @@ -2539,14 +2586,11 @@ static int coroutine_fn raw_co_prw(BlockDriverState 
> *bs, int64_t *offset_ptr,
>              .iov            = qiov->iov,
>              .niov           = qiov->niov,
>          },
> +        .flags          = flags,
>      };
>  
>      assert(qiov->size == bytes);
>      ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
> -    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
> -        /* TODO Use pwritev2() instead if it's available */
> -        ret = raw_co_flush_to_disk(bs);
> -    }
>      goto out; /* Avoid the compiler err of unused label */
>  
>  out:
> -- 
> 2.43.0
> 

Attachment: signature.asc
Description: PGP signature

Reply via email to