Am 07.04.2025 um 17:47 hat Pinku Deb Nath geschrieben:
> Full Unit Access (FUA) is an optimization where a disk write with the
> flag set will be persisted to disk immediately instead of potentially
> remaining in the disk's write cache.
> 
> This commit address the todo task
> for using pwritev2() with RWF_DSYNC in the thread pool section of
> raw_co_prw(), if pwritev2() with RWF_DSYNC is available in the host,
> which is always the case for Linux kernel >= 4.7.
> 
> The intent for FUA is indicated with the BDRV_REQ_FUA flag.
> The old code paths are preserved in case BDRV_REQ_FUA is off
> or pwritev2() with RWF_DSYNC is not available.
> 
> Support for disk writes with FUA is handled in qemu_pwritev_fua(),
> which uses pwritev2() with RWF_DSYNC if available, otherwise falls
> back to pwritev2() with no flags followed by flush using
> handle_aiocb_flush().
> 
> If pwritev2() is not implemented, then disk write in the linear FUA
> will fallback to pwrite() + handle_aiocb_flush().
> 
> Signed-off-by: Pinku Deb Nath <pranto...@gmail.com>
> 
> ---
> 
> v5:
> - Use pwritev for unsupported OSes
> 
> v4:
> - Add fallback when qemu_pwritev_fua() returns ENOSYS
> - Similar fallback was not added for handle_aiocb_rw_vector()
> since there is a preadv_present check in handle_aiocb_rw()
> 
> v3:
> - Changed signature to add fd, iov, nr_iov
> - Return -ENOSYS for non-Linux hosts
> 
> v2:
> - Moved handle_aiocb_flush() into qemu_pwritev_fua()
> - In handle_aiocb_rw_linear(), iovec with iovcnt=1 is created
> based on the assumption that there will be only one buffer
> ---
>  block/file-posix.c | 68 ++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 56 insertions(+), 12 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 56d1972d15..380f709917 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -229,6 +229,7 @@ typedef struct RawPosixAIOData {
>              unsigned long op;
>          } zone_mgmt;
>      };
> +    BdrvRequestFlags flags;
>  } RawPosixAIOData;
>  
>  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> @@ -1674,6 +1675,20 @@ qemu_pwritev(int fd, const struct iovec *iov, int 
> nr_iov, off_t offset)
>      return pwritev(fd, iov, nr_iov, offset);
>  }
>  
> +static ssize_t
> +qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, 
> RawPosixAIOData *aiocb)
> +{
> +#ifdef RWF_DSYNC
> +    return pwritev2(fd, iov, nr_iov, offset, RWF_DSYNC);

If pwritev2() fails with ENOSYS, we'll disable even pwritev(). I think
we should only try pwritev2(), and if it returns ENOSYS, we can execute
the code below with pwritev() + handle_aiocb_flush(), which would then
be outside the #ifdef.

Similar to preadv_present, we can have a bool pwritev2_present to avoid
trying pwritev2() every time when we already know it's not there.

> +#else
> +    ssize_t len = pwritev(fd, iov, nr_iov, offset);
> +    if (len == 0) {

pwritev() doesn't return 0, but the number of written bytes on success
(you even called the result 'len'!), so with len == 0, we would only
flush if someone tries to write at EOF of a block device or something.

> +        len = handle_aiocb_flush(aiocb);
> +    }
> +    return len;
> +#endif
> +}
> +
>  #else
>  
>  static bool preadv_present = false;
> @@ -1690,6 +1705,11 @@ qemu_pwritev(int fd, const struct iovec *iov, int 
> nr_iov, off_t offset)
>      return -ENOSYS;
>  }
>  
> +static ssize_t
> +qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const 
> RawPosixAIOData *aiocb)
> +{
> +    return -ENOSYS;
> +}

This is inconsistent with the interface of the function above. The
function above returns -1 with errno set on error, this one directly
returns a negative errno code. We need to pick one interface and stick
to it.

The existing definitions for qemu_preadv() are misleading because they
have the same problem - but they are never actually called because
preadv_present is always false. But qemu_pwritev_fua() is called below
in handle_aiocb_rw_linear() without checking preadv_present, so it
doesn't have the same excuse.

If you check a new pwritev2_present in handle_aiocb_rw_linear() and
initialise it as false in this #else block, then what we return here
doesn't matter any more and you can keep it consistent with the other
(misleading) functions.

(I think we should actually replace the existing return -ENOSYS with
g_assert_not_reached(), but that is for a separate patch.)

>  #endif
>  
>  static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
> @@ -1698,10 +1718,16 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData 
> *aiocb)
>  
>      len = RETRY_ON_EINTR(
>          (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
> -            qemu_pwritev(aiocb->aio_fildes,
> -                           aiocb->io.iov,
> -                           aiocb->io.niov,
> -                           aiocb->aio_offset) :
> +            (aiocb->flags &  BDRV_REQ_FUA) ?

Double space.

> +                qemu_pwritev_fua(aiocb->aio_fildes,
> +                                aiocb->io.iov,
> +                                aiocb->io.niov,
> +                                aiocb->aio_offset,
> +                                aiocb) :
> +                qemu_pwritev(aiocb->aio_fildes,
> +                            aiocb->io.iov,
> +                            aiocb->io.niov,
> +                            aiocb->aio_offset) :
>              qemu_preadv(aiocb->aio_fildes,
>                            aiocb->io.iov,
>                            aiocb->io.niov,
> @@ -1727,10 +1753,31 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData 
> *aiocb, char *buf)
>  
>      while (offset < aiocb->aio_nbytes) {
>          if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> -            len = pwrite(aiocb->aio_fildes,
> -                         (const char *)buf + offset,
> -                         aiocb->aio_nbytes - offset,
> -                         aiocb->aio_offset + offset);
> +            if (aiocb->flags & BDRV_REQ_FUA) {
> +                struct iovec iov = {
> +                    .iov_base = buf + offset,
> +                    .iov_len = aiocb->aio_nbytes - offset,
> +                };
> +                len = qemu_pwritev_fua(aiocb->aio_fildes,
> +                                    &iov,
> +                                    1,
> +                                    aiocb->aio_offset + offset,
> +                                    aiocb);
> +                if (len == -ENOSYS) {

This requires checking errno instead, like other instances in this
function do.

> +                    len = pwrite(aiocb->aio_fildes,
> +                                (const char *)buf + offset,
> +                                aiocb->aio_nbytes - offset,
> +                                aiocb->aio_offset + offset);
> +                    if (len == 0) {

Same problem as above with len == 0.

> +                        len = handle_aiocb_flush(aiocb);
> +                    }
> +                }
> +            } else {
> +                len = pwrite(aiocb->aio_fildes,
> +                            (const char *)buf + offset,
> +                            aiocb->aio_nbytes - offset,
> +                            aiocb->aio_offset + offset);
> +            }
>          } else {
>              len = pread(aiocb->aio_fildes,
>                          buf + offset,
> @@ -2539,14 +2586,11 @@ static int coroutine_fn raw_co_prw(BlockDriverState 
> *bs, int64_t *offset_ptr,
>              .iov            = qiov->iov,
>              .niov           = qiov->niov,
>          },
> +        .flags          = flags,
>      };
>  
>      assert(qiov->size == bytes);
>      ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
> -    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
> -        /* TODO Use pwritev2() instead if it's available */
> -        ret = raw_co_flush_to_disk(bs);
> -    }
>      goto out; /* Avoid the compiler err of unused label */
>  
>  out:

Kevin


Reply via email to