On Thu, May 29, 2025 at 04:11:21PM -0500, Eric Blake wrote:
> On Wed, May 28, 2025 at 03:09:16PM -0400, Stefan Hajnoczi wrote:
> > AioContext has its own io_uring instance for file descriptor monitoring.
> > The disk I/O io_uring code was developed separately. Originally I
> > thought the characteristics of file descriptor monitoring and disk I/O
> > were too different, requiring separate io_uring instances.
> > 
> > Now it has become clear to me that it's feasible to share a single
> > io_uring instance for file descriptor monitoring and disk I/O. We're not
> > using io_uring's IOPOLL feature or anything else that would require a
> > separate instance.
> > 
> > Unify block/io_uring.c and util/fdmon-io_uring.c using the new
> > aio_add_sqe() API that allows user-defined io_uring sqe submission. Now
> > block/io_uring.c just needs to submit readv/writev/fsync and most of the
> > io_uring-specific logic is handled by fdmon-io_uring.c.
> > 
> > There are two immediate advantages:
> > 1. Fewer system calls. There is no need to monitor the disk I/O io_uring
> >    ring fd from the file descriptor monitoring io_uring instance. Disk
> >    I/O completions are now picked up directly. Also, sqes are
> >    accumulated in the sq ring until the end of the event loop iteration
> >    and there are fewer io_uring_enter(2) syscalls.
> > 2. Less code duplication.
> > 
> > Signed-off-by: Stefan Hajnoczi <stefa...@redhat.com>
> > ---
> 
> Comments below, but looks sane to me.
> 
> Reviewed-by: Eric Blake <ebl...@redhat.com>
> 
> >  include/block/aio.h     |   7 -
> >  include/block/raw-aio.h |   5 -
> >  block/file-posix.c      |  38 ++--
> >  block/io_uring.c        | 489 ++++++++++------------------------------
> >  stubs/io_uring.c        |  32 ---
> >  util/async.c            |  35 ---
> >  util/fdmon-io_uring.c   |   6 +
> >  block/trace-events      |  12 +-
> >  stubs/meson.build       |   3 -
> >  util/trace-events       |   4 +
> >  10 files changed, 139 insertions(+), 492 deletions(-)
> >  delete mode 100644 stubs/io_uring.c
> > 
> > diff --git a/include/block/aio.h b/include/block/aio.h
> > index 95beef28c3..fbb45cca74 100644
> > --- a/include/block/aio.h
> > +++ b/include/block/aio.h
> > @@ -291,8 +291,6 @@ struct AioContext {
> >      struct LinuxAioState *linux_aio;
> >  #endif
> >  #ifdef CONFIG_LINUX_IO_URING
> > -    LuringState *linux_io_uring;
> > -
> >      /* State for file descriptor monitoring using Linux io_uring */
> >      struct io_uring fdmon_io_uring;
> >      AioHandlerSList submit_list;
> > @@ -597,11 +595,6 @@ struct LinuxAioState *aio_setup_linux_aio(AioContext 
> > *ctx, Error **errp);
> >  /* Return the LinuxAioState bound to this AioContext */
> >  struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
> >  
> > -/* Setup the LuringState bound to this AioContext */
> > -LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp);
> > -
> > -/* Return the LuringState bound to this AioContext */
> > -LuringState *aio_get_linux_io_uring(AioContext *ctx);
> >  /**
> >   * aio_timer_new_with_attrs:
> >   * @ctx: the aio context
> > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> > index 6570244496..30e5fc9a9f 100644
> > --- a/include/block/raw-aio.h
> > +++ b/include/block/raw-aio.h
> > @@ -74,15 +74,10 @@ static inline bool laio_has_fua(void)
> >  #endif
> >  /* io_uring.c - Linux io_uring implementation */
> >  #ifdef CONFIG_LINUX_IO_URING
> > -LuringState *luring_init(Error **errp);
> > -void luring_cleanup(LuringState *s);
> > -
> >  /* luring_co_submit: submit I/O requests in the thread's current 
> > AioContext. */
> >  int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t 
> > offset,
> >                                    QEMUIOVector *qiov, int type,
> >                                    BdrvRequestFlags flags);
> > -void luring_detach_aio_context(LuringState *s, AioContext *old_context);
> > -void luring_attach_aio_context(LuringState *s, AioContext *new_context);
> >  bool luring_has_fua(void);
> >  #else
> >  static inline bool luring_has_fua(void)
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 9b5f08ccb2..d1f1fc3a77 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -755,14 +755,23 @@ static int raw_open_common(BlockDriverState *bs, 
> > QDict *options,
> >      }
> >  #endif /* !defined(CONFIG_LINUX_AIO) */
> >  
> > -#ifndef CONFIG_LINUX_IO_URING
> >      if (s->use_linux_io_uring) {
> > +#ifdef CONFIG_LINUX_IO_URING
> > +        if (!aio_has_io_uring()) {
> 
> Compared to the old code... [1]
> 
> > +            error_setg(errp, "aio=io_uring was specified, but is not "
> > +                             "available (disabled via io_uring_disabled "
> > +                             "sysctl or blocked by container runtime "
> > +                             "seccomp policy?)");
> > +            ret = -EINVAL;
> > +            goto fail;
> > +        }
> > +#else
> >          error_setg(errp, "aio=io_uring was specified, but is not supported 
> > "
> >                           "in this build.");
> 
> While here, let's get rid of the trailing '.' in the error_setg call.

Sounds good.

> 
> 
> >          ret = -EINVAL;
> >          goto fail;
> > -    }
> >  #endif /* !defined(CONFIG_LINUX_IO_URING) */
> > +    }
> >  
> >      s->has_discard = true;
> >      s->has_write_zeroes = true;
> > @@ -2522,27 +2531,6 @@ static bool bdrv_qiov_is_aligned(BlockDriverState 
> > *bs, QEMUIOVector *qiov)
> >      return true;
> >  }
> >  
> > -#ifdef CONFIG_LINUX_IO_URING
> > -static inline bool raw_check_linux_io_uring(BDRVRawState *s)
> > -{
> > -    Error *local_err = NULL;
> > -    AioContext *ctx;
> > -
> > -    if (!s->use_linux_io_uring) {
> > -        return false;
> > -    }
> > -
> > -    ctx = qemu_get_current_aio_context();
> > -    if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) {
> 
> [1]... is there a reason you dropped the unlikely() wrapper?

raw_check_linux_io_uring() was called from the I/O code path where a
branch prediction hint might make sense for performance reasons. The
raw_open_common() you mentioned above is not in the I/O code path.

Also, the new code in raw_open_common() is not the same as
raw_check_linux_io_uring(). aio_setup_linux_io_uring() no longer exists
in the new code. The patch removes this code entirely, it doesn't just
drop unlikely().

> 
> > -        error_reportf_err(local_err, "Unable to use linux io_uring, "
> > -                                     "falling back to thread pool: ");
> > -        s->use_linux_io_uring = false;
> > -        return false;
> > -    }
> > -    return true;
> > -}
> > -#endif
> > -
> >  #ifdef CONFIG_LINUX_AIO
> >  static inline bool raw_check_linux_aio(BDRVRawState *s)
> >  {
> > @@ -2595,7 +2583,7 @@ static int coroutine_fn raw_co_prw(BlockDriverState 
> > *bs, int64_t *offset_ptr,
> >      if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
> >          type |= QEMU_AIO_MISALIGNED;
> >  #ifdef CONFIG_LINUX_IO_URING
> > -    } else if (raw_check_linux_io_uring(s)) {
> > +    } else if (s->use_linux_io_uring) {
> >          assert(qiov->size == bytes);
> >          ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags);
> >          goto out;
> > @@ -2692,7 +2680,7 @@ static int coroutine_fn 
> > raw_co_flush_to_disk(BlockDriverState *bs)
> >      };
> >  
> >  #ifdef CONFIG_LINUX_IO_URING
> > -    if (raw_check_linux_io_uring(s)) {
> > +    if (s->use_linux_io_uring) {
> >          return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
> >      }
> >  #endif
> > diff --git a/block/io_uring.c b/block/io_uring.c
> > index dd4f304910..dd930ee57e 100644
> > --- a/block/io_uring.c
> > +++ b/block/io_uring.c
> > @@ -11,28 +11,20 @@
> >  #include "qemu/osdep.h"
> >  #include <liburing.h>
> >  #include "block/aio.h"
> > -#include "qemu/queue.h"
> >  #include "block/block.h"
> >  #include "block/raw-aio.h"
> >  #include "qemu/coroutine.h"
> > -#include "qemu/defer-call.h"
> > -#include "qapi/error.h"
> >  #include "system/block-backend.h"
> >  #include "trace.h"
> >  
> > -/* Only used for assertions.  */
> > -#include "qemu/coroutine_int.h"
> > -
> > -/* io_uring ring size */
> > -#define MAX_ENTRIES 128
> > -
> > -typedef struct LuringAIOCB {
> > +typedef struct {
> >      Coroutine *co;
> > -    struct io_uring_sqe sqeq;
> > -    ssize_t ret;
> >      QEMUIOVector *qiov;
> > -    bool is_read;
> > -    QSIMPLEQ_ENTRY(LuringAIOCB) next;
> > +    uint64_t offset;
> > +    ssize_t ret;
> > +    int type;
> > +    int fd;
> > +    BdrvRequestFlags flags;
> >  
> >      /*
> >       * Buffered reads may require resubmission, see
> > @@ -40,36 +32,51 @@ typedef struct LuringAIOCB {
> >       */
> >      int total_read;
> >      QEMUIOVector resubmit_qiov;
> > -} LuringAIOCB;
> >  
> > -typedef struct LuringQueue {
> > -    unsigned int in_queue;
> > -    unsigned int in_flight;
> > -    bool blocked;
> > -    QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue;
> > -} LuringQueue;
> > +    CqeHandler cqe_handler;
> > +} LuringRequest;
> >  
> > -struct LuringState {
> > -    AioContext *aio_context;
> > -
> > -    struct io_uring ring;
> > -
> > -    /* No locking required, only accessed from AioContext home thread */
> > -    LuringQueue io_q;
> > -
> > -    QEMUBH *completion_bh;
> > -};
> > -
> > -/**
> > - * luring_resubmit:
> > - *
> > - * Resubmit a request by appending it to submit_queue.  The caller must 
> > ensure
> > - * that ioq_submit() is called later so that submit_queue requests are 
> > started.
> > - */
> > -static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb)
> > +static void luring_prep_sqe(struct io_uring_sqe *sqe, void *opaque)
> >  {
> > -    QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
> > -    s->io_q.in_queue++;
> > +    LuringRequest *req = opaque;
> > +    QEMUIOVector *qiov = req->qiov;
> > +    uint64_t offset = req->offset;
> > +    int fd = req->fd;
> > +    BdrvRequestFlags flags = req->flags;
> > +
> > +    switch (req->type) {
> > +    case QEMU_AIO_WRITE:
> > +#ifdef HAVE_IO_URING_PREP_WRITEV2
> > +    {
> > +        int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
> > +        io_uring_prep_writev2(sqe, fd, qiov->iov,
> > +                              qiov->niov, offset, luring_flags);
> > +    }
> > +#else
> > +        assert(flags == 0);
> > +        io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset);
> 
> Hmm.  'man io_uring_prep_writev2' states:
> 
>        Unless an application explicitly needs to pass in more than  one  
> iovec,
>        it  is  more  efficient  to  use io_uring_prep_write(3) rather than 
> this
>        function, as no state has to be maintained for  a  non-vectored  IO  
> re‐
>        quest.
> 
> Obviously, if we want luring_flags of RWF_DSYNC to be set, we have to
> use the newer interface; but if that flag is not present, should we be
> conditionally falling back to the simpler interface when qiov->niov ==
> 1?

Thanks for the suggestion. I will try it out and benchmark it for v2.

> 
> In fact, even when qiov->niov > 1, can we unvector it ourselves into
> multiple io_uring_prep_write() calls, since the whole point of the
> uring is that we aren't making syscalls, so more ops on the uring
> should still be cheaper?  But that's a question for a followup patch
> (still, given that your series is RFC for performance reasons, it may
> be worth investigating).

That's more complicated. I won't pursue that for the time being.

Attachment: signature.asc
Description: PGP signature

Reply via email to