The io_uring(7) file descriptor monitor cannot enter polling mode because it needs to submit a POLL_ADD SQE every time a file descriptor becomes active. Submitting SQEs only happens in FDMonOps->wait() outside of polling mode.
Fix this using the multi-shot mechanism introduced in Linux 5.13 and liburing 2.1. Stable and enterprise Linux distros ship 5.14+ as of March 2025, so it is safe to require this. Note that fdmon-io_uring is currently not enabled at runtime and is not essential, so QEMU can still be built without it on older hosts. In multi-shot mode, a POLL_ADD SQE remains active until canceled with POLL_REMOVE. This avoids the need to submit a new SQE every time a file descriptor becomes active. When POLL_REMOVE is processed by the host kernel, the multi-shot POLL_ADD operation completes with -ECANCELED. Adjust the code slightly to take this into account. Signed-off-by: Stefan Hajnoczi <stefa...@redhat.com> --- meson.build | 2 +- util/fdmon-io_uring.c | 34 +++++++++++++++++++++------------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/meson.build b/meson.build index 41f68d3806..9f5f31ac46 100644 --- a/meson.build +++ b/meson.build @@ -1144,7 +1144,7 @@ linux_io_uring_test = ''' linux_io_uring = not_found if not get_option('linux_io_uring').auto() or have_block - linux_io_uring = dependency('liburing', version: '>=0.3', + linux_io_uring = dependency('liburing', version: '>=2.1', required: get_option('linux_io_uring'), method: 'pkg-config') if not cc.links(linux_io_uring_test) diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c index b0d68bdc44..6cd665e565 100644 --- a/util/fdmon-io_uring.c +++ b/util/fdmon-io_uring.c @@ -124,8 +124,7 @@ static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags) /* * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and - * telling process_cqe() to delete the AioHandler when its - * IORING_OP_POLL_ADD completes. + * telling process_cqe() to ignore IORING_OP_POLL_ADD completions. */ *flags = qatomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING | FDMON_IO_URING_ADD)); @@ -166,12 +165,12 @@ static void fdmon_io_uring_update(AioContext *ctx, } } -static void add_poll_add_sqe(AioContext *ctx, AioHandler *node) +static void add_poll_multishot_sqe(AioContext *ctx, AioHandler *node) { struct io_uring_sqe *sqe = get_sqe(ctx); int events = poll_events_from_pfd(node->pfd.events); - io_uring_prep_poll_add(sqe, node->pfd.fd, events); + io_uring_prep_poll_multishot(sqe, node->pfd.fd, events); io_uring_sqe_set_data(sqe, node); } @@ -213,7 +212,7 @@ static void fill_sq_ring(AioContext *ctx) while ((node = dequeue(&submit_list, &flags))) { /* Order matters, just in case both flags were set */ if (flags & FDMON_IO_URING_ADD) { - add_poll_add_sqe(ctx, node); + add_poll_multishot_sqe(ctx, node); } if (flags & FDMON_IO_URING_REMOVE) { add_poll_remove_sqe(ctx, node); @@ -234,21 +233,30 @@ static bool process_cqe(AioContext *ctx, return false; } + flags = qatomic_read(&node->flags); + /* - * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race - * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE - * bit before IORING_OP_POLL_REMOVE is submitted. + * poll_multishot cancelled by poll_remove? Or completed early because fd + * was closed before poll_remove finished? */ - flags = qatomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE); - if (flags & FDMON_IO_URING_REMOVE) { + if (cqe->res == -ECANCELED || cqe->res == -EBADF) { + assert(!(cqe->flags & IORING_CQE_F_MORE)); + assert(flags & FDMON_IO_URING_REMOVE); QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); return false; } - aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res)); + /* Ignore if it becomes ready during removal */ + if (flags & FDMON_IO_URING_REMOVE) { + return false; + } - /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */ - add_poll_add_sqe(ctx, node); + /* Multi-shot can stop at any time, so re-arm if necessary */ + if (!(cqe->flags & IORING_CQE_F_MORE)) { + add_poll_multishot_sqe(ctx, node); + } + + aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res)); return true; } -- 2.49.0