On Mon, May 27, 2019 at 01:33:23PM +0530, Aarushi Mehta wrote: > +static void qemu_luring_process_completions(LuringState *s) > +{ > + struct io_uring_cqe *cqes; > + /* > + * Request completion callbacks can run the nested event loop. > + * Schedule ourselves so the nested event loop will "see" remaining > + * completed requests and process them. Without this, completion > + * callbacks that wait for other requests using a nested event loop > + * would hang forever. > + */ > + qemu_bh_schedule(s->completion_bh); > + > + while (!io_uring_peek_cqe(&s->ring, &cqes)) { > + io_uring_cqe_seen(&s->ring, cqes);
The kernel may overwrite the cqe once we've marked it seen. Therefore the cqe must only be marked seen after the last access to it. This is analogous to a use-after-free bug: we're not allowed to access fields of an object after it has been freed. The place to do so is... > + > + LuringAIOCB *luringcb = io_uring_cqe_get_data(cqes); > + luringcb->ret = io_cqe_ret(cqes); ...here: io_uring_cqe_seen(&s->ring, cqes); cqes = NULL; /* returned to ring, don't access it anymore */ > + if (luringcb->co) { > + /* > + * If the coroutine is already entered it must be in ioq_submit() > + * and will notice luringcb->ret has been filled in when it > + * eventually runs later. Coroutines cannot be entered > recursively > + * so avoid doing that! > + */ > + if (!qemu_coroutine_entered(luringcb->co)) { > + aio_co_wake(luringcb->co); > + } > + } else { > + luringcb->common.cb(luringcb->common.opaque, luringcb->ret); > + qemu_aio_unref(luringcb); > + } > + /* Change counters one-by-one because we can be nested. */ > + s->io_q.in_flight--; This counter must be decremented before invoking luringcb's callback. That way the nested event loop doesn't consider this completed request in flight anymore. > +static void ioq_submit(LuringState *s) > +{ > + int ret; > + LuringAIOCB *luringcb, *luringcb_next; > + > + while(!s->io_q.in_queue) { Should this be while (s->io_q.in_queue > 0)? > + QSIMPLEQ_FOREACH_SAFE(luringcb, &s->io_q.sq_overflow, next, > + luringcb_next) { > + struct io_uring_sqe *sqes = io_uring_get_sqe(&s->ring); > + if (!sqes) { > + break; > + } > + /* Prep sqe for submission */ > + *sqes = luringcb->sqeq; > + io_uring_sqe_set_data(sqes, luringcb); This is unnecessary, the data field has already been set in luring_do_submit() and copied to *sqes in the previous line. > +BlockAIOCB *luring_submit(BlockDriverState *bs, LuringState *s, int fd, > + int64_t sector_num, QEMUIOVector *qiov, BlockCompletionFunc *cb, > + void *opaque, int type) > +{ > + LuringAIOCB *luringcb; > + off_t offset = sector_num * BDRV_SECTOR_SIZE; > + int ret; > + > + luringcb = qemu_aio_get(&luring_aiocb_info, bs, cb, opaque); > + luringcb->ret = -EINPROGRESS; luringcb isn't zeroed by qemu_aio_get(). luringcb->co must be explicitly set to NULL to prevent undefined behavior in qemu_luring_process_completions() (uninitialized memory access). luring->co = NULL; By the way, this bug originates from linux-aio.c. I have sent a patch to fix it there! > + ret = luring_do_submit(fd, luringcb, s, offset, qiov, type); > + if (ret < 0) { > + qemu_aio_unref(luringcb); > + return NULL; > + } > + > + return &luringcb->common; > +} > + > +void luring_detach_aio_context(LuringState *s, AioContext *old_context) > +{ > + aio_set_fd_handler(old_context, s->ring.ring_fd, false, NULL, NULL, NULL, > + &s); > + qemu_bh_delete(s->completion_bh); > + s->aio_context = NULL; > +} > + > +void luring_attach_aio_context(LuringState *s, AioContext *new_context) > +{ > + s->aio_context = new_context; > + s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s); > + aio_set_fd_handler(s->aio_context, s->ring.ring_fd, false, > + qemu_luring_completion_cb, NULL, NULL, &s); > +} > + > +LuringState *luring_init(Error **errp) > +{ > + int rc; > + LuringState *s; > + s = g_malloc0(sizeof(*s)); > + struct io_uring *ring = &s->ring; > + rc = io_uring_queue_init(MAX_EVENTS, ring, 0); > + if (rc == -1) { > + error_setg_errno(errp, -rc, "failed to init linux io_uring ring"); Why was this changed from error_setg_errno(errp, errno, "failed to init linux io_uring ring") to -rc in v3? rc is -1 here, not an errno value, so the error message will be incorrect.
signature.asc
Description: PGP signature