On 16/11/2016 18:47, Stefan Hajnoczi wrote: > The AioContext event loop uses ppoll(2) or epoll_wait(2) to monitor file > descriptors or until a timer expires. In cases like virtqueues, Linux > AIO, and ThreadPool it is technically possible to wait for events via > polling (i.e. continuously checking for events without blocking). > > Polling can be faster than blocking syscalls because file descriptors, > the process scheduler, and system calls are bypassed. > > The main disadvantage to polling is that it increases CPU utilization. > In classic polling configuration a full host CPU thread might run at > 100% to respond to events as quickly as possible. This patch implements > a timeout so we fall back to blocking syscalls if polling detects no > activity. After the timeout no CPU cycles are wasted on polling until > the next event loop iteration. > > This patch implements an experimental polling mode that can be > controlled with the QEMU_AIO_POLL_MAX_NS=<nanoseconds> environment > variable. The aio_poll() event loop function will attempt to poll > instead of using blocking syscalls. > > The run_poll_handlers_begin() and run_poll_handlers_end() trace events > are added to aid performance analysis and troubleshooting. If you need > to know whether polling mode is being used, trace these events to find > out. > > Signed-off-by: Stefan Hajnoczi <stefa...@redhat.com> > --- > aio-posix.c | 107 > +++++++++++++++++++++++++++++++++++++++++++++++++++- > async.c | 11 +++++- > include/block/aio.h | 3 ++ > trace-events | 4 ++ > 4 files changed, 123 insertions(+), 2 deletions(-)
Nice! > diff --git a/aio-posix.c b/aio-posix.c > index 4379c13..5e5a561 100644 > --- a/aio-posix.c > +++ b/aio-posix.c > @@ -18,6 +18,8 @@ > #include "block/block.h" > #include "qemu/queue.h" > #include "qemu/sockets.h" > +#include "qemu/cutils.h" > +#include "trace.h" > #ifdef CONFIG_EPOLL_CREATE1 > #include <sys/epoll.h> > #endif > @@ -27,12 +29,16 @@ struct AioHandler > GPollFD pfd; > IOHandler *io_read; > IOHandler *io_write; > + AioPollFn *io_poll; > int deleted; > void *opaque; > bool is_external; > QLIST_ENTRY(AioHandler) node; > }; > > +/* How long to poll AioPollHandlers before monitoring file descriptors */ > +static int64_t aio_poll_max_ns; > + > #ifdef CONFIG_EPOLL_CREATE1 > > /* The fd number threashold to switch to epoll */ > @@ -206,11 +212,12 @@ void aio_set_fd_handler(AioContext *ctx, > AioHandler *node; > bool is_new = false; > bool deleted = false; > + int poll_disable_cnt = 0; poll_disable_cnt = !io_poll - !node->io_poll ? Not the most readable thing, but effective... > node = find_aio_handler(ctx, fd); > > /* Are we deleting the fd handler? */ > - if (!io_read && !io_write) { > + if (!io_read && !io_write && !io_poll) { > if (node == NULL) { > return; > } > @@ -229,6 +236,10 @@ void aio_set_fd_handler(AioContext *ctx, > QLIST_REMOVE(node, node); > deleted = true; > } > + > + if (!node->io_poll) { > + poll_disable_cnt = -1; > + } > } else { > if (node == NULL) { > /* Alloc and insert if it's not already there */ > @@ -238,10 +249,22 @@ void aio_set_fd_handler(AioContext *ctx, > > g_source_add_poll(&ctx->source, &node->pfd); > is_new = true; > + > + if (!io_poll) { > + poll_disable_cnt = 1; > + } > + } else { > + if (!node->io_poll && io_poll) { > + poll_disable_cnt = -1; > + } else if (node->io_poll && !io_poll) { > + poll_disable_cnt = 1; > + } > } > + > /* Update handler with latest information */ > node->io_read = io_read; > node->io_write = io_write; > + node->io_poll = io_poll; > node->opaque = opaque; > node->is_external = is_external; > > @@ -251,6 +274,9 @@ void aio_set_fd_handler(AioContext *ctx, > > aio_epoll_update(ctx, node, is_new); > aio_notify(ctx); > + > + ctx->poll_disable_cnt += poll_disable_cnt; > + > if (deleted) { > g_free(node); > } > @@ -268,6 +294,7 @@ void aio_set_event_notifier(AioContext *ctx, > > bool aio_prepare(AioContext *ctx) > { > + /* TODO run poll handlers? */ > return false; > } > > @@ -402,6 +429,56 @@ static void add_pollfd(AioHandler *node) > npfd++; > } > > +/* run_poll_handlers: > + * @ctx: the AioContext > + * @max_ns: maximum time to poll for, in nanoseconds > + * > + * Polls for a given time. > + * > + * Note that ctx->notify_me must be non-zero so this function can detect > + * aio_notify(). > + * > + * Note that the caller must have incremented ctx->walking_handlers. > + * > + * Returns: true if progress was made, false otherwise > + */ > +static bool run_poll_handlers(AioContext *ctx, int64_t max_ns) > +{ > + bool progress = false; > + int64_t end_time; > + > + assert(ctx->notify_me); > + assert(ctx->walking_handlers > 0); > + assert(ctx->poll_disable_cnt == 0); > + > + trace_run_poll_handlers_begin(ctx, max_ns); > + > + end_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + max_ns; > + > + do { > + AioHandler *node; > + > + /* Bail if aio_notify() was called (e.g. BH was scheduled) */ > + if (atomic_read(&ctx->notified)) { > + progress = true; > + break; > + } This can be done in event_notifier_dummy_poll. Paolo > + QLIST_FOREACH(node, &ctx->aio_handlers, node) { > + if (!node->deleted && node->io_poll && > + node->io_poll(node->opaque)) { > + progress = true; > + } > + > + /* Caller handles freeing deleted nodes. Don't do it here. */ > + } > + } while (!progress && qemu_clock_get_ns(QEMU_CLOCK_REALTIME) < end_time); > + > + trace_run_poll_handlers_end(ctx, progress); > + > + return progress; > +} > + > bool aio_poll(AioContext *ctx, bool blocking) > { > AioHandler *node; > @@ -425,6 +502,18 @@ bool aio_poll(AioContext *ctx, bool blocking) > > ctx->walking_handlers++; > > + if (blocking && aio_poll_max_ns && ctx->poll_disable_cnt == 0) { > + /* See qemu_soonest_timeout() uint64_t hack */ > + int64_t max_ns = MIN((uint64_t)aio_compute_timeout(ctx), > + (uint64_t)aio_poll_max_ns); > + > + if (max_ns && run_poll_handlers(ctx, max_ns)) { > + atomic_sub(&ctx->notify_me, 2); > + blocking = false; /* poll again, don't block */ > + progress = true; > + } > + } > + > assert(npfd == 0); > > /* fill pollfds */ > @@ -486,6 +575,22 @@ bool aio_poll(AioContext *ctx, bool blocking) > > void aio_context_setup(AioContext *ctx) > { > + if (!aio_poll_max_ns) { > + int64_t val; > + const char *env_str = getenv("QEMU_AIO_POLL_MAX_NS"); > + > + if (!env_str) { > + env_str = "0"; > + } > + > + if (!qemu_strtoll(env_str, NULL, 10, &val)) { > + aio_poll_max_ns = val; > + } else { > + fprintf(stderr, "Unable to parse QEMU_AIO_POLL_MAX_NS " > + "environment variable\n"); > + } > + } > + > #ifdef CONFIG_EPOLL_CREATE1 > assert(!ctx->epollfd); > ctx->epollfd = epoll_create1(EPOLL_CLOEXEC); > diff --git a/async.c b/async.c > index c8fbd63..f5958e8 100644 > --- a/async.c > +++ b/async.c > @@ -349,6 +349,15 @@ static void event_notifier_dummy_cb(EventNotifier *e) > { > } > > +/* Polling mode is only available when all event loop resources have polling > + * functions registered. The polling loop notices aio_notify() by watching > the > + * ctx->notified field so this dummy function doesn't need to do anything. > + */ > +static bool event_notifier_dummy_poll(void *opaque) > +{ > + return false; > +} > + > AioContext *aio_context_new(Error **errp) > { > int ret; > @@ -367,7 +376,7 @@ AioContext *aio_context_new(Error **errp) > false, > (EventNotifierHandler *) > event_notifier_dummy_cb, > - NULL); > + event_notifier_dummy_poll); > #ifdef CONFIG_LINUX_AIO > ctx->linux_aio = NULL; > #endif > diff --git a/include/block/aio.h b/include/block/aio.h > index 1fac404..8aa5219 100644 > --- a/include/block/aio.h > +++ b/include/block/aio.h > @@ -131,6 +131,9 @@ struct AioContext { > > int external_disable_cnt; > > + /* Number of AioHandlers without .io_poll() */ > + int poll_disable_cnt; > + > /* epoll(7) state used when built with CONFIG_EPOLL */ > int epollfd; > bool epoll_enabled; > diff --git a/trace-events b/trace-events > index f74e1d3..7fe3a1b 100644 > --- a/trace-events > +++ b/trace-events > @@ -25,6 +25,10 @@ > # > # The <format-string> should be a sprintf()-compatible format string. > > +# aio-posix.c > +run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64 > +run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d" > + > # thread-pool.c > thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p > opaque %p" > thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p > req %p opaque %p ret %d" >