A new implementation for qemu_poll_ns based on epoll is introduced here to address the slowness of g_poll and ppoll when the number of fds are high.
On my laptop this would reduce the virtio-blk on top of null-aio device's response time from 32 us to 29 us with few fds (~10), and 48 us to 32 us with more fds (for example when virtio-serial is plugged and ~64 more io handlers are enabled). Signed-off-by: Fam Zheng <f...@redhat.com> --- Makefile.objs | 1 + include/block/aio.h | 16 +++++ include/qemu/main-loop.h | 1 + qemu-epoll.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++ qemu-timer.c | 4 +- tests/Makefile | 2 +- 6 files changed, 185 insertions(+), 2 deletions(-) create mode 100644 qemu-epoll.c diff --git a/Makefile.objs b/Makefile.objs index 97db978..52ee086 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -9,6 +9,7 @@ util-obj-y = util/ qobject/ qapi/ qapi-types.o qapi-visit.o qapi-event.o block-obj-y = async.o thread-pool.o block-obj-y += nbd.o block.o blockjob.o block-obj-y += main-loop.o iohandler.o qemu-timer.o +block-obj-$(CONFIG_LINUX) += qemu-epoll.o block-obj-$(CONFIG_POSIX) += aio-posix.o block-obj-$(CONFIG_WIN32) += aio-win32.o block-obj-y += block/ diff --git a/include/block/aio.h b/include/block/aio.h index 1562721..b51494a 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -90,6 +90,22 @@ struct AioContext { /* TimerLists for calling timers - one per clock type */ QEMUTimerListGroup tlg; + +#ifdef CONFIG_LINUX + struct EpollState { + /* A copy of last fd array, used to skip epoll_prepare when nothing + * changed. */ + GPollFD *last_fds; + guint last_nfds; + /* An array of fds that failed epoll_ctl and fall back to ppoll. Rare + * case too. */ + GPollFD *g_poll_fds; + guint g_poll_nfds; + int *g_poll_fd_idx; + int epollfd; + } epoll_state; +#endif + }; /* Used internally to synchronize aio_poll against qemu_bh_schedule. */ diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h index 62c68c0..82afb4e 100644 --- a/include/qemu/main-loop.h +++ b/include/qemu/main-loop.h @@ -307,5 +307,6 @@ void qemu_iohandler_poll(GArray *pollfds, int rc); QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque); void qemu_bh_schedule_idle(QEMUBH *bh); +int qemu_epoll(AioContext *ctx, GPollFD *fds, guint nfds, int64_t timeout); #endif diff --git a/qemu-epoll.c b/qemu-epoll.c new file mode 100644 index 0000000..9225003 --- /dev/null +++ b/qemu-epoll.c @@ -0,0 +1,163 @@ +/* + * QEMU Event Loop + * + * Copyright (c) 2014 Red Hat, Inc. + * + * Authors: + * Fam Zheng <f...@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <sys/epoll.h> +#include "qemu/main-loop.h" + +static bool g_poll_fds_changed(const GPollFD *fds_a, const guint nfds_a, + const GPollFD *fds_b, const guint nfds_b) +{ + int i; + + if (nfds_a != nfds_b) { + return true; + } + if (!!fds_a != !!fds_b) { + return true; + } + for (i = 0; i < nfds_a; i++) { + if (fds_a[i].fd != fds_b[i].fd || + fds_a[i].events != fds_b[i].events) { + return true; + } + } + return false; +} + +static inline int io_condition_from_epoll_events(int e) +{ + return (e & EPOLLIN ? G_IO_IN : 0) | + (e & EPOLLOUT ? G_IO_OUT : 0) | + (e & EPOLLERR ? G_IO_ERR : 0) | + (e & EPOLLHUP ? G_IO_HUP : 0); +} + +static inline void epoll_event_from_g_poll_fd(struct epoll_event *event, + GPollFD *fd) +{ + int e = fd->events; + + event->events = (e & G_IO_IN ? EPOLLIN : 0) | + (e & G_IO_OUT ? EPOLLOUT : 0) | + (e & G_IO_ERR ? EPOLLERR : 0) | + (e & G_IO_HUP ? EPOLLHUP : 0); + event->data.ptr = fd; +} + +static int epoll_prepare(int epollfd, + GPollFD *fds, guint nfds, + GPollFD **g_poll_fds, + guint *g_poll_nfds, + int **g_poll_fd_idx) +{ + int i; + + GPollFD *pfds = NULL; + int npfds = 0; + int *idx = NULL; + + for (i = 0; i < nfds; i++) { + int r; + struct epoll_event event; + epoll_event_from_g_poll_fd(&event, &fds[i]); + + r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fds[i].fd, &event); + if (r) { + /* Some fds may not support epoll, fall back and add them to + * ppoll_fds */ + pfds = g_renew(GPollFD, pfds, npfds + 1); + pfds[npfds] = fds[i]; + idx = g_renew(int, idx, npfds + 1); + idx[npfds] = i; + npfds++; + } + } + + *g_poll_fds = pfds; + *g_poll_nfds = npfds; + *g_poll_fd_idx = idx; + + return epollfd; +} + +int qemu_epoll(AioContext *ctx, GPollFD *fds, guint nfds, int64_t timeout) +{ + struct EpollState *e = &ctx->epoll_state; + + const int max_events = 40; + struct epoll_event events[max_events]; + int ret = 0; + int r, i; + + if (!e->last_fds || g_poll_fds_changed(fds, nfds, + e->last_fds, e->last_nfds)) { + if (e->last_fds) { + close(e->epollfd); + } + e->epollfd = epoll_create(1); + if (e->epollfd < 0) { + perror("epoll_create"); + abort(); + } + g_free(e->g_poll_fds); + g_free(e->g_poll_fd_idx); + e->epollfd = epoll_prepare(e->epollfd, fds, nfds, + &e->g_poll_fds, + &e->g_poll_nfds, + &e->g_poll_fd_idx); + g_free(e->last_fds); + e->last_fds = g_memdup(fds, nfds * sizeof(GPollFD)); + e->last_nfds = nfds; + } + if (e->g_poll_nfds) { + ret = g_poll(e->g_poll_fds, e->g_poll_nfds, + qemu_timeout_ns_to_ms(timeout)); + if (ret < 0) { + return ret; + } + /* Sync revents back to original fds */ + for (i = 0; i < ret; i++) { + GPollFD *fd = &fds[e->g_poll_fd_idx[i]]; + assert(fd->fd == e->g_poll_fds[i].fd); + fd->revents = e->g_poll_fds[i].revents; + } + } + + r = epoll_wait(e->epollfd, events, max_events, + qemu_timeout_ns_to_ms(timeout)); + if (r < 0) { + return r; + } + + for (i = 0; i < r; i++) { + GPollFD *gpfd = events[i].data.ptr; + gpfd->revents = io_condition_from_epoll_events(events[i].events); + } + + ret += r; + return ret; +} diff --git a/qemu-timer.c b/qemu-timer.c index 7336b20..635be98 100644 --- a/qemu-timer.c +++ b/qemu-timer.c @@ -309,7 +309,9 @@ int qemu_timeout_ns_to_ms(int64_t ns) */ int qemu_poll_ns(AioContext *ctx, GPollFD *fds, guint nfds, int64_t timeout) { -#ifdef CONFIG_PPOLL +#ifdef CONFIG_LINUX + return qemu_epoll(ctx, fds, nfds, timeout); +#elif CONFIG_PPOLL if (timeout < 0) { return ppoll((struct pollfd *)fds, nfds, NULL, NULL); } else { diff --git a/tests/Makefile b/tests/Makefile index f5de29c..96b9e4a 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -347,7 +347,7 @@ tests/usb-hcd-ohci-test$(EXESUF): tests/usb-hcd-ohci-test.o tests/usb-hcd-uhci-test$(EXESUF): tests/usb-hcd-uhci-test.o tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-pc-obj-y) tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o -tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o qemu-timer.o $(qtest-obj-y) +tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o qemu-timer.o qemu-epoll.o $(qtest-obj-y) tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o libqemuutil.a libqemustub.a -- 1.9.3