The Linux-specific syscall epoll(7) has a constant complexity, whereas ppoll/g_poll is linear complexity, depending on the number of fds.
The event loop is more efficient with epoll, because we only need to poll on few fds now. Sometimes EPOLL_CTL_ADD returns -1 with errno = EPERM, when the target file descriptor doesn't support epoll and they are always ready for read/write. We mark such fds and always dispatch. Signed-off-by: Fam Zheng <f...@redhat.com> --- Makefile.objs | 4 +- include/qemu/iohandler.h | 13 +++ iohandler-linux.c | 213 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 229 insertions(+), 1 deletion(-) create mode 100644 iohandler-linux.c diff --git a/Makefile.objs b/Makefile.objs index 55dbc36..3244c65 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -8,7 +8,9 @@ util-obj-y = util/ qobject/ qapi/ qapi-types.o qapi-visit.o qapi-event.o block-obj-y = async.o thread-pool.o block-obj-y += nbd.o block.o blockjob.o -block-obj-y += main-loop.o iohandler.o qemu-timer.o iohandler-posix.o +block-obj-y += main-loop.o iohandler.o qemu-timer.o +block-obj-$(call lnot,$(CONFIG_LINUX)) += iohandler-posix.o +block-obj-$(CONFIG_LINUX) += iohandler-linux.o block-obj-$(CONFIG_POSIX) += aio-posix.o block-obj-$(CONFIG_WIN32) += aio-win32.o block-obj-y += block/ diff --git a/include/qemu/iohandler.h b/include/qemu/iohandler.h index e2af47d..a879796 100644 --- a/include/qemu/iohandler.h +++ b/include/qemu/iohandler.h @@ -27,7 +27,11 @@ #ifndef QEMU_IOHANDLER_H #define QEMU_IOHANDLER_H +#include "config-host.h" #include "qemu/main-loop.h" +#ifdef CONFIG_LINUX +#include <sys/epoll.h> +#endif typedef struct IOHandlerRecord { IOCanReadHandler *fd_read_poll; @@ -39,11 +43,20 @@ typedef struct IOHandlerRecord { bool deleted; GPollFD gpfd; bool attached; +#ifdef CONFIG_LINUX + struct epoll_event epoll_event; + bool fallback; +#endif + } IOHandlerRecord; typedef struct { GSource source; +#ifdef CONFIG_LINUX + GPollFD epollfd; +#endif + QLIST_HEAD(, IOHandlerRecord) io_handlers; } IOHandlerSource; diff --git a/iohandler-linux.c b/iohandler-linux.c new file mode 100644 index 0000000..61f569b --- /dev/null +++ b/iohandler-linux.c @@ -0,0 +1,213 @@ +/* + * I/O Handler posix implementation + * + * Copyright (c) 2014 Red Hat, Inc. + * + * Author: Fam Zheng <f...@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "config-host.h" +#include "qemu-common.h" +#include "qemu/iohandler.h" + +static int iohandler_get_events(IOHandlerSource *s, IOHandlerRecord *ioh) +{ + int events = 0; + + if (!ioh->deleted) { + if (ioh->fd_read && + (!ioh->fd_read_poll || + ioh->fd_read_poll(ioh->opaque) != 0)) { + events |= EPOLLIN | EPOLLHUP | EPOLLERR; + } + if (ioh->fd_write) { + events |= EPOLLOUT | EPOLLERR; + } + } + + return events; +} + +static gboolean iohandler_source_prepare(GSource *source, gint *timeout) +{ + IOHandlerRecord *ioh; + IOHandlerSource *s = (IOHandlerSource *)source; + int old_events, new_events, r; + + QLIST_FOREACH(ioh, &s->io_handlers, next) { + old_events = ioh->epoll_event.events; + new_events = iohandler_get_events(s, ioh) & (EPOLLIN | EPOLLOUT); + ioh->epoll_event.events = new_events; + ioh->epoll_event.data.ptr = ioh; + if (old_events != new_events) { + if (!old_events) { + r = epoll_ctl(s->epollfd.fd, EPOLL_CTL_ADD, ioh->fd, + &ioh->epoll_event); + if (r) { + if (errno == EPERM) { + /* Some fds don't work with epoll, let's mark it as + * always ready. */ + ioh->fallback = true; + } else { + perror("epoll_ctl add"); + abort(); + } + } + } else if (new_events) { + /* Modify could fail when the fd is not available any more. + * */ + r = epoll_ctl(s->epollfd.fd, EPOLL_CTL_MOD, ioh->fd, + &ioh->epoll_event); + } + } + } + + *timeout = -1; + return false; +} + +static gboolean iohandler_source_check(GSource *source) +{ + IOHandlerRecord *ioh; + IOHandlerSource *s = (IOHandlerSource *)source; + + QLIST_FOREACH(ioh, &s->io_handlers, next) { + int events; + events = s->epollfd.revents; + if (ioh->fd_read && + (events & (G_IO_IN | G_IO_HUP | G_IO_ERR)) && + (!ioh->fd_read_poll || ioh->fd_read_poll(ioh->opaque) != 0)) { + return true; + } + if (ioh->fd_write && (events & (G_IO_OUT | G_IO_ERR))) { + return true; + } + if (ioh->fallback) { + return true; + } + } + return false; +} + +static inline void iohandler_dispatch_event(IOHandlerSource *s, + struct epoll_event *ev) +{ + IOHandlerRecord *ioh = ev->data.ptr; + int revents; + + if (!ioh->deleted) { + revents = iohandler_get_events(s, ioh) & ev->events; + + if (ioh->fd_read && (revents & (EPOLLIN | EPOLLHUP | EPOLLERR))) { + ioh->fd_read(ioh->opaque); + } + if (ioh->fd_write && (revents & (EPOLLOUT | EPOLLERR))) { + ioh->fd_write(ioh->opaque); + } + } + + /* Do this last in case read/write handlers marked it for deletion */ + if (ioh->deleted) { + /* Delete could fail when the fd is not available any more. + * */ + epoll_ctl(s->epollfd.fd, EPOLL_CTL_DEL, ioh->fd, + &ioh->epoll_event); + QLIST_REMOVE(ioh, next); + g_free(ioh); + } +} + +#define MAX_EVENTS 10 + +static gboolean iohandler_source_dispatch(GSource *source, + GSourceFunc callback, + gpointer data) +{ + IOHandlerSource *s = (IOHandlerSource *)source; + struct epoll_event events[MAX_EVENTS]; + int i, r, revents; + sigset_t origmask; + IOHandlerRecord *ioh; + + assert(callback == NULL); + + while (true) { + r = epoll_pwait(s->epollfd.fd, events, MAX_EVENTS, 0, &origmask); + if (r < 0) { + break; + } else if (r == 0) { + break; + } else { + for (i = 0; i < r; i++) { + iohandler_dispatch_event(s, &events[i]); + } + if (r < MAX_EVENTS) { + break; + } + } + } + + QLIST_FOREACH(ioh, &s->io_handlers, next) { + if (!ioh->fallback) { + continue; + } + revents = iohandler_get_events(s, ioh); + + if (ioh->fd_read && (revents & (EPOLLIN | EPOLLHUP | EPOLLERR))) { + ioh->fd_read(ioh->opaque); + } + if (ioh->fd_write && (revents & (EPOLLOUT | EPOLLERR))) { + ioh->fd_write(ioh->opaque); + } + } + + return true; +} + +static GSourceFuncs iohandler_source_funcs = { + iohandler_source_prepare, + iohandler_source_check, + iohandler_source_dispatch, + /* finalize */ NULL +}; + +GSource *qemu_iohandler_get_source(void) +{ + static IOHandlerSource *ioh_source; + if (!ioh_source) { + int epollfd; + GSource *source = g_source_new(&iohandler_source_funcs, + sizeof(IOHandlerSource)); + ioh_source = (IOHandlerSource *)source; + QLIST_INIT(&ioh_source->io_handlers); + epollfd = epoll_create(1); + if (epollfd == -1) { + perror("epoll_create"); + exit(1); + } + ioh_source->epollfd = (GPollFD) { + .fd = epollfd, + .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR, + }; + g_source_add_poll(source, &ioh_source->epollfd); + } + return &ioh_source->source; +} -- 1.9.3