Am 11.03.2019 um 13:35 hat Paolo Bonzini geschrieben: > This backend is faster (100ns vs 150ns per switch on my laptop), but > especially it will be possible to add CET support to it in 4.1. In > the meanwhile, it is nice to have it as an experimental alternative. > > Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
Creating a qcow2 image fails for me with a General Protection Fault. Looks like this is because of a 'movaps %xmm0,(%rsp)' with an unaligned stack pointer (0x7fffec5f8b78). We need to start with rsp 8 bytes lower to comply with the calling convention. > --- /dev/null > +++ b/util/coroutine-x86.c > @@ -0,0 +1,213 @@ > +/* > + * x86-specific coroutine initialization code > + * > + * Copyright (C) 2006 Anthony Liguori <anth...@codemonkey.ws> > + * Copyright (C) 2011 Kevin Wolf <kw...@redhat.com> > + * Copyright (C) 2019 Paolo Bonzini <pbonz...@redhat.com> > + * > + * This library is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.0 of the License, or (at your option) any later version. > + * > + * This library is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this library; if not, see > <http://www.gnu.org/licenses/>. > + */ > + > +/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */ > +#ifdef _FORTIFY_SOURCE > +#undef _FORTIFY_SOURCE > +#endif You don't use setjmp/longjmp, so is this really necessary? > +#include "qemu/osdep.h" > +#include "qemu-common.h" > +#include "qemu/coroutine_int.h" > + > +#ifdef CONFIG_VALGRIND_H > +#include <valgrind/valgrind.h> > +#endif > + > +#if defined(__SANITIZE_ADDRESS__) || __has_feature(address_sanitizer) > +#ifdef CONFIG_ASAN_IFACE_FIBER > +#define CONFIG_ASAN 1 > +#include <sanitizer/asan_interface.h> > +#endif > +#endif > + > +typedef struct { > + Coroutine base; > + void *stack; > + size_t stack_size; > + void *sp; > + > +#ifdef CONFIG_VALGRIND_H > + unsigned int valgrind_stack_id; > +#endif > +} CoroutineX86; > + > +/** > + * Per-thread coroutine bookkeeping > + */ > +static __thread CoroutineX86 leader; > +static __thread Coroutine *current; > + > +static void finish_switch_fiber(void *fake_stack_save) > +{ > +#ifdef CONFIG_ASAN > + const void *bottom_old; > + size_t size_old; > + > + __sanitizer_finish_switch_fiber(fake_stack_save, &bottom_old, &size_old); > + > + if (!leader.stack) { > + leader.stack = (void *)bottom_old; > + leader.stack_size = size_old; > + } > +#endif > +} > + > +static void start_switch_fiber(void **fake_stack_save, > + const void *bottom, size_t size) > +{ > +#ifdef CONFIG_ASAN > + __sanitizer_start_switch_fiber(fake_stack_save, bottom, size); > +#endif > +} These two functions are duplicated without changes between ucontext and x86, and they aren't really backend-specific. Should they be moved to a place where both backends can share them, like util/qemu-coroutine.c? > +/* On entry to a coroutine, rax is "value" and rsi is the coroutine itself. > */ rax is "action" (not "value"), and the coroutine is rdi (not rsi). > +#define CO_SWITCH(from, to, action, jump) ({ > \ > + int ret = action; > \ > + void *from_ = from; > \ > + void *to_ = to; > \ > + asm volatile( > \ > + ".cfi_remember_state\n" > \ > + "pushq %%rbp\n" /* save scratch register on > source stack */ \ > + ".cfi_adjust_cfa_offset 8\n" > \ > + ".cfi_rel_offset %%rbp, 0\n" > \ > + "call 1f\n" /* switch continues at label 1 > */ \ > + ".cfi_adjust_cfa_offset 8\n" > \ > + "jmp 2f\n" /* switch back continues at > label 2 */ \ > + "1: movq (%%rsp), %%rbp\n" /* save source IP for debugging > */ \ > + "movq %%rsp, %c[sp](%[FROM])\n" /* save source SP */ > \ > + "movq %c[sp](%[TO]), %%rsp\n" /* load destination SP */ > \ > + jump "\n" /* coroutine switch */ > \ > + "2:" > \ > + ".cfi_adjust_cfa_offset -8\n" > \ > + "popq %%rbp\n" > \ > + ".cfi_adjust_cfa_offset -8\n" > \ > + ".cfi_restore_state\n" > \ > + : "+a" (ret), [FROM] "+b" (from_), [TO] "+D" (to_) > \ > + : [sp] "i" (offsetof(CoroutineX86, sp)) > \ > + : "rcx", "rdx", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", > "r14", "r15", \ > + "memory"); > \ > + ret; \ > +}) > + > +static void __attribute__((__used__)) coroutine_trampoline(void *arg) > +{ > + CoroutineX86 *self = arg; > + Coroutine *co = &self->base; > + > + finish_switch_fiber(NULL); > + > + while (true) { > + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); > + co->entry(co->entry_arg); Okay, inverse order because you have a fake entry on creation below... > + } > +} > + > +Coroutine *qemu_coroutine_new(void) > +{ > + CoroutineX86 *co; > + void *fake_stack_save = NULL; > + > + co = g_malloc0(sizeof(*co)); > + co->stack_size = COROUTINE_STACK_SIZE; > + co->stack = qemu_alloc_stack(&co->stack_size); > + co->sp = co->stack + co->stack_size; > + > +#ifdef CONFIG_VALGRIND_H > + co->valgrind_stack_id = > + VALGRIND_STACK_REGISTER(co->stack, co->stack + co->stack_size); > +#endif > + > + /* Immediately enter the coroutine once to pass it its address as the > argument */ > + co->base.caller = qemu_coroutine_self(); > + start_switch_fiber(&fake_stack_save, co->stack, co->stack_size); > + CO_SWITCH(current, co, 0, "jmp coroutine_trampoline"); > + finish_switch_fiber(fake_stack_save); > + co->base.caller = NULL; ...but why is this necessary? CO_SWITCH() always passes the coroutine in rdi, not just here, so wouldn't the first real call do this, too? Ah, I see, because of the 'jmp coroutine_trampoline'. But the comment is really misleading. Actually, I think the code would become simpler if you just put the address of coroutine_trampoline on the initial stack and then have 'ret' unconditionally (see below for a quick attempt at something to squash in). > + return &co->base; > +} > + > +#ifdef CONFIG_VALGRIND_H > +#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__) > +/* Work around an unused variable in the valgrind.h macro... */ > +#pragma GCC diagnostic push > +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" > +#endif > +static inline void valgrind_stack_deregister(CoroutineX86 *co) > +{ > + VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id); > +} > +#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__) > +#pragma GCC diagnostic pop > +#endif > +#endif Another candidate for sharing instead of duplicating? (You could trivially pass the valgrind_stack_id instead of the CoroutineX86 object.) Kevin diff --git a/util/coroutine-x86.c b/util/coroutine-x86.c index b7649e7ae1..ff78c298c9 100644 --- a/util/coroutine-x86.c +++ b/util/coroutine-x86.c @@ -79,7 +79,7 @@ static void start_switch_fiber(void **fake_stack_save, } /* On entry to a coroutine, rax is "value" and rsi is the coroutine itself. */ -#define CO_SWITCH(from, to, action, jump) ({ \ +#define CO_SWITCH(from, to, action) ({ \ int ret = action; \ void *from_ = from; \ void *to_ = to; \ @@ -94,7 +94,7 @@ static void start_switch_fiber(void **fake_stack_save, "1: movq (%%rsp), %%rbp\n" /* save source IP for debugging */ \ "movq %%rsp, %c[sp](%[FROM])\n" /* save source SP */ \ "movq %c[sp](%[TO]), %%rsp\n" /* load destination SP */ \ - jump "\n" /* coroutine switch */ \ + "ret\n" /* coroutine switch */ \ "2:" \ ".cfi_adjust_cfa_offset -8\n" \ "popq %%rbp\n" \ @@ -115,15 +115,14 @@ static void __attribute__((__used__)) coroutine_trampoline(void *arg) finish_switch_fiber(NULL); while (true) { - qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); co->entry(co->entry_arg); + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); } } Coroutine *qemu_coroutine_new(void) { CoroutineX86 *co; - void *fake_stack_save = NULL; co = g_malloc0(sizeof(*co)); co->stack_size = COROUTINE_STACK_SIZE; @@ -135,12 +134,10 @@ Coroutine *qemu_coroutine_new(void) VALGRIND_STACK_REGISTER(co->stack, co->stack + co->stack_size); #endif - /* Immediately enter the coroutine once to pass it its address as the argument */ - co->base.caller = qemu_coroutine_self(); - start_switch_fiber(&fake_stack_save, co->stack, co->stack_size); - CO_SWITCH(current, co, 0, "jmp coroutine_trampoline"); - finish_switch_fiber(fake_stack_save); - co->base.caller = NULL; + /* Put entry point on the stack; 8 more bytes for the stack alignment + * required by the calling convention. */ + co->sp -= 16; + *(uint64_t*) co->sp = (uint64_t) coroutine_trampoline; return &co->base; } @@ -193,7 +190,7 @@ qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, start_switch_fiber(action == COROUTINE_TERMINATE ? NULL : &fake_stack_save, to->stack, to->stack_size); - action = CO_SWITCH(from, to, action, "ret"); + action = CO_SWITCH(from, to, action); finish_switch_fiber(fake_stack_save); return action;