----- Mail original ----- > De: "Paolo Bonzini" <pbonz...@redhat.com> > À: "Sebastian Tanase" <sebastian.tan...@openwide.fr>, qemu-devel@nongnu.org > Cc: aligu...@amazon.com, afaer...@suse.de, r...@twiddle.net, "peter maydell" > <peter.mayd...@linaro.org>, > mich...@walle.cc, a...@alex.org.uk, stefa...@redhat.com, > lcapitul...@redhat.com, crobi...@redhat.com, > arm...@redhat.com, wenchaoq...@gmail.com, quint...@redhat.com, > kw...@redhat.com, m...@tls.msk.ru, m...@redhat.com > Envoyé: Lundi 30 Juin 2014 18:46:13 > Objet: Re: [RFC PATCH V3 4/6] cpu_exec: Add sleeping algorithm > > Il 30/06/2014 15:59, Sebastian Tanase ha scritto: > > The goal is to sleep qemu whenever the guest clock > > is in advance compared to the host clock (we use > > the monotonic clocks). The amount of time to sleep > > is calculated in the execution loop in cpu_exec. > > > > At first, we tried to approximate at each for loop the real time > > elapsed > > while searching for a TB (generating or retrieving from cache) and > > executing it. We would then approximate the virtual time > > corresponding > > to the number of virtual instructions executed. The difference > > between > > these 2 values would allow us to know if the guest is in advance or > > delayed. > > However, the function used for measuring the real time > > (qemu_clock_get_ns(QEMU_CLOCK_REALTIME)) proved to be very > > expensive. > > We had an added overhead of 13% of the total run time. > > > > Therefore, we modified the algorithm and only take into account the > > difference between the 2 clocks at the begining of the cpu_exec > > function. > > During the for loop we try to reduce the advance of the guest only > > by > > computing the virtual time elapsed and sleeping if necessary. The > > overhead > > is thus reduced to 3%. Even though this method still has a > > noticeable > > overhead, it no longer is a bottleneck in trying to achieve a > > better > > guest frequency for which the guest clock is faster than the host > > one. > > > > As for the the alignement of the 2 clocks, with the first algorithm > > the guest clock was oscillating between -1 and 1ms compared to the > > host clock. > > Using the second algorithm we notice that the guest is 5ms behind > > the host, which > > is still acceptable for our use case. > > > > The tests where conducted using fio and stress. The host machine in > > an i5 CPU at > > 3.10GHz running Debian Jessie (kernel 3.12). The guest machine is > > an arm versatile-pb > > built with buildroot. > > > > Currently, on our test machine, the lowest icount we can achieve > > that is suitable for > > aligning the 2 clocks is 6. However, we observe that the IO tests > > (using fio) are > > slower than the cpu tests (using stress). > > > > Signed-off-by: Sebastian Tanase <sebastian.tan...@openwide.fr> > > Tested-by: Camille Bégué <camille.be...@openwide.fr> > > --- > > cpu-exec.c | 112 > > +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 112 insertions(+) > > > > diff --git a/cpu-exec.c b/cpu-exec.c > > index 38e5f02..ac741b7 100644 > > --- a/cpu-exec.c > > +++ b/cpu-exec.c > > @@ -22,6 +22,102 @@ > > #include "tcg.h" > > #include "qemu/atomic.h" > > #include "sysemu/qtest.h" > > +#include "qemu/timer.h" > > + > > +/* Structs and function pointers for delaying the host */ > > +typedef struct SyncClocks SyncClocks; > > +typedef void (*init_delay_func)(SyncClocks *sc, > > + const CPUState *cpu); > > +typedef void (*perform_align_func)(SyncClocks *sc, > > + const CPUState *cpu); > > +struct SyncClocks { > > + int64_t diff_clk; > > + int64_t original_instr_counter; > > + init_delay_func init_delay; > > + perform_align_func perform_align; > > +}; > > I don't remember exactly what I had in mind :) but if I remove these > pointers from your patches, the code already looks nice, with no > CONFIG_USER_ONLY except just below here. >
Ok, I will remove the function pointers then :) > > +#if !defined(CONFIG_USER_ONLY) > > +/* Allow the guest to have a max 3ms advance. > > + * The difference between the 2 clocks could therefore > > + * oscillate around 0. > > + */ > > +#define VM_CLOCK_ADVANCE 3000000 > > How did you tune this? > I computed this value based on the tests run on my machine. Of course, this value will be different on a different machine running different tests. > > +static int64_t delay_host(int64_t diff_clk) > > +{ > > + struct timespec sleep_delay, rem_delay; > > + if (diff_clk > VM_CLOCK_ADVANCE) { > > + sleep_delay.tv_sec = diff_clk / 1000000000LL; > > + sleep_delay.tv_nsec = diff_clk % 1000000000LL; > > + if (nanosleep(&sleep_delay, &rem_delay) < 0) { > > + diff_clk -= (sleep_delay.tv_sec - rem_delay.tv_sec) * > > 1000000000LL; > > + diff_clk -= sleep_delay.tv_nsec - rem_delay.tv_nsec; > > I just remembered that nanosleep doesn't exist on Windows. :( The > rem_delay feature of nanosleep is very useful, and I don't think > there > is an equivalent. So for now we shall make this POSIX only. > > Paolo > Should I surround the nanosleep with #ifndef _WIN32 and then add Sleep for the Windows case ? or just leave out Windows ? Sebastian > > + } else { > > + diff_clk = 0; > > + } > > + } > > + return diff_clk; > > +} > > + > > +static int64_t instr_to_vtime(int64_t instr_counter, const > > CPUState *cpu) > > +{ > > + int64_t instr_exec_time; > > + instr_exec_time = instr_counter - > > + (cpu->icount_extra + > > + cpu->icount_decr.u16.low); > > + instr_exec_time = instr_exec_time << icount_time_shift; > > + > > + return instr_exec_time; > > +} > > + > > +static void align_clocks(SyncClocks *sc, const CPUState *cpu) > > +{ > > + if (!icount_align_option) { > > + return; > > + } > > + sc->diff_clk += instr_to_vtime(sc->original_instr_counter, > > cpu); > > + sc->original_instr_counter = cpu->icount_extra + > > cpu->icount_decr.u16.low; > > + sc->diff_clk = delay_host(sc->diff_clk); > > +} > > + > > +static void init_delay_params(SyncClocks *sc, > > + const CPUState *cpu) > > +{ > > + static int64_t clocks_offset = -1; > > + int64_t realtime_clock_value, virtual_clock_value; > > + if (!icount_align_option) { > > + return; > > + } > > + /* On x86 target architecture, the PIT reset function (called > > + by qemu_system_reset) will end up calling qemu_clock_warp > > + and then icount_warp_rt changing vm_clock_warp_start from 0 > > (initial > > + value) to -1. This in turn will make us skip the initial > > offset > > + between the real and virtual clocks (initially virtual > > clock is 0). > > + Therefore we impose that the first time we run the cpu > > + the host and virtual clocks should be aligned; we don't > > alter any of > > + the clocks, we just calculate the difference between them. > > */ > > + realtime_clock_value = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); > > + virtual_clock_value = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); > > + if (clocks_offset == -1) { > > + clocks_offset = realtime_clock_value - > > virtual_clock_value; > > + } > > + sc->diff_clk = virtual_clock_value - realtime_clock_value + > > clocks_offset; > > + sc->original_instr_counter = cpu->icount_extra + > > cpu->icount_decr.u16.low; > > +} > > +#else > > +/* We don't use the align feature for User emulation > > + thus we add empty functions which shall be ignored > > + by the compiler */ > > +static void align_clocks(SyncClocks *sc, const CPUState *cpu) > > +{ > > +} > > + > > +static void init_delay_params(SyncClocks *sc, > > + const CPUState *cpu) > > +{ > > +} > > +#endif /* CONFIG USER ONLY */ > > > > void cpu_loop_exit(CPUState *cpu) > > { > > @@ -227,6 +323,11 @@ int cpu_exec(CPUArchState *env) > > TranslationBlock *tb; > > uint8_t *tc_ptr; > > uintptr_t next_tb; > > + /* Delay algorithm */ > > + static SyncClocks sc = { > > + .init_delay = init_delay_params, > > + .perform_align = align_clocks > > + }; > > /* This must be volatile so it is not trashed by longjmp() */ > > volatile bool have_tb_lock = false; > > > > @@ -283,6 +384,11 @@ int cpu_exec(CPUArchState *env) > > #endif > > cpu->exception_index = -1; > > > > + /* Calculate difference between guest clock and host clock. > > + This delay includes the delay of the last cycle, so > > + what we have to do is sleep until it is 0. As for the > > + advance/delay we gain here, we try to fix it next time. */ > > + sc.init_delay(&sc, cpu); > > /* prepare setjmp context for exception handling */ > > for(;;) { > > if (sigsetjmp(cpu->jmp_env, 0) == 0) { > > @@ -672,6 +778,9 @@ int cpu_exec(CPUArchState *env) > > if (insns_left > 0) { > > /* Execute remaining instructions. > > */ > > cpu_exec_nocache(env, insns_left, > > tb); > > + /* Try to align the host and > > virtual clocks > > + if the guest is in advance. */ > > + sc.perform_align(&sc, cpu); > > } > > cpu->exception_index = EXCP_INTERRUPT; > > next_tb = 0; > > @@ -684,6 +793,9 @@ int cpu_exec(CPUArchState *env) > > } > > } > > cpu->current_tb = NULL; > > + /* Try to align the host and virtual clocks > > + if the guest is in advance */ > > + sc.perform_align(&sc, cpu); > > /* reset soft MMU for next block (it can currently > > only be set by a memory fault) */ > > } /* for(;;) */ > > > >