On Mon, Jan 24, 2022 at 10:10:39PM +0800, huang...@chinatelecom.cn wrote: > From: Hyman Huang(黄勇) <huang...@chinatelecom.cn> > > Setup a negative feedback system when vCPU thread > handling KVM_EXIT_DIRTY_RING_FULL exit by introducing > throttle_us_per_full field in struct CPUState. Sleep > throttle_us_per_full microseconds to throttle vCPU > if dirtylimit is enabled. > > Start a thread to track current dirty page rates and > tune the throttle_us_per_full dynamically untill current > dirty page rate reach the quota. > > Introduce the util function in the header for dirtylimit > implementation. > > Signed-off-by: Hyman Huang(黄勇) <huang...@chinatelecom.cn> > --- > accel/kvm/kvm-all.c | 13 ++ > accel/stubs/kvm-stub.c | 5 + > include/hw/core/cpu.h | 6 + > include/sysemu/dirtylimit.h | 16 +++ > include/sysemu/kvm.h | 2 + > softmmu/dirtylimit.c | 308 > ++++++++++++++++++++++++++++++++++++++++++++ > softmmu/trace-events | 8 ++ > 7 files changed, 358 insertions(+) > > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c > index 1a5f1d1..60f51fd 100644 > --- a/accel/kvm/kvm-all.c > +++ b/accel/kvm/kvm-all.c > @@ -45,6 +45,7 @@ > #include "qemu/guest-random.h" > #include "sysemu/hw_accel.h" > #include "kvm-cpus.h" > +#include "sysemu/dirtylimit.h" > > #include "hw/boards.h" > > @@ -476,6 +477,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp) > cpu->kvm_state = s; > cpu->vcpu_dirty = true; > cpu->dirty_pages = 0; > + cpu->throttle_us_per_full = 0; > > mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); > if (mmap_size < 0) { > @@ -1469,6 +1471,11 @@ static void *kvm_dirty_ring_reaper_thread(void *data) > */ > sleep(1); > > + /* keep sleeping in order to not interfere the dirtylimit */ > + if (dirtylimit_in_service()) { > + continue; > + } > + > trace_kvm_dirty_ring_reaper("wakeup"); > r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING; > > @@ -2312,6 +2319,11 @@ bool kvm_dirty_ring_enabled(void) > return kvm_state->kvm_dirty_ring_size ? true : false; > } > > +uint32_t kvm_dirty_ring_size(void) > +{ > + return kvm_state->kvm_dirty_ring_size; > +}
Please consider moving this into a small patch too along with the stub. > + > static int kvm_init(MachineState *ms) > { > MachineClass *mc = MACHINE_GET_CLASS(ms); > @@ -2961,6 +2973,7 @@ int kvm_cpu_exec(CPUState *cpu) > qemu_mutex_lock_iothread(); > kvm_dirty_ring_reap(kvm_state, cpu); > qemu_mutex_unlock_iothread(); > + dirtylimit_vcpu_execute(cpu); > ret = 0; > break; > case KVM_EXIT_SYSTEM_EVENT: > diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c > index 5319573..1128cb2 100644 > --- a/accel/stubs/kvm-stub.c > +++ b/accel/stubs/kvm-stub.c > @@ -152,4 +152,9 @@ bool kvm_dirty_ring_enabled(void) > { > return false; > } > + > +uint32_t kvm_dirty_ring_size(void) > +{ > + return 0; > +} > #endif > diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h > index 76ab3b8..dbeb31a 100644 > --- a/include/hw/core/cpu.h > +++ b/include/hw/core/cpu.h > @@ -411,6 +411,12 @@ struct CPUState { > */ > bool throttle_thread_scheduled; > > + /* > + * Sleep throttle_us_per_full microseconds once dirty ring is full > + * if dirty page rate limit is enabled. > + */ > + int64_t throttle_us_per_full; > + > bool ignore_memory_transaction_failures; > > /* Used for user-only emulation of prctl(PR_SET_UNALIGN). */ > diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h > index da459f0..37e634b 100644 > --- a/include/sysemu/dirtylimit.h > +++ b/include/sysemu/dirtylimit.h > @@ -19,4 +19,20 @@ void vcpu_dirty_rate_stat_start(void); > void vcpu_dirty_rate_stat_stop(void); > void vcpu_dirty_rate_stat_initialize(void); > void vcpu_dirty_rate_stat_finalize(void); > + > +void dirtylimit_state_lock(void); > +void dirtylimit_state_unlock(void); > +void dirtylimit_state_initialize(void); > +void dirtylimit_state_finalize(void); > +void dirtylimit_thread_finalize(void); > +bool dirtylimit_in_service(void); > +bool dirtylimit_vcpu_index_valid(int cpu_index); > +void dirtylimit_start(void); > +void dirtylimit_stop(void); > +void dirtylimit_set_vcpu(int cpu_index, > + uint64_t quota, > + bool enable); > +void dirtylimit_set_all(uint64_t quota, > + bool enable); > +void dirtylimit_vcpu_execute(CPUState *cpu); > #endif > diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h > index 6eb39a0..bc3f0b5 100644 > --- a/include/sysemu/kvm.h > +++ b/include/sysemu/kvm.h > @@ -563,4 +563,6 @@ bool kvm_cpu_check_are_resettable(void); > bool kvm_arch_cpu_check_are_resettable(void); > > bool kvm_dirty_ring_enabled(void); > + > +uint32_t kvm_dirty_ring_size(void); > #endif > diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c > index a10ac6f..cf20020 100644 > --- a/softmmu/dirtylimit.c > +++ b/softmmu/dirtylimit.c > @@ -18,6 +18,32 @@ > #include "sysemu/dirtylimit.h" > #include "exec/memory.h" > #include "hw/boards.h" > +#include "sysemu/kvm.h" > +#include "trace.h" > + > +/* > + * Dirtylimit stop working if dirty page rate error > + * value less than DIRTYLIMIT_TOLERANCE_RANGE > + */ > +#define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */ > +/* > + * Dirty page rate error greater than > + * DIRTYLIMIT_LINEAR_ADJUSTMENT_WATERMARK should use > + * linear adjustment police. > + */ > +#define DIRTYLIMIT_LINEAR_ADJUSTMENT_WATERMARK 400 /* MB/s */ Is this magic value really useful? Asked since it smells like a workaround to avoid oscillation, at the meantime, AFAICT it's never used, see [1] below. Again, I really hope we use as less magic values as possible. Piling up magic code (without solid proof that it'll help) is scary.. > +/* > + * Plus or minus vcpu sleep time linearly if dirty > + * page rate error value percentage over > + * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT. > + * Otherwise, plus or minus a fixed vcpu sleep time. > + */ > +#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT 50 > +/* > + * Max vcpu sleep time percentage during a cycle > + * composed of dirty ring full and sleep time. > + */ > +#define DIRTYLIMIT_THROTTLE_PCT_MAX 99 > > struct { > VcpuStat stat; > @@ -25,6 +51,31 @@ struct { > QemuThread thread; > } *vcpu_dirty_rate_stat; > > +typedef struct VcpuDirtyLimitState { > + int cpu_index; > + bool enabled; > + /* > + * Quota dirty page rate, unit is MB/s > + * zero if not enabled. > + */ > + uint64_t quota; > +} VcpuDirtyLimitState; > + > +struct { > + VcpuDirtyLimitState *states; > + /* Max cpus number configured by user */ > + int max_cpus; > + /* Number of vcpu under dirtylimit */ > + int limited_nvcpu; > + QEMUBH *bh; > +} *dirtylimit_state; > + > +/* protect dirtylimit_state */ > +static QemuMutex dirtylimit_mutex; > + > +/* dirtylimit thread quit if dirtylimit_quit is true */ > +static bool dirtylimit_quit; > + > static void vcpu_dirty_rate_stat_collect(void) > { > int64_t start_time; > @@ -58,6 +109,7 @@ static void *vcpu_dirty_rate_stat_thread(void *opaque) > > while (qatomic_read(&vcpu_dirty_rate_stat->running)) { > vcpu_dirty_rate_stat_collect(); > + qemu_bh_schedule(dirtylimit_state->bh); When I mentioned "a hook" previously, I meant something like a function pointer. Even simpler, IMHO you could call dirtylimit code directly, assuming this thread only services dirty limit. Or is this BH required for some reason? > } > > /* stop log sync */ > @@ -118,3 +170,259 @@ void vcpu_dirty_rate_stat_finalize(void) > free(vcpu_dirty_rate_stat); > vcpu_dirty_rate_stat = NULL; > } > + > +void dirtylimit_state_lock(void) > +{ > + qemu_mutex_lock(&dirtylimit_mutex); > +} > + > +void dirtylimit_state_unlock(void) > +{ > + qemu_mutex_unlock(&dirtylimit_mutex); > +} > + > +static void > +__attribute__((__constructor__)) dirtylimit_mutex_init(void) > +{ > + qemu_mutex_init(&dirtylimit_mutex); > +} > + > +static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index) > +{ > + return &dirtylimit_state->states[cpu_index]; > +} > + > +void dirtylimit_state_initialize(void) > +{ > + MachineState *ms = MACHINE(qdev_get_machine()); > + int max_cpus = ms->smp.max_cpus; > + int i; > + > + dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state)); > + > + dirtylimit_state->states = > + g_malloc0(sizeof(VcpuDirtyLimitState) * max_cpus); > + > + for (i = 0; i < max_cpus; i++) { > + dirtylimit_state->states[i].cpu_index = i; > + } > + > + dirtylimit_state->max_cpus = max_cpus; > + trace_dirtylimit_state_initialize(max_cpus); > +} > + > +void dirtylimit_state_finalize(void) > +{ > + free(dirtylimit_state->states); > + dirtylimit_state->states = NULL; > + > + free(dirtylimit_state); > + dirtylimit_state = NULL; > + > + trace_dirtylimit_state_finalize(); > +} > + > +bool dirtylimit_in_service(void) > +{ > + return !!dirtylimit_state; > +} > + > +bool dirtylimit_vcpu_index_valid(int cpu_index) > +{ > + MachineState *ms = MACHINE(qdev_get_machine()); > + > + return !(cpu_index < 0 || > + cpu_index >= ms->smp.max_cpus); > +} > + > +static inline void dirtylimit_vcpu_set_quota(int cpu_index, > + uint64_t quota, > + bool on) > +{ > + if (on) { > + dirtylimit_state->states[cpu_index].quota = quota; > + if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) { > + dirtylimit_state->limited_nvcpu++; > + } > + } else { > + dirtylimit_state->states[cpu_index].quota = 0; > + if (dirtylimit_state->states[cpu_index].enabled) { > + dirtylimit_state->limited_nvcpu--; > + } > + } > + > + dirtylimit_state->states[cpu_index].enabled = on; > +} > + > +static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate) > +{ > + static uint64_t max_dirtyrate; > + uint32_t dirty_ring_size = kvm_dirty_ring_size(); > + uint64_t dirty_ring_size_meory_MB = > + dirty_ring_size * TARGET_PAGE_SIZE >> 20; > + > + if (max_dirtyrate < dirtyrate) { > + max_dirtyrate = dirtyrate; > + } > + > + return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate; > +} > + > +static inline bool dirtylimit_done(uint64_t quota, > + uint64_t current) > +{ > + uint64_t min, max; > + > + min = MIN(quota, current); > + max = MAX(quota, current); > + > + return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false; > +} > + > +static inline bool > +dirtylimit_need_linear_adjustment(uint64_t quota, > + uint64_t current) > +{ > + uint64_t min, max, pct, error; > + > + min = MIN(quota, current); > + max = MAX(quota, current); > + > + error = (max - min); > + > + if (error < DIRTYLIMIT_LINEAR_ADJUSTMENT_WATERMARK) { > + false; [1] I think you wanted to use "return false"... Maybe this also means the macro never helped at all hence it can be dropped. > + } > + > + pct = error * 100 / max; > + > + return pct > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT; > +} Thanks, -- Peter Xu