dirtylimit: implement virtual CPU throttle

Peter Xu Tue, 08 Feb 2022 02:12:30 -0800

On Mon, Jan 24, 2022 at 10:10:39PM +0800, huang...@chinatelecom.cn wrote:
> From: Hyman Huang(黄勇) <huang...@chinatelecom.cn>
> 
> Setup a negative feedback system when vCPU thread
> handling KVM_EXIT_DIRTY_RING_FULL exit by introducing
> throttle_us_per_full field in struct CPUState. Sleep
> throttle_us_per_full microseconds to throttle vCPU
> if dirtylimit is enabled.
> 
> Start a thread to track current dirty page rates and
> tune the throttle_us_per_full dynamically untill current
> dirty page rate reach the quota.
> 
> Introduce the util function in the header for dirtylimit
> implementation.
> 
> Signed-off-by: Hyman Huang(黄勇) <huang...@chinatelecom.cn>
> ---
>  accel/kvm/kvm-all.c         |  13 ++
>  accel/stubs/kvm-stub.c      |   5 +
>  include/hw/core/cpu.h       |   6 +
>  include/sysemu/dirtylimit.h |  16 +++
>  include/sysemu/kvm.h        |   2 +
>  softmmu/dirtylimit.c        | 308 
> ++++++++++++++++++++++++++++++++++++++++++++
>  softmmu/trace-events        |   8 ++
>  7 files changed, 358 insertions(+)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 1a5f1d1..60f51fd 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -45,6 +45,7 @@
>  #include "qemu/guest-random.h"
>  #include "sysemu/hw_accel.h"
>  #include "kvm-cpus.h"
> +#include "sysemu/dirtylimit.h"
>  
>  #include "hw/boards.h"
>  
> @@ -476,6 +477,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
>      cpu->kvm_state = s;
>      cpu->vcpu_dirty = true;
>      cpu->dirty_pages = 0;
> +    cpu->throttle_us_per_full = 0;
>  
>      mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
>      if (mmap_size < 0) {
> @@ -1469,6 +1471,11 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
>           */
>          sleep(1);
>  
> +        /* keep sleeping in order to not interfere the dirtylimit */
> +        if (dirtylimit_in_service()) {
> +            continue;
> +        }
> +
>          trace_kvm_dirty_ring_reaper("wakeup");
>          r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
>  
> @@ -2312,6 +2319,11 @@ bool kvm_dirty_ring_enabled(void)
>      return kvm_state->kvm_dirty_ring_size ? true : false;
>  }
>  
> +uint32_t kvm_dirty_ring_size(void)
> +{
> +    return kvm_state->kvm_dirty_ring_size;
> +}


Please consider moving this into a small patch too along with the stub.

> +
>  static int kvm_init(MachineState *ms)
>  {
>      MachineClass *mc = MACHINE_GET_CLASS(ms);
> @@ -2961,6 +2973,7 @@ int kvm_cpu_exec(CPUState *cpu)
>              qemu_mutex_lock_iothread();
>              kvm_dirty_ring_reap(kvm_state, cpu);
>              qemu_mutex_unlock_iothread();
> +            dirtylimit_vcpu_execute(cpu);
>              ret = 0;
>              break;
>          case KVM_EXIT_SYSTEM_EVENT:
> diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
> index 5319573..1128cb2 100644
> --- a/accel/stubs/kvm-stub.c
> +++ b/accel/stubs/kvm-stub.c
> @@ -152,4 +152,9 @@ bool kvm_dirty_ring_enabled(void)
>  {
>      return false;
>  }
> +
> +uint32_t kvm_dirty_ring_size(void)
> +{
> +    return 0;
> +}
>  #endif
> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> index 76ab3b8..dbeb31a 100644
> --- a/include/hw/core/cpu.h
> +++ b/include/hw/core/cpu.h
> @@ -411,6 +411,12 @@ struct CPUState {
>       */
>      bool throttle_thread_scheduled;
>  
> +    /*
> +     * Sleep throttle_us_per_full microseconds once dirty ring is full
> +     * if dirty page rate limit is enabled.
> +     */
> +    int64_t throttle_us_per_full;
> +
>      bool ignore_memory_transaction_failures;
>  
>      /* Used for user-only emulation of prctl(PR_SET_UNALIGN). */
> diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
> index da459f0..37e634b 100644
> --- a/include/sysemu/dirtylimit.h
> +++ b/include/sysemu/dirtylimit.h
> @@ -19,4 +19,20 @@ void vcpu_dirty_rate_stat_start(void);
>  void vcpu_dirty_rate_stat_stop(void);
>  void vcpu_dirty_rate_stat_initialize(void);
>  void vcpu_dirty_rate_stat_finalize(void);
> +
> +void dirtylimit_state_lock(void);
> +void dirtylimit_state_unlock(void);
> +void dirtylimit_state_initialize(void);
> +void dirtylimit_state_finalize(void);
> +void dirtylimit_thread_finalize(void);
> +bool dirtylimit_in_service(void);
> +bool dirtylimit_vcpu_index_valid(int cpu_index);
> +void dirtylimit_start(void);
> +void dirtylimit_stop(void);
> +void dirtylimit_set_vcpu(int cpu_index,
> +                         uint64_t quota,
> +                         bool enable);
> +void dirtylimit_set_all(uint64_t quota,
> +                        bool enable);
> +void dirtylimit_vcpu_execute(CPUState *cpu);
>  #endif
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 6eb39a0..bc3f0b5 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -563,4 +563,6 @@ bool kvm_cpu_check_are_resettable(void);
>  bool kvm_arch_cpu_check_are_resettable(void);
>  
>  bool kvm_dirty_ring_enabled(void);
> +
> +uint32_t kvm_dirty_ring_size(void);
>  #endif
> diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
> index a10ac6f..cf20020 100644
> --- a/softmmu/dirtylimit.c
> +++ b/softmmu/dirtylimit.c
> @@ -18,6 +18,32 @@
>  #include "sysemu/dirtylimit.h"
>  #include "exec/memory.h"
>  #include "hw/boards.h"
> +#include "sysemu/kvm.h"
> +#include "trace.h"
> +
> +/*
> + * Dirtylimit stop working if dirty page rate error
> + * value less than DIRTYLIMIT_TOLERANCE_RANGE
> + */
> +#define DIRTYLIMIT_TOLERANCE_RANGE  25  /* MB/s */
> +/*
> + * Dirty page rate error greater than
> + * DIRTYLIMIT_LINEAR_ADJUSTMENT_WATERMARK should use
> + * linear adjustment police.
> + */
> +#define DIRTYLIMIT_LINEAR_ADJUSTMENT_WATERMARK  400  /* MB/s */

Is this magic value really useful?

Asked since it smells like a workaround to avoid oscillation, at the meantime,
AFAICT it's never used, see [1] below.

Again, I really hope we use as less magic values as possible.  Piling up magic
code (without solid proof that it'll help) is scary..

> +/*
> + * Plus or minus vcpu sleep time linearly if dirty
> + * page rate error value percentage over
> + * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
> + * Otherwise, plus or minus a fixed vcpu sleep time.
> + */
> +#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT     50
> +/*
> + * Max vcpu sleep time percentage during a cycle
> + * composed of dirty ring full and sleep time.
> + */
> +#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
>  
>  struct {
>      VcpuStat stat;
> @@ -25,6 +51,31 @@ struct {
>      QemuThread thread;
>  } *vcpu_dirty_rate_stat;
>  
> +typedef struct VcpuDirtyLimitState {
> +    int cpu_index;
> +    bool enabled;
> +    /*
> +     * Quota dirty page rate, unit is MB/s
> +     * zero if not enabled.
> +     */
> +    uint64_t quota;
> +} VcpuDirtyLimitState;
> +
> +struct {
> +    VcpuDirtyLimitState *states;
> +    /* Max cpus number configured by user */
> +    int max_cpus;
> +    /* Number of vcpu under dirtylimit */
> +    int limited_nvcpu;
> +    QEMUBH *bh;
> +} *dirtylimit_state;
> +
> +/* protect dirtylimit_state */
> +static QemuMutex dirtylimit_mutex;
> +
> +/* dirtylimit thread quit if dirtylimit_quit is true */
> +static bool dirtylimit_quit;
> +
>  static void vcpu_dirty_rate_stat_collect(void)
>  {
>      int64_t start_time;
> @@ -58,6 +109,7 @@ static void *vcpu_dirty_rate_stat_thread(void *opaque)
>  
>      while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
>          vcpu_dirty_rate_stat_collect();
> +        qemu_bh_schedule(dirtylimit_state->bh);

When I mentioned "a hook" previously, I meant something like a function
pointer.  Even simpler, IMHO you could call dirtylimit code directly, assuming
this thread only services dirty limit.

Or is this BH required for some reason?

>      }
>  
>      /* stop log sync */
> @@ -118,3 +170,259 @@ void vcpu_dirty_rate_stat_finalize(void)
>      free(vcpu_dirty_rate_stat);
>      vcpu_dirty_rate_stat = NULL;
>  }
> +
> +void dirtylimit_state_lock(void)
> +{
> +    qemu_mutex_lock(&dirtylimit_mutex);
> +}
> +
> +void dirtylimit_state_unlock(void)
> +{
> +    qemu_mutex_unlock(&dirtylimit_mutex);
> +}
> +
> +static void
> +__attribute__((__constructor__)) dirtylimit_mutex_init(void)
> +{
> +    qemu_mutex_init(&dirtylimit_mutex);
> +}
> +
> +static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
> +{
> +    return &dirtylimit_state->states[cpu_index];
> +}
> +
> +void dirtylimit_state_initialize(void)
> +{
> +    MachineState *ms = MACHINE(qdev_get_machine());
> +    int max_cpus = ms->smp.max_cpus;
> +    int i;
> +
> +    dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
> +
> +    dirtylimit_state->states =
> +            g_malloc0(sizeof(VcpuDirtyLimitState) * max_cpus);
> +
> +    for (i = 0; i < max_cpus; i++) {
> +        dirtylimit_state->states[i].cpu_index = i;
> +    }
> +
> +    dirtylimit_state->max_cpus = max_cpus;
> +    trace_dirtylimit_state_initialize(max_cpus);
> +}
> +
> +void dirtylimit_state_finalize(void)
> +{
> +    free(dirtylimit_state->states);
> +    dirtylimit_state->states = NULL;
> +
> +    free(dirtylimit_state);
> +    dirtylimit_state = NULL;
> +
> +    trace_dirtylimit_state_finalize();
> +}
> +
> +bool dirtylimit_in_service(void)
> +{
> +    return !!dirtylimit_state;
> +}
> +
> +bool dirtylimit_vcpu_index_valid(int cpu_index)
> +{
> +    MachineState *ms = MACHINE(qdev_get_machine());
> +
> +    return !(cpu_index < 0 ||
> +             cpu_index >= ms->smp.max_cpus);
> +}
> +
> +static inline void dirtylimit_vcpu_set_quota(int cpu_index,
> +                                             uint64_t quota,
> +                                             bool on)
> +{
> +    if (on) {
> +        dirtylimit_state->states[cpu_index].quota = quota;
> +        if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
> +            dirtylimit_state->limited_nvcpu++;
> +        }
> +    } else {
> +        dirtylimit_state->states[cpu_index].quota = 0;
> +        if (dirtylimit_state->states[cpu_index].enabled) {
> +            dirtylimit_state->limited_nvcpu--;
> +        }
> +    }
> +
> +    dirtylimit_state->states[cpu_index].enabled = on;
> +}
> +
> +static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
> +{
> +    static uint64_t max_dirtyrate;
> +    uint32_t dirty_ring_size = kvm_dirty_ring_size();
> +    uint64_t dirty_ring_size_meory_MB =
> +        dirty_ring_size * TARGET_PAGE_SIZE >> 20;
> +
> +    if (max_dirtyrate < dirtyrate) {
> +        max_dirtyrate = dirtyrate;
> +    }
> +
> +    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
> +}
> +
> +static inline bool dirtylimit_done(uint64_t quota,
> +                                   uint64_t current)
> +{
> +    uint64_t min, max;
> +
> +    min = MIN(quota, current);
> +    max = MAX(quota, current);
> +
> +    return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
> +}
> +
> +static inline bool
> +dirtylimit_need_linear_adjustment(uint64_t quota,
> +                                  uint64_t current)
> +{
> +    uint64_t min, max, pct, error;
> +
> +    min = MIN(quota, current);
> +    max = MAX(quota, current);
> +
> +    error = (max - min);
> +
> +    if (error < DIRTYLIMIT_LINEAR_ADJUSTMENT_WATERMARK) {
> +        false;

[1]

I think you wanted to use "return false"... Maybe this also means the macro
never helped at all hence it can be dropped.

> +    }
> +
> +    pct = error * 100 / max;
> +
> +    return pct > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
> +}

Thanks,

-- 
Peter Xu

Re: [PATCH v12 4/5] softmmu/dirtylimit: implement virtual CPU throttle

Reply via email to