Since commit 2cd571245b43 ("sched/fair: Add related data structure for
task based throttle") in v6.18, CFS bandwidth throttling no longer
dequeues a task directly; it queues task_work via TWA_RESUME and sets
TIF_NOTIFY_RESUME, relying on that work running before the task returns
to guest/user mode. The powerpc KVM run loops only checked for reschedule
and signals, never TIF_NOTIFY_RESUME, so the deferred throttle never ran
while a vCPU stayed in the run loop: a CPU-bound guest that rarely exits
to userspace ran far past its cpu.max quota and then appeared frozen for
minutes while the accrued throttle debt was repaid.

Use the generic infrastructure to check for and handle pending work
before transitioning into guest mode, replacing the open-coded
need_resched() and cond_resched() checks in the Book3S HV run loops and
in the common kvmppc_prepare_to_enter() used by the Book3S PR and BookE
run loops. The redundant signal_pending() recheck (and its sigpend label)
in kvmhv_run_single_vcpu() is also dropped, as
xfer_to_guest_mode_work_pending() is a superset of it.

This picks up handling for TIF_NOTIFY_RESUME, which was previously
ignored, meaning task work will now be correctly handled on every
guest re-entry.

In kvmppc_prepare_to_enter() the generic helper accounts the signal exit
(vcpu->stat.signal_exits and KVM_EXIT_INTR) but does not set the exit
type, so kvmppc_set_exit_type(SIGNAL_EXITS) is retained on the signal
path to preserve the E500 CONFIG_KVM_EXIT_TIMING histogram; it is a no-op
otherwise.

Signed-off-by: Vishal Chourasia <[email protected]>
---
 arch/powerpc/kvm/Kconfig     |  1 +
 arch/powerpc/kvm/book3s_hv.c | 64 ++++++++++++++++++++++++++++--------
 arch/powerpc/kvm/powerpc.c   | 34 ++++++++++++++-----
 3 files changed, 77 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 9a0d1c1aca6c..b6bc2fc86dca 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -22,6 +22,7 @@ config KVM
        select KVM_COMMON
        select KVM_VFIO
        select HAVE_KVM_IRQ_BYPASS
+       select VIRT_XFER_TO_GUEST_WORK
 
 config KVM_BOOK3S_HANDLER
        bool
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 61dbeea317f3..a1b2077561bb 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3850,10 +3850,20 @@ static noinline void kvmppc_run_core(struct 
kvmppc_vcore *vc)
         * and return without going into the guest(s).
         * If the mmu_ready flag has been cleared, don't go into the
         * guest because that means a HPT resize operation is in progress.
+        *
+        * xfer_to_guest_mode_work_pending() is the IRQs-disabled recheck for
+        * pending guest-mode work (reschedule, signals, and TIF_NOTIFY_RESUME
+        * task_work such as the deferred CFS throttle). It is the pre-POWER9
+        * analog of the final gate in kvmhv_run_single_vcpu(), and a superset
+        * of the old need_resched() check: it catches work that raced in after
+        * the drain in kvmppc_run_vcpu(), so a CPU-bound vCPU is throttled here
+        * instead of running one more guest dispatch past its quota. IRQs are
+        * hard-disabled just above, so the non-__ variant (which asserts that)
+        * is the correct one.
         */
        local_irq_disable();
        hard_irq_disable();
-       if (lazy_irq_pending() || need_resched() ||
+       if (lazy_irq_pending() || xfer_to_guest_mode_work_pending() ||
            recheck_signals_and_mmu(&core_info)) {
                local_irq_enable();
                vc->vcore_state = VCORE_INACTIVE;
@@ -4824,10 +4834,24 @@ static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
                vc->runner = vcpu;
                if (n_ceded == vc->n_runnable) {
                        kvmppc_vcore_blocked(vc);
-               } else if (need_resched()) {
+               } else if (__xfer_to_guest_mode_work_pending()) {
                        kvmppc_vcore_preempt(vc);
-                       /* Let something else run */
-                       cond_resched_lock(&vc->lock);
+                       /*
+                        * Let something else run, and run pending guest-mode
+                        * work (reschedule, and TIF_NOTIFY_RESUME task_work 
such
+                        * as the deferred CFS throttle) before we would 
re-enter
+                        * the guest, so a CPU-bound vCPU is actually throttled
+                        * here instead of running past its quota. This is a
+                        * superset of the old need_resched() check. Use the raw
+                        * helper, not the kvm_ wrapper: signals (KVM_EXIT_INTR
+                        * and the signal_exits stat) are accounted by this 
path's
+                        * existing handling below, so going through the wrapper
+                        * here would double-count them. The helper may 
schedule(),
+                        * so the vcore lock is dropped around it.
+                        */
+                       spin_unlock(&vc->lock);
+                       xfer_to_guest_mode_handle_work();
+                       spin_lock(&vc->lock);
                        if (vc->vcore_state == VCORE_PREEMPT)
                                kvmppc_vcore_end_preempt(vc);
                } else {
@@ -4899,8 +4923,21 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 
time_limit,
                }
        }
 
-       if (need_resched())
-               cond_resched();
+       /*
+        * Run pending work before (re-)entering the guest, most importantly
+        * task_work queued via TWA_RESUME (e.g. the deferred CFS bandwidth
+        * throttle, which only sets TIF_NOTIFY_RESUME). Without this a 
CPU-bound
+        * vCPU that keeps returning RESUME_GUEST never reaches an exit-to-user
+        * point, so the throttle is never enforced and the task runs far beyond
+        * its quota. The helper also handles reschedule and signals, replacing
+        * the cond_resched() that was here. It may schedule(), so it runs 
before
+        * preemption and IRQs are disabled, with no vcore/KVM locks held. This
+        * is the per-reentry site shared by the bare-metal and pseries (nested)
+        * paths, so both are covered.
+        */
+       r = kvm_xfer_to_guest_mode_handle_work(vcpu);
+       if (r)  /* -EINTR: signal pending, exit to userspace (KVM_EXIT_INTR) */
+               return r;
 
        kvmppc_update_vpas(vcpu);
 
@@ -4914,9 +4951,14 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 
time_limit,
 
        vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
 
-       if (signal_pending(current))
-               goto sigpend;
-       if (need_resched() || !kvm->arch.mmu_ready)
+       /*
+        * Final IRQs-disabled check for pending guest-mode work or an MMU that
+        * is not ready. IRQs are disabled here, so bail to the outer loop,
+        * which re-enters and handles the pending work via
+        * kvm_xfer_to_guest_mode_handle_work() above (exiting with -EINTR on a
+        * signal).
+        */
+       if (xfer_to_guest_mode_work_pending() || !kvm->arch.mmu_ready)
                goto out;
 
        vcpu->cpu = pcpu;
@@ -5068,10 +5110,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 
time_limit,
 
        return vcpu->arch.ret;
 
- sigpend:
-       vcpu->stat.signal_exits++;
-       run->exit_reason = KVM_EXIT_INTR;
-       vcpu->arch.ret = -EINTR;
  out:
        vcpu->cpu = -1;
        vcpu->arch.thread_cpu = -1;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 00302399fc37..ff1a9a8de5e0 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -84,20 +84,36 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
        hard_irq_disable();
 
        while (true) {
-               if (need_resched()) {
+               if (__xfer_to_guest_mode_work_pending()) {
+                       /*
+                        * Handle pending guest-mode work before entering the
+                        * guest: reschedule, signals, and TIF_NOTIFY_RESUME
+                        * task_work such as the deferred CFS bandwidth 
throttle.
+                        * The helper must run with interrupts enabled and may
+                        * schedule(). This is a superset of the open-coded
+                        * need_resched()/signal_pending() checks it replaces. 
On
+                        * a pending signal it returns -EINTR after
+                        * kvm_xfer_to_guest_mode_handle_work() has set
+                        * run->exit_reason (KVM_EXIT_INTR) and bumped
+                        * vcpu->stat.signal_exits, so just return to userspace.
+                        */
                        local_irq_enable();
-                       cond_resched();
+                       r = kvm_xfer_to_guest_mode_handle_work(vcpu);
                        hard_irq_disable();
+                       if (r) {
+                               /*
+                                * -EINTR: the generic helper does not set the
+                                * exit type, so record it here for the E500
+                                * CONFIG_KVM_EXIT_TIMING histogram (a no-op
+                                * otherwise). Only the exit type is set;
+                                * signal_exits was already accounted above.
+                                */
+                               kvmppc_set_exit_type(vcpu, SIGNAL_EXITS);
+                               break;
+                       }
                        continue;
                }
 
-               if (signal_pending(current)) {
-                       kvmppc_account_exit(vcpu, SIGNAL_EXITS);
-                       vcpu->run->exit_reason = KVM_EXIT_INTR;
-                       r = -EINTR;
-                       break;
-               }
-
                vcpu->mode = IN_GUEST_MODE;
 
                /*
-- 
2.54.0


Reply via email to