Re: vmm guests die during host's supspend/resume

Mike Larkin Thu, 24 Mar 2022 10:21:16 -0700

On Thu, Mar 24, 2022 at 10:04:34AM -0700, Mike Larkin wrote:
> On Mon, Mar 21, 2022 at 11:40:27AM -0400, Dave Voutila wrote:
> >
> > Dave Voutila <[email protected]> writes:
> >
> > > Martin Pieuchot <[email protected]> writes:
> > >
> > >> I see the following in the dmesg:
> > >>
> > >> vcpu_run_vmx: failed vmresume for unknown reason
> > >> vcpu_run_vmx: error code = 5, VMRESUME: non-launched VMCS
> > >
> > > This is due to intel's vmx design. We need some handling to flush vmcs
> > > state on suspend and reload it on resume. This is on my backlog of
> > > things to clean up.
> > >
> >
> > Here's a diff that fixes the issue for me on my x270. I've performed
> > limited testing on my AMD-based X13. Will probably send to tech@ later
> > today for more eyeballs and tests.
> >
> > In short, it adds both a barrier and refcounting to the ioctl
> > handler. On device quiesce, we drain device users from the critical path
> > requiring access to guest state. For every vcpu, we vmclear (if Intel
> > host) where needed.
> >
> > To handle hibernate, we also do a vmm_stop()/vmm_start(), which on Intel
> > means issuing vmxoff/vmxon instructions on each cpu. Normally we only do
> > this during vm creation/termination.
> >
> > On wakeup, we reverse the process and notify any waiting device users
> > blocked by the barrier.
> >
>
> See comments below...
>
> -ml
>
> >
> > diff refs/heads/master refs/heads/vmm-suspend
> > blob - a195b5d247b957c1f5c12cb153ba81ed2810e89a
> > blob + 6a96d7b1297b231b0cffdd1dca933ebdf9ff2a5d
> > --- sys/arch/amd64/amd64/vmm.c
> > +++ sys/arch/amd64/amd64/vmm.c
> > @@ -25,6 +25,7 @@
> >  #include <sys/user.h>
> >  #include <sys/ioctl.h>
> >  #include <sys/queue.h>
> > +#include <sys/refcnt.h>
> >  #include <sys/rwlock.h>
> >  #include <sys/pledge.h>
> >  #include <sys/memrange.h>
> > @@ -88,6 +89,12 @@ SLIST_HEAD(vmlist_head, vm);
> >  struct vmm_softc {
> >     struct device           sc_dev;
> >
> > +   /* Suspend/Resume Synchronization */
> > +   struct refcnt           sc_refcnt;
> > +   volatile unsigned int   sc_status;
> > +#define VMM_SUSPENDED              (unsigned int) 0
> > +#define VMM_ACTIVE         (unsigned int) 1
> > +
> >     /* Capabilities */
> >     uint32_t                nr_vmx_cpus;
> >     uint32_t                nr_svm_cpus;
> > @@ -115,9 +122,11 @@ void vmx_dump_vmcs_field(uint16_t, const char *);
> >  int vmm_enabled(void);
> >  int vmm_probe(struct device *, void *, void *);
> >  void vmm_attach(struct device *, struct device *, void *);
> > +int vmm_activate(struct device *, int);
> >  int vmmopen(dev_t, int, int, struct proc *);
> >  int vmmioctl(dev_t, u_long, caddr_t, int, struct proc *);
> >  int vmmclose(dev_t, int, int, struct proc *);
> > +int vmm_quiesce_vmx(void);
> >  int vmm_start(void);
> >  int vmm_stop(void);
> >  size_t vm_create_check_mem_ranges(struct vm_create_params *);
> > @@ -264,7 +273,7 @@ struct cfdriver vmm_cd = {
> >  };
> >
> >  const struct cfattach vmm_ca = {
> > -   sizeof(struct vmm_softc), vmm_probe, vmm_attach, NULL, NULL
> > +   sizeof(struct vmm_softc), vmm_probe, vmm_attach, NULL, vmm_activate
> >  };
> >
> >  /*
> > @@ -367,6 +376,12 @@ vmm_attach(struct device *parent, struct device *self,
> >     struct cpu_info *ci;
> >     CPU_INFO_ITERATOR cii;
> >
> > +   sc->sc_status = VMM_ACTIVE;
> > +
> > +   /* We're in autoconf, so immediately release our refcnt. */
> > +   refcnt_init(&sc->sc_refcnt);
> > +   refcnt_rele(&sc->sc_refcnt);
> > +
> >     sc->nr_vmx_cpus = 0;
> >     sc->nr_svm_cpus = 0;
> >     sc->nr_rvi_cpus = 0;
> > @@ -441,6 +456,163 @@ vmm_attach(struct device *parent, struct device *self,
> >  }
> >
> >  /*
> > + * vmm_quiesce_vmx
> > + *
> > + * Prepare the host for suspend by flushing all VMCS states.
> > + */
> > +int
> > +vmm_quiesce_vmx(void)
> > +{
> > +   struct vm               *vm;
> > +   struct vcpu             *vcpu;
> > +   int                      err;
> > +
> > +   /*
> > +    * We should be only called from a quiescing device state so we
> > +    * don't expect to sleep here. If we can't get all our locks,
> > +    * something is wrong.
> > +    */
> > +   if ((err = rw_enter(&vmm_softc->vm_lock, RW_WRITE | RW_NOSLEEP)))
> > +           return (err);
> > +
> > +   /* Iterate over each vm... */
> > +   SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
> > +           if ((err = rw_enter(&vm->vm_vcpu_lock, RW_READ | RW_NOSLEEP)))
> > +                   break;
> > +
> > +           /* Iterate over each vcpu... */
> > +           SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
> > +                   err = rw_enter(&vcpu->vc_lock, RW_WRITE | RW_NOSLEEP);
> > +                   if (err)
> > +                           break;
> > +
> > +                   /* We can skip unlaunched VMCS. Nothing to flush. */
> > +                   if (atomic_load_int(&vcpu->vc_vmx_vmcs_state)
> > +                       != VMCS_LAUNCHED) {
> > +                           DPRINTF("%s: skipping vcpu %d for vm %d\n",
> > +                               __func__, vcpu->vc_id, vm->vm_id);
> > +                           rw_exit_write(&vcpu->vc_lock);
> > +                           continue;
> > +                   }
> > +
> > +                   if (vcpu->vc_last_pcpu != curcpu()) {
> > +                           /* Remote cpu vmclear via ipi. */
> > +                           err = vmx_remote_vmclear(vcpu->vc_last_pcpu,
> > +                               vcpu);
> > +                           if (err)
> > +                                   printf("%s: failed to remote vmclear "
> > +                                       "vcpu %d of vm %d\n", __func__,
> > +                                       vcpu->vc_id, vm->vm_id);
> > +                   } else {
> > +                           /* Local cpu vmclear instruction. */
> > +                           if ((err = vmclear(&vcpu->vc_control_pa)))
> > +                                   printf("%s: failed to locally vmclear "
> > +                                       "vcpu %d of vm %d\n", __func__,
> > +                                       vcpu->vc_id, vm->vm_id);
> > +                           atomic_swap_uint(&vcpu->vc_vmx_vmcs_state,
> > +                               VMCS_CLEARED);
> > +                   }
> > +
> > +                   rw_exit_write(&vcpu->vc_lock);
> > +                   if (err)
> > +                           break;
> > +                   DPRINTF("%s: cleared vcpu %d for vm %d\n", __func__,
> > +                       vcpu->vc_id, vm->vm_id);
> > +           }
> > +           rw_exit_read(&vm->vm_vcpu_lock);
> > +           if (err)
> > +                   break;
> > +   }
> > +   rw_exit_write(&vmm_softc->vm_lock);
> > +
> > +   if (err)
> > +           return (err);
> > +   return (0);
> > +}
> > +
> > +/*
> > + * vmm_activate
> > + */
> > +int
> > +vmm_activate(struct device *self, int act)
> > +{
> > +   struct cpu_info         *ci = curcpu();
> > +   unsigned int             old_state;
> > +
> > +   switch (act) {
> > +   case DVACT_QUIESCE:
> > +           /* Block device users as we're suspending operation. */
> > +           old_state = atomic_cas_uint(&vmm_softc->sc_status, VMM_ACTIVE,
> > +               VMM_SUSPENDED);
> > +           if (old_state != VMM_ACTIVE) {
> > +                   printf("%s: invalid device state on quiesce (%d)\n",
> > +                       __func__, old_state);
> > +                   return (1);
>
> I think returning 1 here will cause the config_suspend_children mechanism
> to try and undo/abort the zzz operation. I recall some instability in that
> area before; it probably makes sense to leave the printf and return 0. You'll
> possibly lose your VMs after resume but at least your machine won't go into 
> that
> lightly tested "unwind on error" path (and likely hang or crash elsewhere).
> There are a few of these return (1) here in this function.
>


Was chatting with Theo about this a bit. All these failure conditions really
can't happen. Well, they *could* happen (like if IPIs started failing) but in
that case we'd be so screwed elsewhere that checking these conditions doesn't
make sense anyway.

So you can probably treat all these as not fail.

-ml

> > +           }
> > +
> > +           /* Wait for any device users to finish. */
> > +           while (refcnt_read(&vmm_softc->sc_refcnt) > 0) {
> > +                   if (tsleep_nsec(&vmm_softc, PPAUSE, "vmm",
> > +                       MSEC_TO_NSEC(1)) != EWOULDBLOCK) {
> > +                           printf("%s: interrupted while draining users\n",
> > +                               __func__);
> > +                           goto quiesce_fail;
> > +                   }
> > +           }
> > +
> > +           /* If we're not in vmm mode, nothing to do. */
> > +           if ((ci->ci_flags & CPUF_VMM) == 0)
> > +                   break;
> > +
> > +           /* Intel systems need extra steps to sync vcpu state. */
> > +           if (vmm_softc->mode == VMM_MODE_EPT ||
> > +               vmm_softc->mode == VMM_MODE_VMX)
> > +                   if (vmm_quiesce_vmx()) {
> > +                           printf("%s: failed to prepare vcpus for "
> > +                               "host suspend\n", __func__);
> > +                           goto quiesce_fail;
> > +                   }
> > +
> > +           /* Stop virtualization mode on all cpus. */
> > +           if (vmm_stop()) {
> > +                   printf("%s: failed to stop vmm mode on quiesce\n",
> > +                       __func__);
> > +                   goto quiesce_fail;
> > +           }
> > +           break;
> > +
> > +   case DVACT_WAKEUP:
> > +           /* Restart virtualization mode on all cpu's. */
> > +           if (vmm_softc->vm_ct > 0 && vmm_start()) {
> > +                   printf("%s: failed to restart vmm mode on wakeup\n",
> > +                       __func__);
> > +                   return (1);
>
> Same. Just printf and move on (return 0).
>
> > +           }
> > +
> > +           /* Set the device back to active. */
> > +           old_state = atomic_cas_uint(&vmm_softc->sc_status,
> > +               VMM_SUSPENDED, VMM_ACTIVE);
> > +           if (old_state != VMM_SUSPENDED) {
> > +                   printf("%s: invalid device state on wakeup (%d)\n",
> > +                       __func__, old_state);
> > +                   return (1);
> > +           }
> > +
> > +           /* Notify any waiting device users. */
> > +           wakeup(&vmm_softc);
> > +           break;
> > +   }
> > +
> > +   return (0);
> > +
> > +quiesce_fail:
> > +   /* Something's wrong. Need to try getting back to a good state. */
> > +   atomic_store_int(&vmm_softc->sc_status, VMM_ACTIVE);
> > +   wakeup(&vmm_softc);
> > +   return (1);
>
> Same
>
> > +}
> > +
> > +/*
> >   * vmmopen
> >   *
> >   * Called during open of /dev/vmm.
> > @@ -476,10 +648,24 @@ vmmopen(dev_t dev, int flag, int mode, struct proc *p)
> >  int
> >  vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
> >  {
> > -   int ret;
> > +   int                     ret;
> >
> >     KERNEL_UNLOCK();
> >
> > +   while (atomic_load_int(&vmm_softc->sc_status) != VMM_ACTIVE) {
> > +           /* Wait for the signal that we're running again. */
> > +           ret = tsleep_nsec(&vmm_softc, PWAIT | PCATCH, "vmm",
> > +               MSEC_TO_NSEC(1));
> > +           if (ret != ERESTART && ret != EINTR && ret != EWOULDBLOCK
> > +               && ret != 0) {
> > +                   printf("%s: unhandled wakeup (%d) for device\n",
> > +                       __func__, ret);
> > +                   ret = EBUSY;
> > +                   goto out;
> > +           }
> > +   }
> > +   refcnt_take(&vmm_softc->sc_refcnt);
> > +
> >     switch (cmd) {
> >     case VMM_IOC_CREATE:
> >             if ((ret = vmm_start()) != 0) {
> > @@ -524,6 +710,8 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag
> >             ret = ENOTTY;
> >     }
> >
> > +   refcnt_rele(&vmm_softc->sc_refcnt);
> > +out:
> >     KERNEL_LOCK();
> >
> >     return (ret);

Re: vmm guests die during host's supspend/resume

Reply via email to