On Fri, Jul 15, 2022 at 12:27:04PM -0400, Dave Voutila wrote:
> The following diff adds in formalization around mmio assists for nested
> page/ept faults on Intel and AMD vmm(4) hosts. It provides what little
> information is available to userland in terms of either the instruction
> bytes (on AMD) or the instruction length (on Intel).
>
> vmd is updated to intercept these vm exit events, but will currently log
> the issue and cause the vm process to exit. This is the same behavior
> folks experience currently when a guest attempts to read/write guest
> physical addresses in the mmio-reserved ranges, but now with an
> explanation of the reason and current {e,r,}ip value.
>
> This is the foundation I'll build upon while implementing instruction
> decode and emulation support in userland. No noticeable change should
> occur for existing guests that don't trigger mmio assist events.
>
> ok?
>

See below.

-ml

> -dv
>
>
> diff refs/heads/master refs/heads/mmio
> commit - 10e026163f31687dba11fb4655500afb4e616258
> commit + 7d92e26b51c3fd520807dbcd5233f14b76bc611e
> blob - 84da19438b74377276b16b4b4f7db45ae9ec6be2
> blob + a89a4dc0fbe7b31a47390de363109092b76ffa22
> --- sys/arch/amd64/amd64/vmm.c
> +++ sys/arch/amd64/amd64/vmm.c
> @@ -4891,11 +4891,20 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
>                               vcpu->vc_gueststate.vg_rax =
>                                   vcpu->vc_exit.vei.vei_data;
>                       break;
> +             case VMX_EXIT_EPT_VIOLATION:
> +                     ret = vcpu_writeregs_vmx(vcpu, VM_RWREGS_GPRS, 0,
> +                         &vcpu->vc_exit.vrs);
> +                     if (ret) {
> +                             printf("%s: vm %d vcpu %d failed to update "
> +                                 "registers\n", __func__,
> +                                 vcpu->vc_parent->vm_id, vcpu->vc_id);
> +                             return (EINVAL);
> +                     }
> +                     break;
>               case VM_EXIT_NONE:
>               case VMX_EXIT_HLT:
>               case VMX_EXIT_INT_WINDOW:
>               case VMX_EXIT_EXTINT:
> -             case VMX_EXIT_EPT_VIOLATION:
>               case VMX_EXIT_CPUID:
>               case VMX_EXIT_XSETBV:
>                       break;
> @@ -4927,6 +4936,7 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
>                       break;
>  #endif /* VMM_DEBUG */
>               }
> +             memset(&vcpu->vc_exit, 0, sizeof(vcpu->vc_exit));
>       }
>
>       setregion(&gdt, ci->ci_gdt, GDT_SIZE - 1);
> @@ -5658,7 +5668,7 @@ vmm_get_guest_memtype(struct vm *vm, paddr_t gpa)
>
>       if (gpa >= VMM_PCI_MMIO_BAR_BASE && gpa <= VMM_PCI_MMIO_BAR_END) {
>               DPRINTF("guest mmio access @ 0x%llx\n", (uint64_t)gpa);
> -             return (VMM_MEM_TYPE_REGULAR);
> +             return (VMM_MEM_TYPE_MMIO);
>       }
>
>       /* XXX Use binary search? */
> @@ -5782,18 +5792,28 @@ int
>  svm_handle_np_fault(struct vcpu *vcpu)
>  {
>       uint64_t gpa;
> -     int gpa_memtype, ret;
> +     int gpa_memtype, ret = 0;
>       struct vmcb *vmcb = (struct vmcb *)vcpu->vc_control_va;
> +     struct vm_exit_eptviolation *vee = &vcpu->vc_exit.vee;
>
> -     ret = 0;
> +     memset(vee, 0, sizeof(*vee));
> +     vee->vee_fault_type = VEE_FAULT_INVALID;
>
>       gpa = vmcb->v_exitinfo2;
>
>       gpa_memtype = vmm_get_guest_memtype(vcpu->vc_parent, gpa);
>       switch (gpa_memtype) {
>       case VMM_MEM_TYPE_REGULAR:
> +             vee->vee_fault_type = VEE_FAULT_HANDLED;
>               ret = svm_fault_page(vcpu, gpa);
>               break;
> +     case VMM_MEM_TYPE_MMIO:
> +             vee->vee_fault_type = VEE_FAULT_MMIO_ASSIST;
> +             vee->vee_insn_len = vmcb->v_n_bytes_fetched;

If you are going to depend on decode assists, we should probably validate
that the CPU has them:

CPUID Fn8000_000A_EDX[7]:
 DecodeAssists Decode assists. Indicates support for the decode assists. See 
“Decode Assists.”

It does look like we sorta assume that we have some of these features (like
support for cleanbits, and even NP itself), so there are probably a few
checks we should do in attach. We have something similar in vmx but it's
done later on vm create IIRC because at the time we wanted to run on
a variety of host CPUs with different capabilities. I don't think that
is important anymore, so maybe a short-circuit check for some of these
required features in vmmattach for both svm and vmx is warranted now.

FWIW I've never seen a svm CPU without these assists (but I bet they exist).

> +             memcpy(&vee->vee_insn_bytes, vmcb->v_guest_ins_bytes,
> +                 sizeof(vee->vee_insn_bytes));
> +             ret = EAGAIN;
> +             break;
>       default:
>               printf("unknown memory type %d for GPA 0x%llx\n",
>                   gpa_memtype, gpa);
> @@ -5862,10 +5882,13 @@ vmx_fault_page(struct vcpu *vcpu, paddr_t gpa)
>  int
>  vmx_handle_np_fault(struct vcpu *vcpu)
>  {
> -     uint64_t gpa;
> -     int gpa_memtype, ret;
> +     uint64_t insn_len = 0, gpa;
> +     int gpa_memtype, ret = 0;
> +     struct vm_exit_eptviolation *vee = &vcpu->vc_exit.vee;
>
> -     ret = 0;
> +     memset(vee, 0, sizeof(*vee));
> +     vee->vee_fault_type = VEE_FAULT_INVALID;
> +
>       if (vmread(VMCS_GUEST_PHYSICAL_ADDRESS, &gpa)) {
>               printf("%s: cannot extract faulting pa\n", __func__);
>               return (EINVAL);
> @@ -5874,8 +5897,21 @@ vmx_handle_np_fault(struct vcpu *vcpu)
>       gpa_memtype = vmm_get_guest_memtype(vcpu->vc_parent, gpa);
>       switch (gpa_memtype) {
>       case VMM_MEM_TYPE_REGULAR:
> +             vee->vee_fault_type = VEE_FAULT_HANDLED;
>               ret = vmx_fault_page(vcpu, gpa);
>               break;
> +     case VMM_MEM_TYPE_MMIO:
> +             vee->vee_fault_type = VEE_FAULT_MMIO_ASSIST;
> +             if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_len) ||
> +                 insn_len == 0 || insn_len > 15) {
> +                     printf("%s: cannot extract instruction length\n",
> +                         __func__);
> +                     ret = EINVAL;
> +             } else {
> +                     vee->vee_insn_len = (uint32_t)insn_len;
> +                     ret = EAGAIN;
> +             }
> +             break;
>       default:
>               printf("unknown memory type %d for GPA 0x%llx\n",
>                   gpa_memtype, gpa);
> @@ -7321,7 +7357,19 @@ vcpu_run_svm(struct vcpu *vcpu, struct vm_run_params *
>                                   vcpu->vc_exit.vei.vei_data;
>                               vmcb->v_rax = vcpu->vc_gueststate.vg_rax;
>                       }
> +                     break;
> +             case SVM_VMEXIT_NPF:
> +                     ret = vcpu_writeregs_svm(vcpu, VM_RWREGS_GPRS,
> +                         &vcpu->vc_exit.vrs);
> +                     if (ret) {
> +                             printf("%s: vm %d vcpu %d failed to update "
> +                                 "registers\n", __func__,
> +                                 vcpu->vc_parent->vm_id, vcpu->vc_id);
> +                             return (EINVAL);
> +                     }
> +                     break;
>               }
> +             memset(&vcpu->vc_exit, 0, sizeof(vcpu->vc_exit));
>       }
>
>       while (ret == 0) {
> blob - 6fbb5e1bdc22666010e1c7484035d5d09cd7955b
> blob + aa1628405717bda486de225055c89c5db91b01c5
> --- sys/arch/amd64/include/vmmvar.h
> +++ sys/arch/amd64/include/vmmvar.h
> @@ -324,7 +324,9 @@ enum {
>  };
>
>  enum {
> +     VEE_FAULT_INVALID,
>       VEE_FAULT_HANDLED,
> +     VEE_FAULT_MMIO_ASSIST,
>       VEE_FAULT_PROTECT,
>  };
>
> @@ -362,6 +366,8 @@ struct vm_exit_inout {
>   */
>  struct vm_exit_eptviolation {
>       uint8_t         vee_fault_type;
> +     uint8_t         vee_insn_len;
> +     uint8_t         vee_insn_bytes[15];

I think you will need a "vee_insn_bytes_valid" flag unless you are planning
on filling out vee_insn_bytes in vmm(4) before returning (in the VMX case).

Otherwise, vmd does not know (or today, care) about what type of CPU it's
running on and there will be no way to know that "vee_insn_len" bytes, on
VMX, should not be interpreted as anything to do with vee_insn_bytes.

Was your plan to do PT walks to fetch arguments (and/or insn bytes)
in vmd or in vmm?

>  };
>
>  /*
> @@ -709,6 +715,7 @@ enum {
>
>  enum {
>       VMM_MEM_TYPE_REGULAR,
> +     VMM_MEM_TYPE_MMIO,
>       VMM_MEM_TYPE_UNKNOWN
>  };
>
> blob - eac6616ea918716c2986bab0a05724b31bcce2ec
> blob + b244d72264ce9694378af83df076231b62801aae
> --- usr.sbin/vmd/vm.c
> +++ usr.sbin/vmd/vm.c
> @@ -1652,26 +1653,36 @@ vcpu_exit_inout(struct vm_run_params *vrp)
>   *
>   * Return values:
>   *  0: no action required
> - *  EAGAIN: a protection fault occured, kill the vm.
> + *  EFAULT: a protection fault occured, kill the vm.
>   */
>  int
>  vcpu_exit_eptviolation(struct vm_run_params *vrp)
>  {
> +     int ret = 0;
> +     uint8_t fault_type;
>       struct vm_exit *ve = vrp->vrp_exit;
>
> -     /*
> -      * vmd may be exiting to vmd to handle a pending interrupt
> -      * but last exit type may have been VMX_EXIT_EPT_VIOLATION,
> -      * check the fault_type to ensure we really are processing
> -      * a VMX_EXIT_EPT_VIOLATION.
> -      */
> -     if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) {
> -             log_debug("%s: EPT Violation: rip=0x%llx",
> -                 __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP]);
> -             return (EAGAIN);
> +     fault_type = ve->vee.vee_fault_type;
> +     switch (fault_type) {
> +     case VEE_FAULT_HANDLED:
> +             log_debug("%s: fault already handled", __func__);
> +             break;
> +     case VEE_FAULT_PROTECT:
> +             log_debug("%s: EPT Violation: rip=0x%llx", __progname,
> +                 ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
> +             ret = EFAULT;
> +             break;
> +     case VEE_FAULT_MMIO_ASSIST:
> +             log_warnx("%s: mmio assist required: rip=0x%llx", __progname,
> +                 ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
> +             ret = EFAULT;
> +             break;
> +     default:
> +             fatalx("%s: invalid fault_type %d", __progname, fault_type);
> +             /* UNREACHED */
>       }
>
> -     return (0);
> +     return (ret);
>  }
>
>  /*
> @@ -1704,7 +1715,6 @@ vcpu_exit(struct vm_run_params *vrp)
>       case VMX_EXIT_CPUID:
>       case VMX_EXIT_EXTINT:
>       case SVM_VMEXIT_INTR:
> -     case SVM_VMEXIT_NPF:
>       case SVM_VMEXIT_MSR:
>       case SVM_VMEXIT_CPUID:
>               /*
> @@ -1715,11 +1725,11 @@ vcpu_exit(struct vm_run_params *vrp)
>                * in more vmd log spam).
>                */
>               break;
> +     case SVM_VMEXIT_NPF:
>       case VMX_EXIT_EPT_VIOLATION:
>               ret = vcpu_exit_eptviolation(vrp);
>               if (ret)
>                       return (ret);
> -
>               break;
>       case VMX_EXIT_IO:
>       case SVM_VMEXIT_IOIO:

Reply via email to