Re: [Qemu-devel] [PATCH 3/7] KVM: i386: Add support for KVM_CAP_EXCEPTION_PAYLOAD

Liran Alon Fri, 14 Jun 2019 17:59:01 -0700

> On 15 Jun 2019, at 3:42, Paolo Bonzini <pbonz...@redhat.com> wrote:
> 
> From: Liran Alon <liran.a...@oracle.com>
> 
> Kernel commit c4f55198c7c2 ("kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD")
> introduced a new KVM capability which allows userspace to correctly
> distinguish between pending and injected exceptions.
> 
> This distinguish is important in case of nested virtualization scenarios
> because a L2 pending exception can still be intercepted by the L1 hypervisor
> while a L2 injected exception cannot.
> 
> Furthermore, when an exception is attempted to be injected by QEMU,
> QEMU should specify the exception payload (CR2 in case of #PF or
> DR6 in case of #DB) instead of having the payload already delivered in
> the respective vCPU register. Because in case exception is injected to
> L2 guest and is intercepted by L1 hypervisor, then payload needs to be
> reported to L1 intercept (VMExit handler) while still preserving
> respective vCPU register unchanged.
> 
> This commit adds support for QEMU to properly utilise this new KVM
> capability (KVM_CAP_EXCEPTION_PAYLOAD).
> 
> Reviewed-by: Nikita Leshenko <nikita.leshche...@oracle.com>
> Signed-off-by: Liran Alon <liran.a...@oracle.com>
> Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
> ---
> target/i386/cpu.c        | 10 ++---
> target/i386/cpu.h        | 13 +++++-
> target/i386/hvf/hvf.c    | 10 +++--
> target/i386/hvf/x86hvf.c |  4 +-
> target/i386/kvm.c        | 95 +++++++++++++++++++++++++++++++++-------
> target/i386/machine.c    | 61 +++++++++++++++++++++++++-
> 6 files changed, 163 insertions(+), 30 deletions(-)
> 
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index c1ab86d63e..4e19969111 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -4777,7 +4777,9 @@ static void x86_cpu_reset(CPUState *s)
>     memset(env->mtrr_fixed, 0, sizeof(env->mtrr_fixed));
> 
>     env->interrupt_injected = -1;
> -    env->exception_injected = -1;
> +    env->exception_nr = -1;
> +    env->exception_pending = 0;
> +    env->exception_injected = 0;

Note: I the patch-series I will submit I will add here:
+    env->exception_has_payload = false;
+    env->exception_payload = 0;

>     env->nmi_injected = false;
> #if !defined(CONFIG_USER_ONLY)
>     /* We hard-wire the BSP to the first CPU. */
> @@ -5173,12 +5175,6 @@ static int x86_cpu_filter_features(X86CPU *cpu)
>     return rv;
> }
> 
> -#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \
> -                           (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \
> -                           (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3)
> -#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
> -                         (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
> -                         (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
> static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
> {
>     CPUState *cs = CPU(dev);
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index bd06523a53..bbeb7a9521 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -729,6 +729,13 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
> 
> #define CPUID_VENDOR_HYGON    "HygonGenuine"
> 
> +#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \
> +                           (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \
> +                           (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3)
> +#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
> +                         (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
> +                         (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
> +
> #define CPUID_MWAIT_IBE     (1U << 1) /* Interrupts can exit capability */
> #define CPUID_MWAIT_EMX     (1U << 0) /* enumeration supported */
> 
> @@ -1332,10 +1339,14 @@ typedef struct CPUX86State {
> 
>     /* For KVM */
>     uint32_t mp_state;
> -    int32_t exception_injected;
> +    int32_t exception_nr;
>     int32_t interrupt_injected;
>     uint8_t soft_interrupt;
> +    uint8_t exception_pending;
> +    uint8_t exception_injected;
>     uint8_t has_error_code;
> +    uint8_t exception_has_payload;
> +    uint64_t exception_payload;
>     uint32_t ins_len;
>     uint32_t sipi_vector;
>     bool tsc_valid;
> diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
> index 2751c8125c..dc4bb63536 100644
> --- a/target/i386/hvf/hvf.c
> +++ b/target/i386/hvf/hvf.c
> @@ -605,7 +605,9 @@ static void hvf_store_events(CPUState *cpu, uint32_t 
> ins_len, uint64_t idtvec_in
>     X86CPU *x86_cpu = X86_CPU(cpu);
>     CPUX86State *env = &x86_cpu->env;
> 
> -    env->exception_injected = -1;
> +    env->exception_nr = -1;
> +    env->exception_pending = 0;
> +    env->exception_injected = 0;
>     env->interrupt_injected = -1;
>     env->nmi_injected = false;
>     if (idtvec_info & VMCS_IDT_VEC_VALID) {
> @@ -619,7 +621,8 @@ static void hvf_store_events(CPUState *cpu, uint32_t 
> ins_len, uint64_t idtvec_in
>             break;
>         case VMCS_IDT_VEC_HWEXCEPTION:
>         case VMCS_IDT_VEC_SWEXCEPTION:
> -            env->exception_injected = idtvec_info & VMCS_IDT_VEC_VECNUM;
> +            env->exception_nr = idtvec_info & VMCS_IDT_VEC_VECNUM;
> +            env->exception_injected = 1;
>             break;
>         case VMCS_IDT_VEC_PRIV_SWEXCEPTION:
>         default:
> @@ -912,7 +915,8 @@ int hvf_vcpu_exec(CPUState *cpu)
>             macvm_set_rip(cpu, rip + ins_len);
>             break;
>         case VMX_REASON_VMCALL:
> -            env->exception_injected = EXCP0D_GPF;
> +            env->exception_nr = EXCP0D_GPF;
> +            env->exception_injected = 1;
>             env->has_error_code = true;
>             env->error_code = 0;
>             break;
> diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c
> index df8e946fbc..e0ea02d631 100644
> --- a/target/i386/hvf/x86hvf.c
> +++ b/target/i386/hvf/x86hvf.c
> @@ -362,8 +362,8 @@ bool hvf_inject_interrupts(CPUState *cpu_state)
>     if (env->interrupt_injected != -1) {
>         vector = env->interrupt_injected;
>         intr_type = VMCS_INTR_T_SWINTR;
> -    } else if (env->exception_injected != -1) {
> -        vector = env->exception_injected;
> +    } else if (env->exception_nr != -1) {
> +        vector = env->exception_nr;
>         if (vector == EXCP03_INT3 || vector == EXCP04_INTO) {
>             intr_type = VMCS_INTR_T_SWEXCEPTION;
>         } else {
> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
> index 53f95b02a0..dca76830ec 100644
> --- a/target/i386/kvm.c
> +++ b/target/i386/kvm.c
> @@ -104,6 +104,7 @@ static uint32_t num_architectural_pmu_fixed_counters;
> static int has_xsave;
> static int has_xcrs;
> static int has_pit_state2;
> +static int has_exception_payload;
> 
> static bool has_msr_mcg_ext_ctl;
> 
> @@ -584,15 +585,51 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, 
> void *addr)
>     /* Hope we are lucky for AO MCE */
> }
> 
> +static void kvm_reset_exception(CPUX86State *env)
> +{
> +     env->exception_nr = -1;
> +     env->exception_pending = 0;
> +     env->exception_injected = 0;

Note: In the patch-series I will submit I will add here:
+       env->exception_has_payload = false;
+       env->exception_payload = 0;

> +}
> +
> +static void kvm_queue_exception(CPUX86State *env,
> +                                int32_t exception_nr,
> +                                uint8_t exception_has_payload,
> +                                uint64_t exception_payload)
> +{
> +    assert(env->exception_nr == -1);
> +    assert(!env->exception_pending);
> +    assert(!env->exception_injected);

Note: In the patch-series I will submit I will add here:
+    assert(!env->exception_has_payload);

> +
> +    env->exception_nr = exception_nr;
> +
> +    if (has_exception_payload) {
> +        env->exception_pending = 1;
> +
> +        env->exception_has_payload = exception_has_payload;
> +        env->exception_payload = exception_payload;
> +    } else {
> +        env->exception_injected = 1;
> +
> +        if (exception_has_payload) {
> +            if (exception_nr == EXCP01_DB) {
> +                env->dr[6] = exception_payload;
> +            } else if (exception_nr == EXCP0E_PAGE) {
> +                env->cr[2] = exception_payload;
> +            }
> +        }

Note: In the patch-series I will submit, I have changed this
to verify that #DB & #PF always have env->exception_has_payload set to true
and other cases have it set to false.

> +    }
> +}
> +
> static int kvm_inject_mce_oldstyle(X86CPU *cpu)
> {
>     CPUX86State *env = &cpu->env;
> 
> -    if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
> +    if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
>         unsigned int bank, bank_num = env->mcg_cap & 0xff;
>         struct kvm_x86_mce mce;
> 
> -        env->exception_injected = -1;
> +     kvm_reset_exception(env);
> 
>         /*
>          * There must be at least one bank in use if an MCE is pending.
> @@ -1573,6 +1610,16 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
> 
>     hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
> 
> +    has_exception_payload = kvm_check_extension(s, 
> KVM_CAP_EXCEPTION_PAYLOAD);
> +    if (has_exception_payload) {
> +        ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
> +        if (ret < 0) {
> +            error_report("kvm: Failed to enable exception payload cap: %s",
> +                         strerror(-ret));
> +            return ret;
> +        }
> +    }
> +
>     ret = kvm_get_supported_msrs(s);
>     if (ret < 0) {
>         return ret;
> @@ -2877,8 +2924,16 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level)
>         return 0;
>     }
> 
> -    events.exception.injected = (env->exception_injected >= 0);
> -    events.exception.nr = env->exception_injected;
> +    events.flags = 0;
> +
> +    if (has_exception_payload) {
> +        events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
> +        events.exception.pending = env->exception_pending;
> +        events.exception_has_payload = env->exception_has_payload;
> +        events.exception_payload = env->exception_payload;
> +    }
> +    events.exception.nr = env->exception_nr;
> +    events.exception.injected = env->exception_injected;
>     events.exception.has_error_code = env->has_error_code;
>     events.exception.error_code = env->error_code;
> 
> @@ -2891,7 +2946,6 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level)
>     events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
> 
>     events.sipi_vector = env->sipi_vector;
> -    events.flags = 0;
> 
>     if (has_msr_smbase) {
>         events.smi.smm = !!(env->hflags & HF_SMM_MASK);
> @@ -2941,8 +2995,19 @@ static int kvm_get_vcpu_events(X86CPU *cpu)
>     if (ret < 0) {
>        return ret;
>     }
> -    env->exception_injected =
> -       events.exception.injected ? events.exception.nr : -1;
> +
> +    if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
> +        env->exception_pending = events.exception.pending;
> +        env->exception_has_payload = events.exception_has_payload;
> +        env->exception_payload = events.exception_payload;
> +    } else {
> +        env->exception_pending = 0;
> +        env->exception_has_payload = false;
> +    }
> +    env->exception_injected = events.exception.injected;
> +    env->exception_nr =
> +        (env->exception_pending || env->exception_injected) ?
> +        events.exception.nr : -1;
>     env->has_error_code = events.exception.has_error_code;
>     env->error_code = events.exception.error_code;
> 
> @@ -2994,12 +3059,12 @@ static int kvm_guest_debug_workarounds(X86CPU *cpu)
>     unsigned long reinject_trap = 0;
> 
>     if (!kvm_has_vcpu_events()) {
> -        if (env->exception_injected == EXCP01_DB) {
> +        if (env->exception_nr == EXCP01_DB) {
>             reinject_trap = KVM_GUESTDBG_INJECT_DB;
>         } else if (env->exception_injected == EXCP03_INT3) {
>             reinject_trap = KVM_GUESTDBG_INJECT_BP;
>         }
> -        env->exception_injected = -1;
> +        kvm_reset_exception(env);
>     }
> 
>     /*
> @@ -3320,13 +3385,13 @@ int kvm_arch_process_async_events(CPUState *cs)
> 
>         kvm_cpu_synchronize_state(cs);
> 
> -        if (env->exception_injected == EXCP08_DBLE) {
> +        if (env->exception_nr == EXCP08_DBLE) {
>             /* this means triple fault */
>             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
>             cs->exit_request = 1;
>             return 0;
>         }
> -        env->exception_injected = EXCP12_MCHK;
> +        kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
>         env->has_error_code = 0;
> 
>         cs->halted = 0;
> @@ -3541,14 +3606,12 @@ static int kvm_handle_debug(X86CPU *cpu,
>     }
>     if (ret == 0) {
>         cpu_synchronize_state(cs);
> -        assert(env->exception_injected == -1);
> +        assert(env->exception_nr == -1);
> 
>         /* pass to guest */
> -        env->exception_injected = arch_info->exception;
> +        kvm_queue_exception(env, arch_info->exception,
> +                            EXCP01_DB, arch_info->dr6);

There is a bug here I will fix in my patch-series:
Third argument should be (arch_info->exception == EXCP01_DB).

>         env->has_error_code = 0;
> -        if (arch_info->exception == EXCP01_DB) {
> -            env->dr[6] = arch_info->dr6;
> -        }
>     }
> 
>     return ret;
> diff --git a/target/i386/machine.c b/target/i386/machine.c
> index 225b5d433b..41460be54b 100644
> --- a/target/i386/machine.c
> +++ b/target/i386/machine.c
> @@ -199,6 +199,21 @@ static const VMStateDescription vmstate_fpreg = {
>     }
> };
> 
> +static bool is_vmx_enabled(CPUX86State *env)
> +{
> +    return (IS_INTEL_CPU(env) && (env->cr[4] & CR4_VMXE_MASK));
> +}
> +
> +static bool is_svm_enabled(CPUX86State *env)
> +{
> +    return (IS_AMD_CPU(env) && (env->efer & MSR_EFER_SVME));
> +}
> +
> +static bool is_nested_virt_enabled(CPUX86State *env)
> +{
> +    return (is_vmx_enabled(env) || is_svm_enabled(env));
> +}

I have later realised that this nested_virt_enabled() function is not enough to 
determine if nested_state is required to be sent.
This is because it may be that vCPU is running L2 but have momentarily entered 
SMM due to SMI.
In this case, CR4 & MSR_EFER are saved in SMRAM and are set to 0 on entering to 
SMM.
This means that in case (env->hflags & HF_SMM_MASK), we theoretically should 
have read saved CR4 & MSR_EFER from SMRAM.
However, because we cannot reference guest memory at this point (Not valid in 
case we migrate guest in post-copy), I should change
code to assume that in case (env->hflags & HF_SMM_MASK), we need to assume that 
nested-state is needed.
This should break backwards-compatability migration only in very rare cases and 
therefore I think it should be sufficient.
Any objections to this idea?

> +
> static int cpu_pre_save(void *opaque)
> {

In my next patch-series I have added logic to cpu_pre_save that converts
a pending exception to an injected exception in case there is an exception 
pending
and nested-virtualization is not enabled. During this conversion, I also make 
sure
to apply the exception payload to the respective vCPU register (i.e. CR2 in 
case of #PF
or DR6 in case of #DB).
I have realised this is required because otherwise exception-info VMState 
subsection
will be deemed not needed and exception-payload will be lost.
Something like this:

+    /*
+     * When vCPU is running L2 and exception is still pending,
+     * it can potentially be intercepted by L1 hypervisor.
+     * In contrast to an injected exception which cannot be
+     * intercepted anymore.
+     *
+     * Furthermore, when a L2 exception is intercepted by L1
+     * hypervisor, it's exception payload (CR2/DR6 on #PF/#DB)
+     * should not be set yet in the respective vCPU register.
+     * Thus, in case an exception is pending, it is
+     * important to save the exception payload seperately.
+     *
+     * Therefore, if an exception is not in a pending state
+     * or guest is not running a nested hypervisor, it is
+     * not important to distinguish between a pending and
+     * injected exception and we don't need to store seperately
+     * the exception payload.
+     *
+     * In order to preserve better backwards-compatabile migration,
+     * convert a pending exception to an injected exception in
+     * case it is not important to distingiush between them
+     * as described above.
+     */
+    if (!is_nested_virt_enabled(env) && env->exception_pending) {
+        env->exception_pending = 0;
+        env->exception_injected = 1;
+
+        if (env->exception_nr == EXCP01_DB) {
+            assert(env->exception_has_payload);
+            env->dr[6] = env->exception_payload;
+        } else if (env->exception_nr == EXCP0E_PAGE) {
+            assert(env->exception_has_payload);
+            env->cr[2] = env->exception_payload;
+        } else {
+            assert(!env->exception_has_payload);
+        }
+    }
+

>     X86CPU *cpu = opaque;
> @@ -278,6 +293,23 @@ static int cpu_post_load(void *opaque, int version_id)
>     env->hflags &= ~HF_CPL_MASK;
>     env->hflags |= (env->segs[R_SS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
> 
> +    /*
> +     * There are cases that we can get valid exception_nr with both
> +     * exception_pending and exception_clear being cleared. This can happen 
> in
> +     * one of the following scenarios:
> +     * 1) Source is older QEMU without KVM_CAP_EXCEPTION_PAYLOAD support.
> +     * 2) Source is running on kernel without KVM_CAP_EXCEPTION_PAYLOAD 
> support.
> +     * 3) "cpu/exception_info" subsection not sent because there is no 
> exception
> +     *         pending or guest wasn't running L2.
> +     *
> +     * In those cases, we can just deduce that a valid exception_nr means
> +     * we can treat the exception as already injected.
> +     */
> +    if ((env->exception_nr != -1) &&
> +        !env->exception_pending && !env->exception_injected) {
> +        env->exception_injected = 1;
> +    }
> +
>     env->fpstt = (env->fpus_vmstate >> 11) & 7;
>     env->fpus = env->fpus_vmstate & ~0x3800;
>     env->fptag_vmstate ^= 0xff;
> @@ -323,6 +355,32 @@ static bool steal_time_msr_needed(void *opaque)
>     return cpu->env.steal_time_msr != 0;
> }
> 
> +static bool exception_info_needed(void *opaque)
> +{
> +    X86CPU *cpu = opaque;
> +
> +    /*
> +     * Differenting between pending and injected exceptions
> +     * is only important when running L2.
> +     */
> +    return (cpu->env.exception_pending &&
> +            is_nested_virt_enabled(&cpu->env));
> +}
> +
> +static const VMStateDescription vmstate_exception_info = {
> +    .name = "cpu/exception_info",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = exception_info_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT8(env.exception_pending, X86CPU),
> +        VMSTATE_UINT8(env.exception_injected, X86CPU),
> +        VMSTATE_UINT8(env.exception_has_payload, X86CPU),
> +        VMSTATE_UINT64(env.exception_payload, X86CPU),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> static const VMStateDescription vmstate_steal_time_msr = {
>     .name = "cpu/steal_time_msr",
>     .version_id = 1,
> @@ -1035,7 +1093,7 @@ VMStateDescription vmstate_x86_cpu = {
>         VMSTATE_INT32(env.interrupt_injected, X86CPU),
>         VMSTATE_UINT32(env.mp_state, X86CPU),
>         VMSTATE_UINT64(env.tsc, X86CPU),
> -        VMSTATE_INT32(env.exception_injected, X86CPU),
> +        VMSTATE_INT32(env.exception_nr, X86CPU),
>         VMSTATE_UINT8(env.soft_interrupt, X86CPU),
>         VMSTATE_UINT8(env.nmi_injected, X86CPU),
>         VMSTATE_UINT8(env.nmi_pending, X86CPU),
> @@ -1059,6 +1117,7 @@ VMStateDescription vmstate_x86_cpu = {
>         /* The above list is not sorted /wrt version numbers, watch out! */
>     },
>     .subsections = (const VMStateDescription*[]) {
> +        &vmstate_exception_info,
>         &vmstate_async_pf_msr,
>         &vmstate_pv_eoi_msr,
>         &vmstate_steal_time_msr,
> -- 
> 2.21.0
> 
>
Re: [Qemu-devel] [PATCH 3/7] KVM: i386: Add support for KVM_CAP_EXCEPTION_PAYLOAD

Reply via email to