> On 15 Jun 2019, at 3:42, Paolo Bonzini <pbonz...@redhat.com> wrote:
>
> From: Liran Alon <liran.a...@oracle.com>
>
> Kernel commit c4f55198c7c2 ("kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD")
> introduced a new KVM capability which allows userspace to correctly
> distinguish between pending and injected exceptions.
>
> This distinguish is important in case of nested virtualization scenarios
> because a L2 pending exception can still be intercepted by the L1 hypervisor
> while a L2 injected exception cannot.
>
> Furthermore, when an exception is attempted to be injected by QEMU,
> QEMU should specify the exception payload (CR2 in case of #PF or
> DR6 in case of #DB) instead of having the payload already delivered in
> the respective vCPU register. Because in case exception is injected to
> L2 guest and is intercepted by L1 hypervisor, then payload needs to be
> reported to L1 intercept (VMExit handler) while still preserving
> respective vCPU register unchanged.
>
> This commit adds support for QEMU to properly utilise this new KVM
> capability (KVM_CAP_EXCEPTION_PAYLOAD).
>
> Reviewed-by: Nikita Leshenko <nikita.leshche...@oracle.com>
> Signed-off-by: Liran Alon <liran.a...@oracle.com>
> Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
> ---
> target/i386/cpu.c | 10 ++---
> target/i386/cpu.h | 13 +++++-
> target/i386/hvf/hvf.c | 10 +++--
> target/i386/hvf/x86hvf.c | 4 +-
> target/i386/kvm.c | 95 +++++++++++++++++++++++++++++++++-------
> target/i386/machine.c | 61 +++++++++++++++++++++++++-
> 6 files changed, 163 insertions(+), 30 deletions(-)
>
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index c1ab86d63e..4e19969111 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -4777,7 +4777,9 @@ static void x86_cpu_reset(CPUState *s)
> memset(env->mtrr_fixed, 0, sizeof(env->mtrr_fixed));
>
> env->interrupt_injected = -1;
> - env->exception_injected = -1;
> + env->exception_nr = -1;
> + env->exception_pending = 0;
> + env->exception_injected = 0;
Note: I the patch-series I will submit I will add here:
+ env->exception_has_payload = false;
+ env->exception_payload = 0;
> env->nmi_injected = false;
> #if !defined(CONFIG_USER_ONLY)
> /* We hard-wire the BSP to the first CPU. */
> @@ -5173,12 +5175,6 @@ static int x86_cpu_filter_features(X86CPU *cpu)
> return rv;
> }
>
> -#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \
> - (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \
> - (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3)
> -#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
> - (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
> - (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
> static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
> {
> CPUState *cs = CPU(dev);
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index bd06523a53..bbeb7a9521 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -729,6 +729,13 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
>
> #define CPUID_VENDOR_HYGON "HygonGenuine"
>
> +#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \
> + (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \
> + (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3)
> +#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
> + (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
> + (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
> +
> #define CPUID_MWAIT_IBE (1U << 1) /* Interrupts can exit capability */
> #define CPUID_MWAIT_EMX (1U << 0) /* enumeration supported */
>
> @@ -1332,10 +1339,14 @@ typedef struct CPUX86State {
>
> /* For KVM */
> uint32_t mp_state;
> - int32_t exception_injected;
> + int32_t exception_nr;
> int32_t interrupt_injected;
> uint8_t soft_interrupt;
> + uint8_t exception_pending;
> + uint8_t exception_injected;
> uint8_t has_error_code;
> + uint8_t exception_has_payload;
> + uint64_t exception_payload;
> uint32_t ins_len;
> uint32_t sipi_vector;
> bool tsc_valid;
> diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
> index 2751c8125c..dc4bb63536 100644
> --- a/target/i386/hvf/hvf.c
> +++ b/target/i386/hvf/hvf.c
> @@ -605,7 +605,9 @@ static void hvf_store_events(CPUState *cpu, uint32_t
> ins_len, uint64_t idtvec_in
> X86CPU *x86_cpu = X86_CPU(cpu);
> CPUX86State *env = &x86_cpu->env;
>
> - env->exception_injected = -1;
> + env->exception_nr = -1;
> + env->exception_pending = 0;
> + env->exception_injected = 0;
> env->interrupt_injected = -1;
> env->nmi_injected = false;
> if (idtvec_info & VMCS_IDT_VEC_VALID) {
> @@ -619,7 +621,8 @@ static void hvf_store_events(CPUState *cpu, uint32_t
> ins_len, uint64_t idtvec_in
> break;
> case VMCS_IDT_VEC_HWEXCEPTION:
> case VMCS_IDT_VEC_SWEXCEPTION:
> - env->exception_injected = idtvec_info & VMCS_IDT_VEC_VECNUM;
> + env->exception_nr = idtvec_info & VMCS_IDT_VEC_VECNUM;
> + env->exception_injected = 1;
> break;
> case VMCS_IDT_VEC_PRIV_SWEXCEPTION:
> default:
> @@ -912,7 +915,8 @@ int hvf_vcpu_exec(CPUState *cpu)
> macvm_set_rip(cpu, rip + ins_len);
> break;
> case VMX_REASON_VMCALL:
> - env->exception_injected = EXCP0D_GPF;
> + env->exception_nr = EXCP0D_GPF;
> + env->exception_injected = 1;
> env->has_error_code = true;
> env->error_code = 0;
> break;
> diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c
> index df8e946fbc..e0ea02d631 100644
> --- a/target/i386/hvf/x86hvf.c
> +++ b/target/i386/hvf/x86hvf.c
> @@ -362,8 +362,8 @@ bool hvf_inject_interrupts(CPUState *cpu_state)
> if (env->interrupt_injected != -1) {
> vector = env->interrupt_injected;
> intr_type = VMCS_INTR_T_SWINTR;
> - } else if (env->exception_injected != -1) {
> - vector = env->exception_injected;
> + } else if (env->exception_nr != -1) {
> + vector = env->exception_nr;
> if (vector == EXCP03_INT3 || vector == EXCP04_INTO) {
> intr_type = VMCS_INTR_T_SWEXCEPTION;
> } else {
> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
> index 53f95b02a0..dca76830ec 100644
> --- a/target/i386/kvm.c
> +++ b/target/i386/kvm.c
> @@ -104,6 +104,7 @@ static uint32_t num_architectural_pmu_fixed_counters;
> static int has_xsave;
> static int has_xcrs;
> static int has_pit_state2;
> +static int has_exception_payload;
>
> static bool has_msr_mcg_ext_ctl;
>
> @@ -584,15 +585,51 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code,
> void *addr)
> /* Hope we are lucky for AO MCE */
> }
>
> +static void kvm_reset_exception(CPUX86State *env)
> +{
> + env->exception_nr = -1;
> + env->exception_pending = 0;
> + env->exception_injected = 0;
Note: In the patch-series I will submit I will add here:
+ env->exception_has_payload = false;
+ env->exception_payload = 0;
> +}
> +
> +static void kvm_queue_exception(CPUX86State *env,
> + int32_t exception_nr,
> + uint8_t exception_has_payload,
> + uint64_t exception_payload)
> +{
> + assert(env->exception_nr == -1);
> + assert(!env->exception_pending);
> + assert(!env->exception_injected);
Note: In the patch-series I will submit I will add here:
+ assert(!env->exception_has_payload);
> +
> + env->exception_nr = exception_nr;
> +
> + if (has_exception_payload) {
> + env->exception_pending = 1;
> +
> + env->exception_has_payload = exception_has_payload;
> + env->exception_payload = exception_payload;
> + } else {
> + env->exception_injected = 1;
> +
> + if (exception_has_payload) {
> + if (exception_nr == EXCP01_DB) {
> + env->dr[6] = exception_payload;
> + } else if (exception_nr == EXCP0E_PAGE) {
> + env->cr[2] = exception_payload;
> + }
> + }
Note: In the patch-series I will submit, I have changed this
to verify that #DB & #PF always have env->exception_has_payload set to true
and other cases have it set to false.
> + }
> +}
> +
> static int kvm_inject_mce_oldstyle(X86CPU *cpu)
> {
> CPUX86State *env = &cpu->env;
>
> - if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
> + if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
> unsigned int bank, bank_num = env->mcg_cap & 0xff;
> struct kvm_x86_mce mce;
>
> - env->exception_injected = -1;
> + kvm_reset_exception(env);
>
> /*
> * There must be at least one bank in use if an MCE is pending.
> @@ -1573,6 +1610,16 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>
> hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
>
> + has_exception_payload = kvm_check_extension(s,
> KVM_CAP_EXCEPTION_PAYLOAD);
> + if (has_exception_payload) {
> + ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
> + if (ret < 0) {
> + error_report("kvm: Failed to enable exception payload cap: %s",
> + strerror(-ret));
> + return ret;
> + }
> + }
> +
> ret = kvm_get_supported_msrs(s);
> if (ret < 0) {
> return ret;
> @@ -2877,8 +2924,16 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level)
> return 0;
> }
>
> - events.exception.injected = (env->exception_injected >= 0);
> - events.exception.nr = env->exception_injected;
> + events.flags = 0;
> +
> + if (has_exception_payload) {
> + events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
> + events.exception.pending = env->exception_pending;
> + events.exception_has_payload = env->exception_has_payload;
> + events.exception_payload = env->exception_payload;
> + }
> + events.exception.nr = env->exception_nr;
> + events.exception.injected = env->exception_injected;
> events.exception.has_error_code = env->has_error_code;
> events.exception.error_code = env->error_code;
>
> @@ -2891,7 +2946,6 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level)
> events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
>
> events.sipi_vector = env->sipi_vector;
> - events.flags = 0;
>
> if (has_msr_smbase) {
> events.smi.smm = !!(env->hflags & HF_SMM_MASK);
> @@ -2941,8 +2995,19 @@ static int kvm_get_vcpu_events(X86CPU *cpu)
> if (ret < 0) {
> return ret;
> }
> - env->exception_injected =
> - events.exception.injected ? events.exception.nr : -1;
> +
> + if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
> + env->exception_pending = events.exception.pending;
> + env->exception_has_payload = events.exception_has_payload;
> + env->exception_payload = events.exception_payload;
> + } else {
> + env->exception_pending = 0;
> + env->exception_has_payload = false;
> + }
> + env->exception_injected = events.exception.injected;
> + env->exception_nr =
> + (env->exception_pending || env->exception_injected) ?
> + events.exception.nr : -1;
> env->has_error_code = events.exception.has_error_code;
> env->error_code = events.exception.error_code;
>
> @@ -2994,12 +3059,12 @@ static int kvm_guest_debug_workarounds(X86CPU *cpu)
> unsigned long reinject_trap = 0;
>
> if (!kvm_has_vcpu_events()) {
> - if (env->exception_injected == EXCP01_DB) {
> + if (env->exception_nr == EXCP01_DB) {
> reinject_trap = KVM_GUESTDBG_INJECT_DB;
> } else if (env->exception_injected == EXCP03_INT3) {
> reinject_trap = KVM_GUESTDBG_INJECT_BP;
> }
> - env->exception_injected = -1;
> + kvm_reset_exception(env);
> }
>
> /*
> @@ -3320,13 +3385,13 @@ int kvm_arch_process_async_events(CPUState *cs)
>
> kvm_cpu_synchronize_state(cs);
>
> - if (env->exception_injected == EXCP08_DBLE) {
> + if (env->exception_nr == EXCP08_DBLE) {
> /* this means triple fault */
> qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
> cs->exit_request = 1;
> return 0;
> }
> - env->exception_injected = EXCP12_MCHK;
> + kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
> env->has_error_code = 0;
>
> cs->halted = 0;
> @@ -3541,14 +3606,12 @@ static int kvm_handle_debug(X86CPU *cpu,
> }
> if (ret == 0) {
> cpu_synchronize_state(cs);
> - assert(env->exception_injected == -1);
> + assert(env->exception_nr == -1);
>
> /* pass to guest */
> - env->exception_injected = arch_info->exception;
> + kvm_queue_exception(env, arch_info->exception,
> + EXCP01_DB, arch_info->dr6);
There is a bug here I will fix in my patch-series:
Third argument should be (arch_info->exception == EXCP01_DB).
> env->has_error_code = 0;
> - if (arch_info->exception == EXCP01_DB) {
> - env->dr[6] = arch_info->dr6;
> - }
> }
>
> return ret;
> diff --git a/target/i386/machine.c b/target/i386/machine.c
> index 225b5d433b..41460be54b 100644
> --- a/target/i386/machine.c
> +++ b/target/i386/machine.c
> @@ -199,6 +199,21 @@ static const VMStateDescription vmstate_fpreg = {
> }
> };
>
> +static bool is_vmx_enabled(CPUX86State *env)
> +{
> + return (IS_INTEL_CPU(env) && (env->cr[4] & CR4_VMXE_MASK));
> +}
> +
> +static bool is_svm_enabled(CPUX86State *env)
> +{
> + return (IS_AMD_CPU(env) && (env->efer & MSR_EFER_SVME));
> +}
> +
> +static bool is_nested_virt_enabled(CPUX86State *env)
> +{
> + return (is_vmx_enabled(env) || is_svm_enabled(env));
> +}
I have later realised that this nested_virt_enabled() function is not enough to
determine if nested_state is required to be sent.
This is because it may be that vCPU is running L2 but have momentarily entered
SMM due to SMI.
In this case, CR4 & MSR_EFER are saved in SMRAM and are set to 0 on entering to
SMM.
This means that in case (env->hflags & HF_SMM_MASK), we theoretically should
have read saved CR4 & MSR_EFER from SMRAM.
However, because we cannot reference guest memory at this point (Not valid in
case we migrate guest in post-copy), I should change
code to assume that in case (env->hflags & HF_SMM_MASK), we need to assume that
nested-state is needed.
This should break backwards-compatability migration only in very rare cases and
therefore I think it should be sufficient.
Any objections to this idea?
> +
> static int cpu_pre_save(void *opaque)
> {
In my next patch-series I have added logic to cpu_pre_save that converts
a pending exception to an injected exception in case there is an exception
pending
and nested-virtualization is not enabled. During this conversion, I also make
sure
to apply the exception payload to the respective vCPU register (i.e. CR2 in
case of #PF
or DR6 in case of #DB).
I have realised this is required because otherwise exception-info VMState
subsection
will be deemed not needed and exception-payload will be lost.
Something like this:
+ /*
+ * When vCPU is running L2 and exception is still pending,
+ * it can potentially be intercepted by L1 hypervisor.
+ * In contrast to an injected exception which cannot be
+ * intercepted anymore.
+ *
+ * Furthermore, when a L2 exception is intercepted by L1
+ * hypervisor, it's exception payload (CR2/DR6 on #PF/#DB)
+ * should not be set yet in the respective vCPU register.
+ * Thus, in case an exception is pending, it is
+ * important to save the exception payload seperately.
+ *
+ * Therefore, if an exception is not in a pending state
+ * or guest is not running a nested hypervisor, it is
+ * not important to distinguish between a pending and
+ * injected exception and we don't need to store seperately
+ * the exception payload.
+ *
+ * In order to preserve better backwards-compatabile migration,
+ * convert a pending exception to an injected exception in
+ * case it is not important to distingiush between them
+ * as described above.
+ */
+ if (!is_nested_virt_enabled(env) && env->exception_pending) {
+ env->exception_pending = 0;
+ env->exception_injected = 1;
+
+ if (env->exception_nr == EXCP01_DB) {
+ assert(env->exception_has_payload);
+ env->dr[6] = env->exception_payload;
+ } else if (env->exception_nr == EXCP0E_PAGE) {
+ assert(env->exception_has_payload);
+ env->cr[2] = env->exception_payload;
+ } else {
+ assert(!env->exception_has_payload);
+ }
+ }
+
> X86CPU *cpu = opaque;
> @@ -278,6 +293,23 @@ static int cpu_post_load(void *opaque, int version_id)
> env->hflags &= ~HF_CPL_MASK;
> env->hflags |= (env->segs[R_SS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
>
> + /*
> + * There are cases that we can get valid exception_nr with both
> + * exception_pending and exception_clear being cleared. This can happen
> in
> + * one of the following scenarios:
> + * 1) Source is older QEMU without KVM_CAP_EXCEPTION_PAYLOAD support.
> + * 2) Source is running on kernel without KVM_CAP_EXCEPTION_PAYLOAD
> support.
> + * 3) "cpu/exception_info" subsection not sent because there is no
> exception
> + * pending or guest wasn't running L2.
> + *
> + * In those cases, we can just deduce that a valid exception_nr means
> + * we can treat the exception as already injected.
> + */
> + if ((env->exception_nr != -1) &&
> + !env->exception_pending && !env->exception_injected) {
> + env->exception_injected = 1;
> + }
> +
> env->fpstt = (env->fpus_vmstate >> 11) & 7;
> env->fpus = env->fpus_vmstate & ~0x3800;
> env->fptag_vmstate ^= 0xff;
> @@ -323,6 +355,32 @@ static bool steal_time_msr_needed(void *opaque)
> return cpu->env.steal_time_msr != 0;
> }
>
> +static bool exception_info_needed(void *opaque)
> +{
> + X86CPU *cpu = opaque;
> +
> + /*
> + * Differenting between pending and injected exceptions
> + * is only important when running L2.
> + */
> + return (cpu->env.exception_pending &&
> + is_nested_virt_enabled(&cpu->env));
> +}
> +
> +static const VMStateDescription vmstate_exception_info = {
> + .name = "cpu/exception_info",
> + .version_id = 1,
> + .minimum_version_id = 1,
> + .needed = exception_info_needed,
> + .fields = (VMStateField[]) {
> + VMSTATE_UINT8(env.exception_pending, X86CPU),
> + VMSTATE_UINT8(env.exception_injected, X86CPU),
> + VMSTATE_UINT8(env.exception_has_payload, X86CPU),
> + VMSTATE_UINT64(env.exception_payload, X86CPU),
> + VMSTATE_END_OF_LIST()
> + }
> +};
> +
> static const VMStateDescription vmstate_steal_time_msr = {
> .name = "cpu/steal_time_msr",
> .version_id = 1,
> @@ -1035,7 +1093,7 @@ VMStateDescription vmstate_x86_cpu = {
> VMSTATE_INT32(env.interrupt_injected, X86CPU),
> VMSTATE_UINT32(env.mp_state, X86CPU),
> VMSTATE_UINT64(env.tsc, X86CPU),
> - VMSTATE_INT32(env.exception_injected, X86CPU),
> + VMSTATE_INT32(env.exception_nr, X86CPU),
> VMSTATE_UINT8(env.soft_interrupt, X86CPU),
> VMSTATE_UINT8(env.nmi_injected, X86CPU),
> VMSTATE_UINT8(env.nmi_pending, X86CPU),
> @@ -1059,6 +1117,7 @@ VMStateDescription vmstate_x86_cpu = {
> /* The above list is not sorted /wrt version numbers, watch out! */
> },
> .subsections = (const VMStateDescription*[]) {
> + &vmstate_exception_info,
> &vmstate_async_pf_msr,
> &vmstate_pv_eoi_msr,
> &vmstate_steal_time_msr,
> --
> 2.21.0
>
>