QEMU uses the kvm_get_msrs() function to save Intel PMU registers from KVM and kvm_put_msrs() to restore them to KVM. However, there is no support for AMD PMU registers. Currently, pmu_version and num_pmu_gp_counters are initialized based on cpuid(0xa), which does not apply to AMD processors. For AMD CPUs, prior to PerfMonV2, the number of general-purpose registers is determined based on the CPU version.
To address this issue, we need to add support for AMD PMU registers. Without this support, the following problems can arise: 1. If the VM is reset (e.g., via QEMU system_reset or VM kdump/kexec) while running "perf top", the PMU registers are not disabled properly. 2. Despite x86_cpu_reset() resetting many registers to zero, kvm_put_msrs() does not handle AMD PMU registers, causing some PMU events to remain enabled in KVM. 3. The KVM kvm_pmc_speculative_in_use() function consistently returns true, preventing the reclamation of these events. Consequently, the kvm_pmc->perf_event remains active. 4. After a reboot, the VM kernel may report the following error: [ 0.092011] Performance Events: Fam17h+ core perfctr, Broken BIOS detected, complain to your hardware vendor. [ 0.092023] [Firmware Bug]: the BIOS has corrupted hw-PMU resources (MSR c0010200 is 530076) 5. In the worst case, the active kvm_pmc->perf_event may inject unknown NMIs randomly into the VM kernel: [...] Uhhuh. NMI received for unknown reason 30 on CPU 0. To resolve these issues, we propose resetting AMD PMU registers during the VM reset process. Signed-off-by: Dongli Zhang <dongli.zh...@oracle.com> --- Changed since v1: - Modify "MSR_K7_EVNTSEL0 + 3" and "MSR_K7_PERFCTR0 + 3" by using AMD64_NUM_COUNTERS (suggested by Sandipan Das). - Use "AMD64_NUM_COUNTERS_CORE * 2 - 1", not "MSR_F15H_PERF_CTL0 + 0xb". (suggested by Sandipan Das). - Switch back to "-pmu" instead of using a global "pmu-cap-disabled". - Don't initialize PMU info if kvm.enable_pmu=N. Changed since v2: - Remove 'static' from host_cpuid_vendorX. - Change has_pmu_version to pmu_version. - Use object_property_get_int() to get CPU family. - Use cpuid_find_entry() instead of cpu_x86_cpuid(). - Send error log when host and guest are from different vendors. - Move "if (!cpu->enable_pmu)" to begin of function. Add comments to reminder developers. - Add support to Zhaoxin. Change is_same_vendor() to is_host_compat_vendor(). - Didn't add Reviewed-by from Sandipan because the change isn't minor. Changed since v3: - Use host_cpu_vendor_fms() from Zhao's patch. - Check AMD directly makes the "compat" rule clear. - Add comment to MAX_GP_COUNTERS. - Skip PMU info initialization if !kvm_pmu_disabled. target/i386/cpu.h | 12 +++ target/i386/kvm/kvm.c | 175 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 183 insertions(+), 4 deletions(-) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 76f24446a5..5d5266f89e 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -490,6 +490,14 @@ typedef enum X86Seg { #define MSR_CORE_PERF_GLOBAL_CTRL 0x38f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x390 +#define MSR_K7_EVNTSEL0 0xc0010000 +#define MSR_K7_PERFCTR0 0xc0010004 +#define MSR_F15H_PERF_CTL0 0xc0010200 +#define MSR_F15H_PERF_CTR0 0xc0010201 + +#define AMD64_NUM_COUNTERS 4 +#define AMD64_NUM_COUNTERS_CORE 6 + #define MSR_MC0_CTL 0x400 #define MSR_MC0_STATUS 0x401 #define MSR_MC0_ADDR 0x402 @@ -1608,6 +1616,10 @@ typedef struct { #endif #define MAX_FIXED_COUNTERS 3 +/* + * This formula is based on Intel's MSR. The current size also meets AMD's + * needs. + */ #define MAX_GP_COUNTERS (MSR_IA32_PERF_STATUS - MSR_P6_EVNTSEL0) #define TARGET_INSN_START_EXTRA_WORDS 1 diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 38cc1a5f43..b8926bd4cb 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -2076,7 +2076,7 @@ int kvm_arch_pre_create_vcpu(CPUState *cpu, Error **errp) return 0; } -static void kvm_init_pmu_info(struct kvm_cpuid2 *cpuid) +static void kvm_init_pmu_info_intel(struct kvm_cpuid2 *cpuid) { struct kvm_cpuid_entry2 *c; @@ -2109,6 +2109,96 @@ static void kvm_init_pmu_info(struct kvm_cpuid2 *cpuid) } } +static void kvm_init_pmu_info_amd(struct kvm_cpuid2 *cpuid, X86CPU *cpu) +{ + struct kvm_cpuid_entry2 *c; + int64_t family; + + family = object_property_get_int(OBJECT(cpu), "family", NULL); + if (family < 0) { + return; + } + + if (family < 6) { + error_report("AMD performance-monitoring is supported from " + "K7 and later"); + return; + } + + pmu_version = 1; + num_pmu_gp_counters = AMD64_NUM_COUNTERS; + + c = cpuid_find_entry(cpuid, 0x80000001, 0); + if (!c) { + return; + } + + if (!(c->ecx & CPUID_EXT3_PERFCORE)) { + return; + } + + num_pmu_gp_counters = AMD64_NUM_COUNTERS_CORE; +} + +static bool is_host_compat_vendor(CPUX86State *env) +{ + char host_vendor[CPUID_VENDOR_SZ + 1]; + + host_cpu_vendor_fms(host_vendor, NULL, NULL, NULL); + + /* + * Intel and Zhaoxin are compatible. + */ + if ((g_str_equal(host_vendor, CPUID_VENDOR_INTEL) || + g_str_equal(host_vendor, CPUID_VENDOR_ZHAOXIN1) || + g_str_equal(host_vendor, CPUID_VENDOR_ZHAOXIN2)) && + (IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env))) { + return true; + } + + return g_str_equal(host_vendor, CPUID_VENDOR_AMD) && + IS_AMD_CPU(env); +} + +static void kvm_init_pmu_info(struct kvm_cpuid2 *cpuid, X86CPU *cpu) +{ + CPUX86State *env = &cpu->env; + + /* + * The PMU virtualization is disabled by kvm.enable_pmu=N. + */ + if (kvm_pmu_disabled) { + return; + } + + /* + * If KVM_CAP_PMU_CAPABILITY is not supported, there is no way to + * disable the AMD PMU virtualization. + * + * Assume the user is aware of this when !cpu->enable_pmu. AMD PMU + * registers are not going to reset, even they are still available to + * guest VM. + */ + if (!cpu->enable_pmu) { + return; + } + + /* + * It is not supported to virtualize AMD PMU registers on Intel + * processors, nor to virtualize Intel PMU registers on AMD processors. + */ + if (!is_host_compat_vendor(env)) { + error_report("host doesn't support requested feature: vPMU"); + return; + } + + if (IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env)) { + kvm_init_pmu_info_intel(cpuid); + } else if (IS_AMD_CPU(env)) { + kvm_init_pmu_info_amd(cpuid, cpu); + } +} + int kvm_arch_init_vcpu(CPUState *cs) { struct { @@ -2291,7 +2381,7 @@ int kvm_arch_init_vcpu(CPUState *cs) cpuid_i = kvm_x86_build_cpuid(env, cpuid_data.entries, cpuid_i); cpuid_data.cpuid.nent = cpuid_i; - kvm_init_pmu_info(&cpuid_data.cpuid); + kvm_init_pmu_info(&cpuid_data.cpuid, cpu); if (((env->cpuid_version >> 8)&0xF) >= 6 && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) == @@ -4071,7 +4161,7 @@ static int kvm_put_msrs(X86CPU *cpu, int level) kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr); } - if (pmu_version > 0) { + if ((IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env)) && pmu_version > 0) { if (pmu_version > 1) { /* Stop the counter. */ kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0); @@ -4102,6 +4192,38 @@ static int kvm_put_msrs(X86CPU *cpu, int level) env->msr_global_ctrl); } } + + if (IS_AMD_CPU(env) && pmu_version > 0) { + uint32_t sel_base = MSR_K7_EVNTSEL0; + uint32_t ctr_base = MSR_K7_PERFCTR0; + /* + * The address of the next selector or counter register is + * obtained by incrementing the address of the current selector + * or counter register by one. + */ + uint32_t step = 1; + + /* + * When PERFCORE is enabled, AMD PMU uses a separate set of + * addresses for the selector and counter registers. + * Additionally, the address of the next selector or counter + * register is determined by incrementing the address of the + * current register by two. + */ + if (num_pmu_gp_counters == AMD64_NUM_COUNTERS_CORE) { + sel_base = MSR_F15H_PERF_CTL0; + ctr_base = MSR_F15H_PERF_CTR0; + step = 2; + } + + for (i = 0; i < num_pmu_gp_counters; i++) { + kvm_msr_entry_add(cpu, ctr_base + i * step, + env->msr_gp_counters[i]); + kvm_msr_entry_add(cpu, sel_base + i * step, + env->msr_gp_evtsel[i]); + } + } + /* * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add, * only sync them to KVM on the first cpu @@ -4549,7 +4671,8 @@ static int kvm_get_msrs(X86CPU *cpu) if (env->features[FEAT_KVM] & CPUID_KVM_POLL_CONTROL) { kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1); } - if (pmu_version > 0) { + + if ((IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env)) && pmu_version > 0) { if (pmu_version > 1) { kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0); kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0); @@ -4565,6 +4688,35 @@ static int kvm_get_msrs(X86CPU *cpu) } } + if (IS_AMD_CPU(env) && pmu_version > 0) { + uint32_t sel_base = MSR_K7_EVNTSEL0; + uint32_t ctr_base = MSR_K7_PERFCTR0; + /* + * The address of the next selector or counter register is + * obtained by incrementing the address of the current selector + * or counter register by one. + */ + uint32_t step = 1; + + /* + * When PERFCORE is enabled, AMD PMU uses a separate set of + * addresses for the selector and counter registers. + * Additionally, the address of the next selector or counter + * register is determined by incrementing the address of the + * current register by two. + */ + if (num_pmu_gp_counters == AMD64_NUM_COUNTERS_CORE) { + sel_base = MSR_F15H_PERF_CTL0; + ctr_base = MSR_F15H_PERF_CTR0; + step = 2; + } + + for (i = 0; i < num_pmu_gp_counters; i++) { + kvm_msr_entry_add(cpu, ctr_base + i * step, 0); + kvm_msr_entry_add(cpu, sel_base + i * step, 0); + } + } + if (env->mcg_cap) { kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0); kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0); @@ -4876,6 +5028,21 @@ static int kvm_get_msrs(X86CPU *cpu) case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1: env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data; break; + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL0 + AMD64_NUM_COUNTERS - 1: + env->msr_gp_evtsel[index - MSR_K7_EVNTSEL0] = msrs[i].data; + break; + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR0 + AMD64_NUM_COUNTERS - 1: + env->msr_gp_counters[index - MSR_K7_PERFCTR0] = msrs[i].data; + break; + case MSR_F15H_PERF_CTL0 ... + MSR_F15H_PERF_CTL0 + AMD64_NUM_COUNTERS_CORE * 2 - 1: + index = index - MSR_F15H_PERF_CTL0; + if (index & 0x1) { + env->msr_gp_counters[index] = msrs[i].data; + } else { + env->msr_gp_evtsel[index] = msrs[i].data; + } + break; case HV_X64_MSR_HYPERCALL: env->msr_hv_hypercall = msrs[i].data; break; -- 2.39.3