On 3/24/2025 11:01 PM, Mingwei Zhang wrote: > From: Dapeng Mi <dapeng1...@linux.intel.com> > > Check if rdpmc can be intercepted for mediated vPMU. Simply speaking, > if guest own all PMU counters in mediated vPMU, then rdpmc interception > should be disabled to mitigate the performance impact, otherwise rdpmc > has to be intercepted to avoid guest obtain host counter's data via > rdpmc instruction. > > Co-developed-by: Mingwei Zhang <mizh...@google.com> > Signed-off-by: Mingwei Zhang <mizh...@google.com> > Co-developed-by: Sandipan Das <sandipan....@amd.com> > Signed-off-by: Sandipan Das <sandipan....@amd.com> > Signed-off-by: Dapeng Mi <dapeng1...@linux.intel.com> > --- > arch/x86/include/asm/msr-index.h | 1 + > arch/x86/kvm/pmu.c | 34 ++++++++++++++++++++++++++++++++ > arch/x86/kvm/pmu.h | 19 ++++++++++++++++++ > arch/x86/kvm/svm/pmu.c | 14 ++++++++++++- > arch/x86/kvm/vmx/pmu_intel.c | 18 ++++++++--------- > 5 files changed, 76 insertions(+), 10 deletions(-) > > diff --git a/arch/x86/include/asm/msr-index.h > b/arch/x86/include/asm/msr-index.h > index ca70846ffd55..337f4b0a2998 100644 > --- a/arch/x86/include/asm/msr-index.h > +++ b/arch/x86/include/asm/msr-index.h > @@ -312,6 +312,7 @@ > #define PERF_CAP_PEBS_FORMAT 0xf00 > #define PERF_CAP_FW_WRITES BIT_ULL(13) > #define PERF_CAP_PEBS_BASELINE BIT_ULL(14) > +#define PERF_CAP_PERF_METRICS BIT_ULL(15) > #define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG > | \ > PERF_CAP_PEBS_FORMAT | > PERF_CAP_PEBS_BASELINE) > > diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c > index 92c742ead663..6ad71752be4b 100644 > --- a/arch/x86/kvm/pmu.c > +++ b/arch/x86/kvm/pmu.c > @@ -604,6 +604,40 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, > u64 *data) > return 0; > } > > +inline bool kvm_rdpmc_in_guest(struct kvm_vcpu *vcpu) > +{ > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > + > + if (!kvm_mediated_pmu_enabled(vcpu)) > + return false; > + > + /* > + * VMware allows access to these Pseduo-PMCs even when read via RDPMC > + * in Ring3 when CR4.PCE=0. > + */ > + if (enable_vmware_backdoor) > + return false; > + > + /* > + * FIXME: In theory, perf metrics is always combined with fixed > + * counter 3. it's fair enough to compare the guest and host > + * fixed counter number and don't need to check perf metrics > + * explicitly. However kvm_pmu_cap.num_counters_fixed is limited > + * KVM_MAX_NR_FIXED_COUNTERS (3) as fixed counter 3 is not > + * supported now. perf metrics is still needed to be checked > + * explicitly here. Once fixed counter 3 is supported, the perf > + * metrics checking can be removed. > + */ > + return pmu->nr_arch_gp_counters == kvm_pmu_cap.num_counters_gp && > + pmu->nr_arch_fixed_counters == kvm_pmu_cap.num_counters_fixed && > + vcpu_has_perf_metrics(vcpu) == kvm_host_has_perf_metrics() && > + pmu->counter_bitmask[KVM_PMC_GP] == > + (BIT_ULL(kvm_pmu_cap.bit_width_gp) - 1) && > + pmu->counter_bitmask[KVM_PMC_FIXED] == > + (BIT_ULL(kvm_pmu_cap.bit_width_fixed) - 1); > +} > +EXPORT_SYMBOL_GPL(kvm_rdpmc_in_guest); > + > void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) > { > if (lapic_in_kernel(vcpu)) { > diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h > index e1d0096f249b..509c995b7871 100644 > --- a/arch/x86/kvm/pmu.h > +++ b/arch/x86/kvm/pmu.h > @@ -271,6 +271,24 @@ static inline bool pmc_is_globally_enabled(struct > kvm_pmc *pmc) > return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); > } > > +static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) > +{ > + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) > + return 0; > + > + return vcpu->arch.perf_capabilities; > +} > + > +static inline bool vcpu_has_perf_metrics(struct kvm_vcpu *vcpu) > +{ > + return !!(vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PERF_METRICS); > +} > + > +static inline bool kvm_host_has_perf_metrics(void) > +{ > + return !!(kvm_host.perf_capabilities & PERF_CAP_PERF_METRICS); > +} > + > void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); > void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); > int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); > @@ -287,6 +305,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 > eventsel); > bool vcpu_pmu_can_enable(struct kvm_vcpu *vcpu); > > bool is_vmware_backdoor_pmc(u32 pmc_idx); > +bool kvm_rdpmc_in_guest(struct kvm_vcpu *vcpu); > > extern struct kvm_pmu_ops intel_pmu_ops; > extern struct kvm_pmu_ops amd_pmu_ops; > diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c > index c8b9fd9b5350..153972e944eb 100644 > --- a/arch/x86/kvm/svm/pmu.c > +++ b/arch/x86/kvm/svm/pmu.c > @@ -173,7 +173,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct > msr_data *msr_info) > return 1; > } > > -static void amd_pmu_refresh(struct kvm_vcpu *vcpu) > +static void __amd_pmu_refresh(struct kvm_vcpu *vcpu) > { > struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > union cpuid_0x80000022_ebx ebx; > @@ -212,6 +212,18 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) > bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); > } > > +static void amd_pmu_refresh(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_svm *svm = to_svm(vcpu); > + > + __amd_pmu_refresh(vcpu); > + > + if (kvm_rdpmc_in_guest(vcpu)) > + svm_clr_intercept(svm, INTERCEPT_RDPMC); > + else > + svm_set_intercept(svm, INTERCEPT_RDPMC); > +} > +
After putting kprobes on kvm_pmu_rdpmc(), I noticed that RDPMC instructions were getting intercepted for the secondary vCPUs. This happens because when secondary vCPUs come up, kvm_vcpu_reset() gets called after guest CPUID has been updated. While RDPMC interception is initially disabled in the kvm_pmu_refresh() path, it gets re-enabled in the kvm_vcpu_reset() path as svm_vcpu_reset() calls init_vmcb(). We should consider adding the following change to avoid that. diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6f9142063cc4..1c9c183092f3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1354,7 +1354,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_SMI); svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0); - svm_set_intercept(svm, INTERCEPT_RDPMC); svm_set_intercept(svm, INTERCEPT_CPUID); svm_set_intercept(svm, INTERCEPT_INVD); svm_set_intercept(svm, INTERCEPT_INVLPG); > static void amd_pmu_init(struct kvm_vcpu *vcpu) > { > struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c > index fc017e9a6a0c..2a5f79206b02 100644 > --- a/arch/x86/kvm/vmx/pmu_intel.c > +++ b/arch/x86/kvm/vmx/pmu_intel.c > @@ -108,14 +108,6 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct > kvm_vcpu *vcpu, > return &counters[array_index_nospec(idx, num_counters)]; > } > > -static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) > -{ > - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) > - return 0; > - > - return vcpu->arch.perf_capabilities; > -} > - > static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) > { > return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; > @@ -456,7 +448,7 @@ static void intel_pmu_enable_fixed_counter_bits(struct > kvm_pmu *pmu, u64 bits) > pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits); > } > > -static void intel_pmu_refresh(struct kvm_vcpu *vcpu) > +static void __intel_pmu_refresh(struct kvm_vcpu *vcpu) > { > struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); > @@ -564,6 +556,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) > } > } > > +static void intel_pmu_refresh(struct kvm_vcpu *vcpu) > +{ > + __intel_pmu_refresh(vcpu); > + > + exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING, > + !kvm_rdpmc_in_guest(vcpu)); > +} > + > static void intel_pmu_init(struct kvm_vcpu *vcpu) > { > int i;