On Wed, May 20, 2026, David Woodhouse wrote:
> On Fri, 2026-05-15 at 12:19 -0700, Sean Christopherson wrote:
> > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> > index b5991d53fc0e..e9e7394140dd 100644
> > --- a/arch/x86/kernel/kvmclock.c
> > +++ b/arch/x86/kernel/kvmclock.c
> > @@ -321,8 +321,8 @@ void __init kvmclock_init(void)
> > flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
> > kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
> >
> > - x86_platform.calibrate_tsc = kvm_get_tsc_khz;
> > - x86_platform.calibrate_cpu = kvm_get_tsc_khz;
> > + tsc_register_calibration_routines(kvm_get_tsc_khz, kvm_get_tsc_khz);
> > +
> > x86_platform.get_wallclock = kvm_get_wallclock;
> > x86_platform.set_wallclock = kvm_set_wallclock;
> > #ifdef CONFIG_X86_LOCAL_APIC
>
> Can we move those (and maybe everything in the context there too) up
> *before* the check for no-kvmclock at the top of the function?
Oof, I was going to say "no", but disabling kvmclock is exactly the workaround
I've told people to use to get the kernel to use the TSC instead of kvmclock.
> Probably in a separate patch.
Ya. I think this?
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 08ee4bc304c8..92a1ebf31e4d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -27,6 +27,7 @@ static int kvmclock_vsyscall __initdata = 1;
static int msr_kvm_system_time __ro_after_init;
static int msr_kvm_wall_clock __ro_after_init;
static u64 kvm_sched_clock_offset __ro_after_init;
+static unsigned int kvm_tsc_khz_cpuid __ro_after_init;
static int __init parse_no_kvmclock(char *arg)
{
@@ -207,7 +208,7 @@ static unsigned long kvm_get_tsc_khz(void)
lapic_timer_period = apic_khz * 1000 / HZ;
#endif
- return kvm_para_tsc_khz() ? : pvclock_tsc_khz(this_cpu_pvti());
+ return kvm_tsc_khz_cpuid ? : pvclock_tsc_khz(this_cpu_pvti());
}
static unsigned long kvm_get_cpu_khz(void)
@@ -387,9 +388,39 @@ void __init kvmclock_init(void)
enum tsc_properties tsc_properties = TSC_FREQUENCY_KNOWN;
bool stable = false;
- if (!kvm_para_available() || !kvmclock)
+ if (!kvm_para_available())
return;
+ /*
+ * If the TSC counts at a constant frequency across P/T states, counts
+ * in deep C-states, and the TSC hasn't been marked unstable, treat the
+ * TSC reliable, as guaranteed by KVM. Note, the TSC unstable check
+ * exists purely to honor the TSC being marked unstable via command
+ * line, any runtime detection of an unstable will happen after this.
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ !check_tsc_unstable())
+ tsc_properties = TSC_FREQ_KNOWN_AND_RELIABLE;
+
+ kvm_tsc_khz_cpuid = kvm_para_tsc_khz();
+
+ /*
+ * If provided, use the TSC (and APIC bus) frequency provided in KVM's
+ * PV CPUID leaf even if kvmclock itself is disabled via command line.
+ * The PV CPUID information isn't dependent on kvmclock in any way, and
+ * in fact using the precise information is *more* important when the
+ * user has explicitly disabled kvmclock to force the kernel to use the
+ * TSC as its clocksource.
+ */
+ if (!kvmclock) {
+ if (kvm_tsc_khz_cpuid)
+ tsc_register_calibration_routines(kvm_get_tsc_khz,
+ kvm_get_cpu_khz,
+ tsc_properties);
+ return;
+ }
+
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
@@ -424,21 +455,14 @@ void __init kvmclock_init(void)
}
/*
- * If the TSC counts at a constant frequency across P/T states, counts
- * in deep C-states, and the TSC hasn't been marked unstable, prefer
- * the TSC over kvmclock for sched_clock and drop kvmclock's rating so
- * that TSC is chosen as the clocksource. Note, the TSC unstable check
- * exists purely to honor the TSC being marked unstable via command
- * line, any runtime detection of an unstable will happen after this.
+ * If the TSC is reliable (see above), prefer the TSC over kvmclock for
+ * sched_clock and drop kvmclock's rating so that TSC is chosen as the
+ * clocksource.
*/
- if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
- boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
- !check_tsc_unstable()) {
+ if (tsc_properties & TSC_RELIABLE)
kvm_clock.rating = 299;
- tsc_properties = TSC_FREQ_KNOWN_AND_RELIABLE;
- } else {
+ else
kvm_sched_clock_init(stable);
- }
tsc_register_calibration_routines(kvm_get_tsc_khz, kvm_get_cpu_khz,
tsc_properties);