Bit rounding isn't the main issue; the difference in ipi delivery to the
cores accumulates due to the ordering. Replacing get_s_time_fixed with
scale_delta in time_calibration_rendezvous_tail should be sufficient. This
configuration won't accumulate errors, but bit rounding can still cause a
significant difference from calibration to calibration, but it's not as
significant.

On Fri, Jan 9, 2026 at 7:11 PM Anton Markov <[email protected]> wrote:

> You're right. These aren't interrupts in get_s_time_fixed, but rather a
> cumulative error in the sequence due to integer rounding. I added logging
> of the current local_stime to local_time_calibration and noticed that the
> timestamp between cores is gradually increasing. If the server has been
> running for weeks, this could be a very large value.
>
>
> ```
>
> @@ -1732,6 +1753,8 @@ static void cf_check local_time_calibration(void)
>
> if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
>
> {
>
> /* Atomically read cpu_calibration struct and write cpu_time struct. */
>
> + printk("update stime on time calibrate %d, %lu -> %lu (%lu, %lu)\n",
> smp_processor_id(), t->stamp.local_stime, c->local_stime,
>
> + t->last_seen_ns, t->last_seen_tsc);
>
> local_irq_disable();
>
> t->stamp = *c;
>
> local_irq_enable();
>
> ```
>
>
> 2 hours of work:
>
>
> ```
>
> (XEN) update stime on time calibrate 0, 8564145820102 -> 8565145861597
> (8565145862216, 0)
>
> (XEN) update stime on time calibrate 1, 8564145820129 -> 8565145861609
> (8565145863957, 0)
>
> (XEN) update stime on time calibrate 3, 8564145819996 -> 8565145861491
> (8565145864800, 0)
>
> (XEN) update stime on time calibrate 2, 8564145820099 -> 8565145861609
> (8565145865372, 0)
>
>
> 8565145861609 - 8565145861491 = 115 * 3 (3.00 GHz) = 345 lag
>
> ```
>
>
> 6 hours of work:
>
> ```
>
> (XEN) update stime on time calibrate 0, 22914730829200 -> 22915730869993
> (22915730870665, 0)
>
> (XEN) update stime on time calibrate 1, 22914730829073 -> 22915730869889
> (22915730870693, 0)
>
> (XEN) update stime on time calibrate 2, 22914730829052 -> 22915730869841
> (22915730872231, 0)
>
> (XEN) update stime on time calibrate 3, 22914730828892 -> 22915730869696
> (22915730872096, 0)
>
>
> 22915730869993 - 22915730869696 = 297 * 3 (3.00 GHz) = 891 lag
>
> ```
>
>
> Clarification, according to my xen settings:
>
> ```
>
> ucode=scan dom0_mem=53923M,max:53923M dom0_max_vcpus=4-96 dom0_vcpus_pin=0
> force-ept=1 ept=no-ad,no-pml hap_1gb=0 hap_2mb=0 altp2m=1
> hpet=legacy-replacement smt=1 spec-ctrl=no gnttab_max_frames=512
> cpufreq=xen:performance max_cstate=1 sched=credit sched-gran=cpu apicv=0
> sched_credit_tslice_ms=5 sched_ratelimit_us=500
>
> ```
>
>
> Processors I tested on:
>
>
> ```
>
> Intel(R) Core(TM) i5-3330 CPU @ 3.00GHz
>
>
> Flags: fpu de tsc msr pae mce cx8 apic sep mca cmov pat clflush acpi mmx
> fxsr sse sse2 ss ht syscall nx rdtscp lm constant_tsc rep_good nopl
> nonstop_tsc cpuid tsc_known_freq pni pclmulqdq monitor est ssse3 cx16
> sse4_1 sse4_2 popcnt aes xsave avx f16c hypervisor lahf_lm cpuid_fault
> fsgsbase erms xsaveopt arch_capabilities
>
> ```
>
> ```
>
> Intel(R) Xeon(R) Gold 5318Y CPU @ 2.10GHz x2 (NUMA)
>
>
> Flags: fpu de tsc msr pae mce cx8 apic sep mca cmov pat clflush acpi mmx
> fxsr sse sse2 ss ht syscall nx rdtscp lm constant_tsc rep_good nopl
> nonstop_tsc cpuid tsc_known_freq pni pclmulqdq monitor est ssse3 fma cx16
> sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm
> 3dnowprefetch cpuid_fault ibpb fsgsbase bmi1 avx2 bmi2 erms rtm avx512f
> avx512dq rdseed adx avx512ifma clflushopt clwb avx512cd sha_ni avx512bw
> avx512vl xsaveopt xsavec xgetbv1 avx512vbmi avx512_vbmi2 gfni vaes
> vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid fsrm md_clear
> arch_capabilities
>
> ```
>
>
> Next I moved the code to r3 to speed up playback:
>
>
> ```
>
> #include <stdint.h>
>
> #include <stdio.h>
>
>
> static __inline__ unsigned long long rdtsc(void)
>
> {
>
> unsigned hi, lo;
>
> __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
>
> return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
>
> }
>
>
> #define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL))
>
>
> struct cpu_time_stamp {
>
> uint64_t local_tsc;
>
> int64_t local_stime;
>
> int64_t master_stime;
>
> };
>
>
> struct time_scale {
>
> int shift;
>
> uint32_t mul_frac;
>
> };
>
>
>
> static inline uint32_t div_frac(uint32_t dividend, uint32_t divisor)
>
> {
>
> uint32_t quotient, remainder;
>
> asm (
>
> "divl %4"
>
> : "=a" (quotient), "=d" (remainder)
>
> : "0" (0), "1" (dividend), "r" (divisor) );
>
> return quotient;
>
> }
>
>
>
> void set_time_scale(struct time_scale *ts, uint64_t ticks_per_sec)
>
> {
>
> uint64_t tps64 = ticks_per_sec;
>
> uint32_t tps32;
>
> int shift = 0;
>
>
> while ( tps64 > (MILLISECS(1000)*2) )
>
> {
>
> tps64 >>= 1;
>
> shift--;
>
> }
>
>
> tps32 = (uint32_t)tps64;
>
> while ( tps32 <= (uint32_t)MILLISECS(1000) )
>
> {
>
> tps32 <<= 1;
>
> shift++;
>
> }
>
>
> ts->mul_frac = div_frac(MILLISECS(1000), tps32);
>
> ts->shift = shift;
>
> }
>
>
>
> uint64_t scale_delta(uint64_t delta, const struct time_scale *scale)
>
> {
>
> uint64_t product;
>
>
> if ( scale->shift < 0 )
>
> delta >>= -scale->shift;
>
> else
>
> delta <<= scale->shift;
>
>
> asm (
>
> "mulq %2 ; shrd $32,%1,%0"
>
> : "=a" (product), "=d" (delta)
>
> : "rm" (delta), "0" ((uint64_t)scale->mul_frac) );
>
>
> return product;
>
> }
>
>
> #define _TS_MUL_FRAC_IDENTITY 0x80000000UL
>
>
> static inline struct time_scale scale_reciprocal(struct time_scale scale)
>
> {
>
> struct time_scale reciprocal;
>
> uint32_t dividend;
>
>
> dividend = _TS_MUL_FRAC_IDENTITY;
>
> reciprocal.shift = 1 - scale.shift;
>
> while ( dividend >= scale.mul_frac )
>
> {
>
> dividend >>= 1;
>
> reciprocal.shift++;
>
> }
>
>
> asm (
>
> "divl %4"
>
> : "=a" (reciprocal.mul_frac), "=d" (dividend)
>
> : "0" (0), "1" (dividend), "r" (scale.mul_frac) );
>
>
> return reciprocal;
>
> }
>
>
>
> int64_t get_s_time_fixed(const struct cpu_time_stamp *t, const struct
> time_scale *ts, uint64_t at_tsc)
>
> {
>
> uint64_t tsc, delta;
>
>
> if ( at_tsc )
>
> tsc = at_tsc;
>
> else
>
> tsc = rdtsc();
>
> delta = tsc - t->local_tsc;
>
> return t->local_stime + scale_delta(delta, ts);
>
> }
>
>
> int main() {
>
>
> struct cpu_time_stamp ct;
>
> struct time_scale ts;
>
> struct time_scale back;
>
>
> uint64_t start = rdtsc();
>
>
> set_time_scale(&ts, 3000000000);
>
> back = scale_reciprocal(ts);
>
>
> ct.local_tsc = start;
>
> ct.local_stime = scale_delta(start, &ts);
>
>
> while (1) {
>
> uint64_t new_tsc = rdtsc();
>
> ct.local_stime = get_s_time_fixed(&ct, &ts, new_tsc);
>
> ct.local_tsc = new_tsc;
>
>
> uint64_t tmp_tsc = rdtsc();
>
> printf("%lu %lu\n", tmp_tsc, scale_delta(get_s_time_fixed(&ct, &ts,
> tmp_tsc), &back));
>
> }
>
>
> return 0;
>
> }
>
> ```
>
>
> It's enough to run the loop for 10-20 seconds to see the problem.
> scale_delta rounds the least significant bits during calculation, which
> causes the error to accumulate, at different rates on different cores,
> depending on the least significant bits at the time of calibration.
>
>
> Now let's talk about why dwm reacts this way. When a snapshot is reversed,
> last_guest_time in hvm_get_guest_time_fixed is set to 0, which doesn't
> prevent time from flowing backwards. This means that cache_tsc_offset in
> hvm_get_guest_tsc_fixed may be configured correctly on one physical core,
> but due to shedding on a physical core with a lagging tsc, the guest may
> occasionally see a tsc value lower than it saw before the snapshot was
> reversed. If this happens to the dwm code, it terminates with an error.
>
>
> A quick solution to this problem might be to save the last_seen_tsc
> parameter in a snapshot for each core, followed by validation.
>
>
> The correct solution is to remove the rounding of the least significant
> bits from the sequence.
>
> On Wed, Jan 7, 2026 at 11:02 AM Jan Beulich <[email protected]> wrote:
>
>> On 06.01.2026 21:10, Антон Марков wrote:
>> > Hi, I'm not sure about the other places. In hvm_load_cpu_ctxt
>> > (xen/arch/x86/hvm/hvm.c ), it was easy to catch because
>> > process_pending_softirqs is frequently called there, which in turn
>> > processes softirqs from the timer (where the timestamp is updated).
>> > After I fixed sync_tsc in hvm_load_cpu_ctxt, the problem stopped
>> > reproducing under no load. However, when the number of vCPUs is 4 times
>> > greater than the number of CPUs (under heavy load), the problem rarely
>> > reoccurs (mostly during snapshot restores during
>> > process_pending_softirqs calls), and this is no longer a simple case.
>> If
>> > get_s_time_fixed can indeed be interrupted during execution after
>> > rdtsc_ordered, then the current fix is insufficient. It's necessary to
>> > atomically copy "t->stamp" to the stack using local_irq_disable and
>> > local_irq_enable (as in local_time_calibration), and then work with the
>> > copy, confident in its lifetime and immutability. But until
>> > get_s_time_fixed is proven to be interruptible, this is premature, so
>> > your fix is sufficient. I think I need more information and testing to
>> > say more.
>>
>> While the cpu_calibration per-CPU variable is updated from IRQ context,
>> the cpu_time one isn't. Hence t->stamp's contents cannot change behind
>> the back of get_s_time_fixed(). I wonder whether ...
>>
>> > Regarding the other scale_delta calls, if they include values
>> > calculated from externally saved tsc values that could have become
>> > stale during the process_pending_softirqs call, this definitely needs
>> to
>> > be fixed.
>>
>> ... another similar issue (possibly one not included in the set of
>> remarks I have in the patch, as none of those look related to what you
>> describe) might be causing the remaining, more rare problems you say you
>> see. That set of remarks is actually a result of me going over all other
>> scale_delta() calls, but of course I may have got the analysis wrong.
>>
>> As to using 4 times as many vCPU-s as there are pCPU-s (and then heavy
>> load) - while I don't think we have a support statement for such upstream
>> (when probably we should), iirc for our (SUSE's) products we would
>> consider that unsupported. Just fyi.
>>
>> Also, btw, please don't top-post.
>>
>> Jan
>>
>

Reply via email to