Hi Thomas, On Wed, Jun 20, 2018 at 01:52:10AM +0200, Thomas Gleixner wrote: > On Fri, 15 Jun 2018, Pavel Tatashin wrote: > > > tsc_early_init(): > > Determines offset, shift and multiplier for the early clock based on the > > TSC frequency. > > > > tsc_early_fini() > > Implement the finish part of early tsc feature, prints message about the > > offset, which can be useful to find out how much time was spent in post and > > boot manager (if TSC starts from 0 during boot) > > > > sched_clock_early(): > > TSC based implementation of early clock and is called from sched_clock(). > > > > start_early_clock(): > > Calls tsc_early_init(), and makes sched_clock() to use early boot clock > > > > set_final_clock(): > > Sets the final clock which is either platform specific or > > native_sched_clock(). Also calls tsc_early_fini() if early clock was > > previously initialized. > > > > Call start_early_clock() to start using early boot time stamps > > functionality on the supported x86 platforms, and call set_final_clock() to > > finish this feature and switch back to the default clock. The supported x86 > > systems are those where TSC frequency is determined early in boot. > > Lots of functions for dubious value. > > > +static struct cyc2ns_data cyc2ns_early; > > + > > +static u64 sched_clock_early(void) > > +{ > > + u64 ns = mul_u64_u32_shr(rdtsc(), cyc2ns_early.cyc2ns_mul, > > + cyc2ns_early.cyc2ns_shift); > > + return ns + cyc2ns_early.cyc2ns_offset; > > +} > > + > > +/* > > + * Initialize clock for early time stamps > > + */ > > +static void __init tsc_early_init(unsigned int khz) > > +{ > > + clocks_calc_mult_shift(&cyc2ns_early.cyc2ns_mul, > > + &cyc2ns_early.cyc2ns_shift, > > + khz, NSEC_PER_MSEC, 0); > > + cyc2ns_early.cyc2ns_offset = -sched_clock_early(); > > +} > > + > > +/* > > + * Finish clock for early time stamps, and hand over to permanent clock by > > + * setting __sched_clock_offset appropriately for continued time keeping. > > + */ > > +static void __init tsc_early_fini(void) > > +{ > > + unsigned long long t; > > + unsigned long r; > > + > > + t = -cyc2ns_early.cyc2ns_offset; > > + r = do_div(t, NSEC_PER_SEC); > > + > > + __sched_clock_offset = sched_clock_early() - sched_clock(); > > + pr_info("early sched clock is finished, offset [%lld.%09lds]\n", t, r); > > +} > > + > > +#ifdef CONFIG_PARAVIRT > > +static inline void __init start_early_clock(void) > > +{ > > + tsc_early_init(tsc_khz); > > + pv_time_ops.active_sched_clock = sched_clock_early; > > +} > > + > > +static inline void __init set_final_clock(void) > > +{ > > + pv_time_ops.active_sched_clock = pv_time_ops.sched_clock; > > + > > + /* We did not have early sched clock if multiplier is 0 */ > > + if (cyc2ns_early.cyc2ns_mul) > > + tsc_early_fini(); > > +} > > +#else /* CONFIG_PARAVIRT */ > > +/* > > + * For native clock we use two switches static and dynamic, the static > > switch is > > + * initially true, so we check the dynamic switch, which is initially > > false. > > + * Later when early clock is disabled, we can alter the static switch in > > order > > + * to avoid branch check on every sched_clock() call. > > + */ > > +static bool __tsc_early; > > +static DEFINE_STATIC_KEY_TRUE(__tsc_early_static); > > + > > +static inline void __init start_early_clock(void) > > +{ > > + tsc_early_init(tsc_khz); > > + __tsc_early = true; > > +} > > + > > +static inline void __init set_final_clock(void) > > +{ > > + __tsc_early = false; > > + static_branch_disable(&__tsc_early_static); > > + > > + /* We did not have early sched clock if multiplier is 0 */ > > + if (cyc2ns_early.cyc2ns_mul) > > + tsc_early_fini(); > > +} > > +#endif /* CONFIG_PARAVIRT */ > > + > > /* > > * Scheduler clock - returns current time in nanosec units. > > */ > > @@ -194,6 +272,13 @@ u64 native_sched_clock(void) > > return cycles_2_ns(tsc_now); > > } > > > > +#ifndef CONFIG_PARAVIRT > > + if (static_branch_unlikely(&__tsc_early_static)) { > > + if (__tsc_early) > > + return sched_clock_early(); > > + } > > +#endif /* !CONFIG_PARAVIRT */ > > + > > This whole function maze plus the ifdeffery which comes with it is really > horrible and not required. What's wrong with reusing the existing > functionality? > > The patch below (uncompiled and untested) should achieve the same thing > without all the paravirt muck (which can be easily added w/o all the > ifdeffery if really required) by just reusing the existing conversion and > initialization functions. > > If I'm not completely mistaken then the second invocation of > set_cyc2ns_scale() from tsc_init() will also take care of the smooth > sched_clock() transition from early to final w/o touching the core > __sched_clock_offset at all. Though my tired brain might trick me. > > It might not work as is, but it should not be rocket science to make it do > so. > > Thanks, > > tglx > > 8<---------------------- > > tsc.c | 59 ++++++++++++++++++++++++++++++++++++++++++++--------------- > 1 file changed, 44 insertions(+), 15 deletions(-) > > --- a/arch/x86/kernel/tsc.c > +++ b/arch/x86/kernel/tsc.c > @@ -39,6 +39,9 @@ EXPORT_SYMBOL(tsc_khz); > static int __read_mostly tsc_unstable; > > static DEFINE_STATIC_KEY_FALSE(__use_tsc); > +static DEFINE_STATIC_KEY_TRUE(tsc_early_enabled);
One potential problem may be the several static_keys used here, the "__use_tsc", the "__sched_clock_stable", it may not be used very early in boot phase. As the the static_branch_enable() will use pageing related code while the paging is not setup ready yet. Some details on https://lkml.org/lkml/2018/6/13/92 Thanks, Feng > + > +static bool tsc_early_sched_clock; > > int tsc_clocksource_reliable; > > @@ -133,18 +136,12 @@ static inline unsigned long long cycles_ > return ns; > } > > -static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long > tsc_now) > +static void __set_cyc2ns_scale(unsigned long khz, int cpu, > + unsigned long long tsc_now) > { > unsigned long long ns_now; > struct cyc2ns_data data; > struct cyc2ns *c2n; > - unsigned long flags; > - > - local_irq_save(flags); > - sched_clock_idle_sleep_event(); > - > - if (!khz) > - goto done; > > ns_now = cycles_2_ns(tsc_now); > > @@ -176,22 +173,46 @@ static void set_cyc2ns_scale(unsigned lo > c2n->data[0] = data; > raw_write_seqcount_latch(&c2n->seq); > c2n->data[1] = data; > +} > + > +static void set_cyc2ns_scale(unsigned long khz, int cpu, > + unsigned long long tsc_now) > +{ > + unsigned long flags; > + > + local_irq_save(flags); > + sched_clock_idle_sleep_event(); > + > + if (khz) > + __set_cyc2ns_scale(khz, cpu, tsc_now); > > -done: > sched_clock_idle_wakeup_event(); > local_irq_restore(flags); > } > > +static void __init sched_clock_early_init(unsigned int khz) > +{ > + cyc2ns_init(smp_processor_id()); > + __set_cyc2ns_scale(khz, smp_processor_id(), rdtsc()); > + tsc_early_sched_clock = true; > +} > + > +static void __init sched_clock_early_exit(void) > +{ > + static_branch_disable(&tsc_early_enabled); > +} > + > /* > * Scheduler clock - returns current time in nanosec units. > */ > u64 native_sched_clock(void) > { > - if (static_branch_likely(&__use_tsc)) { > - u64 tsc_now = rdtsc(); > + if (static_branch_likely(&__use_tsc)) > + return cycles_2_ns(rdtsc()); > > - /* return the value in ns */ > - return cycles_2_ns(tsc_now); > + if (static_branch_unlikely(&tsc_early_enabled)) { > + if (tsc_early_sched_clock) > + return cycles_2_ns(rdtsc()); > } > > /* > @@ -1332,9 +1353,10 @@ void __init tsc_early_delay_calibrate(vo > lpj = tsc_khz * 1000; > do_div(lpj, HZ); > loops_per_jiffy = lpj; > + sched_clock_early_init(tsc_khz); > } > > -void __init tsc_init(void) > +static void __init __tsc_init(void) > { > u64 lpj, cyc; > int cpu; > @@ -1384,7 +1406,8 @@ void __init tsc_init(void) > */ > cyc = rdtsc(); > for_each_possible_cpu(cpu) { > - cyc2ns_init(cpu); > + if (!tsc_early_sched_clock || cpu != smp_processor_id()) > + cyc2ns_init(cpu); > set_cyc2ns_scale(tsc_khz, cpu, cyc); > } > > @@ -1411,6 +1434,12 @@ void __init tsc_init(void) > detect_art(); > } > > +void __init tsc_init(void) > +{ > + __tsc_init(); > + sched_clock_early_exit(); > +} > + > #ifdef CONFIG_SMP > /* > * If we have a constant TSC and are using the TSC for the delay loop, > >