On Tue, Apr 14, 2015 at 09:08:58PM -0000, Thomas Gleixner wrote:
> The evaluation of the next timer in the nohz code is based on jiffies
> while all the tick internals are nano seconds based. We have also to
> convert hrtimer nanoseconds to jiffies in the !highres case. That's
> just wrong and introduces interesting corner cases.
> 
> Turn it around and convert the next timer wheel timer expiry and the
> rcu event to clock monotonic and base all calculations on
> nanoseconds. That identifies the case where no timer is pending
> clearly with an absolute expiry value of KTIME_MAX.
> 
> Makes the code more readable and gets rid of the jiffies magic in the
> nohz code.
> 
> Signed-off-by: Thomas Gleixner <[email protected]>
> Cc: "Paul E. McKenney" <[email protected]>
> Cc: Josh Triplett <[email protected]>
> Cc: Lai Jiangshan <[email protected]>
> Cc: John Stultz <[email protected]>
> Cc: Preeti U Murthy <[email protected]>
> Cc: Marcelo Tosatti <[email protected]>
> Cc: Frederic Weisbecker <[email protected]>

Good stuff!

> ---
>  include/linux/hrtimer.h     |    2 
>  include/linux/rcupdate.h    |    6 +-
>  include/linux/rcutree.h     |    2 
>  include/linux/timer.h       |    7 --
>  kernel/rcu/tree_plugin.h    |   14 +++--

I guess that I had better take a look.  ;-)

Short summary:  No problems, looks like you found all of the
rcu_needs_cpu() functions.   (Yes, there are more of them than I would
have believed possible!)

>  kernel/time/hrtimer.c       |   14 ++---
>  kernel/time/tick-internal.h |    2 
>  kernel/time/tick-sched.c    |  109 
> +++++++++++++++++++-------------------------

I got a build error with CONFIG_NO_HZ_FULL on this one, looks like a
stray time_delta that needs to change to delta.  Builds OK with that
change, and one of the three scenarios runs fine as well (the other two
are running now).  Non-CONFIG_NO_HZ_FULL kernels pass a (very short)
five-minute rcutorture test.  (But note that rcutorture doesn't use
hrtimers directly, changing which is on my list.)

                                                        Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3af15c3..753c211 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -653,7 +653,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched 
*ts,
 #ifdef CONFIG_NO_HZ_FULL
        /* Limit the tick delta to the maximum scheduler deferment */
        if (!ts->inidle)
-               delta = min(time_delta, scheduler_tick_max_deferment());
+               delta = min(delta, scheduler_tick_max_deferment());
 #endif
 
        /* Calculate the next expiry time */

------------------------------------------------------------------------

>  kernel/time/tick-sched.h    |    2 
>  kernel/time/timer.c         |   71 +++++++++++++---------------
>  kernel/time/timer_list.c    |    4 -
>  11 files changed, 107 insertions(+), 126 deletions(-)
> 
> Index: tip/include/linux/hrtimer.h
> ===================================================================
> --- tip.orig/include/linux/hrtimer.h
> +++ tip/include/linux/hrtimer.h
> @@ -386,7 +386,7 @@ static inline int hrtimer_restart(struct
>  /* Query timers: */
>  extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
> 
> -extern ktime_t hrtimer_get_next_event(void);
> +extern u64 hrtimer_get_next_event(void);
> 
>  /*
>   * A timer is active, when it is enqueued into the rbtree or the
> Index: tip/include/linux/rcupdate.h
> ===================================================================
> --- tip.orig/include/linux/rcupdate.h
> +++ tip/include/linux/rcupdate.h
> @@ -44,6 +44,8 @@
>  #include <linux/debugobjects.h>
>  #include <linux/bug.h>
>  #include <linux/compiler.h>
> +#include <linux/ktime.h>
> +
>  #include <asm/barrier.h>
> 
>  extern int rcu_expedited; /* for sysctl */
> @@ -1122,9 +1124,9 @@ static inline notrace void rcu_read_unlo
>       __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
> 
>  #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL)
> -static inline int rcu_needs_cpu(unsigned long *delta_jiffies)
> +static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt)
>  {
> -     *delta_jiffies = ULONG_MAX;
> +     *nextevt = KTIME_MAX;
>       return 0;
>  }
>  #endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */
> Index: tip/include/linux/rcutree.h
> ===================================================================
> --- tip.orig/include/linux/rcutree.h
> +++ tip/include/linux/rcutree.h
> @@ -32,7 +32,7 @@
> 
>  void rcu_note_context_switch(void);
>  #ifndef CONFIG_RCU_NOCB_CPU_ALL
> -int rcu_needs_cpu(unsigned long *delta_jiffies);
> +int rcu_needs_cpu(u64 basem, u64 *nextevt);
>  #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
>  void rcu_cpu_stall_reset(void);
> 
> Index: tip/include/linux/timer.h
> ===================================================================
> --- tip.orig/include/linux/timer.h
> +++ tip/include/linux/timer.h
> @@ -188,13 +188,6 @@ extern void set_timer_slack(struct timer
>  #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1)
> 
>  /*
> - * Return when the next timer-wheel timeout occurs (in absolute jiffies),
> - * locks the timer base and does the comparison against the given
> - * jiffie.
> - */
> -extern unsigned long get_next_timer_interrupt(unsigned long now);
> -
> -/*
>   * Timer-statistics info:
>   */
>  #ifdef CONFIG_TIMER_STATS
> Index: tip/kernel/rcu/tree_plugin.h
> ===================================================================
> --- tip.orig/kernel/rcu/tree_plugin.h
> +++ tip/kernel/rcu/tree_plugin.h
> @@ -1372,9 +1372,9 @@ static void rcu_prepare_kthreads(int cpu
>   * any flavor of RCU.
>   */
>  #ifndef CONFIG_RCU_NOCB_CPU_ALL
> -int rcu_needs_cpu(unsigned long *delta_jiffies)
> +int rcu_needs_cpu(u64 basemono, u64 *nextevt)
>  {
> -     *delta_jiffies = ULONG_MAX;
> +     *nextevt = KTIME_MAX;

I was going to ask about basemono, but I see it in the next hunk.

>       return rcu_cpu_has_callbacks(NULL);
>  }
>  #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
> @@ -1485,16 +1485,17 @@ static bool __maybe_unused rcu_try_advan
>   * The caller must have disabled interrupts.
>   */
>  #ifndef CONFIG_RCU_NOCB_CPU_ALL
> -int rcu_needs_cpu(unsigned long *dj)
> +int rcu_needs_cpu(u64 basemono, u64 *nextevt)
>  {
>       struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
> +     unsigned long dj;
> 
>       /* Snapshot to detect later posting of non-lazy callback. */
>       rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
> 
>       /* If no callbacks, RCU doesn't need the CPU. */
>       if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
> -             *dj = ULONG_MAX;
> +             *nextevt = KTIME_MAX;
>               return 0;
>       }
> 
> @@ -1508,11 +1509,12 @@ int rcu_needs_cpu(unsigned long *dj)
> 
>       /* Request timer delay depending on laziness, and round. */
>       if (!rdtp->all_lazy) {
> -             *dj = round_up(rcu_idle_gp_delay + jiffies,
> +             dj = round_up(rcu_idle_gp_delay + jiffies,
>                              rcu_idle_gp_delay) - jiffies;
>       } else {
> -             *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
> +             dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
>       }
> +     *nextevt = basemono + dj * TICK_NSEC;

The multiply would have been a problem back in the day, but should
be just fine on modern hardware.  I suppose that slow hardware could
compensate by having the scheduling-clock period be an exact power of
two worth of nanoseconds.

>       return 0;
>  }
>  #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
> Index: tip/kernel/time/hrtimer.c
> ===================================================================
> --- tip.orig/kernel/time/hrtimer.c
> +++ tip/kernel/time/hrtimer.c
> @@ -1072,26 +1072,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining)
>  /**
>   * hrtimer_get_next_event - get the time until next expiry event
>   *
> - * Returns the delta to the next expiry event or KTIME_MAX if no timer
> - * is pending.
> + * Returns the next expiry time or KTIME_MAX if no timer is pending.
>   */
> -ktime_t hrtimer_get_next_event(void)
> +u64 hrtimer_get_next_event(void)
>  {
>       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
> -     ktime_t mindelta = { .tv64 = KTIME_MAX };
> +     u64 expires = KTIME_MAX;
>       unsigned long flags;
> 
>       raw_spin_lock_irqsave(&cpu_base->lock, flags);
> 
>       if (!__hrtimer_hres_active(cpu_base))
> -             mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
> -                                  ktime_get());
> +             expires = __hrtimer_get_next_event(cpu_base).tv64;
> 
>       raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
> 
> -     if (mindelta.tv64 < 0)
> -             mindelta.tv64 = 0;
> -     return mindelta;
> +     return expires;
>  }
>  #endif
> 
> Index: tip/kernel/time/tick-internal.h
> ===================================================================
> --- tip.orig/kernel/time/tick-internal.h
> +++ tip/kernel/time/tick-internal.h
> @@ -137,3 +137,5 @@ extern void tick_nohz_init(void);
>  # else
>  static inline void tick_nohz_init(void) { }
>  #endif
> +
> +extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
> Index: tip/kernel/time/tick-sched.c
> ===================================================================
> --- tip.orig/kernel/time/tick-sched.c
> +++ tip/kernel/time/tick-sched.c
> @@ -582,39 +582,46 @@ static void tick_nohz_restart(struct tic
>  static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
>                                        ktime_t now, int cpu)
>  {
> -     unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
> -     ktime_t last_update, expires, ret = { .tv64 = 0 };
> -     unsigned long rcu_delta_jiffies;
>       struct clock_event_device *dev = 
> __this_cpu_read(tick_cpu_device.evtdev);
> -     u64 time_delta;
> -
> -     time_delta = timekeeping_max_deferment();
> +     u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
> +     unsigned long seq, basejiff;
> +     ktime_t tick;
> 
>       /* Read jiffies and the time when jiffies were updated last */
>       do {
>               seq = read_seqbegin(&jiffies_lock);
> -             last_update = last_jiffies_update;
> -             last_jiffies = jiffies;
> +             basemono = last_jiffies_update.tv64;
> +             basejiff = jiffies;
>       } while (read_seqretry(&jiffies_lock, seq));
> +     ts->last_jiffies = basejiff;
> 
> -     if (rcu_needs_cpu(&rcu_delta_jiffies) ||
> +     if (rcu_needs_cpu(basemono, &next_rcu) ||
>           arch_needs_cpu() || irq_work_needs_cpu()) {
> -             next_jiffies = last_jiffies + 1;
> -             delta_jiffies = 1;
> +             next_tick = basemono + TICK_NSEC;
>       } else {
> -             /* Get the next timer wheel timer */
> -             next_jiffies = get_next_timer_interrupt(last_jiffies);
> -             delta_jiffies = next_jiffies - last_jiffies;
> -             if (rcu_delta_jiffies < delta_jiffies) {
> -                     next_jiffies = last_jiffies + rcu_delta_jiffies;
> -                     delta_jiffies = rcu_delta_jiffies;
> -             }
> +             /*
> +              * Get the next pending timer. If high resolution
> +              * timers are enabled this only takes the timer wheel
> +              * timers into account. If high resolution timers are
> +              * disabled this also looks at the next expiring
> +              * hrtimer.
> +              */
> +             next_tmr = get_next_timer_interrupt(basejiff, basemono);
> +             ts->next_timer = next_tmr;
> +             /* Take the next rcu event into account */
> +             next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
>       }
> 
> -     if ((long)delta_jiffies <= 1) {
> +     /*
> +      * If the tick is due in the next period, keep it ticking or
> +      * restart it proper.
> +      */
> +     delta = next_tick - basemono;
> +     if (delta <= (u64)TICK_NSEC) {
> +             tick.tv64 = 0;
>               if (!ts->tick_stopped)
>                       goto out;
> -             if (delta_jiffies == 0) {
> +             if (delta == 0) {
>                       /* Tick is stopped, but required now. Enforce it */
>                       tick_nohz_restart(ts, now);
>                       goto out;
> @@ -629,54 +636,39 @@ static ktime_t tick_nohz_stop_sched_tick
>        * do_timer() never invoked. Keep track of the fact that it
>        * was the one which had the do_timer() duty last. If this cpu
>        * is the one which had the do_timer() duty last, we limit the
> -      * sleep time to the timekeeping max_deferement value which we
> -      * retrieved above. Otherwise we can sleep as long as we want.
> +      * sleep time to the timekeeping max_deferement value.
> +      * Otherwise we can sleep as long as we want.
>        */
> +     delta = timekeeping_max_deferment();
>       if (cpu == tick_do_timer_cpu) {
>               tick_do_timer_cpu = TICK_DO_TIMER_NONE;
>               ts->do_timer_last = 1;
>       } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
> -             time_delta = KTIME_MAX;
> +             delta = KTIME_MAX;
>               ts->do_timer_last = 0;
>       } else if (!ts->do_timer_last) {
> -             time_delta = KTIME_MAX;
> +             delta = KTIME_MAX;
>       }
> 
>  #ifdef CONFIG_NO_HZ_FULL
> +     /* Limit the tick delta to the maximum scheduler deferment */
>       if (!ts->inidle)
> -             time_delta = min(time_delta, scheduler_tick_max_deferment());
> +             delta = min(time_delta, scheduler_tick_max_deferment());

s/time_delta/delta/?

>  #endif
> 
> -     /*
> -      * calculate the expiry time for the next timer wheel
> -      * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
> -      * there is no timer pending or at least extremely far into
> -      * the future (12 days for HZ=1000). In this case we set the
> -      * expiry to the end of time.
> -      */
> -     if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
> -             /*
> -              * Calculate the time delta for the next timer event.
> -              * If the time delta exceeds the maximum time delta
> -              * permitted by the current clocksource then adjust
> -              * the time delta accordingly to ensure the
> -              * clocksource does not wrap.
> -              */
> -             time_delta = min_t(u64, time_delta,
> -                                tick_period.tv64 * delta_jiffies);
> -     }
> -
> -     if (time_delta < KTIME_MAX)
> -             expires = ktime_add_ns(last_update, time_delta);
> +     /* Calculate the next expiry time */
> +     if (delta < (KTIME_MAX - basemono))
> +             expires = basemono + delta;
>       else
> -             expires.tv64 = KTIME_MAX;
> +             expires = KTIME_MAX;
> +
> +     expires = min_t(u64, expires, next_tick);
> +     tick.tv64 = expires;
> 
>       /* Skip reprogram of event if its not changed */
> -     if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
> +     if (ts->tick_stopped && (expires == dev->next_event.tv64))
>               goto out;
> 
> -     ret = expires;
> -
>       /*
>        * nohz_stop_sched_tick can be called several times before
>        * the nohz_restart_sched_tick is called. This happens when
> @@ -694,26 +686,23 @@ static ktime_t tick_nohz_stop_sched_tick
>       }
> 
>       /*
> -      * If the expiration time == KTIME_MAX, then
> -      * in this case we simply stop the tick timer.
> +      * If the expiration time == KTIME_MAX, then we simply stop
> +      * the tick timer.
>        */
> -     if (unlikely(expires.tv64 == KTIME_MAX)) {
> +     if (unlikely(expires == KTIME_MAX)) {
>               if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
>                       hrtimer_cancel(&ts->sched_timer);
>               goto out;
>       }
> 
>       if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
> -             hrtimer_start(&ts->sched_timer, expires,
> -                           HRTIMER_MODE_ABS_PINNED);
> +             hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
>       else
> -             tick_program_event(expires, 1);
> +             tick_program_event(tick, 1);
>  out:
> -     ts->next_jiffies = next_jiffies;
> -     ts->last_jiffies = last_jiffies;
> +     /* Update the estimated sleep length */
>       ts->sleep_length = ktime_sub(dev->next_event, now);
> -
> -     return ret;
> +     return tick;
>  }
> 
>  static void tick_nohz_full_stop_tick(struct tick_sched *ts)
> Index: tip/kernel/time/tick-sched.h
> ===================================================================
> --- tip.orig/kernel/time/tick-sched.h
> +++ tip/kernel/time/tick-sched.h
> @@ -57,7 +57,7 @@ struct tick_sched {
>       ktime_t                         iowait_sleeptime;
>       ktime_t                         sleep_length;
>       unsigned long                   last_jiffies;
> -     unsigned long                   next_jiffies;
> +     u64                             next_timer;
>       ktime_t                         idle_expires;
>       int                             do_timer_last;
>  };
> Index: tip/kernel/time/timer.c
> ===================================================================
> --- tip.orig/kernel/time/timer.c
> +++ tip/kernel/time/timer.c
> @@ -49,6 +49,8 @@
>  #include <asm/timex.h>
>  #include <asm/io.h>
> 
> +#include "tick-internal.h"
> +
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/timer.h>
> 
> @@ -1311,54 +1313,48 @@ cascade:
>   * Check, if the next hrtimer event is before the next timer wheel
>   * event:
>   */
> -static unsigned long cmp_next_hrtimer_event(unsigned long now,
> -                                         unsigned long expires)
> +static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
>  {
> -     ktime_t hr_delta = hrtimer_get_next_event();
> -     struct timespec tsdelta;
> -     unsigned long delta;
> -
> -     if (hr_delta.tv64 == KTIME_MAX)
> -             return expires;
> +     u64 nextevt = hrtimer_get_next_event();
> 
>       /*
> -      * Expired timer available, let it expire in the next tick
> +      * If high resolution timers are enabled
> +      * hrtimer_get_next_event() returns KTIME_MAX.
>        */
> -     if (hr_delta.tv64 <= 0)
> -             return now + 1;
> -
> -     tsdelta = ktime_to_timespec(hr_delta);
> -     delta = timespec_to_jiffies(&tsdelta);
> +     if (expires <= nextevt)
> +             return expires;
> 
>       /*
> -      * Limit the delta to the max value, which is checked in
> -      * tick_nohz_stop_sched_tick():
> +      * If the next timer is already expired, return the tick base
> +      * time so the tick is fired immediately.
>        */
> -     if (delta > NEXT_TIMER_MAX_DELTA)
> -             delta = NEXT_TIMER_MAX_DELTA;
> +     if (nextevt <= basem)
> +             return basem;
> 
>       /*
> -      * Take rounding errors in to account and make sure, that it
> -      * expires in the next tick. Otherwise we go into an endless
> -      * ping pong due to tick_nohz_stop_sched_tick() retriggering
> -      * the timer softirq
> +      * Round up to the next jiffie. High resolution timers are
> +      * off, so the hrtimers are expired in the tick and we need to
> +      * make sure that this tick really expires the timer to avoid
> +      * a ping pong of the nohz stop code.
> +      *
> +      * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
>        */
> -     if (delta < 1)
> -             delta = 1;
> -     now += delta;
> -     if (time_before(now, expires))
> -             return now;
> -     return expires;
> +     return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
>  }
> 
>  /**
> - * get_next_timer_interrupt - return the jiffy of the next pending timer
> - * @now: current time (in jiffies)
> + * get_next_timer_interrupt - return the time (clock mono) of the next timer
> + * @basej:   base time jiffies
> + * @basem:   base time clock monotonic
> + *
> + * Returns the tick aligned clock monotonic time of the next pending
> + * timer or KTIME_MAX if no timer is pending.
>   */
> -unsigned long get_next_timer_interrupt(unsigned long now)
> +u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
>  {
>       struct tvec_base *base = __this_cpu_read(tvec_bases);
> -     unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
> +     u64 expires = KTIME_MAX;
> +     unsigned long nextevt;
> 
>       /*
>        * Pretend that there is no timer pending if the cpu is offline.
> @@ -1371,14 +1367,15 @@ unsigned long get_next_timer_interrupt(u
>       if (base->active_timers) {
>               if (time_before_eq(base->next_timer, base->timer_jiffies))
>                       base->next_timer = __next_timer_interrupt(base);
> -             expires = base->next_timer;
> +             nextevt = base->next_timer;
> +             if (time_before_eq(nextevt, basej))
> +                     expires = basem;
> +             else
> +                     expires = basem + (nextevt - basej) * TICK_NSEC;
>       }
>       spin_unlock(&base->lock);
> 
> -     if (time_before_eq(expires, now))
> -             return now;
> -
> -     return cmp_next_hrtimer_event(now, expires);
> +     return cmp_next_hrtimer_event(basem, expires);
>  }
>  #endif
> 
> Index: tip/kernel/time/timer_list.c
> ===================================================================
> --- tip.orig/kernel/time/timer_list.c
> +++ tip/kernel/time/timer_list.c
> @@ -184,7 +184,7 @@ static void print_cpu(struct seq_file *m
>               P_ns(idle_sleeptime);
>               P_ns(iowait_sleeptime);
>               P(last_jiffies);
> -             P(next_jiffies);
> +             P(next_timer);
>               P_ns(idle_expires);
>               SEQ_printf(m, "jiffies: %Lu\n",
>                          (unsigned long long)jiffies);
> @@ -282,7 +282,7 @@ static void timer_list_show_tickdevices_
> 
>  static inline void timer_list_header(struct seq_file *m, u64 now)
>  {
> -     SEQ_printf(m, "Timer List Version: v0.7\n");
> +     SEQ_printf(m, "Timer List Version: v0.8\n");
>       SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
>       SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
>       SEQ_printf(m, "\n");
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to