Jason,

On Thu, 21 Feb 2013, Jason Liu wrote:
> 2013/2/21 Thomas Gleixner <t...@linutronix.de>:
> > Now your explanation makes sense.
> >
> > I have no fast solution for this, but I think that I have an idea how
> > to fix it. Stay tuned.
> 
> Thanks Thomas, wait for your fix. :)

find below a completely untested patch, which should address that issue.

Sigh. Stopping the cpu local timer in deep idle is known to be an
idiotic design decision for 10+ years. I'll never understand why ARM
vendors insist on implementing the same stupidity over and over.

Thanks,

        tglx

Index: linux-2.6/kernel/time/tick-broadcast.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-broadcast.c
+++ linux-2.6/kernel/time/tick-broadcast.c
@@ -397,6 +397,8 @@ int tick_resume_broadcast(void)
 
 /* FIXME: use cpumask_var_t. */
 static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
+static DECLARE_BITMAP(tick_broadcast_pending, NR_CPUS);
+static DECLARE_BITMAP(tick_force_broadcast_mask, NR_CPUS);
 
 /*
  * Exposed for debugging: see timer_list.c
@@ -453,12 +455,24 @@ again:
        /* Find all expired events */
        for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
                td = &per_cpu(tick_cpu_device, cpu);
-               if (td->evtdev->next_event.tv64 <= now.tv64)
+               if (td->evtdev->next_event.tv64 <= now.tv64) {
                        cpumask_set_cpu(cpu, to_cpumask(tmpmask));
-               else if (td->evtdev->next_event.tv64 < next_event.tv64)
+                       /*
+                        * Mark the remote cpu in the pending mask, so
+                        * it can avoid reprogramming the cpu local
+                        * timer in tick_broadcast_oneshot_control().
+                        */
+                       set_bit(cpu, tick_broadcast_pending);
+               } else if (td->evtdev->next_event.tv64 < next_event.tv64)
                        next_event.tv64 = td->evtdev->next_event.tv64;
        }
 
+       /* Take care of enforced broadcast requests */
+       for_each_cpu(cpu, to_cpumask(tick_force_broadcast_mask)) {
+               set_bit(cpu, tmpmask);
+               clear_bit(cpu, tick_force_broadcast_mask);
+       }
+
        /*
         * Wakeup the cpus which have an expired event.
         */
@@ -494,6 +508,7 @@ void tick_broadcast_oneshot_control(unsi
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
        unsigned long flags;
+       ktime_t now;
        int cpu;
 
        /*
@@ -518,6 +533,8 @@ void tick_broadcast_oneshot_control(unsi
 
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
+               WARN_ON_ONCE(test_bit(cpu, tick_broadcast_pending));
+               WARN_ON_ONCE(test_bit(cpu, tick_force_broadcast_mask));
                if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
                        cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
                        clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
@@ -529,10 +546,63 @@ void tick_broadcast_oneshot_control(unsi
                        cpumask_clear_cpu(cpu,
                                          tick_get_broadcast_oneshot_mask());
                        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
-                       if (dev->next_event.tv64 != KTIME_MAX)
-                               tick_program_event(dev->next_event, 1);
+                       if (dev->next_event.tv64 == KTIME_MAX)
+                               goto out;
+                       /*
+                        * The cpu handling the broadcast timer marked
+                        * this cpu in the broadcast pending mask and
+                        * fired the broadcast IPI. So we are going to
+                        * handle the expired event anyway via the
+                        * broadcast IPI handler. No need to reprogram
+                        * the timer with an already expired event.
+                        */
+                       if (test_and_clear_bit(cpu, tick_broadcast_pending))
+                               goto out;
+                       /*
+                        * If the pending bit is not set, then we are
+                        * either the CPU handling the broadcast
+                        * interrupt or we got woken by something else.
+                        *
+                        * We are not longer in the broadcast mask, so
+                        * if the cpu local expiry time is already
+                        * reached, we would reprogram the cpu local
+                        * timer with an already expired event.
+                        *
+                        * This can lead to a ping-pong when we return
+                        * to idle and therefor rearm the broadcast
+                        * timer before the cpu local timer was able
+                        * to fire. This happens because the forced
+                        * reprogramming makes sure that the event
+                        * will happen in the future and depending on
+                        * the min_delta setting this might be far
+                        * enough out that the ping-pong starts.
+                        *
+                        * If the cpu local next_event has expired
+                        * then we know that the broadcast timer
+                        * next_event has expired as well and
+                        * broadcast is about to be handled. So we
+                        * avoid reprogramming and enforce that the
+                        * broadcast handler, which did not run yet,
+                        * will invoke the cpu local handler.
+                        *
+                        * We cannot call the handler directly from
+                        * here, because we might be in a NOHZ phase
+                        * and we did not go through the irq_enter()
+                        * nohz fixups.
+                        */
+                       now = ktime_get();
+                       if (dev->next_event.tv64 <= now.tv64) {
+                               set_bit(cpu, tick_force_broadcast_mask);
+                               goto out;
+                       }
+                       /*
+                        * We got woken by something else. Reprogram
+                        * the cpu local timer device.
+                        */
+                       tick_program_event(dev->next_event, 1);
                }
        }
+out:
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to