From: Thomas Gleixner <t...@linutronix.de>

Preeti reported a cpu down race with hrtimer based broadcasting:

Assume CPU1 is the CPU which holds the hrtimer broadcasting duty
before it is taken down.

CPU0                            CPU1
cpu_down()
                                takedown_cpu()
                                  disable_interrupts()
cpu_die()
  while (CPU1 != DEAD) {
    msleep(100);
      switch_to_idle()
        stop_cpu_timer()
          schedule_broadcast()
  }

tick_cleanup_dead_cpu()
     take_over_broadcast()      

So after CPU1 disabled interrupts it cannot handle the broadcast
hrtimer anymore, so CPU0 will be stuck forever.

Doing a "while (CPU1 != DEAD) msleep(100);" periodic poll is silly at
best, but we need to fix that nevertheless.

Split the tick cleanup into two pieces:

1) Shutdown and remove all per cpu clockevent devices from
   takedown_cpu()

   This is done carefully with respect to existing arch code which
   works around the shortcoming of the clockevents core code in
   interesting ways. We really want a separate callback for this to
   cleanup the workarounds, but that's not scope of this patch

2) Takeover the broadcast duty explicitely before calling cpu_die()

   This is a temporary workaround as well. What we really want is a
   callback in the clockevent device which allows us to do that from
   the dying CPU by pushing the hrtimer onto a different cpu. That
   might involve an IPI and is definitely more complex than this
   immediate fix.

Reported-by: Preeti U Murthy <pre...@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
---
 include/linux/tick.h         |    9 +++++----
 kernel/cpu.c                 |    6 +++---
 kernel/time/clockevents.c    |   30 ++++++++++++++++++------------
 kernel/time/tick-broadcast.c |   32 ++++++++++++++++++++++----------
 kernel/time/tick-common.c    |   34 ++++++++++++----------------------
 kernel/time/tick-internal.h  |    6 +++---
 6 files changed, 63 insertions(+), 54 deletions(-)

Index: linux/include/linux/tick.h
===================================================================
--- linux.orig/include/linux/tick.h
+++ linux/include/linux/tick.h
@@ -29,13 +29,12 @@ extern struct tick_device *tick_get_devi
 extern void __init tick_init(void);
 /* Should be core only, but XEN resume magic requires this */
 extern void tick_resume_local(void);
-extern void tick_handover_do_timer(void);
-extern void tick_cleanup_dead_cpu(int cpu);
+/* CPU hotplug */
+extern void tick_shutdown_local(void);
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
 static inline void tick_init(void) { }
 static inline void tick_resume_local(void) { }
-static inline void tick_handover_do_timer(void) { }
-static inline void tick_cleanup_dead_cpu(int cpu) { }
+static inline void tick_shutdown_local(void) { }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 #ifdef CONFIG_TICK_ONESHOT
@@ -66,8 +65,10 @@ static inline void tick_broadcast_contro
 
 #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && 
defined(CONFIG_TICK_ONESHOT)
 extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state);
+extern void tick_takeover(int deadcpu);
 #else
 static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state 
state) { return 0; }
+static inline void tick_takeover(int deadcpu) { }
 #endif
 
 static inline void tick_broadcast_enable(void)
Index: linux/kernel/cpu.c
===================================================================
--- linux.orig/kernel/cpu.c
+++ linux/kernel/cpu.c
@@ -349,8 +349,8 @@ static int __ref take_cpu_down(void *_pa
                return err;
 
        cpu_notify(CPU_DYING | param->mod, param->hcpu);
-       /* Give up timekeeping duties */
-       tick_handover_do_timer();
+       /* Shutdown the per cpu tick */
+       tick_shutdown_local();
        /* Park the stopper thread */
        kthread_park(current);
        return 0;
@@ -428,7 +428,7 @@ static int __ref _cpu_down(unsigned int
        __cpu_die(cpu);
 
        /* CPU is completely dead: tell everyone.  Too late to complain. */
-       tick_cleanup_dead_cpu(cpu);
+       tick_takeover(cpu);
        cpu_notify_nofail(CPU_DEAD | mod, hcpu);
 
        check_for_tasks(cpu);
Index: linux/kernel/time/clockevents.c
===================================================================
--- linux.orig/kernel/time/clockevents.c
+++ linux/kernel/time/clockevents.c
@@ -541,26 +541,32 @@ void clockevents_resume(void)
 #endif
 
 #ifdef CONFIG_HOTPLUG_CPU
-/**
- * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
+/*
+ * Cleanup the clock events devices on the dying cpu. curdev is the
+ * current installed tick device on that cpu
  */
-void tick_cleanup_dead_cpu(int cpu)
+void clockevents_cleanup_dying_cpu(struct clock_event_device *curdev)
 {
        struct clock_event_device *dev, *tmp;
        unsigned long flags;
+       int cpu;
 
        raw_spin_lock_irqsave(&clockevents_lock, flags);
-
-       tick_shutdown(cpu);
-       /*
-        * Unregister the clock event devices which were
-        * released from the users in the notify chain.
-        */
-       list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
-               list_del(&dev->list);
+       if (!curdev)
+               goto cleanup;
        /*
-        * Now check whether the CPU has left unused per cpu devices
+        * We cannot call the set mode function here at the moment
+        * because existing architecture cpu down code shuts down
+        * stuff already and we cannot interfere with that. So we just
+        * set the mode to unused for now.
         */
+       curdev->mode = CLOCK_EVT_MODE_UNUSED;
+       list_del(&curdev->list);
+       module_put(curdev->owner);
+
+cleanup:
+       /* Remove the unused percpu devices from the list */
+       cpu = smp_processor_id();
        list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
                if (cpumask_test_cpu(cpu, dev->cpumask) &&
                    cpumask_weight(dev->cpumask) == 1 &&
Index: linux/kernel/time/tick-broadcast.c
===================================================================
--- linux.orig/kernel/time/tick-broadcast.c
+++ linux/kernel/time/tick-broadcast.c
@@ -421,15 +421,17 @@ void tick_set_periodic_handler(struct cl
 
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Remove a CPU from broadcasting
+ * Remove a CPU from broadcasting. Called from the dying cpu.
  */
-void tick_shutdown_broadcast(unsigned int cpu)
+void tick_shutdown_broadcast(void)
 {
        struct clock_event_device *bc;
        unsigned long flags;
+       int cpu;
 
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 
+       cpu = smp_processor_id();
        cpumask_clear_cpu(cpu, tick_broadcast_mask);
        cpumask_clear_cpu(cpu, tick_broadcast_on);
 
@@ -906,14 +908,26 @@ void tick_broadcast_switch_to_oneshot(vo
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void broadcast_move_bc(int deadcpu)
+/*
+ * Called from the cpu hotplug code after a cpu is dead. This ensures
+ * that a hrtimer based broad cast device is taken over.
+ *
+ * FIXME: This should go away. We should replace this by a mechanism
+ * which pushes the hrtimer over to a different cpu from
+ * tick_shutdown_broadcast_oneshot()
+ */
+void tick_broadcast_takeover_bc(int deadcpu)
 {
-       struct clock_event_device *bc = tick_broadcast_device.evtdev;
+       struct clock_event_device *bc;
+       unsigned long flags;
 
-       if (!bc || !broadcast_needs_cpu(bc, deadcpu))
-               return;
-       /* This moves the broadcast assignment to this cpu */
-       clockevents_program_event(bc, bc->next_event, 1);
+       raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+       bc = tick_broadcast_device.evtdev;
+       if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+               /* This moves the broadcast assignment to this cpu */
+               clockevents_program_event(bc, bc->next_event, 1);
+       }
+       raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
 /*
@@ -929,8 +943,6 @@ static void tick_shutdown_broadcast_ones
        cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
        cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
        cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
-
-       broadcast_move_bc(cpu);
 }
 #endif
 
Index: linux/kernel/time/tick-common.c
===================================================================
--- linux.orig/kernel/time/tick-common.c
+++ linux/kernel/time/tick-common.c
@@ -336,10 +336,10 @@ out_bc:
 /*
  * Transfer the do_timer job away from a dying cpu.
  *
- * Called with interrupts disabled. Not locking required. If
- * tick_do_timer_cpu is owned by this cpu, nothing can change it.
+ * No locking required. If tick_do_timer_cpu is owned by this cpu,
+ * nothing can change it.
  */
-void tick_handover_do_timer(void)
+static void tick_handover_do_timer(void)
 {
        if (tick_do_timer_cpu == smp_processor_id()) {
                int cpu = cpumask_first(cpu_online_mask);
@@ -349,32 +349,22 @@ void tick_handover_do_timer(void)
        }
 }
 
-/*
- * Shutdown an event device on a given cpu:
+/**
+ * tick_shutdown_local - Shutdown the tick related functions on a cpu
  *
- * This is called on a life CPU, when a CPU is dead. So we cannot
- * access the hardware device itself.
- * We just set the mode and remove it from the lists.
+ * This is called from the dying cpu.
  */
-void tick_shutdown(unsigned int cpu)
+void tick_shutdown_local(void)
 {
-       struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
-       struct clock_event_device *dev = td->evtdev;
+       struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
 
        /* Remove the CPU from the broadcast machinery */
-       tick_shutdown_broadcast(cpu);
+       tick_shutdown_broadcast();
 
+       clockevents_cleanup_dying_cpu(td->evtdev);
        td->mode = TICKDEV_MODE_PERIODIC;
-       if (dev) {
-               /*
-                * Prevent that the clock events layer tries to call
-                * the set mode function!
-                */
-               dev->mode = CLOCK_EVT_MODE_UNUSED;
-               clockevents_exchange_device(dev, NULL);
-               dev->event_handler = clockevents_handle_noop;
-               td->evtdev = NULL;
-       }
+
+       tick_handover_do_timer();
 }
 #endif
 
Index: linux/kernel/time/tick-internal.h
===================================================================
--- linux.orig/kernel/time/tick-internal.h
+++ linux/kernel/time/tick-internal.h
@@ -20,7 +20,6 @@ extern int tick_do_timer_cpu __read_most
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
 extern void tick_check_new_device(struct clock_event_device *dev);
-extern void tick_shutdown(unsigned int cpu);
 extern void tick_suspend(void);
 extern void tick_resume(void);
 extern bool tick_check_replacement(struct clock_event_device *curdev,
@@ -38,6 +37,7 @@ extern void clockevents_shutdown(struct
 extern void clockevents_exchange_device(struct clock_event_device *old,
                                        struct clock_event_device *new);
 extern void clockevents_handle_noop(struct clock_event_device *dev);
+extern void clockevents_cleanup_dying_cpu(struct clock_event_device *dev);
 extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
 extern void clockevents_suspend(void);
 extern void clockevents_resume(void);
@@ -82,7 +82,7 @@ static inline int tick_check_oneshot_cha
 extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
 extern void tick_install_broadcast_device(struct clock_event_device *dev);
 extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_shutdown_broadcast(unsigned int cpu);
+extern void tick_shutdown_broadcast(void);
 extern void tick_suspend_broadcast(void);
 extern void tick_resume_broadcast(void);
 extern bool tick_resume_check_broadcast(void);
@@ -96,7 +96,7 @@ static inline void tick_install_broadcas
 static inline int tick_is_broadcast_device(struct clock_event_device *dev) { 
return 0; }
 static inline int tick_device_uses_broadcast(struct clock_event_device *dev, 
int cpu) { return 0; }
 static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_shutdown_broadcast(unsigned int cpu) { }
+static inline void tick_shutdown_broadcast(void) { }
 static inline void tick_suspend_broadcast(void) { }
 static inline void tick_resume_broadcast(void) { }
 static inline bool tick_resume_check_broadcast(void) { return false; }


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to