Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.

Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.

Suggested-by: Thomas Gleixner <t...@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
---
 include/linux/hrtimer.h |   47 ++++++-----------
 kernel/time/hrtimer.c   |  130 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 122 insertions(+), 55 deletions(-)

--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -53,34 +53,27 @@ enum hrtimer_restart {
  *
  * 0x00                inactive
  * 0x01                enqueued into rbtree
- * 0x02                callback function running
- * 0x04                timer is migrated to another cpu
+ * 0x02                timer is migrated to another cpu
  *
- * Special cases:
- * 0x03                callback function running and enqueued
- *             (was requeued on another CPU)
- * 0x05                timer was migrated on CPU hotunplug
+ * The callback state is not part of the timer->state because clearing it would
+ * mean touching the timer after the callback, this makes it impossible to free
+ * the timer from the callback function.
  *
- * The "callback function running and enqueued" status is only possible on
- * SMP. It happens for example when a posix timer expired and the callback
+ * Therefore we track the callback state in:
+ *
+ *     timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
  * queued a signal. Between dropping the lock which protects the posix timer
  * and reacquiring the base lock of the hrtimer, another CPU can deliver the
- * signal and rearm the timer. We have to preserve the callback running state,
- * as otherwise the timer could be removed before the softirq code finishes the
- * the handling of the timer.
- *
- * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * signal and rearm the timer.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE 0x00
 #define HRTIMER_STATE_ENQUEUED 0x01
-#define HRTIMER_STATE_CALLBACK 0x02
-#define HRTIMER_STATE_MIGRATE  0x04
+#define HRTIMER_STATE_MIGRATE  0x02
 
 /**
  * struct hrtimer - the basic hrtimer structure
@@ -167,6 +160,8 @@ enum  hrtimer_base_type {
  * struct hrtimer_cpu_base - the per cpu clock bases
  * @lock:              lock protecting the base and associated clock bases
  *                     and timers
+ * @seq:               seqcount around __run_hrtimer
+ * @running:           pointer to the currently running hrtimer
  * @cpu:               cpu number
  * @active_bases:      Bitfield to mark bases with active timers
  * @clock_was_set_seq: Sequence counter of clock was set events
@@ -188,6 +183,8 @@ enum  hrtimer_base_type {
  */
 struct hrtimer_cpu_base {
        raw_spinlock_t                  lock;
+       seqcount_t                      seq;
+       struct hrtimer                  *running;
        unsigned int                    cpu;
        unsigned int                    active_bases;
        unsigned int                    clock_was_set_seq;
@@ -395,15 +392,7 @@ extern ktime_t hrtimer_get_remaining(con
 
 extern u64 hrtimer_get_next_event(void);
 
-/*
- * A timer is active, when it is enqueued into the rbtree or the
- * callback function is running or it's in the state of being migrated
- * to another cpu.
- */
-static inline int hrtimer_active(const struct hrtimer *timer)
-{
-       return timer->state != HRTIMER_STATE_INACTIVE;
-}
+extern bool hrtimer_active(const struct hrtimer *timer);
 
 /*
  * Helper function to check, whether the timer is on one of the queues
@@ -419,7 +408,7 @@ static inline int hrtimer_is_queued(stru
  */
 static inline int hrtimer_callback_running(struct hrtimer *timer)
 {
-       return timer->state & HRTIMER_STATE_CALLBACK;
+       return timer->base->cpu_base->running == timer;
 }
 
 /* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -67,6 +67,7 @@
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
        .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+       .seq = SEQCNT_ZERO(hrtimer_bases.seq),
        .clock_base =
        {
                {
@@ -111,6 +112,19 @@ static inline int hrtimer_clockid_to_bas
 #ifdef CONFIG_SMP
 
 /*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+       .seq = SEQCNT_ZERO(migration_cpu_base),
+};
+
+static struct hrtimer_clock_base migration_base = {
+       .cpu_base = &migration_cpu_base,
+};
+
+/*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
@@ -119,8 +133,8 @@ static inline int hrtimer_clockid_to_bas
  * be found on the lists/queues.
  *
  * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
  */
 static
 struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -130,7 +144,7 @@ struct hrtimer_clock_base *lock_hrtimer_
 
        for (;;) {
                base = timer->base;
-               if (likely(base != NULL)) {
+               if (likely(base != &migration_base)) {
                        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
                        if (likely(base == timer->base))
                                return base;
@@ -194,8 +208,8 @@ switch_hrtimer_base(struct hrtimer *time
                if (unlikely(hrtimer_callback_running(timer)))
                        return base;
 
-               /* See the comment in lock_timer_base() */
-               timer->base = NULL;
+               /* See the comment in lock_hrtimer_base() */
+               timer->base = &migration_base;
                raw_spin_unlock(&base->cpu_base->lock);
                raw_spin_lock(&new_base->cpu_base->lock);
 
@@ -840,11 +854,7 @@ static int enqueue_hrtimer(struct hrtime
 
        base->cpu_base->active_bases |= 1 << base->index;
 
-       /*
-        * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
-        * state of a possibly running callback.
-        */
-       timer->state |= HRTIMER_STATE_ENQUEUED;
+       timer->state = HRTIMER_STATE_ENQUEUED;
 
        return timerqueue_add(&base->active, &timer->node);
 }
@@ -894,7 +904,6 @@ static inline int
 remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 {
        if (hrtimer_is_queued(timer)) {
-               unsigned long state;
                int reprogram;
 
                /*
@@ -908,13 +917,8 @@ remove_hrtimer(struct hrtimer *timer, st
                debug_deactivate(timer);
                timer_stats_hrtimer_clear_start_info(timer);
                reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
-               /*
-                * We must preserve the CALLBACK state flag here,
-                * otherwise we could move the timer base in
-                * switch_hrtimer_base.
-                */
-               state = timer->state & HRTIMER_STATE_CALLBACK;
-               __remove_hrtimer(timer, base, state, reprogram);
+
+               __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 
reprogram);
                return 1;
        }
        return 0;
@@ -1114,6 +1118,70 @@ void hrtimer_init(struct hrtimer *timer,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
+ */
+bool hrtimer_active(const struct hrtimer *timer)
+{
+       struct hrtimer_cpu_base *cpu_base;
+       unsigned int seq;
+       bool active;
+
+       do {
+               active = false;
+               cpu_base = READ_ONCE(timer->base->cpu_base);
+               seq = raw_read_seqcount(&cpu_base->seq);
+
+               if (timer->state != HRTIMER_STATE_INACTIVE ||
+                   cpu_base->running == timer)
+                       active = true;
+
+       } while (read_seqcount_retry(&cpu_base->seq, seq) ||
+                cpu_base != READ_ONCE(timer->base->cpu_base));
+
+       return active;
+}
+EXPORT_SYMBOL_GPL(hrtimer_active);
+
+/*
+ *     __run_hrtimer()                     hrtimer_active()
+ *
+ * [S] cpu_base->running = timer       [R] timer->base->cpu_base
+ * [S] seq++                           [R] seq
+ *     WMB                                 RMB
+ * [S] timer->state = INACTIVE
+ *                                     [R] timer->state
+ *     fn();                           [R] cpu_base->running
+ *
+ * [S] timer->state = ENQUEUED (optional)
+ *     WMB                                 RMB
+ * [S] seq++                           [R] seq
+ * [S] cpu_base->running = NULL                [R] timer->base->cpu_base
+ *
+ *
+ * The WMBs+seq on the __run_hrtimer() split the thing into 3 distinct 
sections:
+ *
+ *  - queued:  the timer is queued
+ *  - callback:        the timer is being ran
+ *  - post:    the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * Note: this is unusual seqlock usage in that we do not have the odd/even
+ * thing, all we really care about is both reads happening in the same section
+ * of __run_hrtimer().
+ *
+ * Note: both seq increments are not fully store ordered and this is fine, if
+ * for example the first seq increment is observed before the ->running store
+ * the section edge shifts slightly, but its a safe shift because here ->state
+ * is still ENQUEUED.
+ */
+
 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
                          struct hrtimer_clock_base *base,
                          struct hrtimer *timer, ktime_t *now)
@@ -1121,10 +1189,17 @@ static void __run_hrtimer(struct hrtimer
        enum hrtimer_restart (*fn)(struct hrtimer *);
        int restart;
 
-       WARN_ON(!irqs_disabled());
+       lockdep_assert_held(&cpu_base->lock);
 
        debug_deactivate(timer);
-       __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+       cpu_base->running = timer;
+
+       /*
+        * separate the ->running assignment from the ->state assignment
+        */
+       raw_write_seqcount_begin(&cpu_base->seq);
+
+       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
        timer_stats_account_hrtimer(timer);
        fn = timer->function;
 
@@ -1140,7 +1215,7 @@ static void __run_hrtimer(struct hrtimer
        raw_spin_lock(&cpu_base->lock);
 
        /*
-        * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+        * Note: We clear the running state after enqueue_hrtimer and
         * we do not reprogramm the event hardware. Happens either in
         * hrtimer_start_range_ns() or in hrtimer_interrupt()
         *
@@ -1152,9 +1227,13 @@ static void __run_hrtimer(struct hrtimer
            !(timer->state & HRTIMER_STATE_ENQUEUED))
                enqueue_hrtimer(timer, base);
 
-       WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+       /*
+        * separate the ->running assignment from the ->state assignment
+        */
+       raw_write_seqcount_end(&cpu_base->seq);
 
-       timer->state &= ~HRTIMER_STATE_CALLBACK;
+       WARN_ON_ONCE(cpu_base->running != timer);
+       cpu_base->running = NULL;
 }
 
 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t 
now)
@@ -1523,11 +1602,10 @@ static void migrate_hrtimer_list(struct
                 * hrtimer_interrupt after we migrated everything to
                 * sort out already expired timers and reprogram the
                 * event device.
+                *
+                * Sets timer->state = HRTIMER_STATE_ENQUEUED.
                 */
                enqueue_hrtimer(timer, new_base);
-
-               /* Clear the migration state bit */
-               timer->state &= ~HRTIMER_STATE_MIGRATE;
        }
 }
 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to