This message is too late, /me going to see new series :)
05.06.2015, 12:02, "Kirill Tkhai" <tk...@yandex.ru>: > В Чт, 04/06/2015 в 12:49 +0200, Peter Zijlstra пишет: > On Thu, Jun 04, 2015 at 12:07:03PM +0300, Kirill Tkhai wrote: >>>> --- a/include/linux/hrtimer.h >>>> +++ b/include/linux/hrtimer.h >>>> @@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void); >>>> * A timer is active, when it is enqueued into the rbtree or the >>>> * callback function is running or it's in the state of being migrated >>>> * to another cpu. >>>> + * >>>> + * See __run_hrtimer(). >>>> */ >>>> -static inline int hrtimer_active(const struct hrtimer *timer) >>>> +static inline bool hrtimer_active(const struct hrtimer *timer) >>>> { >>>> - return timer->state != HRTIMER_STATE_INACTIVE || >>>> - timer->base->running == timer; >>>> + if (timer->state != HRTIMER_STATE_INACTIVE) >>>> + return true; >>>> + >>>> + smp_rmb(); /* C matches A */ >>>> + >>>> + if (timer->base->running == timer) >>>> + return true; >>>> + >>>> + smp_rmb(); /* D matches B */ >>>> + >>>> + if (timer->state != HRTIMER_STATE_INACTIVE) >>>> + return true; >>>> + >>>> + return false; >>> This races with two sequential timer handlers. hrtimer_active() >>> is preemptible everywhere, and no guarantees that all three "if" >>> conditions check the same timer tick. >> Indeed. >>> How about transformation of hrtimer_bases.lock: raw_spinlock_t --> >>> seqlock_t? >> Ingo will like that because it means we already need to touch cpu_base. >> >> But I think there's a problem there on timer migration, the timer can >> migrate between bases while we do the seq read loop and then you can get >> false positives on the different seqcount numbers. >> >> We could of course do something like the below, but hrtimer_is_active() >> is turning into quite the monster. >> >> Needs more comments at the very least, its fully of trickery. > > Yeah, it's safe for now, but it may happen difficulties with a support > in the future, because barrier logic is not easy to review. But it seems > we may simplify it a little bit. Please, see the comments below. >> --- >> --- a/include/linux/hrtimer.h >> +++ b/include/linux/hrtimer.h >> @@ -59,7 +59,9 @@ enum hrtimer_restart { >> * mean touching the timer after the callback, this makes it impossible to >> free >> * the timer from the callback function. >> * >> - * Therefore we track the callback state in timer->base->running == timer. >> + * Therefore we track the callback state in: >> + * >> + * timer->base->cpu_base->running == timer >> * >> * On SMP it is possible to have a "callback function running and enqueued" >> * status. It happens for example when a posix timer expired and the >> callback >> @@ -144,7 +146,6 @@ struct hrtimer_clock_base { >> struct timerqueue_head active; >> ktime_t (*get_time)(void); >> ktime_t offset; >> - struct hrtimer *running; >> } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN))); >> >> enum hrtimer_base_type { >> @@ -159,6 +160,8 @@ enum hrtimer_base_type { >> * struct hrtimer_cpu_base - the per cpu clock bases >> * @lock: lock protecting the base and associated clock bases >> * and timers >> + * @seq: seqcount around __run_hrtimer >> + * @running: pointer to the currently running hrtimer >> * @cpu: cpu number >> * @active_bases: Bitfield to mark bases with active timers >> * @clock_was_set_seq: Sequence counter of clock was set events >> @@ -180,6 +183,8 @@ enum hrtimer_base_type { >> */ >> struct hrtimer_cpu_base { >> raw_spinlock_t lock; >> + seqcount_t seq; >> + struct hrtimer *running; >> unsigned int cpu; >> unsigned int active_bases; >> unsigned int clock_was_set_seq; >> @@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void); >> */ >> static inline int hrtimer_active(const struct hrtimer *timer) >> { >> - return timer->state != HRTIMER_STATE_INACTIVE || >> - timer->base->running == timer; >> + struct hrtimer_cpu_base *cpu_base; >> + unsigned int seq; >> + bool active; >> + >> + do { >> + active = false; >> + cpu_base = READ_ONCE(timer->base->cpu_base); >> + seqcount_lockdep_reader_access(&cpu_base->seq); >> + seq = raw_read_seqcount(&cpu_base->seq); >> + >> + if (timer->state != HRTIMER_STATE_INACTIVE || >> + cpu_base->running == timer) >> + active = true; >> + >> + } while (read_seqcount_retry(&cpu_base->seq, seq) || >> + cpu_base != READ_ONCE(timer->base->cpu_base)); >> + >> + return active; >> } > > This may race with migrate_hrtimer_list(), so it needs write seqcounter > too. >> /* >> @@ -412,7 +433,7 @@ static inline int hrtimer_is_queued(stru >> */ >> static inline int hrtimer_callback_running(struct hrtimer *timer) >> { >> - return timer->base->running == timer; >> + return timer->base->cpu_base->running == timer; >> } >> >> /* Forward a hrtimer so it expires after now: */ >> --- a/kernel/time/hrtimer.c >> +++ b/kernel/time/hrtimer.c >> @@ -67,6 +67,7 @@ >> DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = >> { >> .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), >> + .seq = SEQCNT_ZERO(hrtimer_bases.seq), >> .clock_base = >> { >> { >> @@ -113,9 +114,15 @@ static inline int hrtimer_clockid_to_bas >> /* >> * We require the migration_base for >> lock_hrtimer_base()/switch_hrtimer_base() >> * such that hrtimer_callback_running() can unconditionally dereference >> - * timer->base. >> + * timer->base->cpu_base >> */ >> -static struct hrtimer_clock_base migration_base; >> +static struct hrtimer_cpu_base migration_cpu_base = { >> + .seq = SEQCNT_ZERO(migration_cpu_base), >> +}; >> + >> +static struct hrtimer_clock_base migration_base { >> + .cpu_base = &migration_cpu_base, >> +}; >> >> /* >> * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock >> @@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer >> enum hrtimer_restart (*fn)(struct hrtimer *); >> int restart; >> >> - WARN_ON(!irqs_disabled()); >> + lockdep_assert_held(&cpu_base->lock); >> >> debug_deactivate(timer); >> - base->running = timer; >> + cpu_base->running = timer; > > My suggestion is do not use seqcounters for long parts of code, and implement > short primitives for changing timer state and cpu_base running timer. > Something > like this: > > static inline void hrtimer_set_state(struct hrtimer *timer, unsigned long > state) > { > struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base; > > lockdep_assert_held(&cpu_base->lock); > > write_seqcount_begin(&cpu_base->seq); > timer->state = state; > write_seqcount_end(&cpu_base->seq); > } > > static inline void cpu_base_set_running(struct hrtimer_cpu_base *cpu_base, > struct hrtimer *timer) > { > lockdep_assert_held(&cpu_base->lock); > > write_seqcount_begin(&cpu_base->seq); > cpu_base->running = timer; > write_seqcount_end(&cpu_base->seq); > } > > Implemented this, we may less think about right barrier order, because > all changes are being made under seqcount. > > static inline int hrtimer_active(const struct hrtimer *timer) > { > struct hrtimer_cpu_base *cpu_base; > struct hrtimer_clock_base *base; > unsigned int seq; > bool active = false; > > do { > base = READ_ONCE(timer->base); > if (base == &migration_base) { > active = true; > break; > } > > cpu_base = base->cpu_base; > seqcount_lockdep_reader_access(&cpu_base->seq); > seq = raw_read_seqcount(&cpu_base->seq); > > if (timer->state != HRTIMER_STATE_INACTIVE || > cpu_base->running == timer) { > active = true; > break; > } > } while (read_seqcount_retry(&cpu_base->seq, seq) || > READ_ONCE(timer->base) != base); > > return active; > } >> + >> + /* >> + * separate the ->running assignment from the ->state assignment >> + */ >> + write_seqcount_begin(&cpu_base->seq); >> + >> __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); >> timer_stats_account_hrtimer(timer); >> fn = timer->function; >> @@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer >> !(timer->state & HRTIMER_STATE_ENQUEUED)) >> enqueue_hrtimer(timer, base); >> >> - WARN_ON_ONCE(base->running != timer); >> - base->running = NULL; >> + /* >> + * separate the ->running assignment from the ->state assignment >> + */ >> + write_seqcount_end(&cpu_base->seq); >> + >> + WARN_ON_ONCE(cpu_base->running != timer); >> + cpu_base->running = NULL; >> } >> >> static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, >> ktime_t now) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/