On x86, sleeping while on an IST or irq stack has a surprisingly good chance of working, but it can also fail dramatically. Add an arch hook to allow schedule and __might_sleep to catch sleeping on the wrong stack.
This will also catch do_exit from a funny stack, which could leave an IST stack shifted or an NMI nesting count incremented. Signed-off-by: Andy Lutomirski <l...@amacapital.net> --- arch/x86/Kconfig | 1 + arch/x86/include/asm/thread_info.h | 17 +++++++++++++++++ arch/x86/kernel/irq_32.c | 13 +++---------- include/linux/thread_info.h | 7 +++++++ kernel/Kconfig.locks | 3 +++ kernel/sched/core.c | 14 ++++++++++---- 6 files changed, 41 insertions(+), 14 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ded8a6774ac9..a811286636d2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -137,6 +137,7 @@ config X86 select HAVE_ACPI_APEI_NMI if ACPI select ACPI_LEGACY_TABLES_LOOKUP if ACPI select X86_FEATURE_NAMES if PROC_FS + select HAVE_ARCH_SCHEDULE_ALLOWED config INSTRUCTION_DECODER def_bool y diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 547e344a6dc6..05701f132473 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -170,6 +170,23 @@ static inline struct thread_info *current_thread_info(void) return ti; } +static inline unsigned long current_stack_pointer(void) +{ + unsigned long sp; +#ifdef CONFIG_X86_64 + asm("mov %%rsp,%0" : "=g" (sp)); +#else + asm("mov %%esp,%0" : "=g" (sp)); +#endif + return sp; +} + +static inline bool arch_schedule_allowed(void) +{ + return ((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) + & ~(THREAD_SIZE - 1)) == 0; +} + #else /* !__ASSEMBLY__ */ /* how to get the thread information struct from ASM */ diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 63ce838e5a54..28d28f5eb8f4 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -69,16 +69,9 @@ static void call_on_stack(void *func, void *stack) : "memory", "cc", "edx", "ecx", "eax"); } -/* how to get the current stack pointer from C */ -#define current_stack_pointer ({ \ - unsigned long sp; \ - asm("mov %%esp,%0" : "=g" (sp)); \ - sp; \ -}) - static inline void *current_stack(void) { - return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); + return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); } static inline int @@ -103,7 +96,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) /* Save the next esp at the bottom of the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer; + *prev_esp = current_stack_pointer(); if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -156,7 +149,7 @@ void do_softirq_own_stack(void) /* Push the previous esp onto the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer; + *prev_esp = current_stack_pointer(); call_on_stack(__do_softirq, isp); } diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ff307b548ed3..6deaf7e97009 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -145,6 +145,13 @@ static inline bool test_and_clear_restore_sigmask(void) #error "no set_restore_sigmask() provided and default one won't work" #endif +#ifndef CONFIG_HAVE_ARCH_SCHEDULE_ALLOWED +static inline bool arch_schedule_allowed(void) +{ + return true; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 76768ee812b2..2714dc34695a 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -237,3 +237,6 @@ config ARCH_USE_QUEUE_RWLOCK config QUEUE_RWLOCK def_bool y if ARCH_USE_QUEUE_RWLOCK depends on SMP + +config HAVE_ARCH_SCHEDULE_ALLOWED + bool \ No newline at end of file diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 240157c13ddc..e51ab65a9750 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2705,8 +2705,12 @@ static inline void schedule_debug(struct task_struct *prev) * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path. Otherwise whine * if we are scheduling when we should not. + * + * If architectural conditions for scheduling are not met, + * complain even if we are in do_exit. */ - if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) + if (unlikely((in_atomic_preempt_off() && prev->state != TASK_DEAD) || + !arch_schedule_allowed())) __schedule_bug(prev); rcu_sleep_check(); @@ -7200,10 +7204,12 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + bool arch_ok; rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ + arch_ok = arch_schedule_allowed(); if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && - !is_idle_task(current)) || + !is_idle_task(current) && arch_ok) || system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) @@ -7214,8 +7220,8 @@ void __might_sleep(const char *file, int line, int preempt_offset) "BUG: sleeping function called from invalid context at %s:%d\n", file, line); printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), + "in_atomic(): %d, irqs_disabled(): %d, arch_schedule_allowed: %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), (int)arch_ok, current->pid, current->comm); debug_show_held_locks(current); -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/