This will allow IRQ stacks to nest inside NMIs or similar entries that can happen during IRQ stack setup or teardown.
The Xen code here has a confusing comment. Signed-off-by: Andy Lutomirski <l...@kernel.org> --- arch/x86/entry/entry_64.S | 72 ++++++++++++++++++++++++++------------------ arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/process_64.c | 4 +++ 3 files changed, 47 insertions(+), 31 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d3033183ed70..5f7df8949fa7 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -491,6 +491,39 @@ ENTRY(irq_entries_start) END(irq_entries_start) /* + * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers + * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. + * Requires kernel GSBASE. + * + * The invariant is that, if irq_count != 0, then we're either on the + * IRQ stack or an IST stack, even if an NMI interrupts IRQ stack entry + * or exit. + */ +.macro ENTER_IRQ_STACK old_rsp + movq %rsp, \old_rsp + cmpl $0, PER_CPU_VAR(irq_count) + jne 694f + movq PER_CPU_VAR(irq_stack_ptr), %rsp + /* + * Right now, we're on the irq stack with irq_count == 0. A nested + * IRQ stack switch could clobber the stack. That's fine: the stack + * is empty. + */ +694: + incl PER_CPU_VAR(irq_count) + pushq \old_rsp +.endm + +/* + * Undoes ENTER_IRQ_STACK + */ +.macro LEAVE_IRQ_STACK + /* We need to be off the IRQ stack before decrementing irq_count. */ + popq %rsp + decl PER_CPU_VAR(irq_count) +.endm + +/* * Interrupt entry/exit. * * Interrupt entry points save only callee clobbered registers in fast path. @@ -518,17 +551,7 @@ END(irq_entries_start) #endif 1: - /* - * Save previous stack pointer, optionally switch to interrupt stack. - * irq_count is used to check if a CPU is already on an interrupt stack - * or not. While this is essentially redundant with preempt_count it is - * a little cheaper to use a separate counter in the PDA (short of - * moving irq_enter into assembly, which would be too much work) - */ - movq %rsp, %rdi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rdi + ENTER_IRQ_STACK old_rsp=%rdi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -548,10 +571,8 @@ common_interrupt: ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) - /* Restore saved previous stack */ - popq %rsp + LEAVE_IRQ_STACK testb $3, CS(%rsp) jz retint_kernel @@ -863,14 +884,9 @@ bad_gs: /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(do_softirq_own_stack) - pushq %rbp - mov %rsp, %rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr), %rsp - push %rbp /* frame pointer backlink */ + ENTER_IRQ_STACK old_rsp=%r11 call __do_softirq - leaveq - decl PER_CPU_VAR(irq_count) + LEAVE_IRQ_STACK ret END(do_softirq_own_stack) @@ -889,25 +905,21 @@ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 * So, on entry to the handler we detect whether we interrupted an * existing activation in its critical region -- if so, we pop the current * activation and restart the handler using the previous one. + * + * XXX: I have no idea what this comment is talking about. --luto */ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ - + ENTER_IRQ_STACK old_rsp=%r11 /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ - movq %rdi, %rsp /* we don't return, adjust the stack frame */ -11: incl PER_CPU_VAR(irq_count) - movq %rsp, %rbp - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rbp /* frame pointer backlink */ call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) + LEAVE_IRQ_STACK #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall #endif - jmp error_exit + ret END(xen_do_hypervisor_callback) /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1c528b06f802..e9968531ce56 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1161,7 +1161,7 @@ EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(char *, irq_stack_ptr) = init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; -DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; +DEFINE_PER_CPU(unsigned int, irq_count) __visible; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0831ba3bcf95..65783f6eb22c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -280,6 +280,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) unsigned fsindex, gsindex; fpu_switch_t fpu_switch; +#ifdef CONFIG_DEBUG_ENTRY + WARN_ON(this_cpu_read(irq_count)); +#endif + fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); /* We must save %fs and %gs before load_TLS() because -- 2.4.3 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/