This will allow IRQ stacks to nest inside NMIs or similar entries
that can happen during IRQ stack setup or teardown.

The Xen code here has a confusing comment.

Signed-off-by: Andy Lutomirski <l...@kernel.org>
---
 arch/x86/entry/entry_64.S    | 72 ++++++++++++++++++++++++++------------------
 arch/x86/kernel/cpu/common.c |  2 +-
 arch/x86/kernel/process_64.c |  4 +++
 3 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d3033183ed70..5f7df8949fa7 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -491,6 +491,39 @@ ENTRY(irq_entries_start)
 END(irq_entries_start)
 
 /*
+ * Enters the IRQ stack if we're not already using it.  NMI-safe.  Clobbers
+ * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
+ * Requires kernel GSBASE.
+ *
+ * The invariant is that, if irq_count != 0, then we're either on the
+ * IRQ stack or an IST stack, even if an NMI interrupts IRQ stack entry
+ * or exit.
+ */
+.macro ENTER_IRQ_STACK old_rsp
+       movq    %rsp, \old_rsp
+       cmpl    $0, PER_CPU_VAR(irq_count)
+       jne 694f
+       movq    PER_CPU_VAR(irq_stack_ptr), %rsp
+       /*
+        * Right now, we're on the irq stack with irq_count == 0.  A nested
+        * IRQ stack switch could clobber the stack.  That's fine: the stack
+        * is empty.
+        */
+694:
+       incl    PER_CPU_VAR(irq_count)
+       pushq   \old_rsp
+.endm
+
+/*
+ * Undoes ENTER_IRQ_STACK
+ */
+.macro LEAVE_IRQ_STACK
+       /* We need to be off the IRQ stack before decrementing irq_count. */
+       popq    %rsp
+       decl    PER_CPU_VAR(irq_count)
+.endm
+
+/*
  * Interrupt entry/exit.
  *
  * Interrupt entry points save only callee clobbered registers in fast path.
@@ -518,17 +551,7 @@ END(irq_entries_start)
 #endif
 
 1:
-       /*
-        * Save previous stack pointer, optionally switch to interrupt stack.
-        * irq_count is used to check if a CPU is already on an interrupt stack
-        * or not. While this is essentially redundant with preempt_count it is
-        * a little cheaper to use a separate counter in the PDA (short of
-        * moving irq_enter into assembly, which would be too much work)
-        */
-       movq    %rsp, %rdi
-       incl    PER_CPU_VAR(irq_count)
-       cmovzq  PER_CPU_VAR(irq_stack_ptr), %rsp
-       pushq   %rdi
+       ENTER_IRQ_STACK old_rsp=%rdi
        /* We entered an interrupt context - irqs are off: */
        TRACE_IRQS_OFF
 
@@ -548,10 +571,8 @@ common_interrupt:
 ret_from_intr:
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-       decl    PER_CPU_VAR(irq_count)
 
-       /* Restore saved previous stack */
-       popq    %rsp
+       LEAVE_IRQ_STACK
 
        testb   $3, CS(%rsp)
        jz      retint_kernel
@@ -863,14 +884,9 @@ bad_gs:
 
 /* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(do_softirq_own_stack)
-       pushq   %rbp
-       mov     %rsp, %rbp
-       incl    PER_CPU_VAR(irq_count)
-       cmove   PER_CPU_VAR(irq_stack_ptr), %rsp
-       push    %rbp                            /* frame pointer backlink */
+       ENTER_IRQ_STACK old_rsp=%r11
        call    __do_softirq
-       leaveq
-       decl    PER_CPU_VAR(irq_count)
+       LEAVE_IRQ_STACK
        ret
 END(do_softirq_own_stack)
 
@@ -889,25 +905,21 @@ idtentry xen_hypervisor_callback 
xen_do_hypervisor_callback has_error_code=0
  * So, on entry to the handler we detect whether we interrupted an
  * existing activation in its critical region -- if so, we pop the current
  * activation and restart the handler using the previous one.
+ *
+ * XXX: I have no idea what this comment is talking about.  --luto
  */
 ENTRY(xen_do_hypervisor_callback)              /* 
do_hypervisor_callback(struct *pt_regs) */
-
+       ENTER_IRQ_STACK old_rsp=%r11
 /*
  * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
  * see the correct pointer to the pt_regs
  */
-       movq    %rdi, %rsp                      /* we don't return, adjust the 
stack frame */
-11:    incl    PER_CPU_VAR(irq_count)
-       movq    %rsp, %rbp
-       cmovzq  PER_CPU_VAR(irq_stack_ptr), %rsp
-       pushq   %rbp                            /* frame pointer backlink */
        call    xen_evtchn_do_upcall
-       popq    %rsp
-       decl    PER_CPU_VAR(irq_count)
+       LEAVE_IRQ_STACK
 #ifndef CONFIG_PREEMPT
        call    xen_maybe_preempt_hcall
 #endif
-       jmp     error_exit
+       ret
 END(xen_do_hypervisor_callback)
 
 /*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1c528b06f802..e9968531ce56 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1161,7 +1161,7 @@ EXPORT_PER_CPU_SYMBOL(current_task);
 DEFINE_PER_CPU(char *, irq_stack_ptr) =
        init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
 
-DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
+DEFINE_PER_CPU(unsigned int, irq_count) __visible;
 
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0831ba3bcf95..65783f6eb22c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -280,6 +280,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct 
*next_p)
        unsigned fsindex, gsindex;
        fpu_switch_t fpu_switch;
 
+#ifdef CONFIG_DEBUG_ENTRY
+       WARN_ON(this_cpu_read(irq_count));
+#endif
+
        fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
 
        /* We must save %fs and %gs before load_TLS() because
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to