Hi,

This particular crash was hard to diagnose because of two reasons:

* CPU would happily use userspace RSP in kernel mode.
  Crash comes only later, when we run off the stack.
  We lose information when it started.

* Kernel's error handling code is ill prepared for RSP pointing
  to user stack. So we take another page fault trying
  to dump stack.

I prepared a patch which helps with both problems.

For testing, I inserted an invalid instruction right before SYSRET
to induce a similar bug, and booted resulting kernel in qemu.

Before my patch, double fault output starts like this:

[    0.715216] PANIC: double fault, error_code: 0x0
[    0.716033] CPU: 0 PID: 1 Comm: init Not tainted 4.0.0-rc2+ #7
[    0.716033] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[    0.716033] task: ffff880007588000 ti: ffff880007590000 task.ti: 
ffff880007590000
[    0.716033] RIP: 0010:[<ffffffff81017057>]  [<ffffffff81017057>] 
do_error_trap+0x47/0x120
[    0.716033] RSP: 0018:00007ffd89e7ffb8  EFLAGS: 00010006

The key here is that it doesn't show at which RIP we took the first
"bad" exception. The only useful detail visible here is bad RSP.
"do_error_trap+0x47" is useless.

After the patch, the very moment of "bad" exception is caught:

[    0.666758] Exception on user stack 00007ffc1fd0c388: RSP: 
0018:00007ffc1fd0c3b0  EFLAGS: 00010006
[    0.667285] RIP: 0010:[<ffffffff81793688>]  [<ffffffff81793688>] 
ret_from_sys_call+0x5f/0x67
[    0.667285] PANIC: double fault, error_code: 0xffffffffffffffff
[    0.667285] CPU: 0 PID: 1 Comm: init Not tainted 4.0.0-rc2+ #13
[    0.667285] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[    0.667285] task: ffff880007588000 ti: ffff880007590000 task.ti: 
ffff880007590000
[    0.667285] RIP: 0010:[<ffffffff81793688>]  [<ffffffff81793688>] 
ret_from_sys_call+0x5f/0x67
[    0.667285] RSP: 0018:00007ffc1fd0c3b0  EFLAGS: 00010006

The exception happened at "ret_from_sys_call+0x5f".
We also won't take another page fault any more,
output proceeds like this:

...
[    0.667285] RAX: 0000000007a00000 RBX: 00007ffc1fd0c4e0 RCX: 00000000c0000101
[    0.667285] RDX: 00000000ffff8800 RSI: 0000000000005401 RDI: 00007ffc1fd0c388
[    0.667285] RBP: 00007ffc1fd0c570 R08: 0000000000000010 R09: 0000000000000000
[    0.667285] R10: 00007ffc1fd0c650 R11: 0000000000000202 R12: 0000000000000120
[    0.667285] R13: 00000000005f7b78 R14: 0000000000000000 R15: 00000000004c9d44
[    0.667285] FS:  0000000000000000(0000) GS:ffff880007a00000(0000) 
knlGS:0000000000000000
[    0.667285] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    0.667285] CR2: 00000000004ad1e4 CR3: 0000000000101000 CR4: 00000000000007f0
[    0.667285] Stack:
[    0.667285]  0000000000000018 00007ffc1fd0c490 00007ffc1fd0c3d0 
0000000000000000
[    0.667285]  0000000000000000 0000000000000000 00007ffc1fd0c490 
0000000000000000
[    0.667285]  0000000000000000 0000000000000000 0000000000000000 
0000000000000000
[    0.667285] Call Trace:
[    0.667285]  <UNK>
[    0.667285] Code: 8b 44 24 50 48 8b 54 24 60 48 8b 74 24 68 48 8b 7c 24 70 
48 8b 8c 24 80 00 00 00 4c 8b 9c 24 90 00 00 00 48 8b a4 24 98 00 00 00 <0f> 0b 
0f 01 f8 48 0f 07 48 c7 84 24 a0 00 00 00 2b 00 00 00 48
[    0.667285] Kernel panic - not syncing: Machine halted.
[    0.667285] CPU: 0 PID: 1 Comm: init Not tainted 4.0.0-rc2+ #13
[    0.667285] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[    0.667285]  ffffffffffffffff ffff880007593e28 ffffffff81789625 
ffff880007588000
[    0.667285]  ffffffff81a3b181 ffff880007593ea8 ffffffff817840aa 
ffff880007590000
[    0.667285]  0000000000000008 ffff880007593eb8 ffff880007593e58 
0000000000000001
[    0.667285] Call Trace:
[    0.667285]  [<ffffffff81789625>] dump_stack+0x4c/0x65
[    0.667285]  [<ffffffff817840aa>] panic+0xc6/0x1ff
[    0.667285]  [<ffffffff81059ee5>] df_debug+0x35/0x40
[    0.667285]  [<ffffffff81017e37>] do_double_fault+0x87/0x100
[    0.667285]  [<ffffffff81017fb7>] do_userpsace_rsp_in_kernel+0x107/0x140
[    0.667285]  [<ffffffff81793688>] ? ret_from_sys_call+0x5f/0x67
[    0.667285]  [<ffffffff81795b49>] userpsace_rsp_in_kernel+0x39/0x40
[    0.667285]  [<ffffffff81793688>] ? ret_from_sys_call+0x5f/0x67
[    0.667285] Kernel Offset: disabled
[    0.667285] Rebooting in 1 seconds..

Takashi, are you willing to reproduce the panic one more time,
with this patch? I would like to see whether oops messages
are more informative with it.



diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 4e49d7d..92a35e6 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -70,6 +70,7 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, 
long);
 dotraplinkage void do_stack_segment(struct pt_regs *, long);
 #ifdef CONFIG_X86_64
 dotraplinkage void do_double_fault(struct pt_regs *, long);
+dotraplinkage void do_userpsace_rsp_in_kernel(struct pt_regs *regs);
 asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *, long);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 0c91256..fb85c26 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -958,6 +958,12 @@ ENTRY(\sym)
        INTR_FRAME
        .endif

+       testq %rsp,%rsp
+       /* If RSP is positive, we are in kernel but have userspace RSP. */
+       /* This should be impossible... modulo bugs. */
+       /* We corrupted user stack already by storing iret frame there. */
+       jns     userpsace_rsp_in_kernel
+
        ASM_CLAC
        PARAVIRT_ADJUST_EXCEPTION_FRAME

@@ -1635,3 +1641,46 @@ ENTRY(ignore_sysret)
        CFI_ENDPROC
 END(ignore_sysret)

+/*
+ * We reach this place only if we detected a severe bug:
+ * on exception prologue, %rsp is not in kernelspace.
+ * This means that exception was taken while kernel was running with
+ * bogus %rsp, which should never nappen.
+ *
+ * We don't know what's going on (it *is* a bug, after all).
+ * GS is also in an unknown state.
+ *
+ * Why do we catch this? Because otherwise we would continue
+ * writing to user stack, eventually taking a page fault which
+ * gets promoted to double-fault. By this time, we'll lose
+ * useful information, such as the source RIP.
+ */
+ENTRY(userpsace_rsp_in_kernel)
+       CFI_STARTPROC
+       /* Save bogus RSP value */
+       movq    %rsp,%rdi
+       /* Switch to kernel GS if necessary */
+       movl    $MSR_GS_BASE,%ecx
+       rdmsr
+       testl   %edx,%edx
+       js      1f      /* negative -> already in kernel */
+       SWAPGS
+1:     /* hopefully PER_CPU_VAR() now works */
+
+       /* Load %rsp with something valid */
+       movq    PER_CPU_VAR(cpu_tss + TSS_sp0),%rsp
+
+       /* Create a semi-bogus iret frame */
+       push    $__KERNEL_DS    /* pt_regs->ss */
+       push    %rdi            /* pt_regs->sp */
+       push    $0              /* pt_regs->flags */
+       push    $__KERNEL_CS    /* pt_regs->cs */
+       push    $0              /* pt_regs->ip */
+       push    $-1             /* pt_regs->orix_ax */
+       ALLOC_PT_GPREGS_ON_STACK
+       call    error_entry     /* fill pt_regs->gpregs */
+       movq    %rsp,%rdi
+       call    do_userpsace_rsp_in_kernel
+       /* does not return */
+       CFI_ENDPROC
+END(userpsace_rsp_in_kernel)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 081252c..59f7ef0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -368,6 +368,47 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, 
long error_code)
        for (;;)
                die(str, regs, error_code);
 }
+
+dotraplinkage void do_userpsace_rsp_in_kernel(struct pt_regs *regs)
+{
+       struct {
+               long error_code;
+               long ip;
+               long cs;
+               long flags;
+               long sp;
+               long ss;
+       } iretq_frame;
+       int err;
+       long __user *bogus_sp;
+
+       memset(&iretq_frame, 0xff, sizeof(iretq_frame));
+
+       bogus_sp = (long __user *)regs->sp;
+       /*
+        * In long mode, CPU aligns iret frame's top to 16-byte boundary.
+        * This allows us to determine whether exception word was pushed.
+        */
+       preempt_disable();
+       if (!(regs->sp & 0xf))
+               err = copy_from_user(&iretq_frame, bogus_sp, 6 * sizeof(long));
+       else
+               err = copy_from_user(&iretq_frame.ip, bogus_sp, 5 * 
sizeof(long));
+
+       /* What this exception pushed onto user stack? */
+       printk(KERN_EMERG "Exception on user stack %016lx:"
+               " RSP: %04lx:%016lx  EFLAGS: %08lx\n",
+                       regs->sp,
+                       iretq_frame.ss, iretq_frame.sp, iretq_frame.flags);
+       printk(KERN_EMERG "RIP: %04lx:[<%016lx>] ",
+                       iretq_frame.cs, iretq_frame.ip);
+       printk_address(iretq_frame.ip);
+
+       /* (Ab)use do_double_fault to print the rest */
+       if (!err)
+               memcpy(&regs->ip, &iretq_frame.ip, 5 * sizeof(long));
+       do_double_fault(regs, iretq_frame.error_code);
+}
 #endif

 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to