This could be even faster if it were written in assembler :) The only reason it's Signed-off-by is that I agree to the DCO. That should not be construed to mean that anyone should apply this patch. It's an abomination and it will do terrible, terrible things.
It boots, though :) I haven't tested it beyond that. Signed-off-by: Andy Lutomirski <l...@amacapital.net> --- arch/x86/include/asm/calling.h | 10 ++++++++++ arch/x86/kernel/entry_64.S | 14 ++++++++++++++ arch/x86/kernel/process_64.c | 37 +++++++++++++++++++++++++++++++++++++ arch/x86/kernel/vsyscall_64.c | 2 +- arch/x86/kernel/vsyscall_emu_64.S | 5 +++++ 5 files changed, 67 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index cb4c73b..ead0345 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -46,7 +46,9 @@ For 32-bit we have the following conventions - kernel is built with */ +#ifdef __ASSEMBLY__ #include <asm/dwarf2.h> +#endif #ifdef CONFIG_X86_64 @@ -85,6 +87,8 @@ For 32-bit we have the following conventions - kernel is built with #define ARGOFFSET R11 #define SWFRAME ORIG_RAX +#ifdef __ASSEMBLY__ + .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1 subq $9*8+\addskip, %rsp CFI_ADJUST_CFA_OFFSET 9*8+\addskip @@ -195,8 +199,12 @@ For 32-bit we have the following conventions - kernel is built with .byte 0xf1 .endm +#endif /* __ASSEMBLY__ */ + #else /* CONFIG_X86_64 */ +#ifdef __ASSEMBLY__ + /* * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These * are different from the entry_32.S versions in not changing the segment @@ -240,5 +248,7 @@ For 32-bit we have the following conventions - kernel is built with CFI_RESTORE eax .endm +#endif /* __ASSEMBLY__ */ + #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1e96c36..7e3eae1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1027,6 +1027,9 @@ retint_swapgs: /* return to user-space */ */ DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ + call install_sysret_trampoline + test %rax,%rax + jnz iret_via_sysret SWAPGS jmp restore_args @@ -1036,6 +1039,7 @@ retint_restore_args: /* return to kernel space */ * The iretq could re-enable interrupts: */ TRACE_IRQS_IRETQ + restore_args: RESTORE_ARGS 1,8,1 @@ -1043,6 +1047,16 @@ irq_return: INTERRUPT_RETURN _ASM_EXTABLE(irq_return, bad_iret) +iret_via_sysret: + SWAPGS + RESTORE_ARGS 1,8,1 + popq %rcx /* RIP */ + popq %r11 /* CS */ + popq %r11 /* RFLAGS */ + popq %rsp /* RSP */ + /* ignore SS */ + sysretq + #ifdef CONFIG_PARAVIRT ENTRY(native_iret) iretq diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9c0280f..e48aced 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -562,3 +562,40 @@ unsigned long KSTK_ESP(struct task_struct *task) return (test_tsk_thread_flag(task, TIF_IA32)) ? (task_pt_regs(task)->sp) : ((task)->thread.usersp); } + +#include <asm/calling.h> + +unsigned long notrace install_sysret_trampoline(void) +{ + unsigned long *here = __builtin_frame_address(0); + unsigned long *asmframe = here + 2; + unsigned long __user * newrsp; + +#define FRAMEVAL(x) asmframe[((x)-ARGOFFSET) / 8] + newrsp = (unsigned long __user * __force)(FRAMEVAL(RSP) - 128 - 3*8); + + if (FRAMEVAL(CS) != __USER_CS) + return 0; + + /* + * A real implementation would do: + * if (!access_ok(VERIFY_WRITE, newrsp, 3*8)) + * return 0; + */ + + if (__put_user(FRAMEVAL(RIP), newrsp + 2)) + return 0; + + if (__put_user(FRAMEVAL(R11), newrsp + 1)) + return 0; + + if (__put_user(FRAMEVAL(RCX), newrsp)) + return 0; + + /* Hi there, optimizer. */ + ACCESS_ONCE(FRAMEVAL(RIP)) = 0xffffffffff600c00; + ACCESS_ONCE(FRAMEVAL(RSP)) = (unsigned long)newrsp; + return 1; + +#undef FRAMEVAL +} diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8b3b3eb..77a5ef3 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -54,7 +54,7 @@ DEFINE_VVAR(int, vgetcpu_mode); -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE; static int __init vsyscall_setup(char *str) { diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S index c9596a9..a54a780 100644 --- a/arch/x86/kernel/vsyscall_emu_64.S +++ b/arch/x86/kernel/vsyscall_emu_64.S @@ -32,6 +32,11 @@ __vsyscall_page: syscall ret + .balign 1024, 0xcc + popq %rcx + popq %r11 + retq $128 + .balign 4096, 0xcc .size __vsyscall_page, 4096 -- 1.9.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/