With PTI, syscall/interrupt/exception entries switch the CR3 register to change the page-table in assembly code. Move the CR3 register switch inside the C code of syscall/interrupt/exception entry handlers.
Signed-off-by: Alexandre Chartre <alexandre.char...@oracle.com> --- arch/x86/entry/common.c | 15 ++++++++++++--- arch/x86/entry/entry_64.S | 23 +++++------------------ arch/x86/entry/entry_64_compat.S | 22 ---------------------- arch/x86/include/asm/entry-common.h | 13 +++++++++++++ arch/x86/include/asm/idtentry.h | 25 ++++++++++++++++++++----- arch/x86/kernel/cpu/mce/core.c | 2 ++ arch/x86/kernel/nmi.c | 2 ++ arch/x86/kernel/traps.c | 6 ++++++ arch/x86/mm/fault.c | 9 +++++++-- 9 files changed, 67 insertions(+), 50 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 1aba02ecb806..6ef5afc42b82 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -51,6 +51,7 @@ __visible noinstr void return_from_fork(struct pt_regs *regs, regs->ax = 0; } syscall_exit_to_user_mode(regs); + user_pagetable_enter(); } static __always_inline void run_syscall(sys_call_ptr_t sysfunc, @@ -74,6 +75,7 @@ static __always_inline void run_syscall(sys_call_ptr_t sysfunc, #ifdef CONFIG_X86_64 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { + user_pagetable_exit(); nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); @@ -91,12 +93,14 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) instrumentation_end(); syscall_exit_to_user_mode(regs); + user_pagetable_enter(); } #endif #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) { + user_pagetable_exit(); if (IS_ENABLED(CONFIG_IA32_EMULATION)) current_thread_info()->status |= TS_COMPAT; @@ -131,11 +135,11 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) do_syscall_32_irqs_on(regs, nr); syscall_exit_to_user_mode(regs); + user_pagetable_enter(); } -static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) +static noinstr bool __do_fast_syscall_32(struct pt_regs *regs, long nr) { - unsigned int nr = syscall_32_enter(regs); int res; /* @@ -179,6 +183,9 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) { + unsigned int nr = syscall_32_enter(regs); + bool syscall_done; + /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling * convention. Adjust regs so it looks like we entered using int80. @@ -194,7 +201,9 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) regs->ip = landing_pad; /* Invoke the syscall. If it failed, keep it simple: use IRET. */ - if (!__do_fast_syscall_32(regs)) + syscall_done = __do_fast_syscall_32(regs, nr); + user_pagetable_enter(); + if (!syscall_done) return 0; #ifdef CONFIG_X86_64 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1715bc0cefff..b7d9a019d001 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -98,7 +98,6 @@ SYM_CODE_START(entry_SYSCALL_64) swapgs /* tss.sp2 is scratch space. */ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) - SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) @@ -192,18 +191,14 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ - POP_REGS pop_rdi=0 skip_r11rcx=1 + POP_REGS skip_r11rcx=1 /* - * We are on the trampoline stack. All regs except RDI are live. * We are on the trampoline stack. All regs except RSP are live. * We can do future final exit work right here. */ STACKLEAK_ERASE_NOCLOBBER - SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi - - popq %rdi movq RSP-ORIG_RAX(%rsp), %rsp USERGS_SYSRET64 SYM_CODE_END(entry_SYSCALL_64) @@ -321,7 +316,6 @@ SYM_CODE_END(ret_from_fork) swapgs cld FENCE_SWAPGS_USER_ENTRY - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 @@ -594,19 +588,15 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) ud2 1: #endif - POP_REGS pop_rdi=0 + POP_REGS + addq $8, %rsp /* skip regs->orig_ax */ /* - * We are on the trampoline stack. All regs except RDI are live. + * We are on the trampoline stack. All regs are live. * We can do future final exit work right here. */ STACKLEAK_ERASE_NOCLOBBER - SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi - - /* Restore RDI. */ - popq %rdi - addq $8, %rsp /* skip regs->orig_ax */ SWAPGS INTERRUPT_RETURN @@ -1009,8 +999,6 @@ SYM_CODE_START_LOCAL(error_entry) */ SWAPGS FENCE_SWAPGS_USER_ENTRY - /* We have user CR3. Change to kernel CR3. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax .Lerror_entry_from_usermode_after_swapgs: /* @@ -1069,11 +1057,10 @@ SYM_CODE_START_LOCAL(error_entry) .Lerror_bad_iret: /* * We came from an IRET to user mode, so we have user - * gsbase and CR3. Switch to kernel gsbase and CR3: + * gsbase and CR3. Switch to kernel gsbase. */ SWAPGS FENCE_SWAPGS_USER_ENTRY - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax /* * Pretend that the exception came from user mode: set up pt_regs diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 541fdaf64045..a6fb5807bf42 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -51,10 +51,6 @@ SYM_CODE_START(entry_SYSENTER_compat) /* Interrupts are off on entry. */ SWAPGS - pushq %rax - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax - popq %rax - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ @@ -204,9 +200,6 @@ SYM_CODE_START(entry_SYSCALL_compat) /* Stash user ESP */ movl %esp, %r8d - /* Use %rsp as scratch reg. User ESP is stashed in r8 */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - /* Switch to the kernel stack */ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -291,18 +284,6 @@ sysret32_from_system_call: * code. We zero R8-R10 to avoid info leaks. */ movq RSP-ORIG_RAX(%rsp), %rsp - - /* - * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored - * on the process stack which is not mapped to userspace and - * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 - * switch until after after the last reference to the process - * stack. - * - * %r8/%r9 are zeroed before the sysret, thus safe to clobber. - */ - SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 - xorl %r8d, %r8d xorl %r9d, %r9d xorl %r10d, %r10d @@ -357,9 +338,6 @@ SYM_CODE_START(entry_INT80_compat) pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ - /* Need to switch before accessing the thread stack. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi - /* In the Xen PV case we already run on the thread stack. */ ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 46682b1433a4..e01735a181b8 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -193,6 +193,17 @@ static __always_inline void user_pagetable_exit(void) switch_to_kernel_cr3(__native_read_cr3()); } +static __always_inline void user_pagetable_return(struct pt_regs *regs) +{ + if (user_mode(regs)) + user_pagetable_enter(); +} + +static __always_inline void user_pagetable_escape(struct pt_regs *regs) +{ + if (user_mode(regs)) + user_pagetable_exit(); +} #else /* CONFIG_PAGE_TABLE_ISOLATION */ @@ -204,6 +215,8 @@ static __always_inline void restore_cr3(unsigned long cr3) {} static __always_inline void user_pagetable_enter(void) {}; static __always_inline void user_pagetable_exit(void) {}; +static __always_inline void user_pagetable_return(struct pt_regs *regs) {}; +static __always_inline void user_pagetable_escape(struct pt_regs *regs) {}; #endif /* CONFIG_PAGE_TABLE_ISOLATION */ #endif /* MODULE */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index a6725afaaec0..f29bfc0700ff 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -132,12 +132,15 @@ static __always_inline void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - irqentry_state_t state = irqentry_enter(regs); \ + irqentry_state_t state; \ \ + user_pagetable_escape(regs); \ + state = irqentry_enter(regs); \ instrumentation_begin(); \ run_idt(__##func, regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ + user_pagetable_return(regs); \ } \ \ static __always_inline void __##func(struct pt_regs *regs) @@ -179,12 +182,15 @@ static __always_inline void __##func(struct pt_regs *regs, \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ - irqentry_state_t state = irqentry_enter(regs); \ + irqentry_state_t state; \ \ + user_pagetable_escape(regs); \ + state = irqentry_enter(regs); \ instrumentation_begin(); \ run_idt_errcode(__##func, regs, error_code); \ instrumentation_end(); \ irqentry_exit(regs, state); \ + user_pagetable_return(regs); \ } \ \ static __always_inline void __##func(struct pt_regs *regs, \ @@ -275,8 +281,10 @@ static __always_inline void __##func(struct pt_regs *regs, u8 vector); \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ - irqentry_state_t state = irqentry_enter(regs); \ + irqentry_state_t state; \ \ + user_pagetable_escape(regs); \ + state = irqentry_enter(regs); \ instrumentation_begin(); \ irq_enter_rcu(); \ kvm_set_cpu_l1tf_flush_l1d(); \ @@ -285,6 +293,7 @@ __visible noinstr void func(struct pt_regs *regs, \ irq_exit_rcu(); \ instrumentation_end(); \ irqentry_exit(regs, state); \ + user_pagetable_return(regs); \ } \ \ static __always_inline void __##func(struct pt_regs *regs, u8 vector) @@ -318,8 +327,10 @@ static void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - irqentry_state_t state = irqentry_enter(regs); \ + irqentry_state_t state; \ \ + user_pagetable_escape(regs); \ + state = irqentry_enter(regs); \ instrumentation_begin(); \ irq_enter_rcu(); \ kvm_set_cpu_l1tf_flush_l1d(); \ @@ -327,6 +338,7 @@ __visible noinstr void func(struct pt_regs *regs) \ irq_exit_rcu(); \ instrumentation_end(); \ irqentry_exit(regs, state); \ + user_pagetable_return(regs); \ } \ \ static noinline void __##func(struct pt_regs *regs) @@ -347,8 +359,10 @@ static __always_inline void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - irqentry_state_t state = irqentry_enter(regs); \ + irqentry_state_t state; \ \ + user_pagetable_escape(regs); \ + state = irqentry_enter(regs); \ instrumentation_begin(); \ __irq_enter_raw(); \ kvm_set_cpu_l1tf_flush_l1d(); \ @@ -356,6 +370,7 @@ __visible noinstr void func(struct pt_regs *regs) \ __irq_exit_raw(); \ instrumentation_end(); \ irqentry_exit(regs, state); \ + user_pagetable_return(regs); \ } \ \ static __always_inline void __##func(struct pt_regs *regs) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 31ac01c1155d..0203e73711a3 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2037,9 +2037,11 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) { unsigned long dr7; + user_pagetable_exit(); dr7 = local_db_save(); run_idt(exc_machine_check_user, regs); local_db_restore(dr7); + user_pagetable_enter(); } #else /* 32bit unified entry point */ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 523d88c3fea1..f5d0f5d0c626 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -542,8 +542,10 @@ DEFINE_IDTENTRY_NMI(exc_nmi) __visible noinstr void exc_nmi_user(struct pt_regs *regs) { + user_pagetable_exit(); handle_nmi(regs); mds_user_clear_cpu_buffers(); + user_pagetable_enter(); } void stop_nmi(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 14d2d6f15184..76db3d5a2965 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -255,11 +255,13 @@ DEFINE_IDTENTRY_RAW(exc_invalid_op) if (!user_mode(regs) && handle_bug(regs)) return; + user_pagetable_escape(regs); state = irqentry_enter(regs); instrumentation_begin(); run_idt(handle_invalid_op, regs); instrumentation_end(); irqentry_exit(regs, state); + user_pagetable_return(regs); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) @@ -663,11 +665,13 @@ DEFINE_IDTENTRY_RAW(exc_int3) * including NMI. */ if (user_mode(regs)) { + user_pagetable_exit(); irqentry_enter_from_user_mode(regs); instrumentation_begin(); run_idt(do_int3_user, regs); instrumentation_end(); irqentry_exit_to_user_mode(regs); + user_pagetable_enter(); } else { bool irq_state = idtentry_enter_nmi(regs); instrumentation_begin(); @@ -1001,7 +1005,9 @@ DEFINE_IDTENTRY_DEBUG(exc_debug) /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { + user_pagetable_exit(); run_idt_errcode(exc_debug_user, regs, debug_read_clear_dr6()); + user_pagetable_enter(); } #else /* 32 bit does not have separate entry points. */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b9d03603d95d..9ca79e86d0f0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1440,9 +1440,11 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code, DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { - unsigned long address = read_cr2(); + unsigned long address; irqentry_state_t state; + user_pagetable_escape(regs); + address = read_cr2(); prefetchw(¤t->mm->mmap_lock); /* @@ -1466,8 +1468,10 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) * The async #PF handling code takes care of idtentry handling * itself. */ - if (kvm_handle_async_pf(regs, (u32)address)) + if (kvm_handle_async_pf(regs, (u32)address)) { + user_pagetable_return(regs); return; + } /* * Entry handling for valid #PF from kernel mode is slightly @@ -1486,4 +1490,5 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) instrumentation_end(); irqentry_exit(regs, state); + user_pagetable_return(regs); } -- 2.18.4