With the shadow stack and exception handling adjustements in place, we can now activate FRED when appropriate. Note that opt_fred is still disabled by default.
Introduce init_fred() to set up all the MSRs relevant for FRED. FRED uses MSR_STAR (entries from Ring3 only), and MSR_FRED_SSP_SL0 aliases MSR_PL0_SSP when CET-SS is active. Otherwise, they're all new MSRs. With init_fred() existing, load_system_tables() and legacy_syscall_init() should only be used when setting up IDT delivery. Insert ASSERT()s to this effect, and adjust the various *_init() functions to make this property true. Per the documentation, ap_early_traps_init() is responsible for switching off the boot GDT, which needs doing even in FRED mode. Finally, set CR4.FRED in {bsp,ap}_early_traps_init(). Xen can now boot in FRED mode up until starting a PV guest, where it faults because IRET is not permitted to change privilege. Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com> --- CC: Jan Beulich <jbeul...@suse.com> CC: Roger Pau Monné <roger....@citrix.com> In principle we can stop allocating the IDT and TSS for CPUs now, although I want to get shutdown and kexec working before making this optimisation, in case there's something I've overlooked. --- xen/arch/x86/include/asm/current.h | 3 ++ xen/arch/x86/traps-setup.c | 78 +++++++++++++++++++++++++++--- 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/xen/arch/x86/include/asm/current.h b/xen/arch/x86/include/asm/current.h index 24d7d906a8c6..046740447db0 100644 --- a/xen/arch/x86/include/asm/current.h +++ b/xen/arch/x86/include/asm/current.h @@ -24,6 +24,9 @@ * 2 - NMI IST stack * 1 - #MC IST stack * 0 - IST Shadow Stacks (4x 1k, read-only) + * + * In FRED mode, #DB and NMI do not need special stacks, so their stacks are + * unused. */ /* diff --git a/xen/arch/x86/traps-setup.c b/xen/arch/x86/traps-setup.c index c4825fc1b11a..fdcfc7f5777d 100644 --- a/xen/arch/x86/traps-setup.c +++ b/xen/arch/x86/traps-setup.c @@ -26,6 +26,7 @@ int8_t __ro_after_init opt_fred = 0; /* -1 when supported. */ boolean_param("fred", opt_fred); void nocall entry_PF(void); +void nocall entry_FRED_R3(void); void nocall lstar_enter(void); void nocall cstar_enter(void); @@ -63,6 +64,8 @@ static void load_system_tables(void) .limit = sizeof(bsp_idt) - 1, }; + ASSERT(opt_fred == 0); + /* * Set up the TSS. Warning - may be live, and the NMI/#MC must remain * valid on every instruction boundary. (Note: these are all @@ -197,6 +200,8 @@ static void legacy_syscall_init(void) unsigned char *stub_page; unsigned int offset; + ASSERT(opt_fred == 0); + /* No PV guests? No need to set up SYSCALL/SYSENTER infrastructure. */ if ( !IS_ENABLED(CONFIG_PV) ) return; @@ -274,6 +279,44 @@ static void __init init_ler(void) setup_force_cpu_cap(X86_FEATURE_XEN_LBR); } +/* + * Set up all MSRs relevant for FRED event delivery. + * + * Xen does not use any of the optional config in MSR_FRED_CONFIG, so all that + * is needed is the entrypoint. + * + * Because FRED always provides a good stack, NMI and #DB do not need any + * special treatment. Only #DF needs another stack level, and #MC for the + * offchance that Xen's main stack suffers an uncorrectable error. + * + * FRED reuses MSR_STAR to provide the segment selector values to load on + * entry from Ring3. Entry from Ring0 leave %cs and %ss unmodified. + */ +static void init_fred(void) +{ + unsigned long stack_top = get_stack_bottom() & ~(STACK_SIZE - 1); + + ASSERT(opt_fred == 1); + + wrmsrns(MSR_STAR, XEN_MSR_STAR); + wrmsrns(MSR_FRED_CONFIG, (unsigned long)entry_FRED_R3); + + wrmsrns(MSR_FRED_RSP_SL0, (unsigned long)(&get_cpu_info()->_fred + 1)); + wrmsrns(MSR_FRED_RSP_SL1, 0); + wrmsrns(MSR_FRED_RSP_SL2, stack_top + (1 + IST_MCE) * PAGE_SIZE); + wrmsrns(MSR_FRED_RSP_SL3, stack_top + (1 + IST_DF) * PAGE_SIZE); + wrmsrns(MSR_FRED_STK_LVLS, ((2UL << (X86_EXC_MC * 2)) | + (3UL << (X86_EXC_DF * 2)))); + + if ( cpu_has_xen_shstk ) + { + wrmsrns(MSR_FRED_SSP_SL0, stack_top + (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE); + wrmsrns(MSR_FRED_SSP_SL1, 0); + wrmsrns(MSR_FRED_SSP_SL2, stack_top + (IST_MCE * IST_SHSTK_SIZE)); + wrmsrns(MSR_FRED_SSP_SL3, stack_top + (IST_DF * IST_SHSTK_SIZE)); + } +} + /* * Configure basic exception handling. This is prior to parsing the command * line or configuring a console, and needs to be as simple as possible. @@ -331,15 +374,18 @@ void __init traps_init(void) printk(XENLOG_INFO "Disabling PV32 due to FRED\n"); } #endif + init_fred(); + set_in_cr4(X86_CR4_FRED); + printk("Using FRED event delivery\n"); } else { + load_system_tables(); + printk("Using IDT event delivery\n"); } - load_system_tables(); - init_ler(); /* Cache {,compat_}gdt_l1e now that physically relocation is done. */ @@ -357,8 +403,13 @@ void __init traps_init(void) */ void bsp_traps_reinit(void) { - load_system_tables(); - percpu_traps_init(); + if ( opt_fred ) + init_fred(); + else + { + load_system_tables(); + percpu_traps_init(); + } } /* @@ -367,7 +418,8 @@ void bsp_traps_reinit(void) */ void percpu_traps_init(void) { - legacy_syscall_init(); + if ( !opt_fred ) + legacy_syscall_init(); if ( cpu_has_xen_lbr ) wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR); @@ -382,7 +434,21 @@ void percpu_traps_init(void) */ void asmlinkage ap_early_traps_init(void) { - load_system_tables(); + if ( opt_fred ) + { + const seg_desc_t *gdt = this_cpu(gdt) - FIRST_RESERVED_GDT_ENTRY; + const struct desc_ptr gdtr = { + .base = (unsigned long)gdt, + .limit = LAST_RESERVED_GDT_BYTE, + }; + + lgdt(&gdtr); + + init_fred(); + write_cr4(read_cr4() | X86_CR4_FRED); + } + else + load_system_tables(); } static void __init __maybe_unused build_assertions(void) -- 2.39.5