When FRED is active, hardware automatically swaps GS when changing privilege, and the SWAPGS instruction is disallowed.
For native OSes using GS as the thread local pointer this is a massive improvement on the pre-FRED architecture, but under Xen it makes handling PV guests more complicated. Specifically, it means that GS_BASE and GS_SHADOW are the opposite way around in FRED mode, as opposed to IDT mode. This leads to the following changes: * In load_segments(), we have to load both GSes. Account for this in the SWAP() condition and avoid the path with SWAGS. * In save_segments(), we need to read GS_KERN rather than GS_BASE. * In toggle_guest_mode(), we need to emulate SWAPGS. * In do_set_segment_base(), merge the SEGBASE_GS_{USER,KERNEL} cases and take FRED into account when choosing which base to update. SEGBASE_GS_USER_SEL was already an LKGS invocation (decades before FRED) so under FRED needs to be a simple MOV %gs. Simply skip the SWAPGSes. Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com> --- CC: Jan Beulich <jbeul...@suse.com> CC: Roger Pau Monné <roger....@citrix.com> v2: * New I think this functions, but it's not ideal. The conditions are asymmetric and awkward. In principle, MSR_IMM can be as performant as FSGSBASE. They can literally be the the same microcode if the microline indexing allows. Otherwise, the FSGSBASE instructions will be more performant than MSR accesses (no need to decode %ecx), even with non-serialising writes (which all FRED hardware should have). However, use of FSGSBASE often comes with SWAPGS and that can't be used under FRED. --- xen/arch/x86/domain.c | 22 +++++++++++++++++----- xen/arch/x86/pv/domain.c | 22 ++++++++++++++++++++-- xen/arch/x86/pv/misc-hypercalls.c | 16 ++++++++++------ 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 8089ff929bf7..64922869a625 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1819,9 +1819,10 @@ static void load_segments(struct vcpu *n) /* * Figure out which way around gsb/gss want to be. gsb needs to be - * the active context, and gss needs to be the inactive context. + * the active context, and gss needs to be the inactive context, + * unless we're in FRED mode where they're reversed. */ - if ( !(n->arch.flags & TF_kernel_mode) ) + if ( !(n->arch.flags & TF_kernel_mode) ^ opt_fred ) SWAP(gsb, gss); if ( using_svm() && (n->arch.pv.fs | n->arch.pv.gs) <= 3 ) @@ -1842,7 +1843,9 @@ static void load_segments(struct vcpu *n) if ( !fs_gs_done && !compat ) { - if ( read_cr4() & X86_CR4_FSGSBASE ) + unsigned long cr4 = read_cr4(); + + if ( !(cr4 & X86_CR4_FRED) && (cr4 & X86_CR4_FSGSBASE) ) { __wrgsbase(gss); __wrfsbase(n->arch.pv.fs_base); @@ -1959,6 +1962,9 @@ static void load_segments(struct vcpu *n) * Guests however cannot use SWAPGS, so there is no mechanism to modify the * inactive GS base behind Xen's back. Therefore, Xen's copy of the inactive * GS base is still accurate, and doesn't need reading back from hardware. + * + * Under FRED, hardware automatically swaps GS for us, so GS_KERN is the + * active GS from the guest's point of view. */ static void save_segments(struct vcpu *v) { @@ -1974,12 +1980,18 @@ static void save_segments(struct vcpu *v) if ( read_cr4() & X86_CR4_FSGSBASE ) { fs_base = __rdfsbase(); - gs_base = __rdgsbase(); + if ( opt_fred ) + gs_base = rdmsr(MSR_SHADOW_GS_BASE); + else + gs_base = __rdgsbase(); } else { fs_base = rdmsr(MSR_FS_BASE); - gs_base = rdmsr(MSR_GS_BASE); + if ( opt_fred ) + gs_base = rdmsr(MSR_SHADOW_GS_BASE); + else + gs_base = rdmsr(MSR_GS_BASE); } v->arch.pv.fs_base = fs_base; diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c index 9c4785c187dd..5a7b69da5000 100644 --- a/xen/arch/x86/pv/domain.c +++ b/xen/arch/x86/pv/domain.c @@ -14,9 +14,10 @@ #include <asm/cpufeature.h> #include <asm/fsgsbase.h> #include <asm/invpcid.h> -#include <asm/spec_ctrl.h> #include <asm/pv/domain.h> #include <asm/shadow.h> +#include <asm/spec_ctrl.h> +#include <asm/traps.h> #ifdef CONFIG_PV32 int8_t __read_mostly opt_pv32 = -1; @@ -480,11 +481,28 @@ void toggle_guest_mode(struct vcpu *v) * subsequent context switch won't bother re-reading it. */ gs_base = read_gs_base(); + + /* + * In FRED mode, not only are the two GSes the other way around (i.e. we + * want to read GS_KERN here), the SWAPGS instruction is disallowed so we + * have to emulate it. + */ + if ( opt_fred ) + { + unsigned long gs_kern = rdmsr(MSR_SHADOW_GS_BASE); + + wrmsrns(MSR_SHADOW_GS_BASE, gs_base); + write_gs_base(gs_kern); + + gs_base = gs_kern; + } + else + asm volatile ( "swapgs" ); + if ( v->arch.flags & TF_kernel_mode ) v->arch.pv.gs_base_kernel = gs_base; else v->arch.pv.gs_base_user = gs_base; - asm volatile ( "swapgs" ); _toggle_guest_pt(v); diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c index 4c2abeb4add8..2c9cf50638db 100644 --- a/xen/arch/x86/pv/misc-hypercalls.c +++ b/xen/arch/x86/pv/misc-hypercalls.c @@ -11,6 +11,7 @@ #include <asm/debugreg.h> #include <asm/fsgsbase.h> +#include <asm/traps.h> long do_set_debugreg(int reg, unsigned long value) { @@ -192,11 +193,12 @@ long do_set_segment_base(unsigned int which, unsigned long base) case SEGBASE_GS_USER: v->arch.pv.gs_base_user = base; - write_gs_shadow(base); - break; - + fallthrough; case SEGBASE_GS_KERNEL: - write_gs_base(base); + if ( (which == SEGBASE_GS_KERNEL) ^ opt_fred ) + write_gs_base(base); + else + write_gs_shadow(base); break; } break; @@ -209,7 +211,8 @@ long do_set_segment_base(unsigned int which, unsigned long base) * We wish to update the user %gs from the GDT/LDT. Currently, the * guest kernel's GS_BASE is in context. */ - asm volatile ( "swapgs" ); + if ( !opt_fred ) + asm volatile ( "swapgs" ); if ( sel > 3 ) /* Fix up RPL for non-NUL selectors. */ @@ -247,7 +250,8 @@ long do_set_segment_base(unsigned int which, unsigned long base) /* Update the cache of the inactive base, as read from the GDT/LDT. */ v->arch.pv.gs_base_user = read_gs_base(); - asm volatile ( safe_swapgs ); + if ( !opt_fred ) + asm volatile ( safe_swapgs ); break; } -- 2.39.5