Most MSR accesses have compile time constant indexes. By using the immediate form when available, the decoder can start issuing uops directly for the relevant MSR, rather than having to issue uops to implement "switch (%ecx)". Modern CPUs have tens of thousands of MSRs, so that's quite an if/else chain.
Create __{rdmsr,wrmsrns}_imm() helpers and use them from {rdmsr,wrmsrns}() when the compiler can determine that the msr index is known at compile time. At the instruction level, the combined ABI is awkward. Explain our choices in detail. Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com> --- CC: Jan Beulich <jbeul...@suse.com> CC: Roger Pau Monné <roger....@citrix.com> The expression wrmsrns(MSR_STAR, rdmsr(MSR_STAR)) now yields: <test_star>: b9 81 00 00 c0 mov $0xc0000081,%ecx 0f 32 rdmsr 48 c1 e2 20 shl $0x20,%rdx 48 09 d0 or %rdx,%rax 48 89 c2 mov %rax,%rdx 48 c1 ea 20 shr $0x20,%rdx 2e 0f 30 cs wrmsr e9 a3 84 e8 ff jmp ffff82d040204260 <__x86_return_thunk> which is as good as we can manage. The alternative form of this looks like: <test_star>: b9 81 00 00 c0 mov $0xc0000081,%ecx c4 e7 7b f6 c0 81 00 rdmsr $0xc0000081,%rax 00 c0 2e c4 e7 7a f6 c0 81 cs wrmsrns %rax,$0xc0000081 00 00 c0 e9 xx xx xx xx jmp ffff82d040204260 <__x86_return_thunk> Still TBD. We ought to update the *_safe() forms too. rdmsr_safe() is easier because the potential #GP locations line up, but there need to be two variants because of v2: * Let the compiler do %ecx setup * Add RDMSR $imm too --- xen/arch/x86/include/asm/alternative.h | 7 ++ xen/arch/x86/include/asm/msr.h | 86 ++++++++++++++++++++- xen/include/public/arch-x86/cpufeatureset.h | 1 + 3 files changed, 92 insertions(+), 2 deletions(-) diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h index 0482bbf7cbf1..fe87b15ec72c 100644 --- a/xen/arch/x86/include/asm/alternative.h +++ b/xen/arch/x86/include/asm/alternative.h @@ -151,6 +151,13 @@ extern void alternative_instructions(void); ALTERNATIVE(oldinstr, newinstr, feature) \ :: input ) +#define alternative_input_2(oldinstr, newinstr1, feature1, \ + newinstr2, feature2, input...) \ + asm_inline volatile ( \ + ALTERNATIVE_2(oldinstr, newinstr1, feature1, \ + newinstr2, feature2) \ + :: input ) + /* Like alternative_input, but with a single output argument */ #define alternative_io(oldinstr, newinstr, feature, output, input...) \ asm_inline volatile ( \ diff --git a/xen/arch/x86/include/asm/msr.h b/xen/arch/x86/include/asm/msr.h index 1bd27b989a4d..2ceff6cca8bb 100644 --- a/xen/arch/x86/include/asm/msr.h +++ b/xen/arch/x86/include/asm/msr.h @@ -29,10 +29,52 @@ * wrmsrl(MSR_FOO, val); */ -static inline uint64_t rdmsr(unsigned int msr) +/* + * RDMSR with a compile-time constant index, when available. Falls back to + * plain RDMSR. + */ +static always_inline uint64_t __rdmsr_imm(uint32_t msr) +{ + uint64_t val; + + /* + * For best performance, RDMSR $msr, %r64 is recommended. For + * compatibility, we need to fall back to plain RDMSR. + * + * The combined ABI is awkward, because RDMSR $imm produces an r64, + * whereas WRMSR{,NS} produces a split edx:eax pair. + * + * Always use RDMSR $imm, %rax, because it has the most in common with the + * legacy form. When MSR_IMM isn't available, emit logic to fold %edx + * back into %rax. + * + * Let the compiler do %ecx setup. This does mean there's a useless `mov + * $imm, %ecx` in the instruction stream in the MSR_IMM case, but it means + * the compiler can de-duplicate the setup in the common case of reading + * and writing the same MSR. + */ + alternative_io( + "rdmsr\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %%rax\n\t", + + /* RDMSR $msr, %rax */ + ".byte 0xc4,0xe7,0x7b,0xf6,0xc0; .long %c[msr]", X86_FEATURE_MSR_IMM, + + "=a" (val), + + [msr] "i" (msr), "c" (msr) : "rdx"); + + return val; +} + +static always_inline uint64_t rdmsr(unsigned int msr) { unsigned long lo, hi; + if ( __builtin_constant_p(msr) ) + return __rdmsr_imm(msr); + asm volatile ( "rdmsr" : "=a" (lo), "=d" (hi) : "c" (msr) ); @@ -55,11 +97,51 @@ static inline void wrmsr(unsigned int msr, uint64_t val) } #define wrmsrl(msr, val) wrmsr(msr, val) +/* + * Non-serialising WRMSR with a compile-time constant index, when available. + * Falls back to plain WRMSRNS, or to a serialising WRMSR. + */ +static always_inline void __wrmsrns_imm(uint32_t msr, uint64_t val) +{ + /* + * For best performance, WRMSRNS %r64, $msr is recommended. For + * compatibility, we need to fall back to plain WRMSRNS, or to WRMSR. + * + * The combined ABI is awkward, because WRMSRNS $imm takes a single r64, + * whereas WRMSR{,NS} takes a split edx:eax pair. + * + * Always use WRMSRNS %rax, $imm, because it has the most in common with + * the legacy forms. When MSR_IMM isn't available, emit setup logic for + * %edx. + * + * Let the compiler do %ecx setup. This does mean there's a useless `mov + * $imm, %ecx` in the instruction stream in the MSR_IMM case, but it means + * the compiler can de-duplicate the setup in the common case of reading + * and writing the same MSR. + */ + alternative_input_2( + "mov %%rax, %%rdx\n\t" + "shr $32, %%rdx\n\t" + ".byte 0x2e; wrmsr", + + /* CS WRMSRNS %rax, $msr */ + ".byte 0x2e,0xc4,0xe7,0x7a,0xf6,0xc0; .long %c[msr]", X86_FEATURE_MSR_IMM, + + "mov %%rax, %%rdx\n\t" + "shr $32, %%rdx\n\t" + ".byte 0x0f,0x01,0xc6", X86_FEATURE_WRMSRNS, + + [msr] "i" (msr), "a" (val), "c" (msr) : "rdx"); +} + /* Non-serialising WRMSR, when available. Falls back to a serialising WRMSR. */ -static inline void wrmsrns(uint32_t msr, uint64_t val) +static always_inline void wrmsrns(uint32_t msr, uint64_t val) { uint32_t lo = val, hi = val >> 32; + if ( __builtin_constant_p(msr) ) + return __wrmsrns_imm(msr, val); + /* * WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant CS * prefix to avoid a trailing NOP. diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h index f7312e0b04e7..990b1d13f301 100644 --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -349,6 +349,7 @@ XEN_CPUFEATURE(MCDT_NO, 13*32+ 5) /*A MCDT_NO */ XEN_CPUFEATURE(UC_LOCK_DIS, 13*32+ 6) /* UC-lock disable */ /* Intel-defined CPU features, CPUID level 0x00000007:1.ecx, word 14 */ +XEN_CPUFEATURE(MSR_IMM, 14*32+ 5) /* {RD,WR}MSR $imm32 */ /* Intel-defined CPU features, CPUID level 0x00000007:1.edx, word 15 */ XEN_CPUFEATURE(AVX_VNNI_INT8, 15*32+ 4) /*A AVX-VNNI-INT8 Instructions */ -- 2.39.5