This is a RFC, in which I want to bring up an option on how to reduce the generated code size by in average 6 bytes for *every* memory-access inside the guest. This applies to nearly every guest target on a x86 host. Other hosts may benefit too, but I didn't checked.
In include/exec/cpu-defs.h the maximum number of MMU modes is defined as: #define NB_MMU_MODES 16 Most targets use less MMU indexes and define them similiar to this: #define MMU_KERNEL_IDX 0 #define MMU_USER_IDX 1 #define MMU_PHYS_IDX 2 or, e.g. on ppc, by a function which gives some number in the range 0-7: static inline int cpu_mmu_index(CPUPPCState *env, bool ifetch) { #ifdef CONFIG_USER_ONLY return MMU_USER_IDX; /* MMU_USER_IDX is 0 */ #else return (env->hflags >> (ifetch ? HFLAGS_IMMU_IDX : HFLAGS_DMMU_IDX)) & 7; #endif } When looking at the generated code for every memory-access in the guest, the tcg generates a CPU TLB lookup in the fast path, e.g. for a x86 guest on x86 host (only relevant part shown below): IN: 0x000ebdf5: 8b 04 24 movl (%esp), %eax OUT: ... 0x003619: 48 23 bd 10 ff ff ff andq -0xf0(%rbp), %rdi 0x003620: 48 03 bd 18 ff ff ff addq -0xe8(%rbp), %rdi ... As can be seen, the TLB mask entry is accessed with a negative offset. By re-defining the MMU indices to become 15,14,13 instead: #define MMU_KERNEL_IDX (NB_MMU_MODES - 1) #define MMU_USER_IDX (NB_MMU_MODES - 2) #define MMU_PHYS_IDX (NB_MMU_MODES - 3) the (negative) offset is smaller, and the x86-64 tcg will generate a 4-byte (instead of 7-byte) instruction: OUT: ... 0x003499: 48 23 7d c0 andq -0x40(%rbp), %rdi 0x00349d: 48 03 7d c8 addq -0x38(%rbp), %rdi So, every memory acces in the guest saves 6 bytes (=2 * 3 bytes) of instruction code in the fast path. Looking at the instruction address offsets (0x003619 vs. 0x003499) it already saved 0x180 (384 decimal) bytes. The first instruction was at offset 0x000000, so an overall instruction size reduction of ~3% can be seen. To reproduce I used this command: ./qemu-system-x86_64 -cdrom ./debian-12.1.0-amd64-netinst.iso \ -boot d -nographic -d in_asm,out_asm Do we want to enable such an performance optimization? If so, I see two possibilities: a) Re-define NB_MMU_MODES per target, with NB_MMU_MODES becoming the highest MMU index needed for that target. This will probably give another small performance improvement as the flush loop will become shorter. b) Increase the MMU index per target as shown in the patch below. The patch below covers x86, ppc, alpha, hppa. For arm, I believe it's sufficient to change ARM_MMU_IDX_M_PRIV=>0xf, ARM_MMU_IDX_M_NEGPRI=0xe, and ARM_MMU_IDX_M_S=>0xd. Opinions? Helge diff --git a/target/i386/cpu.h b/target/i386/cpu.h index e0771a1043..d4aa6e7bee 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -2251,11 +2251,11 @@ uint64_t cpu_get_tsc(CPUX86State *env); #define cpu_list x86_cpu_list /* MMU modes definitions */ -#define MMU_KSMAP_IDX 0 -#define MMU_USER_IDX 1 -#define MMU_KNOSMAP_IDX 2 -#define MMU_NESTED_IDX 3 -#define MMU_PHYS_IDX 4 +#define MMU_KSMAP_IDX (NB_MMU_MODES - 1) +#define MMU_USER_IDX (NB_MMU_MODES - 2) +#define MMU_KNOSMAP_IDX (NB_MMU_MODES - 3) +#define MMU_NESTED_IDX (NB_MMU_MODES - 4) +#define MMU_PHYS_IDX (NB_MMU_MODES - 5) static inline int cpu_mmu_index(CPUX86State *env, bool ifetch) { diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index 25fac9577a..a2a56781eb 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -1474,13 +1474,14 @@ int ppc_dcr_write(ppc_dcr_t *dcr_env, int dcrn, uint32_t val); #define cpu_list ppc_cpu_list /* MMU modes definitions */ -#define MMU_USER_IDX 0 +#define MMU_USER_IDX (NB_MMU_MODES - 1) static inline int cpu_mmu_index(CPUPPCState *env, bool ifetch) { #ifdef CONFIG_USER_ONLY return MMU_USER_IDX; #else - return (env->hflags >> (ifetch ? HFLAGS_IMMU_IDX : HFLAGS_DMMU_IDX)) & 7; + return NB_MMU_MODES - 2 + - ((env->hflags >> (ifetch ? HFLAGS_IMMU_IDX : HFLAGS_DMMU_IDX)) & 7); #endif } diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h index 13306665af..f25cf33e25 100644 --- a/target/alpha/cpu.h +++ b/target/alpha/cpu.h @@ -194,9 +194,9 @@ enum { PALcode cheats and uses the KSEG mapping for its code+data rather than physical addresses. */ -#define MMU_KERNEL_IDX 0 -#define MMU_USER_IDX 1 -#define MMU_PHYS_IDX 2 +#define MMU_KERNEL_IDX (NB_MMU_MODES - 1) +#define MMU_USER_IDX (NB_MMU_MODES - 2) +#define MMU_PHYS_IDX (NB_MMU_MODES - 3) typedef struct CPUArchState { uint64_t ir[31]; diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h index 75c5c0ccf7..1c09602d0b 100644 --- a/target/hppa/cpu.h +++ b/target/hppa/cpu.h @@ -30,9 +30,9 @@ basis. It's probably easier to fall back to a strong memory model. */ #define TCG_GUEST_DEFAULT_MO TCG_MO_ALL -#define MMU_KERNEL_IDX 0 -#define MMU_USER_IDX 3 -#define MMU_PHYS_IDX 4 +#define MMU_KERNEL_IDX (NB_MMU_MODES - 1) +#define MMU_USER_IDX (NB_MMU_MODES - 2) +#define MMU_PHYS_IDX (NB_MMU_MODES - 3) #define TARGET_INSN_START_EXTRA_WORDS 1 /* Hardware exceptions, interrupts, faults, and traps. */