[RFC][PATCH] Reduce generated code by 3% by increasing MMU indices

Helge Deller Sat, 05 Aug 2023 09:47:53 -0700

This is a RFC, in which I want to bring up an option on how to reduce
the generated code size by in average 6 bytes for *every* memory-access
inside the guest. This applies to nearly every guest target on a
x86 host. Other hosts may benefit too, but I didn't checked.

In include/exec/cpu-defs.h the maximum number of MMU modes is
defined as:
#define NB_MMU_MODES 16

Most targets use less MMU indexes and define them similiar to this:
#define MMU_KERNEL_IDX   0
#define MMU_USER_IDX     1
#define MMU_PHYS_IDX     2

or, e.g. on ppc, by a function which gives some number in the range 0-7:

static inline int cpu_mmu_index(CPUPPCState *env, bool ifetch)
{
#ifdef CONFIG_USER_ONLY
    return MMU_USER_IDX;  /* MMU_USER_IDX is 0 */
#else
    return (env->hflags >> (ifetch ? HFLAGS_IMMU_IDX : HFLAGS_DMMU_IDX)) & 7;
#endif
}

When looking at the generated code for every memory-access in the guest,
the tcg generates a CPU TLB lookup in the fast path, e.g. for a
x86 guest on x86 host (only relevant part shown below):

IN:
0x000ebdf5:  8b 04 24                 movl     (%esp), %eax

OUT:
...
0x003619:  48 23 bd 10 ff ff ff     andq     -0xf0(%rbp), %rdi
0x003620:  48 03 bd 18 ff ff ff     addq     -0xe8(%rbp), %rdi
...

As can be seen, the TLB mask entry is accessed with a negative offset.

By re-defining the MMU indices to become 15,14,13 instead:

#define MMU_KERNEL_IDX   (NB_MMU_MODES - 1)
#define MMU_USER_IDX     (NB_MMU_MODES - 2)
#define MMU_PHYS_IDX     (NB_MMU_MODES - 3)

the (negative) offset is smaller, and the x86-64 tcg will generate
a 4-byte (instead of 7-byte) instruction:
OUT:
...
0x003499:  48 23 7d c0              andq     -0x40(%rbp), %rdi
0x00349d:  48 03 7d c8              addq     -0x38(%rbp), %rdi

So, every memory acces in the guest saves 6 bytes (=2 * 3 bytes)
of instruction code in the fast path.

Looking at the instruction address offsets (0x003619 vs. 0x003499)
it already saved 0x180 (384 decimal) bytes.
The first instruction was at offset 0x000000, so an overall
instruction size reduction of ~3% can be seen.

To reproduce I used this command:
./qemu-system-x86_64 -cdrom ./debian-12.1.0-amd64-netinst.iso \
        -boot d -nographic -d in_asm,out_asm


Do we want to enable such an performance optimization?
If so, I see two possibilities:

a) Re-define NB_MMU_MODES per target, with NB_MMU_MODES becoming
the highest MMU index needed for that target. This will probably
give another small performance improvement as the flush loop will
become shorter.

b) Increase the MMU index per target as shown in the patch below.
The patch below covers x86, ppc, alpha, hppa.
For arm, I believe it's sufficient to change ARM_MMU_IDX_M_PRIV=>0xf,
ARM_MMU_IDX_M_NEGPRI=0xe, and ARM_MMU_IDX_M_S=>0xd.

Opinions?

Helge



diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index e0771a1043..d4aa6e7bee 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -2251,11 +2251,11 @@ uint64_t cpu_get_tsc(CPUX86State *env);
 #define cpu_list x86_cpu_list

 /* MMU modes definitions */
-#define MMU_KSMAP_IDX   0
-#define MMU_USER_IDX    1
-#define MMU_KNOSMAP_IDX 2
-#define MMU_NESTED_IDX  3
-#define MMU_PHYS_IDX    4
+#define MMU_KSMAP_IDX   (NB_MMU_MODES - 1)
+#define MMU_USER_IDX    (NB_MMU_MODES - 2)
+#define MMU_KNOSMAP_IDX (NB_MMU_MODES - 3)
+#define MMU_NESTED_IDX  (NB_MMU_MODES - 4)
+#define MMU_PHYS_IDX    (NB_MMU_MODES - 5)

 static inline int cpu_mmu_index(CPUX86State *env, bool ifetch)
 {
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 25fac9577a..a2a56781eb 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1474,13 +1474,14 @@ int ppc_dcr_write(ppc_dcr_t *dcr_env, int dcrn, 
uint32_t val);
 #define cpu_list ppc_cpu_list

 /* MMU modes definitions */
-#define MMU_USER_IDX 0
+#define MMU_USER_IDX (NB_MMU_MODES - 1)
 static inline int cpu_mmu_index(CPUPPCState *env, bool ifetch)
 {
 #ifdef CONFIG_USER_ONLY
     return MMU_USER_IDX;
 #else
-    return (env->hflags >> (ifetch ? HFLAGS_IMMU_IDX : HFLAGS_DMMU_IDX)) & 7;
+    return NB_MMU_MODES - 2
+           - ((env->hflags >> (ifetch ? HFLAGS_IMMU_IDX : HFLAGS_DMMU_IDX)) & 
7);
 #endif
 }

diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h
index 13306665af..f25cf33e25 100644
--- a/target/alpha/cpu.h
+++ b/target/alpha/cpu.h
@@ -194,9 +194,9 @@ enum {
    PALcode cheats and uses the KSEG mapping for its code+data rather than
    physical addresses.  */

-#define MMU_KERNEL_IDX   0
-#define MMU_USER_IDX     1
-#define MMU_PHYS_IDX     2
+#define MMU_KERNEL_IDX   (NB_MMU_MODES - 1)
+#define MMU_USER_IDX     (NB_MMU_MODES - 2)
+#define MMU_PHYS_IDX     (NB_MMU_MODES - 3)

 typedef struct CPUArchState {
     uint64_t ir[31];
diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index 75c5c0ccf7..1c09602d0b 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -30,9 +30,9 @@
    basis.  It's probably easier to fall back to a strong memory model.  */
 #define TCG_GUEST_DEFAULT_MO        TCG_MO_ALL

-#define MMU_KERNEL_IDX   0
-#define MMU_USER_IDX     3
-#define MMU_PHYS_IDX     4
+#define MMU_KERNEL_IDX   (NB_MMU_MODES - 1)
+#define MMU_USER_IDX     (NB_MMU_MODES - 2)
+#define MMU_PHYS_IDX     (NB_MMU_MODES - 3)
 #define TARGET_INSN_START_EXTRA_WORDS 1

 /* Hardware exceptions, interrupts, faults, and traps.  */

[RFC][PATCH] Reduce generated code by 3% by increasing MMU indices

Reply via email to