Sorry for the absence. Forgot the patch in submission till then. I rebased the patch to the newest QEMU git repository and retook the performance improvement data. Please review when have time.
Xin On Mon, Aug 4, 2014 at 8:35 PM, Xin Tong <trent.t...@gmail.com> wrote: > QEMU system mode page table walks are expensive. Taken by running QEMU > qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a > 4-level page tables in guest Linux OS takes ~450 X86 instructions on > average. > > QEMU system mode TLB is implemented using a directly-mapped hashtable. > This structure suffers from conflict misses. Increasing the > associativity of the TLB may not be the solution to conflict misses as > all the ways may have to be walked in serial. > > A victim TLB is a TLB used to hold translations evicted from the > primary TLB upon replacement. The victim TLB lies between the main TLB > and its refill path. Victim TLB is of greater associativity (fully > associative in this patch). It takes longer to lookup the victim TLB, > but its likely better than a full page table walk. The memory > translation path is changed as follows : > > Before Victim TLB: > 1. Inline TLB lookup > 2. Exit code cache on TLB miss. > 3. Check for unaligned, IO accesses > 4. TLB refill. > 5. Do the memory access. > 6. Return to code cache. > > After Victim TLB: > 1. Inline TLB lookup > 2. Exit code cache on TLB miss. > 3. Check for unaligned, IO accesses > 4. Victim TLB lookup. > 5. If victim TLB misses, TLB refill > 6. Do the memory access. > 7. Return to code cache > > The advantage is that victim TLB can offer more associativity to a > directly mapped TLB and thus potentially fewer page table walks while > still keeping the time taken to flush within reasonable limits. > However, placing a victim TLB before the refill path increase TLB > refill path as the victim TLB is consulted before the TLB refill. The > performance results demonstrate that the pros outweigh the cons. > > some performance results taken on SPECINT2006 train > datasets and kernel boot and qemu configure script on an > Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the > Google Doc link below. > > > https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing > > In summary, victim TLB improves the performance of qemu-system-x86_64 by > 11% on average on SPECINT2006, kernelboot and qemu configscript and with > highest improvement of in 26% in 456.hmmer. And victim TLB does not result > in any performance degradation in any of the measured benchmarks. > Furthermore, > the implemented victim TLB is architecture independent and is expected to > benefit other architectures in QEMU as well. > > > Although there are measurement fluctuations, the performance > improvement is very significant and by no means in the range of > noises. > > Signed-off-by: Xin Tong <trent.t...@gmail.com> > > > > --- > cputlb.c | 31 ++++++++++++++++++++++++++++++- > include/exec/cpu-defs.h | 9 +++++++-- > softmmu_template.h | 43 +++++++++++++++++++++++++++++++++++++++---- > 3 files changed, 76 insertions(+), 7 deletions(-) > > diff --git a/cputlb.c b/cputlb.c > index afd3705..a55518a 100644 > --- a/cputlb.c > +++ b/cputlb.c > @@ -60,8 +60,10 @@ void tlb_flush(CPUState *cpu, int flush_global) > cpu->current_tb = NULL; > > memset(env->tlb_table, -1, sizeof(env->tlb_table)); > + memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table)); > memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache)); > > + env->vtlb_index = 0; > env->tlb_flush_addr = -1; > env->tlb_flush_mask = 0; > tlb_flush_count++; > @@ -108,6 +110,14 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr) > tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr); > } > > + /* check whether there are entries that need to be flushed in the > vtlb */ > + for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { > + int k; > + for (k = 0; k < CPU_VTLB_SIZE; k++) { > + tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr); > + } > + } > + > tb_flush_jmp_cache(cpu, addr); > } > > @@ -172,6 +182,11 @@ void cpu_tlb_reset_dirty_all(ram_addr_t start1, > ram_addr_t length) > tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i], > start1, length); > } > + > + for (i = 0; i < CPU_VTLB_SIZE; i++) { > + tlb_reset_dirty_range(&env->tlb_v_table[mmu_idx][i], > + start1, length); > + } > } > } > } > @@ -195,6 +210,13 @@ void tlb_set_dirty(CPUArchState *env, target_ulong > vaddr) > for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { > tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr); > } > + > + for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { > + int k; > + for (k = 0; k < CPU_VTLB_SIZE; k++) { > + tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr); > + } > + } > } > > /* Our TLB does not support large pages, so remember the area covered by > @@ -235,6 +257,7 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr, > uintptr_t addend; > CPUTLBEntry *te; > hwaddr iotlb, xlat, sz; > + unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE; > > assert(size >= TARGET_PAGE_SIZE); > if (size != TARGET_PAGE_SIZE) { > @@ -267,8 +290,14 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr, > prot, &address); > > index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); > - env->iotlb[mmu_idx][index] = iotlb - vaddr; > te = &env->tlb_table[mmu_idx][index]; > + > + /* do not discard the translation in te, evict it into a victim tlb */ > + env->tlb_v_table[mmu_idx][vidx] = *te; > + env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index]; > + > + /* refill the tlb */ > + env->iotlb[mmu_idx][index] = iotlb - vaddr; > te->addend = addend - vaddr; > if (prot & PAGE_READ) { > te->addr_read = address; > diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h > index 2dd6206..0ca6f0b 100644 > --- a/include/exec/cpu-defs.h > +++ b/include/exec/cpu-defs.h > @@ -71,6 +71,8 @@ typedef uint64_t target_ulong; > #if !defined(CONFIG_USER_ONLY) > #define CPU_TLB_BITS 8 > #define CPU_TLB_SIZE (1 << CPU_TLB_BITS) > +/* use a fully associative victim tlb of 8 entries */ > +#define CPU_VTLB_SIZE 8 > > #if HOST_LONG_BITS == 32 && TARGET_LONG_BITS == 32 > #define CPU_TLB_ENTRY_BITS 4 > @@ -103,9 +105,12 @@ QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << > CPU_TLB_ENTRY_BITS)); > #define CPU_COMMON_TLB \ > /* The meaning of the MMU modes is defined in the target code. */ \ > CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE]; \ > - hwaddr iotlb[NB_MMU_MODES][CPU_TLB_SIZE]; \ > + CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE]; \ > + hwaddr iotlb[NB_MMU_MODES][CPU_TLB_SIZE]; \ > + hwaddr iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE]; \ > target_ulong tlb_flush_addr; \ > - target_ulong tlb_flush_mask; > + target_ulong tlb_flush_mask; \ > + target_ulong vtlb_index; \ > > #else > > diff --git a/softmmu_template.h b/softmmu_template.h > index 5a07f99..88e3390 100644 > --- a/softmmu_template.h > +++ b/softmmu_template.h > @@ -116,6 +116,31 @@ > # define helper_te_st_name helper_le_st_name > #endif > > +/* macro to check the victim tlb */ > +#define VICTIM_TLB_HIT(ty) > \ > +({ > \ > + /* we are about to do a page table walk. our last hope is the > \ > + * victim tlb. try to refill from the victim tlb before walking the > \ > + * page table. */ > \ > + int vidx; > \ > + hwaddr tmpiotlb; > \ > + CPUTLBEntry tmptlb; > \ > + for (vidx = CPU_VTLB_SIZE-1; vidx >= 0; --vidx) { > \ > + if (env->tlb_v_table[mmu_idx][vidx].ty == (addr & > TARGET_PAGE_MASK)) {\ > + /* found entry in victim tlb, swap tlb and iotlb */ > \ > + tmptlb = env->tlb_table[mmu_idx][index]; > \ > + env->tlb_table[mmu_idx][index] = > env->tlb_v_table[mmu_idx][vidx]; \ > + env->tlb_v_table[mmu_idx][vidx] = tmptlb; > \ > + tmpiotlb = env->iotlb[mmu_idx][index]; > \ > + env->iotlb[mmu_idx][index] = env->iotlb_v[mmu_idx][vidx]; > \ > + env->iotlb_v[mmu_idx][vidx] = tmpiotlb; > \ > + break; > \ > + } > \ > + } > \ > + /* return true when there is a vtlb hit, i.e. vidx >=0 */ > \ > + vidx >= 0; > \ > +}) > + > #ifndef SOFTMMU_CODE_ACCESS > static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env, > hwaddr physaddr, > @@ -161,7 +186,10 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, > target_ulong addr, int mmu_idx, > mmu_idx, retaddr); > } > #endif > - tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, > retaddr); > + if (!VICTIM_TLB_HIT(ADDR_READ)) { > + tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, > + mmu_idx, retaddr); > + } > tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ; > } > > @@ -246,7 +274,10 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, > target_ulong addr, int mmu_idx, > mmu_idx, retaddr); > } > #endif > - tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, > retaddr); > + if (!VICTIM_TLB_HIT(ADDR_READ)) { > + tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, > + mmu_idx, retaddr); > + } > tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ; > } > > @@ -368,7 +399,9 @@ void helper_le_st_name(CPUArchState *env, target_ulong > addr, DATA_TYPE val, > cpu_unaligned_access(ENV_GET_CPU(env), addr, 1, mmu_idx, > retaddr); > } > #endif > - tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr); > + if (!VICTIM_TLB_HIT(addr_write)) { > + tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr); > + } > tlb_addr = env->tlb_table[mmu_idx][index].addr_write; > } > > @@ -444,7 +477,9 @@ void helper_be_st_name(CPUArchState *env, target_ulong > addr, DATA_TYPE val, > cpu_unaligned_access(ENV_GET_CPU(env), addr, 1, mmu_idx, > retaddr); > } > #endif > - tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr); > + if (!VICTIM_TLB_HIT(addr_write)) { > + tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr); > + } > tlb_addr = env->tlb_table[mmu_idx][index].addr_write; > } > > -- > 1.9.1 > >