Re: [Qemu-devel] [PATCH v5] implementing victim TLB for QEMU system emulated TLB

Xin Tong Mon, 04 Aug 2014 18:39:07 -0700

Sorry for the absence. Forgot the patch in submission till then. I rebased
the patch to the newest QEMU git repository and retook the performance
improvement data. Please review when have time.


Xin



On Mon, Aug 4, 2014 at 8:35 PM, Xin Tong <trent.t...@gmail.com> wrote:

> QEMU system mode page table walks are expensive. Taken by running QEMU
> qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a
> 4-level page tables in guest Linux OS takes ~450 X86 instructions on
> average.
>
> QEMU system mode TLB is implemented using a directly-mapped hashtable.
> This structure suffers from conflict misses. Increasing the
> associativity of the TLB may not be the solution to conflict misses as
> all the ways may have to be walked in serial.
>
> A victim TLB is a TLB used to hold translations evicted from the
> primary TLB upon replacement. The victim TLB lies between the main TLB
> and its refill path. Victim TLB is of greater associativity (fully
> associative in this patch). It takes longer to lookup the victim TLB,
> but its likely better than a full page table walk. The memory
> translation path is changed as follows :
>
> Before Victim TLB:
> 1. Inline TLB lookup
> 2. Exit code cache on TLB miss.
> 3. Check for unaligned, IO accesses
> 4. TLB refill.
> 5. Do the memory access.
> 6. Return to code cache.
>
> After Victim TLB:
> 1. Inline TLB lookup
> 2. Exit code cache on TLB miss.
> 3. Check for unaligned, IO accesses
> 4. Victim TLB lookup.
> 5. If victim TLB misses, TLB refill
> 6. Do the memory access.
> 7. Return to code cache
>
> The advantage is that victim TLB can offer more associativity to a
> directly mapped TLB and thus potentially fewer page table walks while
> still keeping the time taken to flush within reasonable limits.
> However, placing a victim TLB before the refill path increase TLB
> refill path as the victim TLB is consulted before the TLB refill. The
> performance results demonstrate that the pros outweigh the cons.
>
> some performance results taken on SPECINT2006 train
> datasets and kernel boot and qemu configure script on an
> Intel(R) Xeon(R) CPU  E5620  @ 2.40GHz Linux machine are shown in the
> Google Doc link below.
>
>
> https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing
>
> In summary, victim TLB improves the performance of qemu-system-x86_64 by
> 11% on average on SPECINT2006, kernelboot and qemu configscript and with
> highest improvement of in 26% in 456.hmmer. And victim TLB does not result
> in any performance degradation in any of the measured benchmarks.
> Furthermore,
> the implemented victim TLB is architecture independent and is expected to
> benefit other architectures in QEMU as well.
>
>
> Although there are measurement fluctuations, the performance
> improvement is very significant and by no means in the range of
> noises.
>
> Signed-off-by: Xin Tong <trent.t...@gmail.com>
>
>
>
> ---
>  cputlb.c                | 31 ++++++++++++++++++++++++++++++-
>  include/exec/cpu-defs.h |  9 +++++++--
>  softmmu_template.h      | 43 +++++++++++++++++++++++++++++++++++++++----
>  3 files changed, 76 insertions(+), 7 deletions(-)
>
> diff --git a/cputlb.c b/cputlb.c
> index afd3705..a55518a 100644
> --- a/cputlb.c
> +++ b/cputlb.c
> @@ -60,8 +60,10 @@ void tlb_flush(CPUState *cpu, int flush_global)
>      cpu->current_tb = NULL;
>
>      memset(env->tlb_table, -1, sizeof(env->tlb_table));
> +    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
>      memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
>
> +    env->vtlb_index = 0;
>      env->tlb_flush_addr = -1;
>      env->tlb_flush_mask = 0;
>      tlb_flush_count++;
> @@ -108,6 +110,14 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
>          tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
>      }
>
> +    /* check whether there are entries that need to be flushed in the
> vtlb */
> +    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
> +        int k;
> +        for (k = 0; k < CPU_VTLB_SIZE; k++) {
> +            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
> +        }
> +    }
> +
>      tb_flush_jmp_cache(cpu, addr);
>  }
>
> @@ -172,6 +182,11 @@ void cpu_tlb_reset_dirty_all(ram_addr_t start1,
> ram_addr_t length)
>                  tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i],
>                                        start1, length);
>              }
> +
> +            for (i = 0; i < CPU_VTLB_SIZE; i++) {
> +                tlb_reset_dirty_range(&env->tlb_v_table[mmu_idx][i],
> +                                      start1, length);
> +            }
>          }
>      }
>  }
> @@ -195,6 +210,13 @@ void tlb_set_dirty(CPUArchState *env, target_ulong
> vaddr)
>      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
>          tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr);
>      }
> +
> +    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
> +        int k;
> +        for (k = 0; k < CPU_VTLB_SIZE; k++) {
> +            tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr);
> +        }
> +    }
>  }
>
>  /* Our TLB does not support large pages, so remember the area covered by
> @@ -235,6 +257,7 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
>      uintptr_t addend;
>      CPUTLBEntry *te;
>      hwaddr iotlb, xlat, sz;
> +    unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE;
>
>      assert(size >= TARGET_PAGE_SIZE);
>      if (size != TARGET_PAGE_SIZE) {
> @@ -267,8 +290,14 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
>                                              prot, &address);
>
>      index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
> -    env->iotlb[mmu_idx][index] = iotlb - vaddr;
>      te = &env->tlb_table[mmu_idx][index];
> +
> +    /* do not discard the translation in te, evict it into a victim tlb */
> +    env->tlb_v_table[mmu_idx][vidx] = *te;
> +    env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
> +
> +    /* refill the tlb */
> +    env->iotlb[mmu_idx][index] = iotlb - vaddr;
>      te->addend = addend - vaddr;
>      if (prot & PAGE_READ) {
>          te->addr_read = address;
> diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
> index 2dd6206..0ca6f0b 100644
> --- a/include/exec/cpu-defs.h
> +++ b/include/exec/cpu-defs.h
> @@ -71,6 +71,8 @@ typedef uint64_t target_ulong;
>  #if !defined(CONFIG_USER_ONLY)
>  #define CPU_TLB_BITS 8
>  #define CPU_TLB_SIZE (1 << CPU_TLB_BITS)
> +/* use a fully associative victim tlb of 8 entries */
> +#define CPU_VTLB_SIZE 8
>
>  #if HOST_LONG_BITS == 32 && TARGET_LONG_BITS == 32
>  #define CPU_TLB_ENTRY_BITS 4
> @@ -103,9 +105,12 @@ QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 <<
> CPU_TLB_ENTRY_BITS));
>  #define CPU_COMMON_TLB \
>      /* The meaning of the MMU modes is defined in the target code. */   \
>      CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];                  \
> -    hwaddr iotlb[NB_MMU_MODES][CPU_TLB_SIZE];               \
> +    CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE];               \
> +    hwaddr iotlb[NB_MMU_MODES][CPU_TLB_SIZE];                           \
> +    hwaddr iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];                        \
>      target_ulong tlb_flush_addr;                                        \
> -    target_ulong tlb_flush_mask;
> +    target_ulong tlb_flush_mask;                                        \
> +    target_ulong vtlb_index;                                            \
>
>  #else
>
> diff --git a/softmmu_template.h b/softmmu_template.h
> index 5a07f99..88e3390 100644
> --- a/softmmu_template.h
> +++ b/softmmu_template.h
> @@ -116,6 +116,31 @@
>  # define helper_te_st_name  helper_le_st_name
>  #endif
>
> +/* macro to check the victim tlb */
> +#define VICTIM_TLB_HIT(ty)
>      \
> +({
>      \
> +    /* we are about to do a page table walk. our last hope is the
>     \
> +     * victim tlb. try to refill from the victim tlb before walking the
>     \
> +     * page table. */
>     \
> +    int vidx;
>     \
> +    hwaddr tmpiotlb;
>      \
> +    CPUTLBEntry tmptlb;
>     \
> +    for (vidx = CPU_VTLB_SIZE-1; vidx >= 0; --vidx) {
>     \
> +        if (env->tlb_v_table[mmu_idx][vidx].ty == (addr &
> TARGET_PAGE_MASK)) {\
> +            /* found entry in victim tlb, swap tlb and iotlb */
>     \
> +            tmptlb = env->tlb_table[mmu_idx][index];
>      \
> +            env->tlb_table[mmu_idx][index] =
> env->tlb_v_table[mmu_idx][vidx]; \
> +            env->tlb_v_table[mmu_idx][vidx] = tmptlb;
>     \
> +            tmpiotlb = env->iotlb[mmu_idx][index];
>      \
> +            env->iotlb[mmu_idx][index] = env->iotlb_v[mmu_idx][vidx];
>     \
> +            env->iotlb_v[mmu_idx][vidx] = tmpiotlb;
>     \
> +            break;
>      \
> +        }
>     \
> +    }
>     \
> +    /* return true when there is a vtlb hit, i.e. vidx >=0 */
>     \
> +    vidx >= 0;
>      \
> +})
> +
>  #ifndef SOFTMMU_CODE_ACCESS
>  static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
>                                                hwaddr physaddr,
> @@ -161,7 +186,10 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env,
> target_ulong addr, int mmu_idx,
>                                   mmu_idx, retaddr);
>          }
>  #endif
> -        tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx,
> retaddr);
> +        if (!VICTIM_TLB_HIT(ADDR_READ)) {
> +            tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
> +                     mmu_idx, retaddr);
> +        }
>          tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
>      }
>
> @@ -246,7 +274,10 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env,
> target_ulong addr, int mmu_idx,
>                                   mmu_idx, retaddr);
>          }
>  #endif
> -        tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx,
> retaddr);
> +        if (!VICTIM_TLB_HIT(ADDR_READ)) {
> +            tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
> +                     mmu_idx, retaddr);
> +        }
>          tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
>      }
>
> @@ -368,7 +399,9 @@ void helper_le_st_name(CPUArchState *env, target_ulong
> addr, DATA_TYPE val,
>              cpu_unaligned_access(ENV_GET_CPU(env), addr, 1, mmu_idx,
> retaddr);
>          }
>  #endif
> -        tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
> +        if (!VICTIM_TLB_HIT(addr_write)) {
> +            tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
> +        }
>          tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
>      }
>
> @@ -444,7 +477,9 @@ void helper_be_st_name(CPUArchState *env, target_ulong
> addr, DATA_TYPE val,
>              cpu_unaligned_access(ENV_GET_CPU(env), addr, 1, mmu_idx,
> retaddr);
>          }
>  #endif
> -        tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
> +        if (!VICTIM_TLB_HIT(addr_write)) {
> +            tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
> +        }
>          tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
>      }
>
> --
> 1.9.1
>
>

Re: [Qemu-devel] [PATCH v5] implementing victim TLB for QEMU system emulated TLB

Reply via email to