someone please reviews the patch when have time.

Thanks a lot,
Xin


On Mon, Aug 4, 2014 at 8:37 PM, Xin Tong <trent.t...@gmail.com> wrote:

> Sorry for the absence. Forgot the patch in submission till then. I rebased
> the patch to the newest QEMU git repository and retook the performance
> improvement data. Please review when have time.
>
> Xin
>
>
>
> On Mon, Aug 4, 2014 at 8:35 PM, Xin Tong <trent.t...@gmail.com> wrote:
>
>> QEMU system mode page table walks are expensive. Taken by running QEMU
>> qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a
>> 4-level page tables in guest Linux OS takes ~450 X86 instructions on
>> average.
>>
>> QEMU system mode TLB is implemented using a directly-mapped hashtable.
>> This structure suffers from conflict misses. Increasing the
>> associativity of the TLB may not be the solution to conflict misses as
>> all the ways may have to be walked in serial.
>>
>> A victim TLB is a TLB used to hold translations evicted from the
>> primary TLB upon replacement. The victim TLB lies between the main TLB
>> and its refill path. Victim TLB is of greater associativity (fully
>> associative in this patch). It takes longer to lookup the victim TLB,
>> but its likely better than a full page table walk. The memory
>> translation path is changed as follows :
>>
>> Before Victim TLB:
>> 1. Inline TLB lookup
>> 2. Exit code cache on TLB miss.
>> 3. Check for unaligned, IO accesses
>> 4. TLB refill.
>> 5. Do the memory access.
>> 6. Return to code cache.
>>
>> After Victim TLB:
>> 1. Inline TLB lookup
>> 2. Exit code cache on TLB miss.
>> 3. Check for unaligned, IO accesses
>> 4. Victim TLB lookup.
>> 5. If victim TLB misses, TLB refill
>> 6. Do the memory access.
>> 7. Return to code cache
>>
>> The advantage is that victim TLB can offer more associativity to a
>> directly mapped TLB and thus potentially fewer page table walks while
>> still keeping the time taken to flush within reasonable limits.
>> However, placing a victim TLB before the refill path increase TLB
>> refill path as the victim TLB is consulted before the TLB refill. The
>> performance results demonstrate that the pros outweigh the cons.
>>
>> some performance results taken on SPECINT2006 train
>> datasets and kernel boot and qemu configure script on an
>> Intel(R) Xeon(R) CPU  E5620  @ 2.40GHz Linux machine are shown in the
>> Google Doc link below.
>>
>>
>> https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing
>>
>> In summary, victim TLB improves the performance of qemu-system-x86_64 by
>> 11% on average on SPECINT2006, kernelboot and qemu configscript and with
>> highest improvement of in 26% in 456.hmmer. And victim TLB does not result
>> in any performance degradation in any of the measured benchmarks.
>> Furthermore,
>> the implemented victim TLB is architecture independent and is expected to
>> benefit other architectures in QEMU as well.
>>
>>
>> Although there are measurement fluctuations, the performance
>> improvement is very significant and by no means in the range of
>> noises.
>>
>> Signed-off-by: Xin Tong <trent.t...@gmail.com>
>>
>>
>>
>> ---
>>  cputlb.c                | 31 ++++++++++++++++++++++++++++++-
>>  include/exec/cpu-defs.h |  9 +++++++--
>>  softmmu_template.h      | 43 +++++++++++++++++++++++++++++++++++++++----
>>  3 files changed, 76 insertions(+), 7 deletions(-)
>>
>> diff --git a/cputlb.c b/cputlb.c
>> index afd3705..a55518a 100644
>> --- a/cputlb.c
>> +++ b/cputlb.c
>> @@ -60,8 +60,10 @@ void tlb_flush(CPUState *cpu, int flush_global)
>>      cpu->current_tb = NULL;
>>
>>      memset(env->tlb_table, -1, sizeof(env->tlb_table));
>> +    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
>>      memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
>>
>> +    env->vtlb_index = 0;
>>      env->tlb_flush_addr = -1;
>>      env->tlb_flush_mask = 0;
>>      tlb_flush_count++;
>> @@ -108,6 +110,14 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
>>          tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
>>      }
>>
>> +    /* check whether there are entries that need to be flushed in the
>> vtlb */
>> +    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
>> +        int k;
>> +        for (k = 0; k < CPU_VTLB_SIZE; k++) {
>> +            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
>> +        }
>> +    }
>> +
>>      tb_flush_jmp_cache(cpu, addr);
>>  }
>>
>> @@ -172,6 +182,11 @@ void cpu_tlb_reset_dirty_all(ram_addr_t start1,
>> ram_addr_t length)
>>                  tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i],
>>                                        start1, length);
>>              }
>> +
>> +            for (i = 0; i < CPU_VTLB_SIZE; i++) {
>> +                tlb_reset_dirty_range(&env->tlb_v_table[mmu_idx][i],
>> +                                      start1, length);
>> +            }
>>          }
>>      }
>>  }
>> @@ -195,6 +210,13 @@ void tlb_set_dirty(CPUArchState *env, target_ulong
>> vaddr)
>>      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
>>          tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr);
>>      }
>> +
>> +    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
>> +        int k;
>> +        for (k = 0; k < CPU_VTLB_SIZE; k++) {
>> +            tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr);
>> +        }
>> +    }
>>  }
>>
>>  /* Our TLB does not support large pages, so remember the area covered by
>> @@ -235,6 +257,7 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
>>      uintptr_t addend;
>>      CPUTLBEntry *te;
>>      hwaddr iotlb, xlat, sz;
>> +    unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE;
>>
>>      assert(size >= TARGET_PAGE_SIZE);
>>      if (size != TARGET_PAGE_SIZE) {
>> @@ -267,8 +290,14 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
>>                                              prot, &address);
>>
>>      index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
>> -    env->iotlb[mmu_idx][index] = iotlb - vaddr;
>>      te = &env->tlb_table[mmu_idx][index];
>> +
>> +    /* do not discard the translation in te, evict it into a victim tlb
>> */
>> +    env->tlb_v_table[mmu_idx][vidx] = *te;
>> +    env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
>> +
>> +    /* refill the tlb */
>> +    env->iotlb[mmu_idx][index] = iotlb - vaddr;
>>      te->addend = addend - vaddr;
>>      if (prot & PAGE_READ) {
>>          te->addr_read = address;
>> diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
>> index 2dd6206..0ca6f0b 100644
>> --- a/include/exec/cpu-defs.h
>> +++ b/include/exec/cpu-defs.h
>> @@ -71,6 +71,8 @@ typedef uint64_t target_ulong;
>>  #if !defined(CONFIG_USER_ONLY)
>>  #define CPU_TLB_BITS 8
>>  #define CPU_TLB_SIZE (1 << CPU_TLB_BITS)
>> +/* use a fully associative victim tlb of 8 entries */
>> +#define CPU_VTLB_SIZE 8
>>
>>  #if HOST_LONG_BITS == 32 && TARGET_LONG_BITS == 32
>>  #define CPU_TLB_ENTRY_BITS 4
>> @@ -103,9 +105,12 @@ QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 <<
>> CPU_TLB_ENTRY_BITS));
>>  #define CPU_COMMON_TLB \
>>      /* The meaning of the MMU modes is defined in the target code. */   \
>>      CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];                  \
>> -    hwaddr iotlb[NB_MMU_MODES][CPU_TLB_SIZE];               \
>> +    CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE];               \
>> +    hwaddr iotlb[NB_MMU_MODES][CPU_TLB_SIZE];                           \
>> +    hwaddr iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];                        \
>>      target_ulong tlb_flush_addr;                                        \
>> -    target_ulong tlb_flush_mask;
>> +    target_ulong tlb_flush_mask;                                        \
>> +    target_ulong vtlb_index;                                            \
>>
>>  #else
>>
>> diff --git a/softmmu_template.h b/softmmu_template.h
>> index 5a07f99..88e3390 100644
>> --- a/softmmu_template.h
>> +++ b/softmmu_template.h
>> @@ -116,6 +116,31 @@
>>  # define helper_te_st_name  helper_le_st_name
>>  #endif
>>
>> +/* macro to check the victim tlb */
>> +#define VICTIM_TLB_HIT(ty)
>>      \
>> +({
>>      \
>> +    /* we are about to do a page table walk. our last hope is the
>>       \
>> +     * victim tlb. try to refill from the victim tlb before walking the
>>       \
>> +     * page table. */
>>       \
>> +    int vidx;
>>       \
>> +    hwaddr tmpiotlb;
>>      \
>> +    CPUTLBEntry tmptlb;
>>       \
>> +    for (vidx = CPU_VTLB_SIZE-1; vidx >= 0; --vidx) {
>>       \
>> +        if (env->tlb_v_table[mmu_idx][vidx].ty == (addr &
>> TARGET_PAGE_MASK)) {\
>> +            /* found entry in victim tlb, swap tlb and iotlb */
>>       \
>> +            tmptlb = env->tlb_table[mmu_idx][index];
>>      \
>> +            env->tlb_table[mmu_idx][index] =
>> env->tlb_v_table[mmu_idx][vidx]; \
>> +            env->tlb_v_table[mmu_idx][vidx] = tmptlb;
>>       \
>> +            tmpiotlb = env->iotlb[mmu_idx][index];
>>      \
>> +            env->iotlb[mmu_idx][index] = env->iotlb_v[mmu_idx][vidx];
>>       \
>> +            env->iotlb_v[mmu_idx][vidx] = tmpiotlb;
>>       \
>> +            break;
>>      \
>> +        }
>>       \
>> +    }
>>       \
>> +    /* return true when there is a vtlb hit, i.e. vidx >=0 */
>>       \
>> +    vidx >= 0;
>>      \
>> +})
>> +
>>  #ifndef SOFTMMU_CODE_ACCESS
>>  static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
>>                                                hwaddr physaddr,
>> @@ -161,7 +186,10 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env,
>> target_ulong addr, int mmu_idx,
>>                                   mmu_idx, retaddr);
>>          }
>>  #endif
>> -        tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx,
>> retaddr);
>> +        if (!VICTIM_TLB_HIT(ADDR_READ)) {
>> +            tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
>> +                     mmu_idx, retaddr);
>> +        }
>>          tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
>>      }
>>
>> @@ -246,7 +274,10 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env,
>> target_ulong addr, int mmu_idx,
>>                                   mmu_idx, retaddr);
>>          }
>>  #endif
>> -        tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx,
>> retaddr);
>> +        if (!VICTIM_TLB_HIT(ADDR_READ)) {
>> +            tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
>> +                     mmu_idx, retaddr);
>> +        }
>>          tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
>>      }
>>
>> @@ -368,7 +399,9 @@ void helper_le_st_name(CPUArchState *env,
>> target_ulong addr, DATA_TYPE val,
>>              cpu_unaligned_access(ENV_GET_CPU(env), addr, 1, mmu_idx,
>> retaddr);
>>          }
>>  #endif
>> -        tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
>> +        if (!VICTIM_TLB_HIT(addr_write)) {
>> +            tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
>> +        }
>>          tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
>>      }
>>
>> @@ -444,7 +477,9 @@ void helper_be_st_name(CPUArchState *env,
>> target_ulong addr, DATA_TYPE val,
>>              cpu_unaligned_access(ENV_GET_CPU(env), addr, 1, mmu_idx,
>> retaddr);
>>          }
>>  #endif
>> -        tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
>> +        if (!VICTIM_TLB_HIT(addr_write)) {
>> +            tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
>> +        }
>>          tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
>>      }
>>
>> --
>> 1.9.1
>>
>>
>

Reply via email to