On 12/23/2016 10:51 AM, Lluís Vilanova wrote:
>> On 12/22/2016 10:35 AM, Lluís Vilanova wrote:
>>> To handle both issues, this series replicates the shared physical TB cache,
>>> creating a separate physical TB cache for every combination of event states
>>> (those with the 'vcpu' and 'tcg' properties). Then, all vCPUs tracing the 
>>> same
>>> events will use the same physical TB cache.
> 
>> Why do we need to "split the physical TB cache" as opposed to simply 
>> including
>> the trace state into the TB hash function?
> 
> Mmmm, that's an interesting alternative I did not consider. Are you aiming at
> minimizing the changes, or do you also think it would be more efficient?

I suspect that it will be more efficient.

> The dynamic tracing state would then be an arbitrarily long bitmap (defined by
> the number of events with the 'vcpu' property), so I'm not sure how to fit it
> into the hashing function with minimal collisions (the bitmap is now limited 
> to
> an unsigned long to use it as an index to the TB cache "matrix").

You could consider that index a unique identifier for the tracing state, and
then only compare and hash that integer.

> The other drawback I see is that then it would also take longer to compute the
> hashing function, instead of the simpler array indexing. As a benefit, 
> workloads
> with a high frequency of TB-flushing operations might be a bit faster (there
> would be a single QHT).

I don't see adding one more integer to the hashing function to be significant
at all.  Certainly not the 15% that you describe in your cover letter.

> If someone can provide me the code for the modified hash lookup function to
> account for the trace dstate bitmap contents, I will integrate it and measure 
> if
> there is any important change in performance.

Something like the following should do it.  There are two /* cpu->??? */
markers that would need to be filled in.

If you can reduce the tracing identifier to 8 bits, that would be excellent.
I've been wanting to make some other changes to TB hashing, and that would fit
in well with a second "flags" value.


r~
diff --git a/cpu-exec.c b/cpu-exec.c
index 4188fed..3b54317 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -261,6 +261,7 @@ struct tb_desc {
     CPUArchState *env;
     tb_page_addr_t phys_page1;
     uint32_t flags;
+    uint32_t traceid;
 };
 
 static bool tb_cmp(const void *p, const void *d)
@@ -272,6 +273,7 @@ static bool tb_cmp(const void *p, const void *d)
         tb->page_addr[0] == desc->phys_page1 &&
         tb->cs_base == desc->cs_base &&
         tb->flags == desc->flags &&
+        tb->traceid == desc->traceid &&
         !atomic_read(&tb->invalid)) {
         /* check next page if needed */
         if (tb->page_addr[1] == -1) {
@@ -293,7 +295,8 @@ static bool tb_cmp(const void *p, const void *d)
 static TranslationBlock *tb_htable_lookup(CPUState *cpu,
                                           target_ulong pc,
                                           target_ulong cs_base,
-                                          uint32_t flags)
+                                          uint32_t flags,
+                                          uint32_t traceid)
 {
     tb_page_addr_t phys_pc;
     struct tb_desc desc;
@@ -302,10 +305,11 @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu,
     desc.env = (CPUArchState *)cpu->env_ptr;
     desc.cs_base = cs_base;
     desc.flags = flags;
+    desc.traceid = traceid;
     desc.pc = pc;
     phys_pc = get_page_addr_code(desc.env, pc);
     desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
-    h = tb_hash_func(phys_pc, pc, flags);
+    h = tb_hash_func(phys_pc, pc, flags, traceid);
     return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h);
 }
 
@@ -317,6 +321,7 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
     TranslationBlock *tb;
     target_ulong cs_base, pc;
     uint32_t flags;
+    uint32_t traceid = 0 /* cpu->??? */;
     bool have_tb_lock = false;
 
     /* we record a subset of the CPU state. It will
@@ -325,8 +330,8 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
     cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
     tb = atomic_rcu_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
     if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
-                 tb->flags != flags)) {
-        tb = tb_htable_lookup(cpu, pc, cs_base, flags);
+                 tb->flags != flags || tb->traceid != traceid)) {
+        tb = tb_htable_lookup(cpu, pc, cs_base, flags, traceid);
         if (!tb) {
 
             /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
@@ -340,7 +345,7 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
             /* There's a chance that our desired tb has been translated while
              * taking the locks so we check again inside the lock.
              */
-            tb = tb_htable_lookup(cpu, pc, cs_base, flags);
+            tb = tb_htable_lookup(cpu, pc, cs_base, flags, traceid);
             if (!tb) {
                 /* if no translated code available, then translate it now */
                 tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index a8c13ce..40e867d 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -215,6 +215,7 @@ struct TranslationBlock {
 #define CF_IGNORE_ICOUNT 0x40000 /* Do not generate icount code */
 
     uint16_t invalid;
+    uint16_t traceid; /* identifier for tracing state */
 
     void *tc_ptr;    /* pointer to the translated code */
     uint8_t *tc_search;  /* pointer to search data */
diff --git a/include/exec/tb-hash-xx.h b/include/exec/tb-hash-xx.h
index 2c40b5c..7873740 100644
--- a/include/exec/tb-hash-xx.h
+++ b/include/exec/tb-hash-xx.h
@@ -49,7 +49,7 @@
  * contiguous in memory.
  */
 static inline
-uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t e)
+uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f)
 {
     uint32_t v1 = TB_HASH_XX_SEED + PRIME32_1 + PRIME32_2;
     uint32_t v2 = TB_HASH_XX_SEED + PRIME32_2;
@@ -83,6 +83,9 @@ uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t e)
     h32 += e * PRIME32_3;
     h32  = rol32(h32, 17) * PRIME32_4;
 
+    h32 += f * PRIME32_3;
+    h32  = rol32(h32, 17) * PRIME32_4;
+
     h32 ^= h32 >> 15;
     h32 *= PRIME32_2;
     h32 ^= h32 >> 13;
diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h
index 2c27490..5664851 100644
--- a/include/exec/tb-hash.h
+++ b/include/exec/tb-hash.h
@@ -46,9 +46,10 @@ static inline unsigned int 
tb_jmp_cache_hash_func(target_ulong pc)
 }
 
 static inline
-uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags)
+uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc,
+                      uint32_t flags, uint32_t traceid)
 {
-    return tb_hash_func5(phys_pc, pc, flags);
+    return tb_hash_func6(phys_pc, pc, flags, traceid);
 }
 
 #endif
diff --git a/tests/qht-bench.c b/tests/qht-bench.c
index 2afa09d..11c1cec 100644
--- a/tests/qht-bench.c
+++ b/tests/qht-bench.c
@@ -103,7 +103,7 @@ static bool is_equal(const void *obj, const void *userp)
 
 static inline uint32_t h(unsigned long v)
 {
-    return tb_hash_func5(v, 0, 0);
+    return tb_hash_func6(v, 0, 0, 0);
 }
 
 /*
diff --git a/translate-all.c b/translate-all.c
index 3dd9214..ee86822 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -1110,7 +1110,7 @@ void tb_phys_invalidate(TranslationBlock *tb, 
tb_page_addr_t page_addr)
 
     /* remove the TB from the hash list */
     phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
-    h = tb_hash_func(phys_pc, tb->pc, tb->flags);
+    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->traceid);
     qht_remove(&tcg_ctx.tb_ctx.htable, tb, h);
 
     /* remove the TB from the page list */
@@ -1255,7 +1255,7 @@ static void tb_link_page(TranslationBlock *tb, 
tb_page_addr_t phys_pc,
     }
 
     /* add in the hash table */
-    h = tb_hash_func(phys_pc, tb->pc, tb->flags);
+    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->traceid);
     qht_insert(&tcg_ctx.tb_ctx.htable, tb, h);
 
 #ifdef DEBUG_TB_CHECK
@@ -1298,6 +1298,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tb->cs_base = cs_base;
     tb->flags = flags;
     tb->cflags = cflags;
+    tb->traceid = 0 /* cpu->??? */;
 
 #ifdef CONFIG_PROFILER
     tcg_ctx.tb_count1++; /* includes aborted translations because of

Reply via email to