On Tue, Apr 05, 2016 at 14:08:13 -0700, Richard Henderson wrote: > But the point is that we can do better than dropping data into memory. > Particularly for those hosts that do not support unaligned data, such as you > created with the packed structure.
If we made sure the fields in the struct were in the right order (larger fields first), this shouldn't be an issue. Anyway I took your proposal and implemented the patch below. FWIW I cannot measure a perf. difference between this and the packed struct for arm-softmmu (i.e. 16 bytes) on an x86_64 host. How does the appended look? Thanks, E. commit af92a0690f49172621cd8b80759e3ca567d43567 Author: Emilio G. Cota <c...@braap.org> Date: Tue Apr 5 18:06:21 2016 -0400 rth Signed-off-by: Emilio G. Cota <c...@braap.org> diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h index 6b97a7c..349a856 100644 --- a/include/exec/tb-hash.h +++ b/include/exec/tb-hash.h @@ -45,19 +45,124 @@ static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc) | (tmp & TB_JMP_ADDR_MASK)); } -static inline -uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, int flags) +static inline uint32_t h32_finish(uint32_t h32) { - struct { - tb_page_addr_t phys_pc; - target_ulong pc; - int flags; - } QEMU_PACKED k; - - k.phys_pc = phys_pc; - k.pc = pc; - k.flags = flags; - return qemu_xxh32((uint32_t *)&k, sizeof(k) / sizeof(uint32_t), 1); + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + +static inline uint32_t tb_hash_func3(uint32_t a, uint32_t b, uint32_t c, int seed) +{ + uint32_t h32 = seed + PRIME32_5; + + h32 += 12; + + h32 += a * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4; + + h32 += b * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4; + + h32 += c * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4; + + return h32_finish(h32); +} + +static inline uint32_t tb_hash_func4(uint64_t a0, uint32_t c, uint32_t d, int seed) +{ + uint32_t v1 = seed + PRIME32_1 + PRIME32_2; + uint32_t v2 = seed + PRIME32_2; + uint32_t v3 = seed + 0; + uint32_t v4 = seed - PRIME32_1; + uint32_t a = a0 >> 31 >> 1; + uint32_t b = a0; + uint32_t h32; + + v1 += a * PRIME32_2; + v1 = XXH_rotl32(v1, 13); + v1 *= PRIME32_1; + + v2 += b * PRIME32_2; + v2 = XXH_rotl32(v2, 13); + v2 *= PRIME32_1; + + v3 += c * PRIME32_2; + v3 = XXH_rotl32(v3, 13); + v3 *= PRIME32_1; + + v4 += d * PRIME32_2; + v4 = XXH_rotl32(v4, 13); + v4 *= PRIME32_1; + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + + XXH_rotl32(v4, 18); + h32 += 16; + + return h32_finish(h32); +} + +static inline uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t e, int seed) +{ + uint32_t v1 = seed + PRIME32_1 + PRIME32_2; + uint32_t v2 = seed + PRIME32_2; + uint32_t v3 = seed + 0; + uint32_t v4 = seed - PRIME32_1; + uint32_t a = a0 >> 31 >> 1; + uint32_t b = a0; + uint32_t c = b0 >> 31 >> 1; + uint32_t d = b0; + uint32_t h32; + + v1 += a * PRIME32_2; + v1 = XXH_rotl32(v1, 13); + v1 *= PRIME32_1; + + v2 += b * PRIME32_2; + v2 = XXH_rotl32(v2, 13); + v2 *= PRIME32_1; + + v3 += c * PRIME32_2; + v3 = XXH_rotl32(v3, 13); + v3 *= PRIME32_1; + + v4 += d * PRIME32_2; + v4 = XXH_rotl32(v4, 13); + v4 *= PRIME32_1; + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + + XXH_rotl32(v4, 18); + h32 += 20; + + h32 += e * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4; + + return h32_finish(h32); +} + +static __attribute__((noinline)) +unsigned tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, int flags) +{ +#if TARGET_LONG_BITS == 64 + + if (sizeof(phys_pc) == sizeof(pc)) { + return tb_hash_func5(phys_pc, pc, flags, 1); + } + return tb_hash_func4(pc, phys_pc, flags, 1); + +#else /* 32-bit target */ + + if (sizeof(phys_pc) > sizeof(pc)) { + return tb_hash_func4(phys_pc, pc, flags, 1); + } + return tb_hash_func3(pc, phys_pc, flags, 1); + +#endif /* TARGET_LONG_BITS */ } #endif