On 06/04/2016 02:52, Emilio G. Cota wrote: > +static inline uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t e, > int seed)
I would keep just this version and unconditionally zero-extend to 64-bits. The compiler is able to detect the high 32 bits are zero, drop the more expensive multiplications and constant fold everything. For example if you write unsigned tb_hash_func(uint32_t phys_pc, uint32_t pc, int flags) { return tb_hash_func5(phys_pc, pc, flags, 1); } and check the optimized code with -fdump-tree-optimized you'll see that the rotated v1, the rotated v3 and the 20 merge into a single constant 1733907856. Thanks, Paolo > +{ > + uint32_t v1 = seed + PRIME32_1 + PRIME32_2; > + uint32_t v2 = seed + PRIME32_2; > + uint32_t v3 = seed + 0; > + uint32_t v4 = seed - PRIME32_1; > + uint32_t a = a0 >> 31 >> 1; > + uint32_t b = a0; > + uint32_t c = b0 >> 31 >> 1; > + uint32_t d = b0; > + uint32_t h32; > + > + v1 += a * PRIME32_2; > + v1 = XXH_rotl32(v1, 13); > + v1 *= PRIME32_1; > + > + v2 += b * PRIME32_2; > + v2 = XXH_rotl32(v2, 13); > + v2 *= PRIME32_1; > + > + v3 += c * PRIME32_2; > + v3 = XXH_rotl32(v3, 13); > + v3 *= PRIME32_1; > + > + v4 += d * PRIME32_2; > + v4 = XXH_rotl32(v4, 13); > + v4 *= PRIME32_1; > + > + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + > + XXH_rotl32(v4, 18); > + h32 += 20; > + > + h32 += e * PRIME32_3; > + h32 = XXH_rotl32(h32, 17) * PRIME32_4; > + > + return h32_finish(h32); > +} > + > +static __attribute__((noinline)) > +unsigned tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, int flags) > +{ > +#if TARGET_LONG_BITS == 64 > + > + if (sizeof(phys_pc) == sizeof(pc)) { > + return tb_hash_func5(phys_pc, pc, flags, 1); > + }