On 04/05/2016 12:40 PM, Emilio G. Cota wrote:
On Tue, Apr 05, 2016 at 09:07:57 -0700, Richard Henderson wrote:
On 04/05/2016 08:48 AM, Paolo Bonzini wrote:
I think it's fine to use the struct. The exact size of the struct
varies from 3 to 5 32-bit words, so it's hard to write nice
size-dependent code for the hash.
I don't think it is. We have 3 integers. It is trivial to create a simple
function of 2 multiplies, two adds, and a remainder.
Take the primes from the xxhash.h, for example:
(phys_pc * PRIME32_2 + pc * PRIME32_3 + flags)
% PRIME32_1
& (CODE_GEN_PHYS_HASH_SIZE - 1)
Obviously, some bucket measurements should be taken, but I can well imagine
that this might perform just as well as the fully generic hasher.
That function doesn't perform well: 25.06s vs. 21.18s with xxh32.
Sure, that's just what came off the top of my head.
But the point is that we can do better than dropping data into memory.
Particularly for those hosts that do not support unaligned data, such as you
created with the packed structure.
* inline:
00000000001a6800 <tb_hash_func>:
1a6800: 48 83 ec 28 sub $0x28,%rsp
1a6804: 69 cf 77 ca eb 85 imul $0x85ebca77,%edi,%ecx
1a680a: 48 89 3c 24 mov %rdi,(%rsp)
1a680e: 48 c1 ef 20 shr $0x20,%rdi
1a6812: 69 ff 77 ca eb 85 imul $0x85ebca77,%edi,%edi
1a6818: 48 89 74 24 08 mov %rsi,0x8(%rsp)
1a681d: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax
1a6824: 00 00
1a6826: 48 89 44 24 18 mov %rax,0x18(%rsp)
1a682b: 31 c0 xor %eax,%eax
1a682d: 81 c1 29 44 23 24 add $0x24234429,%ecx
1a6833: 69 c6 77 ca eb 85 imul $0x85ebca77,%esi,%eax
1a6839: 48 c1 ee 20 shr $0x20,%rsi
1a683d: 81 ef 88 35 14 7a sub $0x7a143588,%edi
1a6843: 69 f6 77 ca eb 85 imul $0x85ebca77,%esi,%esi
1a6849: c1 c9 13 ror $0x13,%ecx
1a684c: c1 cf 13 ror $0x13,%edi
1a684f: 83 c0 01 add $0x1,%eax
1a6852: 69 c9 b1 79 37 9e imul $0x9e3779b1,%ecx,%ecx
1a6858: c1 c8 13 ror $0x13,%eax
1a685b: 81 c6 50 86 c8 61 add $0x61c88650,%esi
1a6861: 69 ff b1 79 37 9e imul $0x9e3779b1,%edi,%edi
1a6867: c1 ce 13 ror $0x13,%esi
1a686a: c1 c9 1f ror $0x1f,%ecx
1a686d: 69 c0 b1 79 37 9e imul $0x9e3779b1,%eax,%eax
1a6873: c1 cf 19 ror $0x19,%edi
1a6876: 69 f6 b1 79 37 9e imul $0x9e3779b1,%esi,%esi
1a687c: 8d 7c 39 14 lea 0x14(%rcx,%rdi,1),%edi
1a6880: c1 c8 14 ror $0x14,%eax
1a6883: 69 d2 3d ae b2 c2 imul $0xc2b2ae3d,%edx,%edx
1a6889: 01 f8 add %edi,%eax
1a688b: c1 ce 0e ror $0xe,%esi
1a688e: 01 c6 add %eax,%esi
1a6890: 01 f2 add %esi,%edx
1a6892: c1 ca 0f ror $0xf,%edx
1a6895: 69 d2 2f eb d4 27 imul $0x27d4eb2f,%edx,%edx
1a689b: 89 d0 mov %edx,%eax
1a689d: c1 e8 0f shr $0xf,%eax
1a68a0: 31 d0 xor %edx,%eax
1a68a2: 69 d0 77 ca eb 85 imul $0x85ebca77,%eax,%edx
1a68a8: 89 d0 mov %edx,%eax
1a68aa: c1 e8 0d shr $0xd,%eax
1a68ad: 31 d0 xor %edx,%eax
1a68af: 69 d0 3d ae b2 c2 imul $0xc2b2ae3d,%eax,%edx
1a68b5: 89 d0 mov %edx,%eax
1a68b7: c1 e8 10 shr $0x10,%eax
1a68ba: 31 d0 xor %edx,%eax
1a68bc: 48 8b 54 24 18 mov 0x18(%rsp),%rdx
1a68c1: 64 48 33 14 25 28 00 xor %fs:0x28,%rdx
1a68c8: 00 00
1a68ca: 75 05 jne 1a68d1 <tb_hash_func+0xd1>
1a68cc: 48 83 c4 28 add $0x28,%rsp
1a68d0: c3 retq
If I inline everything explicitly, we do even better.
r~
0000000000000000 <tb_hash_func>:
0: 44 69 c6 77 ca eb 85 imul $0x85ebca77,%esi,%r8d
7: 48 c1 ee 20 shr $0x20,%rsi
b: 69 cf 77 ca eb 85 imul $0x85ebca77,%edi,%ecx
11: 48 c1 ef 20 shr $0x20,%rdi
15: 41 83 c0 01 add $0x1,%r8d
19: 41 c1 c0 0d rol $0xd,%r8d
1d: 81 c1 29 44 23 24 add $0x24234429,%ecx
23: 69 c6 77 ca eb 85 imul $0x85ebca77,%esi,%eax
29: c1 c1 0d rol $0xd,%ecx
2c: 41 69 f0 b1 79 37 9e imul $0x9e3779b1,%r8d,%esi
33: 05 50 86 c8 61 add $0x61c88650,%eax
38: 69 ff 77 ca eb 85 imul $0x85ebca77,%edi,%edi
3e: c1 c6 0c rol $0xc,%esi
41: c1 c0 0d rol $0xd,%eax
44: 69 d2 3d ae b2 c2 imul $0xc2b2ae3d,%edx,%edx
4a: 81 ef 88 35 14 7a sub $0x7a143588,%edi
50: 69 c9 b1 79 37 9e imul $0x9e3779b1,%ecx,%ecx
56: 8d 54 16 14 lea 0x14(%rsi,%rdx,1),%edx
5a: c1 c7 0d rol $0xd,%edi
5d: 69 c0 b1 79 37 9e imul $0x9e3779b1,%eax,%eax
63: d1 c1 rol %ecx
65: 01 d1 add %edx,%ecx
67: c1 c8 0e ror $0xe,%eax
6a: 69 d7 b1 79 37 9e imul $0x9e3779b1,%edi,%edx
70: 01 c8 add %ecx,%eax
72: c1 c2 07 rol $0x7,%edx
75: 01 d0 add %edx,%eax
77: c1 c8 0f ror $0xf,%eax
7a: 69 c0 2f eb d4 27 imul $0x27d4eb2f,%eax,%eax
80: 89 c2 mov %eax,%edx
82: c1 ea 0f shr $0xf,%edx
85: 31 d0 xor %edx,%eax
87: 69 c0 77 ca eb 85 imul $0x85ebca77,%eax,%eax
8d: 89 c2 mov %eax,%edx
8f: c1 ea 0d shr $0xd,%edx
92: 31 d0 xor %edx,%eax
94: 69 c0 3d ae b2 c2 imul $0xc2b2ae3d,%eax,%eax
9a: 89 c2 mov %eax,%edx
9c: c1 ea 10 shr $0x10,%edx
9f: 31 d0 xor %edx,%eax
a1: c3 retq
#define PRIME32_1 2654435761U
#define PRIME32_2 2246822519U
#define PRIME32_3 3266489917U
#define PRIME32_4 668265263U
#define PRIME32_5 374761393U
#define XXH_rotl32(x, r) ((x << r) | (x >> (32 - r)))
unsigned tb_hash_func(unsigned long phys_pc, unsigned long pc, unsigned flags)
{
unsigned h32, seed = 1;
unsigned w0, w1, w2, w3, w4;
unsigned v1, v2, v3, v4;
w0 = phys_pc;
w1 = phys_pc >> 31 >> 1;
w2 = pc;
w3 = pc >> 31 >> 1;
w4 = flags;
v1 = seed + PRIME32_1 + PRIME32_2;
v2 = seed + PRIME32_2;
v3 = seed + 0;
v4 = seed - PRIME32_1;
v1 += w0 * PRIME32_2;
v1 = XXH_rotl32(v1, 13);
v1 *= PRIME32_1;
v2 += w1 * PRIME32_2;
v2 = XXH_rotl32(v2, 13);
v2 *= PRIME32_1;
v3 += w2 * PRIME32_2;
v3 = XXH_rotl32(v3, 13);
v3 *= PRIME32_1;
v4 += w3 * PRIME32_2;
v4 = XXH_rotl32(v4, 13);
v4 *= PRIME32_1;
h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7)
+ XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
h32 += 20;
h32 += w4 * PRIME32_3;
h32 = XXH_rotl32(h32, 17) * PRIME32_4;
h32 ^= h32 >> 15;
h32 *= PRIME32_2;
h32 ^= h32 >> 13;
h32 *= PRIME32_3;
h32 ^= h32 >> 16;
return h32;
}