There are probably better ways to do this. Results from oprofile on a shader-db run: mesa_hash_data 3.11 ---> 3.12 hash_table_insert 2.52 ---> 2.50 hash_table_search 2.64 ---> 2.59 set_add 1.74 ---> 1.72 set_search 2.08 ---> 2.09 runtime 160 ---> 164 --- src/mesa/x86/common_x86.c | 4 +++ src/mesa/x86/common_x86_features.h | 8 +++++ src/util/crc32c_hw.h | 67 ++++++++++++++++++++++++++++++++++++++ src/util/hash_table.c | 17 +++++++++- 4 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 src/util/crc32c_hw.h
diff --git a/src/mesa/x86/common_x86.c b/src/mesa/x86/common_x86.c index 25f5c40..de4defa 100644 --- a/src/mesa/x86/common_x86.c +++ b/src/mesa/x86/common_x86.c @@ -266,6 +266,8 @@ _mesa_get_x86_features(void) _mesa_x86_cpu_features |= X86_FEATURE_XMM2; if (cpu_features_ecx & X86_CPU_SSE4_1) _mesa_x86_cpu_features |= X86_FEATURE_SSE4_1; + if (cpu_features_ecx & X86_CPU_SSE4_2) + _mesa_x86_cpu_features |= X86_FEATURE_SSE4_2; #endif /* query extended cpu features */ @@ -354,6 +356,8 @@ _mesa_get_x86_features(void) if (ecx & bit_SSE4_1) _mesa_x86_cpu_features |= X86_FEATURE_SSE4_1; + if (ecx & X86_CPU_SSE4_2) + _mesa_x86_cpu_features |= X86_FEATURE_SSE4_2; } #endif /* USE_X86_64_ASM */ diff --git a/src/mesa/x86/common_x86_features.h b/src/mesa/x86/common_x86_features.h index 65634aa..c205844 100644 --- a/src/mesa/x86/common_x86_features.h +++ b/src/mesa/x86/common_x86_features.h @@ -44,6 +44,7 @@ #define X86_FEATURE_3DNOWEXT (1<<7) #define X86_FEATURE_3DNOW (1<<8) #define X86_FEATURE_SSE4_1 (1<<9) +#define X86_FEATURE_SSE4_2 (1<<10) /* standard X86 CPU features */ #define X86_CPU_FPU (1<<0) @@ -53,6 +54,7 @@ #define X86_CPU_XMM2 (1<<26) /* ECX. */ #define X86_CPU_SSE4_1 (1<<19) +#define X86_CPU_SSE4_2 (1<<20) /* extended X86 CPU features */ #define X86_CPUEXT_MMX_EXT (1<<22) @@ -93,5 +95,11 @@ #define cpu_has_sse4_1 (_mesa_x86_cpu_features & X86_FEATURE_SSE4_1) #endif +#ifdef __SSE4_2__ +#define cpu_has_sse4_2 1 +#else +#define cpu_has_sse4_2 (_mesa_x86_cpu_features & X86_FEATURE_SSE4_2) +#endif + #endif diff --git a/src/util/crc32c_hw.h b/src/util/crc32c_hw.h new file mode 100644 index 0000000..ef8c903 --- /dev/null +++ b/src/util/crc32c_hw.h @@ -0,0 +1,67 @@ +/* Compile with gcc -O3 -msse4.2 ... */ + +#include <stdint.h> +#ifdef __SSE4_2__ +#include <smmintrin.h> + +// Byte-boundary alignment issues +#define ALIGN_SIZE 0x08UL +#define ALIGN_MASK (ALIGN_SIZE - 1) +#define CALC_CRC(op, crc, type, buf, len) \ + do { \ + for (; (len) >= sizeof (type); (len) -= sizeof(type), buf += sizeof (type)) { \ + (crc) = op((crc), *(type *) (buf)); \ + } \ + } while(0) + + +/* Compute CRC-32C using the Intel hardware instruction. */ +/* for better parallelization with bigger buffers see + http://www.drdobbs.com/parallel/fast-parallelized-crc-computation-using/229401411 */ +static inline uint32_t crc32c_hw(const void *input, int len, uint32_t crc) +{ + const char* buf = (const char*)input; + + // XOR the initial CRC with INT_MAX + crc ^= 0xFFFFFFFF; + + // Align the input to the word boundary + for (; (len > 0) && ((size_t)buf & ALIGN_MASK); len--, buf++) { + crc = _mm_crc32_u8(crc, *buf); + } + + // Blast off the CRC32 calculation +#ifdef __x86_64__ + CALC_CRC(_mm_crc32_u64, crc, uint64_t, buf, len); +#endif + CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len); + CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len); + CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len); + + // Post-process the crc + return (crc ^ 0xFFFFFFFF); +} + +static inline uint64_t crc64c_hw(const void *input, int len, uint32_t seed) +{ + const char* buf = (const char*)input; + uint64_t crc = (uint64_t)seed; + + // Align the input to the word boundary + for (; (len > 0) && ((size_t)buf & ALIGN_MASK); len--, buf++) { + crc = _mm_crc32_u8(crc, *buf); + } + + // Blast off the CRC32 calculation +#ifdef __x86_64__ + CALC_CRC(_mm_crc32_u64, crc, uint64_t, buf, len); +#endif + CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len); + CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len); + CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len); + + // Post-process the crc + return crc; +} + +#endif diff --git a/src/util/hash_table.c b/src/util/hash_table.c index f2b8cf6..755b6a5 100644 --- a/src/util/hash_table.c +++ b/src/util/hash_table.c @@ -47,6 +47,8 @@ #include "hash_table.h" #include "ralloc.h" #include "macros.h" +#include "x86/common_x86_asm.h" +#include "crc32c_hw.h" static const uint32_t deleted_key_value; @@ -423,7 +425,6 @@ _mesa_hash_table_random_entry(struct hash_table *ht, return NULL; } - /** * Quick FNV-1a hash implementation based on: * http://www.isthe.com/chongo/tech/comp/fnv/ @@ -436,7 +437,12 @@ _mesa_hash_table_random_entry(struct hash_table *ht, uint32_t _mesa_hash_data(const void *data, size_t size) { +#ifdef _SSE4_2_ + if (cpu_has_sse4_2) + return crc32c_hw(data, size, _mesa_fnv32_1a_offset_bias); +#endif return murmur3_32(data, size, _mesa_fnv32_1a_offset_bias); + } /** FNV-1a string hash implementation */ @@ -446,6 +452,15 @@ _mesa_hash_string(const char *key) uint32_t hash = _mesa_fnv32_1a_offset_bias; while (*key != 0) { +#ifdef _SSE4_2_ + if (cpu_has_sse4_2) { + while (*key != 0) { + hash = crc32c_hw(key, sizeof(*key), hash); + key++; + } + return hash; + } +#endif hash = _mesa_murmur3_32(hash, *key); key++; } -- 2.2.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev