From: Byron Marohn <byron.mar...@intel.com>

In lookup bulk function, the signatures of all entries
are compared against the signature of the key that is being looked up.
Now that all the signatures are together, they can be compared
with vector instructions (SSE, AVX2), achieving higher lookup performance.

Also, entries per bucket are increased to 8 when using processors
with AVX2, as 256 bits can be compared at once, which is the size of
8x32-bit signatures.

Signed-off-by: Byron Marohn <byron.marohn at intel.com>
Signed-off-by: Saikrishna Edupuganti <saikrishna.edupuganti at intel.com>
Signed-off-by: Pablo de Lara <pablo.de.lara.guarch at intel.com>
---
 lib/librte_hash/rte_cuckoo_hash.c | 41 ++++++++++++++++++++++++++++++++++-----
 lib/librte_hash/rte_cuckoo_hash.h |  4 ++++
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/lib/librte_hash/rte_cuckoo_hash.c 
b/lib/librte_hash/rte_cuckoo_hash.c
index 9d507b6..98713d3 100644
--- a/lib/librte_hash/rte_cuckoo_hash.c
+++ b/lib/librte_hash/rte_cuckoo_hash.c
@@ -939,6 +939,38 @@ lookup_stage1(unsigned idx, hash_sig_t *prim_hash, 
hash_sig_t *sec_hash,
        rte_prefetch0(*secondary_bkt);
 }

+static inline void
+compare_signatures(unsigned *prim_hash_matches, unsigned *sec_hash_matches,
+                               const struct rte_hash_bucket *prim_bkt,
+                               const struct rte_hash_bucket *sec_bkt,
+                               hash_sig_t prim_hash, hash_sig_t sec_hash)
+{
+/* 8 entries per bucket */
+#if defined(__AVX2__)
+       *prim_hash_matches |= _mm256_movemask_ps((__m256)_mm256_cmpeq_epi32(
+                       _mm256_load_si256((__m256i const 
*)prim_bkt->sig_current),
+                       _mm256_set1_epi32(prim_hash)));
+       *sec_hash_matches |= _mm256_movemask_ps((__m256)_mm256_cmpeq_epi32(
+                       _mm256_load_si256((__m256i const 
*)sec_bkt->sig_current),
+                       _mm256_set1_epi32(sec_hash)));
+/* 4 entries per bucket */
+#elif defined(__SSE2__)
+       *prim_hash_matches |= _mm_movemask_ps((__m128)_mm_cmpeq_epi16(
+                       _mm_load_si128((__m128i const *)prim_bkt->sig_current),
+                       _mm_set1_epi32(prim_hash)));
+       *sec_hash_matches |= _mm_movemask_ps((__m128)_mm_cmpeq_epi16(
+                       _mm_load_si128((__m128i const *)sec_bkt->sig_current),
+                       _mm_set1_epi32(sec_hash)));
+#else
+       unsigned i;
+
+       for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+               *prim_hash_matches |= ((prim_hash == prim_bkt->sig_current[i]) 
<< i);
+               *sec_hash_matches |= ((sec_hash == sec_bkt->sig_current[i]) << 
i);
+       }
+#endif
+}
+
 /*
  * Lookup bulk stage 2:  Search for match hashes in primary/secondary locations
  * and prefetch first key slot
@@ -951,15 +983,14 @@ lookup_stage2(unsigned idx, hash_sig_t prim_hash, 
hash_sig_t sec_hash,
                uint64_t *extra_hits_mask, const void *keys,
                const struct rte_hash *h)
 {
-       unsigned prim_hash_matches, sec_hash_matches, key_idx, i;
+       unsigned prim_hash_matches, sec_hash_matches, key_idx;
        unsigned total_hash_matches;

        prim_hash_matches = 1 << RTE_HASH_BUCKET_ENTRIES;
        sec_hash_matches = 1 << RTE_HASH_BUCKET_ENTRIES;
-       for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-               prim_hash_matches |= ((prim_hash == prim_bkt->sig_current[i]) 
<< i);
-               sec_hash_matches |= ((sec_hash == sec_bkt->sig_current[i]) << 
i);
-       }
+
+       compare_signatures(&prim_hash_matches, &sec_hash_matches, prim_bkt,
+                                               sec_bkt, prim_hash, sec_hash);

        key_idx = prim_bkt->key_idx[__builtin_ctzl(prim_hash_matches)];
        if (key_idx == 0)
diff --git a/lib/librte_hash/rte_cuckoo_hash.h 
b/lib/librte_hash/rte_cuckoo_hash.h
index fe0654f..eb57d7e 100644
--- a/lib/librte_hash/rte_cuckoo_hash.h
+++ b/lib/librte_hash/rte_cuckoo_hash.h
@@ -130,7 +130,11 @@ enum add_key_case {
 };

 /** Number of items per bucket. */
+#if defined(__AVX2__)
+#define RTE_HASH_BUCKET_ENTRIES                8
+#else
 #define RTE_HASH_BUCKET_ENTRIES                4
+#endif

 #define NULL_SIGNATURE                 0

-- 
2.7.4

Reply via email to