Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations
The default non SIMD path now use this dense mask.

Signed-off-by: Yoan Picchi <yoan.pic...@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.w...@arm.com>
Reviewed-by: Nathan Brown <nathan.br...@arm.com>
---
 lib/hash/compare_signatures_arm_pvt.h     |  47 ++++----
 lib/hash/compare_signatures_generic_pvt.h |  31 +++---
 lib/hash/compare_signatures_x86_pvt.h     |   9 +-
 lib/hash/rte_cuckoo_hash.c                | 124 ++++++++++++++++------
 4 files changed, 145 insertions(+), 66 deletions(-)

diff --git a/lib/hash/compare_signatures_arm_pvt.h 
b/lib/hash/compare_signatures_arm_pvt.h
index 74b3286c95..0fc657c49b 100644
--- a/lib/hash/compare_signatures_arm_pvt.h
+++ b/lib/hash/compare_signatures_arm_pvt.h
@@ -6,48 +6,57 @@
 #ifndef _COMPARE_SIGNATURE_ARM_PVT_H_
 #define _COMPARE_SIGNATURE_ARM_PVT_H_
 
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
 #include <inttypes.h>
 #include <rte_common.h>
 #include <rte_vect.h>
 
 #include "rte_cuckoo_hash.h"
 
+#define DENSE_HASH_BULK_LOOKUP 1
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-                       const struct rte_hash_bucket *prim_bkt,
-                       const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+                       const uint16_t *prim_bucket_sigs,
+                       const uint16_t *sec_bucket_sigs,
                        uint16_t sig,
                        enum rte_hash_sig_compare_function sig_cmp_fn)
 {
-       unsigned int i;
 
-       /* For match mask the first bit of every two bits indicates the match */
+       static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 
8),
+               "hitmask_buffer must be wide enough to fit a dense hitmask");
+
+       /* For match mask every bits indicates the match */
        switch (sig_cmp_fn) {
 #if defined(__ARM_NEON) && RTE_HASH_BUCKET_ENTRIES <= 8
        case RTE_HASH_COMPARE_NEON: {
                uint16x8_t vmat, vsig, x;
-               int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
+               int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+               uint16_t low, high;
 
                vsig = vld1q_dup_u16((uint16_t const *)&sig);
                /* Compare all signatures in the primary bucket */
-               vmat = vceqq_u16(vsig,
-                       vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-               *prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+               vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const 
*)prim_bucket_sigs));
+               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+               low = (uint16_t)(vaddvq_u16(x));
                /* Compare all signatures in the secondary bucket */
-               vmat = vceqq_u16(vsig,
-                       vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-               *sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+               vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const 
*)sec_bucket_sigs));
+               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+               high = (uint16_t)(vaddvq_u16(x));
+               *hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
                }
                break;
 #endif
        default:
-               for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-                       *prim_hash_matches |=
-                               ((sig == prim_bkt->sig_current[i]) << (i << 1));
-                       *sec_hash_matches |=
-                               ((sig == sec_bkt->sig_current[i]) << (i << 1));
+               for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+                       *hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
+                       *hitmask_buffer |=
+                               ((sig == sec_bucket_sigs[i]) << i) << 
RTE_HASH_BUCKET_ENTRIES;
                }
        }
 }
diff --git a/lib/hash/compare_signatures_generic_pvt.h 
b/lib/hash/compare_signatures_generic_pvt.h
index 43587adcef..1d065d4c28 100644
--- a/lib/hash/compare_signatures_generic_pvt.h
+++ b/lib/hash/compare_signatures_generic_pvt.h
@@ -6,27 +6,34 @@
 #ifndef _COMPARE_SIGNATURE_GENERIC_PVT_H_
 #define _COMPARE_SIGNATURE_GENERIC_PVT_H_
 
+/*
+ * The generic version could use either a dense or sparsely packed hitmask 
buffer,
+ * but the dense one is slightly faster.
+ */
+
 #include <inttypes.h>
 #include <rte_common.h>
 #include <rte_vect.h>
 
 #include "rte_cuckoo_hash.h"
 
+#define DENSE_HASH_BULK_LOOKUP 1
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-                       const struct rte_hash_bucket *prim_bkt,
-                       const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+                       const uint16_t *prim_bucket_sigs,
+                       const uint16_t *sec_bucket_sigs,
                        uint16_t sig,
-                       enum rte_hash_sig_compare_function sig_cmp_fn)
+                       __rte_unused enum rte_hash_sig_compare_function 
sig_cmp_fn)
 {
-       unsigned int i;
-
-       /* For match mask the first bit of every two bits indicates the match */
-       for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-               *prim_hash_matches |=
-                       ((sig == prim_bkt->sig_current[i]) << (i << 1));
-               *sec_hash_matches |=
-                       ((sig == sec_bkt->sig_current[i]) << (i << 1));
+
+       static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 
8),
+                       "hitmask_buffer must be wide enough to fit a dense 
hitmask");
+
+       /* For match mask every bits indicates the match */
+       for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+               *hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
+               *hitmask_buffer |= ((sig == sec_bucket_sigs[i]) << i) << 
RTE_HASH_BUCKET_ENTRIES;
        }
 }
 
diff --git a/lib/hash/compare_signatures_x86_pvt.h 
b/lib/hash/compare_signatures_x86_pvt.h
index f77b37f1cd..03e9c44e53 100644
--- a/lib/hash/compare_signatures_x86_pvt.h
+++ b/lib/hash/compare_signatures_x86_pvt.h
@@ -6,14 +6,21 @@
 #ifndef _COMPARE_SIGNATURE_X86_PVT_H_
 #define _COMPARE_SIGNATURE_X86_PVT_H_
 
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
 #include <inttypes.h>
 #include <rte_common.h>
 #include <rte_vect.h>
 
 #include "rte_cuckoo_hash.h"
 
+#define DENSE_HASH_BULK_LOOKUP 0
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t 
*sec_hash_matches,
                        const struct rte_hash_bucket *prim_bkt,
                        const struct rte_hash_bucket *sec_bkt,
                        uint16_t sig,
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 739f7927b8..187918a05a 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1908,22 +1908,41 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
        uint64_t hits = 0;
        int32_t i;
        int32_t ret;
-       uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-       uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
        struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+       const int hitmask_padding = 0;
+       uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#else
+       const int hitmask_padding = 1;
+       uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+       uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
        __hash_rw_reader_lock(h);
 
        /* Compare signatures and prefetch key slot of first hit */
        for (i = 0; i < num_keys; i++) {
-               compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+               uint16_t *hitmask = &hitmask_buffer[i];
+               compare_signatures_dense(hitmask,
+                       primary_bkt[i]->sig_current,
+                       secondary_bkt[i]->sig_current,
+                       sig[i], h->sig_cmp_fn);
+               const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+               const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+               compare_signatures_sparse(&prim_hitmask_buffer[i], 
&sec_hitmask_buffer[i],
                        primary_bkt[i], secondary_bkt[i],
                        sig[i], h->sig_cmp_fn);
+               const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+               const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-               if (prim_hitmask[i]) {
+               if (prim_hitmask) {
                        uint32_t first_hit =
-                                       rte_ctz32(prim_hitmask[i])
-                                       >> 1;
+                                       rte_ctz32(prim_hitmask)
+                                       >> hitmask_padding;
                        uint32_t key_idx =
                                primary_bkt[i]->key_idx[first_hit];
                        const struct rte_hash_key *key_slot =
@@ -1934,10 +1953,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
                        continue;
                }
 
-               if (sec_hitmask[i]) {
+               if (sec_hitmask) {
                        uint32_t first_hit =
-                                       rte_ctz32(sec_hitmask[i])
-                                       >> 1;
+                                       rte_ctz32(sec_hitmask)
+                                       >> hitmask_padding;
                        uint32_t key_idx =
                                secondary_bkt[i]->key_idx[first_hit];
                        const struct rte_hash_key *key_slot =
@@ -1951,10 +1970,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
        /* Compare keys, first hits in primary first */
        for (i = 0; i < num_keys; i++) {
                positions[i] = -ENOENT;
-               while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+               uint16_t *hitmask = &hitmask_buffer[i];
+               unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+               unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+               unsigned int prim_hitmask = prim_hitmask_buffer[i];
+               unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+               while (prim_hitmask) {
                        uint32_t hit_index =
-                                       rte_ctz32(prim_hitmask[i])
-                                       >> 1;
+                                       rte_ctz32(prim_hitmask)
+                                       >> hitmask_padding;
                        uint32_t key_idx =
                                primary_bkt[i]->key_idx[hit_index];
                        const struct rte_hash_key *key_slot =
@@ -1976,13 +2003,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
                                positions[i] = key_idx - 1;
                                goto next_key;
                        }
-                       prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+                       prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
                }
 
-               while (sec_hitmask[i]) {
+               while (sec_hitmask) {
                        uint32_t hit_index =
-                                       rte_ctz32(sec_hitmask[i])
-                                       >> 1;
+                                       rte_ctz32(sec_hitmask)
+                                       >> hitmask_padding;
                        uint32_t key_idx =
                                secondary_bkt[i]->key_idx[hit_index];
                        const struct rte_hash_key *key_slot =
@@ -2005,7 +2032,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
                                positions[i] = key_idx - 1;
                                goto next_key;
                        }
-                       sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+                       sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
                }
 next_key:
                continue;
@@ -2055,11 +2082,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
        uint64_t hits = 0;
        int32_t i;
        int32_t ret;
-       uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-       uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
        struct rte_hash_bucket *cur_bkt, *next_bkt;
        uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+       const int hitmask_padding = 0;
+       uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+       static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+       "The hitmask must be exactly wide enough to accept the whole hitmask 
chen it is dense");
+#else
+       const int hitmask_padding = 1;
+       uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+       uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
        for (i = 0; i < num_keys; i++)
                positions[i] = -ENOENT;
 
@@ -2073,14 +2109,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
 
                /* Compare signatures and prefetch key slot of first hit */
                for (i = 0; i < num_keys; i++) {
-                       compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+                       uint16_t *hitmask = &hitmask_buffer[i];
+                       compare_signatures_dense(hitmask,
+                               primary_bkt[i]->sig_current,
+                               secondary_bkt[i]->sig_current,
+                               sig[i], h->sig_cmp_fn);
+                       const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+                       const unsigned int sec_hitmask = *((uint8_t 
*)(hitmask)+1);
+#else
+                       compare_signatures_sparse(&prim_hitmask_buffer[i], 
&sec_hitmask_buffer[i],
                                primary_bkt[i], secondary_bkt[i],
                                sig[i], h->sig_cmp_fn);
+                       const unsigned int prim_hitmask = 
prim_hitmask_buffer[i];
+                       const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-                       if (prim_hitmask[i]) {
+                       if (prim_hitmask) {
                                uint32_t first_hit =
-                                               rte_ctz32(prim_hitmask[i])
-                                               >> 1;
+                                               rte_ctz32(prim_hitmask)
+                                               >> hitmask_padding;
                                uint32_t key_idx =
                                        primary_bkt[i]->key_idx[first_hit];
                                const struct rte_hash_key *key_slot =
@@ -2091,10 +2139,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
                                continue;
                        }
 
-                       if (sec_hitmask[i]) {
+                       if (sec_hitmask) {
                                uint32_t first_hit =
-                                               rte_ctz32(sec_hitmask[i])
-                                               >> 1;
+                                               rte_ctz32(sec_hitmask)
+                                               >> hitmask_padding;
                                uint32_t key_idx =
                                        secondary_bkt[i]->key_idx[first_hit];
                                const struct rte_hash_key *key_slot =
@@ -2107,10 +2155,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
 
                /* Compare keys, first hits in primary first */
                for (i = 0; i < num_keys; i++) {
-                       while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+                       uint16_t *hitmask = &hitmask_buffer[i];
+                       unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+                       unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+                       unsigned int prim_hitmask = prim_hitmask_buffer[i];
+                       unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+                       while (prim_hitmask) {
                                uint32_t hit_index =
-                                               rte_ctz32(prim_hitmask[i])
-                                               >> 1;
+                                               rte_ctz32(prim_hitmask)
+                                               >> hitmask_padding;
                                uint32_t key_idx =
                                rte_atomic_load_explicit(
                                        &primary_bkt[i]->key_idx[hit_index],
@@ -2136,13 +2192,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
                                        positions[i] = key_idx - 1;
                                        goto next_key;
                                }
-                               prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+                               prim_hitmask &= ~(1 << (hit_index << 
hitmask_padding));
                        }
 
-                       while (sec_hitmask[i]) {
+                       while (sec_hitmask) {
                                uint32_t hit_index =
-                                               rte_ctz32(sec_hitmask[i])
-                                               >> 1;
+                                               rte_ctz32(sec_hitmask)
+                                               >> hitmask_padding;
                                uint32_t key_idx =
                                rte_atomic_load_explicit(
                                        &secondary_bkt[i]->key_idx[hit_index],
@@ -2169,7 +2225,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
                                        positions[i] = key_idx - 1;
                                        goto next_key;
                                }
-                               sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+                               sec_hitmask &= ~(1 << (hit_index << 
hitmask_padding));
                        }
 next_key:
                        continue;
-- 
2.34.1

Reply via email to