Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.

Signed-off-by: Yoan Picchi <yoan.pic...@arm.com>
---
 .mailmap                   |   2 +
 lib/hash/rte_cuckoo_hash.c | 118 ++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 34 deletions(-)

diff --git a/.mailmap b/.mailmap
index 3f5bab26a8..b9c49aa7f6 100644
--- a/.mailmap
+++ b/.mailmap
@@ -485,6 +485,7 @@ Hari Kumar Vemula <hari.kumarx.vem...@intel.com>
 Harini Ramakrishnan <harini.ramakrish...@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindhara...@intel.com>
 Harish Patil <harish.pa...@cavium.com> <harish.pa...@qlogic.com>
+Harjot Singh <harjot.si...@arm.com>
 Harman Kalra <hka...@marvell.com>
 Harneet Singh <harneet.si...@intel.com>
 Harold Huang <baymaxhu...@gmail.com>
@@ -1602,6 +1603,7 @@ Yixue Wang <yixue.w...@intel.com>
 Yi Yang <yangy...@inspur.com> <yi.y.y...@intel.com>
 Yi Zhang <zhang.y...@zte.com.cn>
 Yoann Desmouceaux <ydesm...@cisco.com>
+Yoan Picchi <yoan.pic...@arm.com>
 Yogesh Jangra <yogesh.jan...@intel.com>
 Yogev Chaimovich <yo...@cgstowernetworks.com>
 Yongjie Gu <yongjiex...@intel.com>
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 19b23f2a97..2aa96eb862 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1850,8 +1850,50 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
+#if defined(__ARM_NEON)
+
+static inline void
+compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t 
*sec_hash_matches,
+                       const struct rte_hash_bucket *prim_bkt,
+                       const struct rte_hash_bucket *sec_bkt,
+                       uint16_t sig,
+                       enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+       unsigned int i;
+
+       /* For match mask every bits indicates the match */
+       switch (sig_cmp_fn) {
+       case RTE_HASH_COMPARE_NEON: {
+               uint16x8_t vmat, vsig, x;
+               int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+
+               vsig = vld1q_dup_u16((uint16_t const *)&sig);
+               /* Compare all signatures in the primary bucket */
+               vmat = vceqq_u16(vsig,
+                       vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+               *prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+               /* Compare all signatures in the secondary bucket */
+               vmat = vceqq_u16(vsig,
+                       vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+               *sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+               }
+               break;
+       default:
+               for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+                       *prim_hash_matches |=
+                               ((sig == prim_bkt->sig_current[i]) << i);
+                       *sec_hash_matches |=
+                               ((sig == sec_bkt->sig_current[i]) << i);
+               }
+       }
+}
+
+#else
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t 
*sec_hash_matches,
                        const struct rte_hash_bucket *prim_bkt,
                        const struct rte_hash_bucket *sec_bkt,
                        uint16_t sig,
@@ -1878,25 +1920,7 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t 
*sec_hash_matches,
                /* Extract the even-index bits only */
                *sec_hash_matches &= 0x5555;
                break;
-#elif defined(__ARM_NEON)
-       case RTE_HASH_COMPARE_NEON: {
-               uint16x8_t vmat, vsig, x;
-               int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-               vsig = vld1q_dup_u16((uint16_t const *)&sig);
-               /* Compare all signatures in the primary bucket */
-               vmat = vceqq_u16(vsig,
-                       vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-               *prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-               /* Compare all signatures in the secondary bucket */
-               vmat = vceqq_u16(vsig,
-                       vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-               *sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-               }
-               break;
-#endif
+#endif /* defined(__SSE2__) */
        default:
                for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
                        *prim_hash_matches |=
@@ -1907,6 +1931,8 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t 
*sec_hash_matches,
        }
 }
 
+#endif /* defined(__ARM_NEON) */
+
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
                const struct rte_hash_bucket **primary_bkt,
@@ -1921,18 +1947,30 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
        uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
        struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if defined(__ARM_NEON)
+       const int hitmask_padding = 0;
+#else
+       const int hitmask_padding = 1;
+#endif
+
        __hash_rw_reader_lock(h);
 
        /* Compare signatures and prefetch key slot of first hit */
        for (i = 0; i < num_keys; i++) {
-               compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+               compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
+                       primary_bkt[i], secondary_bkt[i],
+                       sig[i], h->sig_cmp_fn);
+#else
+               compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
                        primary_bkt[i], secondary_bkt[i],
                        sig[i], h->sig_cmp_fn);
+#endif
 
                if (prim_hitmask[i]) {
                        uint32_t first_hit =
                                        __builtin_ctzl(prim_hitmask[i])
-                                       >> 1;
+                                       >> hitmask_padding;
                        uint32_t key_idx =
                                primary_bkt[i]->key_idx[first_hit];
                        const struct rte_hash_key *key_slot =
@@ -1946,7 +1984,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
                if (sec_hitmask[i]) {
                        uint32_t first_hit =
                                        __builtin_ctzl(sec_hitmask[i])
-                                       >> 1;
+                                       >> hitmask_padding;
                        uint32_t key_idx =
                                secondary_bkt[i]->key_idx[first_hit];
                        const struct rte_hash_key *key_slot =
@@ -1963,7 +2001,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
                while (prim_hitmask[i]) {
                        uint32_t hit_index =
                                        __builtin_ctzl(prim_hitmask[i])
-                                       >> 1;
+                                       >> hitmask_padding;
                        uint32_t key_idx =
                                primary_bkt[i]->key_idx[hit_index];
                        const struct rte_hash_key *key_slot =
@@ -1985,13 +2023,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
                                positions[i] = key_idx - 1;
                                goto next_key;
                        }
-                       prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+                       prim_hitmask[i] &= ~(1 << (hit_index << 
hitmask_padding));
                }
 
                while (sec_hitmask[i]) {
                        uint32_t hit_index =
                                        __builtin_ctzl(sec_hitmask[i])
-                                       >> 1;
+                                       >> hitmask_padding;
                        uint32_t key_idx =
                                secondary_bkt[i]->key_idx[hit_index];
                        const struct rte_hash_key *key_slot =
@@ -2014,7 +2052,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
                                positions[i] = key_idx - 1;
                                goto next_key;
                        }
-                       sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+                       sec_hitmask[i] &= ~(1 << (hit_index << 
hitmask_padding));
                }
 next_key:
                continue;
@@ -2069,6 +2107,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
        struct rte_hash_bucket *cur_bkt, *next_bkt;
        uint32_t cnt_b, cnt_a;
 
+#if defined(__ARM_NEON)
+       const int hitmask_padding = 0;
+#else
+       const int hitmask_padding = 1;
+#endif
+
        for (i = 0; i < num_keys; i++)
                positions[i] = -ENOENT;
 
@@ -2082,14 +2126,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
 
                /* Compare signatures and prefetch key slot of first hit */
                for (i = 0; i < num_keys; i++) {
-                       compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+                       compare_signatures_dense(&prim_hitmask[i], 
&sec_hitmask[i],
                                primary_bkt[i], secondary_bkt[i],
                                sig[i], h->sig_cmp_fn);
+#else
+                       compare_signatures_sparse(&prim_hitmask[i], 
&sec_hitmask[i],
+                               primary_bkt[i], secondary_bkt[i],
+                               sig[i], h->sig_cmp_fn);
+#endif
 
                        if (prim_hitmask[i]) {
                                uint32_t first_hit =
                                                __builtin_ctzl(prim_hitmask[i])
-                                               >> 1;
+                                               >> hitmask_padding;
                                uint32_t key_idx =
                                        primary_bkt[i]->key_idx[first_hit];
                                const struct rte_hash_key *key_slot =
@@ -2103,7 +2153,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
                        if (sec_hitmask[i]) {
                                uint32_t first_hit =
                                                __builtin_ctzl(sec_hitmask[i])
-                                               >> 1;
+                                               >> hitmask_padding;
                                uint32_t key_idx =
                                        secondary_bkt[i]->key_idx[first_hit];
                                const struct rte_hash_key *key_slot =
@@ -2119,7 +2169,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
                        while (prim_hitmask[i]) {
                                uint32_t hit_index =
                                                __builtin_ctzl(prim_hitmask[i])
-                                               >> 1;
+                                               >> hitmask_padding;
                                uint32_t key_idx =
                                __atomic_load_n(
                                        &primary_bkt[i]->key_idx[hit_index],
@@ -2145,13 +2195,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
                                        positions[i] = key_idx - 1;
                                        goto next_key;
                                }
-                               prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+                               prim_hitmask[i] &= ~(1 << (hit_index << 
hitmask_padding));
                        }
 
                        while (sec_hitmask[i]) {
                                uint32_t hit_index =
                                                __builtin_ctzl(sec_hitmask[i])
-                                               >> 1;
+                                               >> hitmask_padding;
                                uint32_t key_idx =
                                __atomic_load_n(
                                        &secondary_bkt[i]->key_idx[hit_index],
@@ -2178,7 +2228,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
                                        positions[i] = key_idx - 1;
                                        goto next_key;
                                }
-                               sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+                               sec_hitmask[i] &= ~(1 << (hit_index << 
hitmask_padding));
                        }
 next_key:
                        continue;
-- 
2.25.1

Reply via email to