On 3/4/24 13:35, Konstantin Ananyev wrote:


- Implemented SVE code for comparing signatures in bulk lookup.
- Added Defines in code for SVE code support.
- Optimise NEON code
- New SVE code is ~5% slower than optimized NEON for N2 processor.

Signed-off-by: Yoan Picchi <yoan.pic...@arm.com>
Signed-off-by: Harjot Singh <harjot.si...@arm.com>
Reviewed-by: Nathan Brown <nathan.br...@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.w...@arm.com>
---
   lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
   lib/hash/rte_cuckoo_hash.h |   1 +
   2 files changed, 151 insertions(+), 46 deletions(-)

diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index a07dd3a28d..231d6d6ded 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -442,8 +442,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
                h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
        else
   #elif defined(RTE_ARCH_ARM64)
-       if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+       if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
                h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+               if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+                       h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+       }
        else
   #endif
                h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
@@ -1860,37 +1863,103 @@ rte_hash_free_key_with_position(const struct rte_hash 
*h,
   #if defined(__ARM_NEON)

   static inline void
-compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t 
*sec_hash_matches,
-                       const struct rte_hash_bucket *prim_bkt,
-                       const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+                       const uint16_t *prim_bucket_sigs,
+                       const uint16_t *sec_bucket_sigs,
                        uint16_t sig,
                        enum rte_hash_sig_compare_function sig_cmp_fn)
   {
        unsigned int i;

+       static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+       "The hitmask must be exactly wide enough to accept the whole hitmask if it 
is dense");
+
        /* For match mask every bits indicates the match */
        switch (sig_cmp_fn) {

Can I ask to move arch specific comparison code into some arch-specific headers 
or so?
It is getting really hard to read and understand the generic code with all 
these ifdefs and arch specific instructions...


Hi, apologies for long delay in response.

I can easily enough move the compare_signatures into an arm/x86
directory, and have a default version in the code.

Yes, that's what I thought about.
The problem would be for bulk lookup. The function is already duplicated
   2 times (the l and lf version). If I remove the #ifdefs, I'll need to
duplicate them again into 4 nearly identical versions (dense and
sparse). The only third options I see would be some preprocessor macro
to patch the function, but that looks even dirtier to me.

Not sure I understood you here: from looking at the code I don't see any
arch specific ifdefs in bulk_lookup() routines.
What I am missing here?

Most if not all of those #if are architecture specific. For instance:
#if defined(__ARM_NEON)
#if defined(RTE_HAS_SVE_ACLE)

The main reason there's some #if in bulk lookup is to handle whether the function run with dense hitmask or a sparse hitmask. x86 only support the sparse hitmask version (1 bit data, 1 bit padding) but arm support the dense hitmask (every bit count). The later ends up being faster. Splitting bulk_lookup into its sparse and dense variant would be a lot of code duplication that I'd prefer to avoid.

What I might be able to do would be move compare_signatures into some arch specific version. The function are different enough that it wouldn't be too much of a code duplication. I'd argue though that the #ifded for NEON and SSE were already there and I only added the SVE variant.


I think duplicating the code would be bad, but I can do it if you want.
Unless you have a better solution?

+#if RTE_HASH_BUCKET_ENTRIES <= 8
        case RTE_HASH_COMPARE_NEON: {
-               uint16x8_t vmat, x;
+               uint16x8_t vmat, hit1, hit2;
                const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 
0x80};
                const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);

                /* Compare all signatures in the primary bucket */
-               vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const 
*)prim_bkt->sig_current));
-               x = vandq_u16(vmat, mask);
-               *prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+               vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+               hit1 = vandq_u16(vmat, mask);
+
                /* Compare all signatures in the secondary bucket */
-               vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const 
*)sec_bkt->sig_current));
-               x = vandq_u16(vmat, mask);
-               *sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+               vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+               hit2 = vandq_u16(vmat, mask);
+
+               hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+               hit2 = vorrq_u16(hit1, hit2);
+               *hitmask_buffer = vaddvq_u16(hit2);
+               }
+               break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+       case RTE_HASH_COMPARE_SVE: {
+               svuint16_t vsign, shift, sv_matches;
+               svbool_t pred, match, bucket_wide_pred;
+               int i = 0;
+               uint64_t vl = svcnth();
+
+               vsign = svdup_u16(sig);
+               shift = svindex_u16(0, 1);
+
+               if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES 
<= 8) {
+                       svuint16_t primary_array_vect, secondary_array_vect;
+                       bucket_wide_pred = svwhilelt_b16(0, 
RTE_HASH_BUCKET_ENTRIES);
+                       primary_array_vect = svld1_u16(bucket_wide_pred, 
prim_bucket_sigs);
+                       secondary_array_vect = svld1_u16(bucket_wide_pred, 
sec_bucket_sigs);
+
+                       /* We merged the two vectors so we can do both 
comparison at once */
+                       primary_array_vect = svsplice_u16(bucket_wide_pred,
+                               primary_array_vect,
+                               secondary_array_vect);
+                       pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+                       /* Compare all signatures in the buckets */
+                       match = svcmpeq_u16(pred, vsign, primary_array_vect);
+                       if (svptest_any(svptrue_b16(), match)) {
+                               sv_matches = svdup_u16(1);
+                               sv_matches = svlsl_u16_z(match, sv_matches, 
shift);
+                               *hitmask_buffer = svorv_u16(svptrue_b16(), 
sv_matches);
+                       }
+               } else {
+                       do {
+                               pred = svwhilelt_b16(i, 
RTE_HASH_BUCKET_ENTRIES);
+                               uint16_t lower_half = 0;
+                               uint16_t upper_half = 0;
+                               /* Compare all signatures in the primary bucket 
*/
+                               match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+                                                       &prim_bucket_sigs[i]));
+                               if (svptest_any(svptrue_b16(), match)) {
+                                       sv_matches = svdup_u16(1);
+                                       sv_matches = svlsl_u16_z(match, 
sv_matches, shift);
+                                       lower_half = svorv_u16(svptrue_b16(), 
sv_matches);
+                               }
+                               /* Compare all signatures in the secondary 
bucket */
+                               match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+                                                       &sec_bucket_sigs[i]));
+                               if (svptest_any(svptrue_b16(), match)) {
+                                       sv_matches = svdup_u16(1);
+                                       sv_matches = svlsl_u16_z(match, 
sv_matches, shift);
+                                       upper_half = svorv_u16(svptrue_b16(), 
sv_matches)
+                                               << RTE_HASH_BUCKET_ENTRIES;
+                               }
+                               hitmask_buffer[i/8] = upper_half | lower_half;
+                               i += vl;
+                       } while (i < RTE_HASH_BUCKET_ENTRIES);
+               }
                }
                break;
+#endif
        default:
                for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-                       *prim_hash_matches |=
-                               ((sig == prim_bkt->sig_current[i]) << i);
-                       *sec_hash_matches |=
-                               ((sig == sec_bkt->sig_current[i]) << i);
+                       *hitmask_buffer |=
+                               ((sig == prim_bucket_sigs[i]) << i);
+                       *hitmask_buffer |=
+                               ((sig == sec_bucket_sigs[i]) << i) << 
RTE_HASH_BUCKET_ENTRIES;
                }
        }
   }
@@ -1908,7 +1977,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, 
uint32_t *sec_hash_matche

        /* For match mask the first bit of every two bits indicates the match */
        switch (sig_cmp_fn) {
-#if defined(__SSE2__)
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
        case RTE_HASH_COMPARE_SSE:
                /* Compare all signatures in the bucket */
                *prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
@@ -1948,14 +2017,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
        uint64_t hits = 0;
        int32_t i;
        int32_t ret;
-       uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-       uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
        struct rte_hash_bucket *cur_bkt, *next_bkt;

   #if defined(__ARM_NEON)
        const int hitmask_padding = 0;
+       uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+       static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+       "The hitmask must be exactly wide enough to accept the whole hitmask when it 
is dense");
   #else
        const int hitmask_padding = 1;
+       uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+       uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
   #endif

        __hash_rw_reader_lock(h);
@@ -1963,18 +2036,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
        /* Compare signatures and prefetch key slot of first hit */
        for (i = 0; i < num_keys; i++) {
   #if defined(__ARM_NEON)
-               compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-                       primary_bkt[i], secondary_bkt[i],
+               uint16_t *hitmask = &hitmask_buffer[i];
+               compare_signatures_dense(hitmask,
+                       primary_bkt[i]->sig_current,
+                       secondary_bkt[i]->sig_current,
                        sig[i], h->sig_cmp_fn);
+               const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+               const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
   #else
-               compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+               compare_signatures_sparse(&prim_hitmask_buffer[i], 
&sec_hitmask_buffer[i],
                        primary_bkt[i], secondary_bkt[i],
                        sig[i], h->sig_cmp_fn);
+               const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+               const unsigned int sec_hitmask = sec_hitmask_buffer[i];
   #endif

-               if (prim_hitmask[i]) {
+               if (prim_hitmask) {
                        uint32_t first_hit =
-                                       rte_ctz32(prim_hitmask[i])
+                                       rte_ctz32(prim_hitmask)
                                        >> hitmask_padding;
                        uint32_t key_idx =
                                primary_bkt[i]->key_idx[first_hit];
@@ -1986,9 +2065,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
                        continue;
                }

-               if (sec_hitmask[i]) {
+               if (sec_hitmask) {
                        uint32_t first_hit =
-                                       rte_ctz32(sec_hitmask[i])
+                                       rte_ctz32(sec_hitmask)
                                        >> hitmask_padding;
                        uint32_t key_idx =
                                secondary_bkt[i]->key_idx[first_hit];
@@ -2003,9 +2082,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
        /* Compare keys, first hits in primary first */
        for (i = 0; i < num_keys; i++) {
                positions[i] = -ENOENT;
-               while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+               uint16_t *hitmask = &hitmask_buffer[i];
+               unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+               unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+               unsigned int prim_hitmask = prim_hitmask_buffer[i];
+               unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+               while (prim_hitmask) {
                        uint32_t hit_index =
-                                       rte_ctz32(prim_hitmask[i])
+                                       rte_ctz32(prim_hitmask)
                                        >> hitmask_padding;
                        uint32_t key_idx =
                                primary_bkt[i]->key_idx[hit_index];
@@ -2028,12 +2115,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
                                positions[i] = key_idx - 1;
                                goto next_key;
                        }
-                       prim_hitmask[i] &= ~(1 << (hit_index << 
hitmask_padding));
+                       prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
                }

-               while (sec_hitmask[i]) {
+               while (sec_hitmask) {
                        uint32_t hit_index =
-                                       rte_ctz32(sec_hitmask[i])
+                                       rte_ctz32(sec_hitmask)
                                        >> hitmask_padding;
                        uint32_t key_idx =
                                secondary_bkt[i]->key_idx[hit_index];
@@ -2057,7 +2144,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void 
**keys,
                                positions[i] = key_idx - 1;
                                goto next_key;
                        }
-                       sec_hitmask[i] &= ~(1 << (hit_index << 
hitmask_padding));
+                       sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
                }
   next_key:
                continue;
@@ -2107,15 +2194,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
        uint64_t hits = 0;
        int32_t i;
        int32_t ret;
-       uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-       uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
        struct rte_hash_bucket *cur_bkt, *next_bkt;
        uint32_t cnt_b, cnt_a;

   #if defined(__ARM_NEON)
        const int hitmask_padding = 0;
+       uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+       static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+       "The hitmask must be exactly wide enough to accept the whole hitmask chen it 
is dense");
   #else
        const int hitmask_padding = 1;
+       uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+       uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
   #endif

        for (i = 0; i < num_keys; i++)
@@ -2132,18 +2222,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
                /* Compare signatures and prefetch key slot of first hit */
                for (i = 0; i < num_keys; i++) {
   #if defined(__ARM_NEON)
-                       compare_signatures_dense(&prim_hitmask[i], 
&sec_hitmask[i],
-                               primary_bkt[i], secondary_bkt[i],
+                       uint16_t *hitmask = &hitmask_buffer[i];
+                       compare_signatures_dense(hitmask,
+                               primary_bkt[i]->sig_current,
+                               secondary_bkt[i]->sig_current,
                                sig[i], h->sig_cmp_fn);
+                       const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+                       const unsigned int sec_hitmask = *((uint8_t 
*)(hitmask)+1);
   #else
-                       compare_signatures_sparse(&prim_hitmask[i], 
&sec_hitmask[i],
+                       compare_signatures_sparse(&prim_hitmask_buffer[i], 
&sec_hitmask_buffer[i],
                                primary_bkt[i], secondary_bkt[i],
                                sig[i], h->sig_cmp_fn);
+                       const unsigned int prim_hitmask = 
prim_hitmask_buffer[i];
+                       const unsigned int sec_hitmask = sec_hitmask_buffer[i];
   #endif

-                       if (prim_hitmask[i]) {
+                       if (prim_hitmask) {
                                uint32_t first_hit =
-                                               rte_ctz32(prim_hitmask[i])
+                                               rte_ctz32(prim_hitmask)
                                                >> hitmask_padding;
                                uint32_t key_idx =
                                        primary_bkt[i]->key_idx[first_hit];
@@ -2155,9 +2251,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
                                continue;
                        }

-                       if (sec_hitmask[i]) {
+                       if (sec_hitmask) {
                                uint32_t first_hit =
-                                               rte_ctz32(sec_hitmask[i])
+                                               rte_ctz32(sec_hitmask)
                                                >> hitmask_padding;
                                uint32_t key_idx =
                                        secondary_bkt[i]->key_idx[first_hit];
@@ -2171,9 +2267,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,

                /* Compare keys, first hits in primary first */
                for (i = 0; i < num_keys; i++) {
-                       while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+                       uint16_t *hitmask = &hitmask_buffer[i];
+                       unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+                       unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+                       unsigned int prim_hitmask = prim_hitmask_buffer[i];
+                       unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+                       while (prim_hitmask) {
                                uint32_t hit_index =
-                                               rte_ctz32(prim_hitmask[i])
+                                               rte_ctz32(prim_hitmask)
                                                >> hitmask_padding;
                                uint32_t key_idx =
                                rte_atomic_load_explicit(
@@ -2200,12 +2304,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
                                        positions[i] = key_idx - 1;
                                        goto next_key;
                                }
-                               prim_hitmask[i] &= ~(1 << (hit_index << 
hitmask_padding));
+                               prim_hitmask &= ~(1 << (hit_index << 
hitmask_padding));
                        }

-                       while (sec_hitmask[i]) {
+                       while (sec_hitmask) {
                                uint32_t hit_index =
-                                               rte_ctz32(sec_hitmask[i])
+                                               rte_ctz32(sec_hitmask)
                                                >> hitmask_padding;
                                uint32_t key_idx =
                                rte_atomic_load_explicit(
@@ -2233,7 +2337,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void 
**keys,
                                        positions[i] = key_idx - 1;
                                        goto next_key;
                                }
-                               sec_hitmask[i] &= ~(1 << (hit_index << 
hitmask_padding));
+                               sec_hitmask &= ~(1 << (hit_index << 
hitmask_padding));
                        }
   next_key:
                        continue;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index 8ea793c66e..ed18e1f41e 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -137,6 +137,7 @@ enum rte_hash_sig_compare_function {
        RTE_HASH_COMPARE_SCALAR = 0,
        RTE_HASH_COMPARE_SSE,
        RTE_HASH_COMPARE_NEON,
+       RTE_HASH_COMPARE_SVE,
        RTE_HASH_COMPARE_NUM
   };

--
2.34.1



Reply via email to