From: Pavan Nikhilesh <pbhagavat...@marvell.com> Improve reassembly lookup performance by using NEON intrinsics for key validation.
Signed-off-by: Pavan Nikhilesh <pbhagavat...@marvell.com> --- lib/ip_frag/ip_frag_internal.c | 224 +++++++++++++++++++++++++------ lib/ip_frag/ip_reassembly.h | 6 + lib/ip_frag/rte_ip_frag_common.c | 10 ++ 3 files changed, 196 insertions(+), 44 deletions(-) diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c index 7cbef647df..de78a0ed8f 100644 --- a/lib/ip_frag/ip_frag_internal.c +++ b/lib/ip_frag/ip_frag_internal.c @@ -4,8 +4,9 @@ #include <stddef.h> -#include <rte_jhash.h> #include <rte_hash_crc.h> +#include <rte_jhash.h> +#include <rte_vect.h> #include "ip_frag_common.h" @@ -280,10 +281,166 @@ ip_frag_find(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr, return pkt; } -struct ip_frag_pkt * -ip_frag_lookup(struct rte_ip_frag_tbl *tbl, - const struct ip_frag_key *key, uint64_t tms, - struct ip_frag_pkt **free, struct ip_frag_pkt **stale) +static inline void +ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p, + uint32_t list_idx, uint32_t list_cnt) +{ + RTE_SET_USED(tbl); + RTE_SET_USED(list_idx); + RTE_SET_USED(list_cnt); + if (p->key.key_len == IPV4_KEYLEN) + IP_FRAG_LOG(DEBUG, + "%s:%d:\n" + "tbl: %p, max_entries: %u, use_entries: %u\n" + "ipv4_frag_pkt line0: %p, index: %u from %u\n" + "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n", + __func__, __LINE__, tbl, tbl->max_entries, + tbl->use_entries, p, list_idx, list_cnt, + p->key.src_dst[0], p->key.id, p->start); + else + IP_FRAG_LOG(DEBUG, + "%s:%d:\n" + "tbl: %p, max_entries: %u, use_entries: %u\n" + "ipv6_frag_pkt line0: %p, index: %u from %u\n" + "key: <" IPv6_KEY_BYTES_FMT + ", %#x>, start: %" PRIu64 "\n", + __func__, __LINE__, tbl, tbl->max_entries, + tbl->use_entries, p, list_idx, list_cnt, + IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id, + p->start); +} + +#if defined(RTE_ARCH_ARM64) +static inline struct ip_frag_pkt * +ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms, + struct ip_frag_pkt **free, struct ip_frag_pkt **stale) +{ + struct ip_frag_pkt *empty, *old; + struct ip_frag_pkt *p1, *p2; + uint32_t assoc, sig1, sig2; + uint64_t max_cycles; + + empty = NULL; + old = NULL; + + max_cycles = tbl->max_cycles; + assoc = tbl->bucket_entries; + + if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0) + return tbl->last; + + /* different hashing methods for IPv4 and IPv6 */ + if (key->key_len == IPV4_KEYLEN) + ipv4_frag_hash(key, &sig1, &sig2); + else + ipv6_frag_hash(key, &sig1, &sig2); + + p1 = IP_FRAG_TBL_POS(tbl, sig1); + p2 = IP_FRAG_TBL_POS(tbl, sig2); + + uint64x2_t key0, key1, key2, key3; + uint64_t vmask, zmask, ts_mask; + uint64x2_t ts0, ts1; + uint32x4_t nz_key; + uint8_t idx; + /* Bucket entries are always power of 2. */ + rte_prefetch0(&p1[0].key); + rte_prefetch0(&p1[1].key); + rte_prefetch0(&p2[0].key); + rte_prefetch0(&p2[1].key); + + while (assoc > 1) { + if (assoc > 2) { + rte_prefetch0(&p1[2].key); + rte_prefetch0(&p1[3].key); + rte_prefetch0(&p2[2].key); + rte_prefetch0(&p2[3].key); + } + struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]}; + key0 = vld1q_u64(&p[0]->key.id_key_len); + key1 = vld1q_u64(&p[1]->key.id_key_len); + key2 = vld1q_u64(&p[2]->key.id_key_len); + key3 = vld1q_u64(&p[3]->key.id_key_len); + + nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1), nz_key, 0); + nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1), nz_key, 1); + nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1), nz_key, 2); + nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3), 1), nz_key, 3); + + nz_key = vceqzq_u32(nz_key); + zmask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0); + vmask = ~zmask; + + vmask &= 0x8000800080008000; + for (; vmask > 0; vmask &= vmask - 1) { + idx = __builtin_ctzll(vmask) >> 4; + if (ip_frag_key_cmp(key, &p[idx]->key) == 0) + return p[idx]; + } + + vmask = ~zmask; + if (zmask && empty == NULL) { + zmask &= 0x8000800080008000; + idx = __builtin_ctzll(zmask) >> 4; + empty = p[idx]; + } + + if (vmask && old == NULL) { + const uint64x2_t max_cyc = vdupq_n_u64(max_cycles); + const uint64x2_t cur_cyc = vdupq_n_u64(tms); + + ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0, 0); + ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0, 1); + ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1, 0); + ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1, 1); + + ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc)); + ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc)); + + ts_mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32( + vuzp1q_u32(vreinterpretq_u32_u64(ts0), + vreinterpretq_u32_u64(ts1)), + 16)), + 0); + vmask &= 0x8000800080008000; + ts_mask &= vmask; + if (ts_mask) { + idx = __builtin_ctzll(ts_mask) >> 4; + old = p[idx]; + } + } + p1 += 2; + p2 += 2; + assoc -= 4; + } + while (assoc) { + if (ip_frag_key_cmp(key, &p1->key) == 0) + return p1; + else if (ip_frag_key_is_empty(&p1->key)) + empty = (empty == NULL) ? p1 : empty; + else if (max_cycles + p1->start < tms) + old = (old == NULL) ? p1 : old; + + if (ip_frag_key_cmp(key, &p2->key) == 0) + return p2; + else if (ip_frag_key_is_empty(&p2->key)) + empty = (empty == NULL) ? p2 : empty; + else if (max_cycles + p2->start < tms) + old = (old == NULL) ? p2 : old; + p1++; + p2++; + assoc--; + } + + *free = empty; + *stale = old; + return NULL; +} +#endif + +static struct ip_frag_pkt * +ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms, + struct ip_frag_pkt **free, struct ip_frag_pkt **stale) { struct ip_frag_pkt *p1, *p2; struct ip_frag_pkt *empty, *old; @@ -309,25 +466,7 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl, p2 = IP_FRAG_TBL_POS(tbl, sig2); for (i = 0; i != assoc; i++) { - if (p1->key.key_len == IPV4_KEYLEN) - IP_FRAG_LOG(DEBUG, "%s:%d:\n" - "tbl: %p, max_entries: %u, use_entries: %u\n" - "ipv4_frag_pkt line0: %p, index: %u from %u\n" - "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n", - __func__, __LINE__, - tbl, tbl->max_entries, tbl->use_entries, - p1, i, assoc, - p1[i].key.src_dst[0], p1[i].key.id, p1[i].start); - else - IP_FRAG_LOG(DEBUG, "%s:%d:\n" - "tbl: %p, max_entries: %u, use_entries: %u\n" - "ipv6_frag_pkt line0: %p, index: %u from %u\n" - "key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n", - __func__, __LINE__, - tbl, tbl->max_entries, tbl->use_entries, - p1, i, assoc, - IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id, p1[i].start); - + ip_frag_dbg(tbl, &p1[i], i, assoc); if (ip_frag_key_cmp(key, &p1[i].key) == 0) return p1 + i; else if (ip_frag_key_is_empty(&p1[i].key)) @@ -335,29 +474,11 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl, else if (max_cycles + p1[i].start < tms) old = (old == NULL) ? (p1 + i) : old; - if (p2->key.key_len == IPV4_KEYLEN) - IP_FRAG_LOG(DEBUG, "%s:%d:\n" - "tbl: %p, max_entries: %u, use_entries: %u\n" - "ipv4_frag_pkt line1: %p, index: %u from %u\n" - "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n", - __func__, __LINE__, - tbl, tbl->max_entries, tbl->use_entries, - p2, i, assoc, - p2[i].key.src_dst[0], p2[i].key.id, p2[i].start); - else - IP_FRAG_LOG(DEBUG, "%s:%d:\n" - "tbl: %p, max_entries: %u, use_entries: %u\n" - "ipv6_frag_pkt line1: %p, index: %u from %u\n" - "key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n", - __func__, __LINE__, - tbl, tbl->max_entries, tbl->use_entries, - p2, i, assoc, - IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id, p2[i].start); - + ip_frag_dbg(tbl, &p2[i], i, assoc); if (ip_frag_key_cmp(key, &p2[i].key) == 0) return p2 + i; else if (ip_frag_key_is_empty(&p2[i].key)) - empty = (empty == NULL) ?( p2 + i) : empty; + empty = (empty == NULL) ? (p2 + i) : empty; else if (max_cycles + p2[i].start < tms) old = (old == NULL) ? (p2 + i) : old; } @@ -366,3 +487,18 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl, *stale = old; return NULL; } + +struct ip_frag_pkt * +ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms, + struct ip_frag_pkt **free, struct ip_frag_pkt **stale) +{ + switch (tbl->lookup_fn) { +#if defined(RTE_ARCH_ARM64) + case REASSEMBLY_LOOKUP_NEON: + return ip_frag_lookup_neon(tbl, key, tms, free, stale); +#endif + case REASSEMBLY_LOOKUP_SCALAR: + default: + return ip_frag_lookup_scalar(tbl, key, tms, free, stale); + } +} diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h index ef9d8c0d75..049437ae32 100644 --- a/lib/ip_frag/ip_reassembly.h +++ b/lib/ip_frag/ip_reassembly.h @@ -12,6 +12,11 @@ #include <rte_ip_frag.h> +enum ip_frag_lookup_func { + REASSEMBLY_LOOKUP_SCALAR = 0, + REASSEMBLY_LOOKUP_NEON, +}; + enum { IP_LAST_FRAG_IDX, /* index of last fragment */ IP_FIRST_FRAG_IDX, /* index of first fragment */ @@ -83,6 +88,7 @@ struct rte_ip_frag_tbl { struct ip_frag_pkt *last; /* last used entry. */ struct ip_pkt_list lru; /* LRU list for table entries. */ struct ip_frag_tbl_stat stat; /* statistics counters. */ + enum ip_frag_lookup_func lookup_fn; /* hash table lookup function. */ __extension__ struct ip_frag_pkt pkt[]; /* hash table. */ }; diff --git a/lib/ip_frag/rte_ip_frag_common.c b/lib/ip_frag/rte_ip_frag_common.c index c1de2e81b6..ef3c104e45 100644 --- a/lib/ip_frag/rte_ip_frag_common.c +++ b/lib/ip_frag/rte_ip_frag_common.c @@ -5,7 +5,9 @@ #include <stddef.h> #include <stdio.h> +#include <rte_cpuflags.h> #include <rte_log.h> +#include <rte_vect.h> #include "ip_frag_common.h" @@ -75,6 +77,14 @@ rte_ip_frag_table_create(uint32_t bucket_num, uint32_t bucket_entries, tbl->bucket_entries = bucket_entries; tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries - 1); +#if defined(RTE_ARCH_ARM64) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) && + rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) + tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON; + else +#endif + tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR; + TAILQ_INIT(&(tbl->lru)); return tbl; } -- 2.25.1