Sketching algorithm provide high-fidelity approximate measurements and appears as a promising alternative to traditional approaches such as packet sampling.
NitroSketch [1] is a software sketching framework that optimizes performance, provides accuracy guarantees, and supports a variety of sketches. This commit adds a new data structure called sketch into membership library. This new data structure is an efficient way to profile the traffic for heavy hitters. Also use min-heap structure to maintain the top-k flow keys. [1] Zaoxing Liu, Ran Ben-Basat, Gil Einziger, Yaron Kassner, Vladimir Braverman, Roy Friedman, Vyas Sekar, "NitroSketch: Robust and General Sketch-based Monitoring in Software Switches", in ACM SIGCOMM 2019. https://dl.acm.org/doi/pdf/10.1145/3341302.3342076 Signed-off-by: Alan Liu <zaoxing...@gmail.com> Signed-off-by: Yipeng Wang <yipeng1.w...@intel.com> Signed-off-by: Leyi Rong <leyi.r...@intel.com> --- lib/member/meson.build | 49 ++- lib/member/rte_member.c | 75 ++++ lib/member/rte_member.h | 154 ++++++- lib/member/rte_member_heap.h | 424 ++++++++++++++++++ lib/member/rte_member_sketch.c | 594 ++++++++++++++++++++++++++ lib/member/rte_member_sketch.h | 97 +++++ lib/member/rte_member_sketch_avx512.c | 70 +++ lib/member/rte_member_sketch_avx512.h | 35 ++ lib/member/rte_xxh64_avx512.h | 117 +++++ lib/member/version.map | 9 + 10 files changed, 1619 insertions(+), 5 deletions(-) create mode 100644 lib/member/rte_member_heap.h create mode 100644 lib/member/rte_member_sketch.c create mode 100644 lib/member/rte_member_sketch.h create mode 100644 lib/member/rte_member_sketch_avx512.c create mode 100644 lib/member/rte_member_sketch_avx512.h create mode 100644 lib/member/rte_xxh64_avx512.h diff --git a/lib/member/meson.build b/lib/member/meson.build index e06fddc240..8bd0af492c 100644 --- a/lib/member/meson.build +++ b/lib/member/meson.build @@ -7,6 +7,51 @@ if is_windows subdir_done() endif -sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c') headers = files('rte_member.h') -deps += ['hash'] + +sources = files( + 'rte_member.c', + 'rte_member_ht.c', + 'rte_member_vbf.c', + 'rte_member_sketch.c' +) + +deps += ['hash', 'ring'] + +# compile AVX512 version if: +if dpdk_conf.has('RTE_ARCH_X86_64') and binutils_ok + # compile AVX512 version if either: + # a. we have AVX512 supported in minimum instruction set + # baseline + # b. it's not minimum instruction set, but supported by + # compiler + # + # in former case, just add avx512 C file to files list + # in latter case, compile c file to static lib, using correct + # compiler flags, and then have the .o file from static lib + # linked into main lib. + + #check if all required flags already enabled + sketch_avx512_flags = ['__AVX512F__', '__AVX512DQ__', '__AVX512IFMA__'] + + sketch_avx512_on = true + foreach f:sketch_avx512_flags + if cc.get_define(f, args: machine_args) == '' + sketch_avx512_on = false + endif + endforeach + + if sketch_avx512_on == true + cflags += ['-DCC_AVX512_SUPPORT'] + sources += files('rte_member_sketch_avx512.c') + elif cc.has_multi_arguments('-mavx512f', '-mavx512dq', '-mavx512ifma') + sketch_avx512_tmp = static_library('sketch_avx512_tmp', + 'rte_member_sketch_avx512.c', + include_directories: includes, + dependencies: [static_rte_eal, static_rte_hash], + c_args: cflags + + ['-mavx512f', '-mavx512dq', '-mavx512ifma']) + objs += sketch_avx512_tmp.extract_objects('rte_member_sketch_avx512.c') + cflags += ['-DCC_AVX512_SUPPORT'] + endif +endif diff --git a/lib/member/rte_member.c b/lib/member/rte_member.c index 7e1632e6b5..8f859f7fbd 100644 --- a/lib/member/rte_member.c +++ b/lib/member/rte_member.c @@ -9,10 +9,12 @@ #include <rte_malloc.h> #include <rte_errno.h> #include <rte_tailq.h> +#include <rte_ring_elem.h> #include "rte_member.h" #include "rte_member_ht.h" #include "rte_member_vbf.h" +#include "rte_member_sketch.h" TAILQ_HEAD(rte_member_list, rte_tailq_entry); static struct rte_tailq_elem rte_member_tailq = { @@ -72,6 +74,9 @@ rte_member_free(struct rte_member_setsum *setsum) case RTE_MEMBER_TYPE_VBF: rte_member_free_vbf(setsum); break; + case RTE_MEMBER_TYPE_SKETCH: + rte_member_free_sketch(setsum); + break; default: break; } @@ -86,6 +91,8 @@ rte_member_create(const struct rte_member_parameters *params) struct rte_member_list *member_list; struct rte_member_setsum *setsum; int ret; + char ring_name[RTE_RING_NAMESIZE]; + struct rte_ring *sketch_key_ring = NULL; if (params == NULL) { rte_errno = EINVAL; @@ -100,6 +107,16 @@ rte_member_create(const struct rte_member_parameters *params) return NULL; } + if (params->type == RTE_MEMBER_TYPE_SKETCH) { + snprintf(ring_name, sizeof(ring_name), "SK_%s", params->name); + sketch_key_ring = rte_ring_create_elem(ring_name, sizeof(uint32_t), + rte_align32pow2(params->top_k), params->socket_id, 0); + if (sketch_key_ring == NULL) { + RTE_MEMBER_LOG(ERR, "Sketch Ring Memory allocation failed\n"); + return NULL; + } + } + member_list = RTE_TAILQ_CAST(rte_member_tailq.head, rte_member_list); rte_mcfg_tailq_write_lock(); @@ -145,6 +162,9 @@ rte_member_create(const struct rte_member_parameters *params) case RTE_MEMBER_TYPE_VBF: ret = rte_member_create_vbf(setsum, params); break; + case RTE_MEMBER_TYPE_SKETCH: + ret = rte_member_create_sketch(setsum, params, sketch_key_ring); + break; default: goto error_unlock_exit; } @@ -162,6 +182,7 @@ rte_member_create(const struct rte_member_parameters *params) error_unlock_exit: rte_free(te); rte_free(setsum); + rte_ring_free(sketch_key_ring); rte_mcfg_tailq_write_unlock(); return NULL; } @@ -178,6 +199,23 @@ rte_member_add(const struct rte_member_setsum *setsum, const void *key, return rte_member_add_ht(setsum, key, set_id); case RTE_MEMBER_TYPE_VBF: return rte_member_add_vbf(setsum, key, set_id); + case RTE_MEMBER_TYPE_SKETCH: + return rte_member_add_sketch(setsum, key, set_id); + default: + return -EINVAL; + } +} + +int +rte_member_add_byte_count(const struct rte_member_setsum *setsum, + const void *key, uint32_t byte_count) +{ + if (setsum == NULL || key == NULL || byte_count == 0) + return -EINVAL; + + switch (setsum->type) { + case RTE_MEMBER_TYPE_SKETCH: + return rte_member_add_sketch_byte_count(setsum, key, byte_count); default: return -EINVAL; } @@ -195,6 +233,8 @@ rte_member_lookup(const struct rte_member_setsum *setsum, const void *key, return rte_member_lookup_ht(setsum, key, set_id); case RTE_MEMBER_TYPE_VBF: return rte_member_lookup_vbf(setsum, key, set_id); + case RTE_MEMBER_TYPE_SKETCH: + return rte_member_lookup_sketch(setsum, key, set_id); default: return -EINVAL; } @@ -261,6 +301,36 @@ rte_member_lookup_multi_bulk(const struct rte_member_setsum *setsum, } } +int +rte_member_query_count(const struct rte_member_setsum *setsum, + const void *key, uint64_t *output) +{ + if (setsum == NULL || key == NULL || output == NULL) + return -EINVAL; + + switch (setsum->type) { + case RTE_MEMBER_TYPE_SKETCH: + return rte_member_query_sketch(setsum, key, output); + default: + return -EINVAL; + } +} + +int +rte_member_report_heavyhitter(const struct rte_member_setsum *setsum, + void **key, uint64_t *count) +{ + if (setsum == NULL || key == NULL || count == NULL) + return -EINVAL; + + switch (setsum->type) { + case RTE_MEMBER_TYPE_SKETCH: + return rte_member_report_heavyhitter_sketch(setsum, key, count); + default: + return -EINVAL; + } +} + int rte_member_delete(const struct rte_member_setsum *setsum, const void *key, member_set_t set_id) @@ -272,6 +342,8 @@ rte_member_delete(const struct rte_member_setsum *setsum, const void *key, case RTE_MEMBER_TYPE_HT: return rte_member_delete_ht(setsum, key, set_id); /* current vBF implementation does not support delete function */ + case RTE_MEMBER_TYPE_SKETCH: + return rte_member_delete_sketch(setsum, key); case RTE_MEMBER_TYPE_VBF: default: return -EINVAL; @@ -290,6 +362,9 @@ rte_member_reset(const struct rte_member_setsum *setsum) case RTE_MEMBER_TYPE_VBF: rte_member_reset_vbf(setsum); return; + case RTE_MEMBER_TYPE_SKETCH: + rte_member_reset_sketch(setsum); + return; default: return; } diff --git a/lib/member/rte_member.h b/lib/member/rte_member.h index 2611015771..178efeed33 100644 --- a/lib/member/rte_member.h +++ b/lib/member/rte_member.h @@ -39,6 +39,18 @@ * | | | not overwrite | | * | | | existing key. | | * +----------+---------------------+----------------+-------------------------+ + * +==========+=============================+ + * | type | sketch | + * +==========+=============================+ + * |structure | counting bloom filter array | + * +----------+-----------------------------+ + * |set id | 1: heavy set, 0: light set | + * | | | + * +----------+-----------------------------+ + * |usages & | count size of a flow, | + * |properties| used for heavy hitter | + * | | detection. | + * +----------+-----------------------------+ * --> */ @@ -50,6 +62,8 @@ extern "C" { #endif #include <stdint.h> +#include <stdbool.h> +#include <inttypes.h> #include <rte_common.h> @@ -65,6 +79,20 @@ typedef uint16_t member_set_t; #define RTE_MEMBER_BUCKET_ENTRIES 16 /** Maximum number of characters in setsum name. */ #define RTE_MEMBER_NAMESIZE 32 +/** Max value of the random number */ +#define RTE_RAND_MAX ~0LLU +/** + * As packets skipped in the sampling-based algorithm, the accounting + * results accuracy is not guaranteed in the start stage. There should + * be a "convergence time" to achieve the accuracy after receiving enough + * packets. + * For sketch, use the flag if prefer always bounded mode, which only + * starts sampling after receiving enough packets to keep the results + * accuracy always bounded. + */ +#define RTE_MEMBER_SKETCH_ALWAYS_BOUNDED 0x01 +/** For sketch, use the flag if to count packet size instead of packet count */ +#define RTE_MEMBER_SKETCH_COUNT_BYTE 0x02 /** @internal Hash function used by membership library. */ #if defined(RTE_ARCH_X86) || defined(__ARM_FEATURE_CRC32) @@ -104,6 +132,7 @@ struct rte_member_parameters; enum rte_member_setsum_type { RTE_MEMBER_TYPE_HT = 0, /**< Hash table based set summary. */ RTE_MEMBER_TYPE_VBF, /**< Vector of bloom filters. */ + RTE_MEMBER_TYPE_SKETCH, RTE_MEMBER_NUM_TYPE }; @@ -114,6 +143,19 @@ enum rte_member_sig_compare_function { RTE_MEMBER_COMPARE_NUM }; +/* sketch update function with different implementations. */ +typedef void (*sketch_update_fn_t)(const struct rte_member_setsum *ss, + const void *key, + uint32_t count); + +/* sketch lookup function with different implementations. */ +typedef uint64_t (*sketch_lookup_fn_t)(const struct rte_member_setsum *ss, + const void *key); + +/* sketch delete function with different implementations. */ +typedef void (*sketch_delete_fn_t)(const struct rte_member_setsum *ss, + const void *key); + /** @internal setsummary structure. */ struct rte_member_setsum { enum rte_member_setsum_type type; /* Type of the set summary. */ @@ -134,6 +176,21 @@ struct rte_member_setsum { uint32_t bit_mask; /* Bit mask to get bit location in bf. */ uint32_t num_hashes; /* Number of hash values to index bf. */ + /* Parameters for sketch */ + float error_rate; + float sample_rate; + uint32_t num_col; + uint32_t num_row; + int always_bounded; + double converge_thresh; + uint32_t topk; + uint32_t count_byte; + uint64_t *hash_seeds; + sketch_update_fn_t sketch_update; /* Pointer to the sketch update function */ + sketch_lookup_fn_t sketch_lookup; /* Pointer to the sketch lookup function */ + sketch_delete_fn_t sketch_delete; /* Pointer to the sketch delete function */ + + void *runtime_var; uint32_t mul_shift; /* vbf internal variable used during bit test. */ uint32_t div_shift; /* vbf internal variable used during bit test. */ @@ -143,6 +200,9 @@ struct rte_member_setsum { /* Second cache line should start here. */ uint32_t socket_id; /* NUMA Socket ID for memory. */ char name[RTE_MEMBER_NAMESIZE]; /* Name of this set summary. */ +#ifdef RTE_ARCH_X86 + bool use_avx512; +#endif } __rte_cache_aligned; /** @@ -261,8 +321,33 @@ struct rte_member_parameters { */ uint32_t sec_hash_seed; + /** + * For count(min) sketch data structure, error rate defines the accuracy + * required by the user. Higher accuracy leads to more memory usage, but + * the flow size is estimated more accurately. + */ + float error_rate; + + /** + * Sampling rate means the internal sample rate of the rows of the count + * min sketches. Lower sampling rate can reduce CPU overhead, but the + * data structure will require more time to converge statistically. + */ + float sample_rate; + + /** + * How many top heavy hitter to be reported. The library will internally + * keep the keys of heavy hitters for final report. + */ + uint32_t top_k; + + /** + * Extra flags that may passed in by user + */ + uint32_t extra_flag; + int socket_id; /**< NUMA Socket ID for memory. */ -}; +} __rte_cache_aligned; /** * @warning @@ -418,7 +503,7 @@ rte_member_lookup_multi_bulk(const struct rte_member_setsum *setsum, * RTE_MEMBER_NO_MATCH by default is set as 0. * For HT mode, the set_id has range as [1, 0x7FFF], MSB is reserved. * For vBF mode the set id is limited by the num_set parameter when create - * the set-summary. + * the set-summary. For sketch mode, this id is ignored. * @return * HT (cache mode) and vBF should never fail unless the set_id is not in the * valid range. In such case -EINVAL is returned. @@ -429,12 +514,75 @@ rte_member_lookup_multi_bulk(const struct rte_member_setsum *setsum, * Return 0 for HT (cache mode) if the add does not cause * eviction, return 1 otherwise. Return 0 for non-cache mode if success, * -ENOSPC for full, and 1 if cuckoo eviction happens. - * Always returns 0 for vBF mode. + * Always returns 0 for vBF mode and sketch. */ int rte_member_add(const struct rte_member_setsum *setsum, const void *key, member_set_t set_id); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Add the packet byte size into the sketch. + * + * @param setsum + * Pointer of a set-summary. + * @param key + * Pointer of the key to be added. + * @param byte_count + * Add the byte count of the packet into the sketch. + * @return + * Return -EINVAL for invalid parameters, otherwise return 0. + */ +__rte_experimental +int +rte_member_add_byte_count(const struct rte_member_setsum *setsum, + const void *key, uint32_t byte_count); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Query packet count for a certain flow-key. + * + * @param setsum + * Pointer of a set-summary. + * @param key + * Pointer of the key to be added. + * @param count + * The output packet count or byte count. + * @return + * Return -EINVAL for invalid parameters. + */ +__rte_experimental +int +rte_member_query_count(const struct rte_member_setsum *setsum, + const void *key, uint64_t *count); + + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Report heavyhitter flow-keys into set-summary (SS). + * + * @param setsum + * Pointer of a set-summary. + * @param keys + * Pointer of the output top-k key array. + * @param counts + * Pointer of the output packet count or byte count array of the top-k keys. + * @return + * Return -EINVAL for invalid parameters. Return a positive integer indicate + * how many heavy hitters are reported. + */ +__rte_experimental +int +rte_member_report_heavyhitter(const struct rte_member_setsum *setsum, + void **keys, uint64_t *counts); + + /** * @warning * @b EXPERIMENTAL: this API may change without prior notice diff --git a/lib/member/rte_member_heap.h b/lib/member/rte_member_heap.h new file mode 100644 index 0000000000..3ced34160a --- /dev/null +++ b/lib/member/rte_member_heap.h @@ -0,0 +1,424 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + * Copyright(c) 2020, Alan Liu <zaoxing...@gmail.com> + */ + +#ifndef _RTE_MEMBER_HEAP_H_ +#define _RTE_MEMBER_HEAP_H_ + +#include <rte_ring_elem.h> +#include "rte_member.h" + +#define LCHILD(x) (2 * x + 1) +#define RCHILD(x) (2 * x + 2) +#define PARENT(x) ((x - 1) / 2) + +#define HASH_BKT_SIZE 16 +#define HASH_HP_MULTI 4 +#define HASH_RESIZE_MULTI 2 + +struct hash_bkt { + uint16_t sig[HASH_BKT_SIZE]; + uint16_t idx[HASH_BKT_SIZE]; +}; + +struct hash { + uint16_t bkt_cnt; + uint16_t num_item; + uint32_t seed; + struct hash_bkt buckets[0]; +}; + +struct node { + void *key; + uint64_t count; +}; + +struct minheap { + uint32_t key_len; + uint32_t size; + uint32_t socket; + struct hash *hashtable; + struct node *elem; +}; + +static int +hash_table_insert(const void *key, int value, int key_len, struct hash *table) +{ + uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed); + uint16_t idx = hash % table->bkt_cnt; + uint16_t sig = hash >> 16; + int i; + + for (i = 0; i < HASH_BKT_SIZE; i++) { + if (table->buckets[idx].idx[i] == 0) { + table->buckets[idx].idx[i] = value; + table->buckets[idx].sig[i] = sig; + table->num_item++; + return 0; + } + } + + return -ENOMEM; +} + +static int +hash_table_update(const void *key, int old_value, int value, int key_len, struct hash *table) +{ + uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed); + uint16_t idx = hash % table->bkt_cnt; + uint16_t sig = hash >> 16; + int i; + + for (i = 0; i < HASH_BKT_SIZE; i++) { + if (table->buckets[idx].sig[i] == sig && table->buckets[idx].idx[i] == old_value) { + table->buckets[idx].idx[i] = value; + return 0; + } + } + + return -1; +} + +static int +hash_table_del(const void *key, uint16_t value, int key_len, struct hash *table) +{ + uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed); + uint16_t idx = hash % table->bkt_cnt; + uint16_t sig = hash >> 16; + int i; + + for (i = 0; i < HASH_BKT_SIZE; i++) { + if (table->buckets[idx].sig[i] == sig && table->buckets[idx].idx[i] == value) { + table->buckets[idx].idx[i] = 0; + table->num_item--; + return 0; + } + } + + return -1; +} + +static int +hash_table_lookup(const void *key, int key_len, struct minheap *hp) +{ + struct hash *table = hp->hashtable; + uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed); + uint16_t idx = hash % table->bkt_cnt; + uint16_t sig = hash >> 16; + int i; + + for (i = 0; i < HASH_BKT_SIZE; i++) { + if (table->buckets[idx].sig[i] == sig && table->buckets[idx].idx[i] != 0) { + uint32_t hp_idx = table->buckets[idx].idx[i] - 1; + + if (memcmp(hp->elem[hp_idx].key, key, hp->key_len) == 0) + return hp_idx; + } + } + + return -ENOENT; /* key doesn't exist */ +} + +static int +resize_hash_table(struct minheap *hp) +{ + uint32_t i; + uint32_t new_bkt_cnt; + + while (1) { + new_bkt_cnt = hp->hashtable->bkt_cnt * HASH_RESIZE_MULTI; + + RTE_MEMBER_LOG(ERR, "Sketch Minheap HT load factor is [%f]\n", + hp->hashtable->num_item / ((float)hp->hashtable->bkt_cnt * HASH_BKT_SIZE)); + RTE_MEMBER_LOG(ERR, "Sketch Minheap HT resize happen!\n"); + rte_free(hp->hashtable); + hp->hashtable = rte_zmalloc_socket(NULL, sizeof(struct hash) + + new_bkt_cnt * sizeof(struct hash_bkt), + RTE_CACHE_LINE_SIZE, hp->socket); + + if (hp->hashtable == NULL) { + RTE_MEMBER_LOG(ERR, "Sketch Minheap HT allocation failed\n"); + return -ENOMEM; + } + + hp->hashtable->bkt_cnt = new_bkt_cnt; + + for (i = 0; i < hp->size; ++i) { + if (hash_table_insert(hp->elem[i].key, + i + 1, hp->key_len, hp->hashtable) < 0) { + RTE_MEMBER_LOG(ERR, + "Sketch Minheap HT resize insert fail!\n"); + break; + } + } + if (i == hp->size) + break; + } + + return 0; +} + +/* find the item in the given minheap */ +static int +rte_member_minheap_find(struct minheap *hp, const void *key) +{ + int idx = hash_table_lookup(key, hp->key_len, hp); + return idx; +} + +static int +rte_member_minheap_init(struct minheap *heap, int size, + uint32_t socket, uint32_t seed) +{ + heap->elem = rte_zmalloc_socket(NULL, sizeof(struct node) * size, + RTE_CACHE_LINE_SIZE, socket); + if (heap->elem == NULL) { + RTE_MEMBER_LOG(ERR, "Sketch Minheap elem allocation failed\n"); + return -ENOMEM; + } + + uint32_t hash_bkt_cnt = rte_align32pow2(size * HASH_HP_MULTI) / HASH_BKT_SIZE; + + if (hash_bkt_cnt == 0) + hash_bkt_cnt = 1; + + heap->hashtable = rte_zmalloc_socket(NULL, sizeof(struct hash) + + hash_bkt_cnt * sizeof(struct hash_bkt), + RTE_CACHE_LINE_SIZE, socket); + + if (heap->hashtable == NULL) { + RTE_MEMBER_LOG(ERR, "Sketch Minheap HT allocation failed\n"); + rte_free(heap->elem); + return -ENOMEM; + } + + heap->hashtable->seed = seed; + heap->hashtable->bkt_cnt = hash_bkt_cnt; + heap->socket = socket; + + return 0; +} + +/* swap the minheap nodes */ +static __rte_always_inline void +rte_member_heap_swap(struct node *n1, struct node *n2) +{ + struct node temp = *n1; + *n1 = *n2; + *n2 = temp; +} + +/* heapify function */ +static void +rte_member_heapify(struct minheap *hp, uint32_t idx, bool update_hash) +{ + uint32_t smallest; + + if (LCHILD(idx) < hp->size && + hp->elem[LCHILD(idx)].count < hp->elem[idx].count) + smallest = LCHILD(idx); + else + smallest = idx; + + if (RCHILD(idx) < hp->size && + hp->elem[RCHILD(idx)].count < hp->elem[smallest].count) + smallest = RCHILD(idx); + + if (smallest != idx) { + rte_member_heap_swap(&(hp->elem[idx]), &(hp->elem[smallest])); + + if (update_hash) { + if (hash_table_update(hp->elem[smallest].key, idx + 1, smallest + 1, + hp->key_len, hp->hashtable) < 0) { + RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n"); + return; + } + + if (hash_table_update(hp->elem[idx].key, smallest + 1, idx + 1, + hp->key_len, hp->hashtable) < 0) { + RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n"); + return; + } + } + rte_member_heapify(hp, smallest, update_hash); + } +} + +/* insert a node into the minheap */ +static int +rte_member_minheap_insert_node(struct minheap *hp, const void *key, + int counter, void *key_slot, + struct rte_ring *free_key_slot) +{ + struct node nd; + uint32_t slot_id; + + if (rte_ring_sc_dequeue_elem(free_key_slot, &slot_id, sizeof(uint32_t)) != 0) { + RTE_MEMBER_LOG(ERR, "Minheap get empty keyslot failed\n"); + return -1; + } + + nd.count = counter; + nd.key = RTE_PTR_ADD(key_slot, slot_id * hp->key_len); + + memcpy(nd.key, key, hp->key_len); + + uint32_t i = (hp->size)++; + + while (i && nd.count < hp->elem[PARENT(i)].count) { + hp->elem[i] = hp->elem[PARENT(i)]; + if (hash_table_update(hp->elem[i].key, PARENT(i) + 1, i + 1, + hp->key_len, hp->hashtable) < 0) { + RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n"); + return -1; + } + i = PARENT(i); + } + hp->elem[i] = nd; + + if (hash_table_insert(key, i + 1, hp->key_len, hp->hashtable) < 0) { + if (resize_hash_table(hp) < 0) { + RTE_MEMBER_LOG(ERR, "Minheap Hash Table resize failed\n"); + return -1; + } + } + + return 0; +} + +/* delete a key from the minheap */ +static int +rte_member_minheap_delete_node(struct minheap *hp, const void *key, + void *key_slot, struct rte_ring *free_key_slot) +{ + int idx = rte_member_minheap_find(hp, key); + uint32_t offset = RTE_PTR_DIFF(hp->elem[idx].key, key_slot) / hp->key_len; + + if (hash_table_del(key, idx + 1, hp->key_len, hp->hashtable) < 0) { + RTE_MEMBER_LOG(ERR, "Minheap Hash Table delete failed\n"); + return -1; + } + + rte_ring_sp_enqueue_elem(free_key_slot, &offset, sizeof(uint32_t)); + + if (idx == (int)(hp->size - 1)) { + hp->size--; + return 0; + } + + hp->elem[idx] = hp->elem[hp->size - 1]; + + if (hash_table_update(hp->elem[idx].key, hp->size, idx + 1, + hp->key_len, hp->hashtable) < 0) { + RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n"); + return -1; + } + hp->size--; + rte_member_heapify(hp, idx, true); + + return 0; +} + +/* replace a min node with a new key. */ +static int +rte_member_minheap_replace_node(struct minheap *hp, + const void *new_key, + int new_counter) +{ + struct node nd; + void *recycle_key = NULL; + + recycle_key = hp->elem[0].key; + + if (hash_table_del(recycle_key, 1, hp->key_len, hp->hashtable) < 0) { + RTE_MEMBER_LOG(ERR, "Minheap Hash Table delete failed\n"); + return -1; + } + + hp->elem[0] = hp->elem[hp->size - 1]; + + if (hash_table_update(hp->elem[0].key, hp->size, 1, + hp->key_len, hp->hashtable) < 0) { + RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n"); + return -1; + } + hp->size--; + + rte_member_heapify(hp, 0, true); + + nd.count = new_counter; + nd.key = recycle_key; + + memcpy(nd.key, new_key, hp->key_len); + + uint32_t i = (hp->size)++; + + while (i && nd.count < hp->elem[PARENT(i)].count) { + hp->elem[i] = hp->elem[PARENT(i)]; + if (hash_table_update(hp->elem[i].key, PARENT(i) + 1, i + 1, + hp->key_len, hp->hashtable) < 0) { + RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n"); + return -1; + } + i = PARENT(i); + } + + hp->elem[i] = nd; + + if (hash_table_insert(new_key, i + 1, hp->key_len, hp->hashtable) < 0) { + RTE_MEMBER_LOG(ERR, "Minheap Hash Table replace insert failed\n"); + if (resize_hash_table(hp) < 0) { + RTE_MEMBER_LOG(ERR, "Minheap Hash Table replace resize failed\n"); + return -1; + } + } + + return 0; +} + +/* sort the heap into a descending array */ +static void +rte_member_heapsort(struct minheap *hp, struct node *result_array) +{ + struct minheap new_hp; + + /* build a new heap for using the given array */ + new_hp.size = hp->size; + new_hp.key_len = hp->key_len; + new_hp.elem = result_array; + memcpy(result_array, hp->elem, hp->size * sizeof(struct node)); + + /* sort the new heap */ + while (new_hp.size > 1) { + rte_member_heap_swap(&(new_hp.elem[0]), &(new_hp.elem[new_hp.size - 1])); + new_hp.size--; + rte_member_heapify(&new_hp, 0, false); + } +} + +static void +rte_member_minheap_free(struct minheap *hp) +{ + if (hp == NULL) + return; + + rte_free(hp->elem); + rte_free(hp->hashtable); +} + +static void +rte_member_minheap_reset(struct minheap *hp) +{ + if (hp == NULL) + return; + + memset(hp->elem, 0, sizeof(struct node) * hp->size); + hp->size = 0; + + memset((char *)hp->hashtable + sizeof(struct hash), 0, + hp->hashtable->bkt_cnt * sizeof(struct hash_bkt)); + hp->hashtable->num_item = 0; +} + +#endif /* _RTE_MEMBER_HEAP_H_ */ diff --git a/lib/member/rte_member_sketch.c b/lib/member/rte_member_sketch.c new file mode 100644 index 0000000000..524ba77620 --- /dev/null +++ b/lib/member/rte_member_sketch.c @@ -0,0 +1,594 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + * Copyright(c) 2020, Alan Liu <zaoxing...@gmail.com> + */ + +#include <math.h> +#include <string.h> + +#include <rte_malloc.h> +#include <rte_memory.h> +#include <rte_errno.h> +#include <rte_log.h> +#include <rte_random.h> +#include <rte_prefetch.h> +#include <rte_ring_elem.h> + +#include "rte_member.h" +#include "rte_member_sketch.h" +#include "rte_member_heap.h" + +#ifdef CC_AVX512_SUPPORT +#include "rte_member_sketch_avx512.h" +#endif /* CC_AVX512_SUPPORT */ + +struct sketch_runtime { + uint64_t pkt_cnt; + uint32_t until_next; + int converged; + struct minheap heap; + struct node *report_array; + void *key_slots; + struct rte_ring *free_key_slots; +} __rte_cache_aligned; + +/* + * Geometric sampling to calculate how many packets needs to be + * skipped until next update. This method can mitigate the CPU + * overheads compared with coin-toss sampling. + */ +static uint32_t +draw_geometric(const struct rte_member_setsum *ss) +{ + double rand = 1; + + if (ss->sample_rate == 1) + return 1; + + while (rand == 1 || rand == 0) + rand = (double) rte_rand() / (double)(RTE_RAND_MAX); + + return (uint32_t)ceil(log(1 - rand) / log(1 - ss->sample_rate)); +} + +static void +isort(uint64_t *array, int n) +{ + int i; + + for (i = 1; i < n; i++) { + uint64_t t = array[i]; + int j; + + for (j = i - 1; j >= 0; j--) { + if (t < array[j]) + array[j + 1] = array[j]; + else + break; + } + array[j + 1] = t; + } +} + +static __rte_always_inline void +swap(uint64_t *a, uint64_t *b) +{ + uint64_t tmp = *a; + *a = *b; + *b = tmp; +} + +static uint64_t +medianof5(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e) +{ + if (a > b) + swap(&a, &b); + if (c > d) + swap(&c, &d); + if (a > c) { + if (d > e) + swap(&c, &e); + else { + swap(&c, &d); + swap(&d, &e); + } + } else { + if (b > e) + swap(&a, &e); + else { + swap(&a, &b); + swap(&b, &e); + } + } + + if (a > c) + return a > d ? d : a; + else + return b > c ? c : b; +} + +int +rte_member_create_sketch(struct rte_member_setsum *ss, + const struct rte_member_parameters *params, + struct rte_ring *ring) +{ + struct sketch_runtime *runtime; + uint32_t num_col; + uint32_t i; + + if (params->sample_rate == 0 || params->sample_rate > 1) { + rte_errno = EINVAL; + RTE_MEMBER_LOG(ERR, + "Membership Sketch created with invalid parameters\n"); + return -EINVAL; + } + + if (params->extra_flag & RTE_MEMBER_SKETCH_COUNT_BYTE) + ss->count_byte = 1; + +#ifdef RTE_ARCH_X86 + if (ss->count_byte == 1 && + rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512 && + rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 && + rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512IFMA) == 1) { +#ifdef CC_AVX512_SUPPORT + ss->use_avx512 = true; +#else + ss->use_avx512 = false; +#endif + } + + if (ss->use_avx512 == true) { +#ifdef CC_AVX512_SUPPORT + ss->num_row = NUM_ROW_VEC; + RTE_MEMBER_LOG(NOTICE, + "Membership Sketch AVX512 update/lookup/delete ops is selected\n"); + ss->sketch_update = sketch_update_avx512; + ss->sketch_lookup = sketch_lookup_avx512; + ss->sketch_delete = sketch_delete_avx512; +#endif + } else +#endif + { + ss->num_row = NUM_ROW_SCALAR; + RTE_MEMBER_LOG(NOTICE, + "Membership Sketch SCALAR update/lookup/delete ops is selected\n"); + ss->sketch_update = sketch_update_scalar; + ss->sketch_lookup = sketch_lookup_scalar; + ss->sketch_delete = sketch_delete_scalar; + } + + ss->socket_id = params->socket_id; + + if (ss->count_byte == 0) + num_col = 4.0 / params->error_rate / params->sample_rate; +#ifdef RTE_ARCH_X86 + else if (ss->use_avx512 == true) + num_col = rte_align32pow2(4.0 / params->error_rate); +#endif + else + num_col = 4.0 / params->error_rate; + + ss->table = rte_zmalloc_socket(NULL, + sizeof(uint64_t) * num_col * ss->num_row, + RTE_CACHE_LINE_SIZE, ss->socket_id); + if (ss->table == NULL) { + RTE_MEMBER_LOG(ERR, "Sketch Table memory allocation failed\n"); + return -ENOMEM; + } + + ss->hash_seeds = rte_zmalloc_socket(NULL, sizeof(uint64_t) * ss->num_row, + RTE_CACHE_LINE_SIZE, ss->socket_id); + if (ss->hash_seeds == NULL) { + RTE_MEMBER_LOG(ERR, "Sketch Hashseeds memory allocation failed\n"); + return -ENOMEM; + } + + ss->runtime_var = rte_zmalloc_socket(NULL, sizeof(struct sketch_runtime), + RTE_CACHE_LINE_SIZE, ss->socket_id); + if (ss->runtime_var == NULL) { + RTE_MEMBER_LOG(ERR, "Sketch Runtime memory allocation failed\n"); + rte_free(ss); + return -ENOMEM; + } + runtime = ss->runtime_var; + + ss->num_col = num_col; + ss->sample_rate = params->sample_rate; + ss->prim_hash_seed = params->prim_hash_seed; + ss->sec_hash_seed = params->sec_hash_seed; + ss->error_rate = params->error_rate; + ss->topk = params->top_k; + ss->key_len = params->key_len; + runtime->heap.key_len = ss->key_len; + + runtime->key_slots = rte_zmalloc_socket(NULL, ss->key_len * ss->topk, + RTE_CACHE_LINE_SIZE, ss->socket_id); + if (runtime->key_slots == NULL) { + RTE_MEMBER_LOG(ERR, "Sketch Key Slots allocation failed\n"); + goto error; + } + + runtime->free_key_slots = ring; + for (i = 0; i < ss->topk; i++) + rte_ring_sp_enqueue_elem(runtime->free_key_slots, + &i, sizeof(uint32_t)); + + if (rte_member_minheap_init(&(runtime->heap), params->top_k, + ss->socket_id, params->prim_hash_seed) < 0) { + RTE_MEMBER_LOG(ERR, "Sketch Minheap allocation failed\n"); + goto error_runtime; + } + + runtime->report_array = rte_zmalloc_socket(NULL, sizeof(struct node) * ss->topk, + RTE_CACHE_LINE_SIZE, ss->socket_id); + if (runtime->report_array == NULL) { + RTE_MEMBER_LOG(ERR, "Sketch Runtime Report Array allocation failed\n"); + goto error_runtime; + } + + rte_srand(ss->prim_hash_seed); + for (i = 0; i < ss->num_row; i++) + ss->hash_seeds[i] = rte_rand(); + + if (params->extra_flag & RTE_MEMBER_SKETCH_ALWAYS_BOUNDED) + ss->always_bounded = 1; + + if (ss->always_bounded) { + double delta = 1.0 / (pow(2, ss->num_row)); + + ss->converge_thresh = 10 * pow(ss->error_rate, -2.0) * sqrt(log(1 / delta)); + } + + RTE_MEMBER_LOG(DEBUG, "Sketch created, " + "the total memory required is %u Bytes\n", ss->num_col * ss->num_row * 8); + + return 0; + +error_runtime: + rte_member_minheap_free(&runtime->heap); + rte_ring_free(runtime->free_key_slots); + rte_free(runtime->key_slots); +error: + rte_free(runtime); + rte_free(ss); + + return -ENOMEM; +} + +uint64_t +sketch_lookup_scalar(const struct rte_member_setsum *ss, const void *key) +{ + uint64_t *count_array = ss->table; + uint32_t col[ss->num_row]; + uint64_t count_row[ss->num_row]; + uint32_t cur_row; + uint64_t count; + + for (cur_row = 0; cur_row < ss->num_row; cur_row++) { + col[cur_row] = MEMBER_HASH_FUNC(key, ss->key_len, + ss->hash_seeds[cur_row]) % ss->num_col; + + rte_prefetch0(&count_array[cur_row * ss->num_col + col[cur_row]]); + } + + /* if sample rate is 1, it is a regular count-min, we report the min */ + if (ss->sample_rate == 1 || ss->count_byte == 1) + return count_min(ss, col); + + memset(count_row, 0, sizeof(uint64_t) * ss->num_row); + + /* otherwise we report the median number */ + for (cur_row = 0; cur_row < ss->num_row; cur_row++) + count_row[cur_row] = count_array[cur_row * ss->num_col + col[cur_row]]; + + if (ss->num_row == 5) + return medianof5(count_row[0], count_row[1], + count_row[2], count_row[3], count_row[4]); + + isort(count_row, ss->num_row); + + if (ss->num_row % 2 == 0) { + count = (count_row[ss->num_row / 2] + count_row[ss->num_row / 2 - 1]) / 2; + return count; + } + /* ss->num_row % 2 != 0 */ + count = count_row[ss->num_row / 2]; + + return count; +} + +void +sketch_delete_scalar(const struct rte_member_setsum *ss, const void *key) +{ + uint32_t col[ss->num_row]; + uint64_t *count_array = ss->table; + uint32_t cur_row; + + for (cur_row = 0; cur_row < ss->num_row; cur_row++) { + col[cur_row] = MEMBER_HASH_FUNC(key, ss->key_len, + ss->hash_seeds[cur_row]) % ss->num_col; + + /* set corresponding counter to 0 */ + count_array[cur_row * ss->num_col + col[cur_row]] = 0; + } +} + +int +rte_member_query_sketch(const struct rte_member_setsum *ss, + const void *key, + uint64_t *output) +{ + uint64_t count = ss->sketch_lookup(ss, key); + *output = count; + + return 0; +} + +void +rte_member_update_heap(const struct rte_member_setsum *ss) +{ + uint32_t i; + struct sketch_runtime *runtime_var = ss->runtime_var; + + for (i = 0; i < runtime_var->heap.size; i++) { + uint64_t count = ss->sketch_lookup(ss, runtime_var->heap.elem[i].key); + + runtime_var->heap.elem[i].count = count; + } +} + +int +rte_member_report_heavyhitter_sketch(const struct rte_member_setsum *setsum, + void **key, + uint64_t *count) +{ + uint32_t i; + struct sketch_runtime *runtime_var = setsum->runtime_var; + + rte_member_update_heap(setsum); + rte_member_heapsort(&(runtime_var->heap), runtime_var->report_array); + + for (i = 0; i < runtime_var->heap.size; i++) { + key[i] = runtime_var->report_array[i].key; + count[i] = runtime_var->report_array[i].count; + } + + return runtime_var->heap.size; +} + +int +rte_member_lookup_sketch(const struct rte_member_setsum *ss, + const void *key, member_set_t *set_id) +{ + uint64_t count = ss->sketch_lookup(ss, key); + struct sketch_runtime *runtime_var = ss->runtime_var; + + if (runtime_var->heap.size > 0 && count >= runtime_var->heap.elem[0].count) + *set_id = 1; + else + *set_id = 0; + + if (count == 0) + return 0; + else + return 1; +} + +static void +should_converge(const struct rte_member_setsum *ss) +{ + struct sketch_runtime *runtime_var = ss->runtime_var; + + /* For count min sketch - L1 norm */ + if (runtime_var->pkt_cnt > ss->converge_thresh) { + runtime_var->converged = 1; + RTE_MEMBER_LOG(DEBUG, "Sketch converged, begin sampling " + "from key count %"PRIu64"\n", + runtime_var->pkt_cnt); + } +} + +static void +sketch_update_row(const struct rte_member_setsum *ss, const void *key, + uint32_t count, uint32_t cur_row) +{ + uint64_t *count_array = ss->table; + uint32_t col = MEMBER_HASH_FUNC(key, ss->key_len, + ss->hash_seeds[cur_row]) % ss->num_col; + + /* sketch counter update */ + count_array[cur_row * ss->num_col + col] += + ceil(count / (ss->sample_rate)); +} + +void +sketch_update_scalar(const struct rte_member_setsum *ss, + const void *key, + uint32_t count) +{ + uint64_t *count_array = ss->table; + uint32_t col; + uint32_t cur_row; + + for (cur_row = 0; cur_row < ss->num_row; cur_row++) { + col = MEMBER_HASH_FUNC(key, ss->key_len, + ss->hash_seeds[cur_row]) % ss->num_col; + count_array[cur_row * ss->num_col + col] += count; + } +} + +static void +heap_update(const struct rte_member_setsum *ss, const void *key) +{ + struct sketch_runtime *runtime_var = ss->runtime_var; + uint64_t key_cnt = 0; + int found; + + /* We also update the heap for this key */ + key_cnt = ss->sketch_lookup(ss, key); + if (key_cnt > runtime_var->heap.elem[0].count) { + found = rte_member_minheap_find(&runtime_var->heap, key); + /* the key is found in the top-k heap */ + if (found >= 0) { + if (runtime_var->heap.elem[found].count < key_cnt) + rte_member_heapify(&runtime_var->heap, found, true); + + runtime_var->heap.elem[found].count = key_cnt; + } else if (runtime_var->heap.size < ss->topk) { + rte_member_minheap_insert_node(&runtime_var->heap, key, + key_cnt, runtime_var->key_slots, runtime_var->free_key_slots); + } else { + rte_member_minheap_replace_node(&runtime_var->heap, key, key_cnt); + } + } else if (runtime_var->heap.size < ss->topk) { + found = rte_member_minheap_find(&runtime_var->heap, key); + if (found >= 0) { + if (runtime_var->heap.elem[found].count < key_cnt) + rte_member_heapify(&runtime_var->heap, found, true); + + runtime_var->heap.elem[found].count = key_cnt; + } else + rte_member_minheap_insert_node(&runtime_var->heap, key, + key_cnt, runtime_var->key_slots, runtime_var->free_key_slots); + } +} + +/* + * Add a single packet into the sketch. + * Sketch value is meatured by packet numbers in this mode. + */ +int +rte_member_add_sketch(const struct rte_member_setsum *ss, + const void *key, + __rte_unused member_set_t set_id) +{ + uint32_t cur_row; + struct sketch_runtime *runtime_var = ss->runtime_var; + uint32_t *until_next = &(runtime_var->until_next); + + /* + * If sketch is measured by byte count, + * the rte_member_add_sketch_byte_count routine should be used. + */ + if (ss->count_byte == 1) { + RTE_MEMBER_LOG(ERR, "Sketch is Byte Mode, " + "should use rte_member_add_byte_count()!\n"); + return -EINVAL; + } + + if (ss->sample_rate == 1) { + ss->sketch_update(ss, key, 1); + heap_update(ss, key); + return 0; + } + + /* convergence stage if it's needed */ + if (ss->always_bounded && !runtime_var->converged) { + ss->sketch_update(ss, key, 1); + + if (!((++runtime_var->pkt_cnt) & (INTERVAL - 1))) + should_converge(ss); + + heap_update(ss, key); + return 0; + } + + /* should we skip this packet */ + if (*until_next >= ss->num_row) { + *until_next -= ss->num_row; + return 0; + } + cur_row = *until_next; + do { + sketch_update_row(ss, key, 1, cur_row); + *until_next = draw_geometric(ss); + if (cur_row + *until_next >= ss->num_row) + break; + cur_row += *until_next; + } while (1); + + *until_next -= (ss->num_row - cur_row); + + heap_update(ss, key); + + return 0; +} + +/* + * Add the byte count of the packet into the sketch. + * Sketch value is meatured by byte count numbers in this mode. + */ +int +rte_member_add_sketch_byte_count(const struct rte_member_setsum *ss, + const void *key, + uint32_t byte_count) +{ + struct sketch_runtime *runtime_var = ss->runtime_var; + uint32_t *until_next = &(runtime_var->until_next); + + /* should not call this API if not in count byte mode */ + if (ss->count_byte == 0) { + RTE_MEMBER_LOG(ERR, "Sketch is Pkt Mode, " + "should use rte_member_add()!\n"); + return -EINVAL; + } + + /* there's specific optimization for the sketch update */ + ss->sketch_update(ss, key, byte_count); + + if (*until_next != 0) { + *until_next = *until_next - 1; + return 0; + } + + *until_next = draw_geometric(ss) - 1; + + heap_update(ss, key); + + return 0; +} + +int +rte_member_delete_sketch(const struct rte_member_setsum *ss, + const void *key) +{ + struct sketch_runtime *runtime_var = ss->runtime_var; + int found; + + found = rte_member_minheap_find(&runtime_var->heap, key); + if (found < 0) + return -1; + + ss->sketch_delete(ss, key); + + return rte_member_minheap_delete_node + (&runtime_var->heap, key, runtime_var->key_slots, runtime_var->free_key_slots); +} + +void +rte_member_free_sketch(struct rte_member_setsum *ss) +{ + struct sketch_runtime *runtime_var = ss->runtime_var; + + rte_free(ss->table); + rte_member_minheap_free(&runtime_var->heap); + rte_free(runtime_var->key_slots); + rte_ring_free(runtime_var->free_key_slots); + rte_free(runtime_var); +} + +void +rte_member_reset_sketch(const struct rte_member_setsum *ss) +{ + struct sketch_runtime *runtime_var = ss->runtime_var; + uint64_t *sketch = ss->table; + uint32_t i; + + memset(sketch, 0, sizeof(uint64_t) * ss->num_col * ss->num_row); + rte_member_minheap_reset(&runtime_var->heap); + rte_ring_reset(runtime_var->free_key_slots); + + for (i = 0; i < ss->topk; i++) + rte_ring_sp_enqueue_elem(runtime_var->free_key_slots, &i, sizeof(uint32_t)); +} diff --git a/lib/member/rte_member_sketch.h b/lib/member/rte_member_sketch.h new file mode 100644 index 0000000000..219323008b --- /dev/null +++ b/lib/member/rte_member_sketch.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_MEMBER_SKETCH_H_ +#define _RTE_MEMBER_SKETCH_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_vect.h> +#include <rte_ring_elem.h> + +#define NUM_ROW_SCALAR 5 +#define INTERVAL (1 << 15) + +#if !RTE_IS_POWER_OF_2(INTERVAL) +#error sketch INTERVAL macro must be a power of 2 +#endif + +int +rte_member_create_sketch(struct rte_member_setsum *ss, + const struct rte_member_parameters *params, + struct rte_ring *r); + +int +rte_member_lookup_sketch(const struct rte_member_setsum *setsum, + const void *key, member_set_t *set_id); + +int +rte_member_add_sketch(const struct rte_member_setsum *setsum, + const void *key, + member_set_t set_id); + +int +rte_member_add_sketch_byte_count(const struct rte_member_setsum *ss, + const void *key, uint32_t byte_count); + +void +sketch_update_scalar(const struct rte_member_setsum *ss, + const void *key, + uint32_t count); + +uint64_t +sketch_lookup_scalar(const struct rte_member_setsum *ss, + const void *key); + +void +sketch_delete_scalar(const struct rte_member_setsum *ss, + const void *key); + +int +rte_member_delete_sketch(const struct rte_member_setsum *setsum, + const void *key); + +int +rte_member_query_sketch(const struct rte_member_setsum *setsum, + const void *key, uint64_t *output); + +void +rte_member_free_sketch(struct rte_member_setsum *ss); + +void +rte_member_reset_sketch(const struct rte_member_setsum *setsum); + +int +rte_member_report_heavyhitter_sketch(const struct rte_member_setsum *setsum, + void **key, uint64_t *count); + +void +rte_member_update_heap(const struct rte_member_setsum *ss); + +static __rte_always_inline uint64_t +count_min(const struct rte_member_setsum *ss, const uint32_t *hash_results) +{ + uint64_t *count_array = ss->table; + uint64_t count; + uint32_t cur_row; + uint64_t min = UINT64_MAX; + + for (cur_row = 0; cur_row < ss->num_row; cur_row++) { + uint64_t cnt = count_array[cur_row * ss->num_col + hash_results[cur_row]]; + + if (cnt < min) + min = cnt; + } + count = min; + + return count; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMBER_SKETCH_H_ */ diff --git a/lib/member/rte_member_sketch_avx512.c b/lib/member/rte_member_sketch_avx512.c new file mode 100644 index 0000000000..288e37a446 --- /dev/null +++ b/lib/member/rte_member_sketch_avx512.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + */ + +#include "rte_xxh64_avx512.h" +#include "rte_member_sketch_avx512.h" + +__rte_always_inline void +sketch_update_avx512(const struct rte_member_setsum *ss, + const void *key, + uint32_t count) +{ + uint64_t *count_array = ss->table; + uint32_t num_col = ss->num_col; + uint32_t key_len = ss->key_len; + __m256i v_row_base; + __m256i v_hash_result; + __m512i current_sketch; + __m512i updated_sketch; + __m512i v_count; + + const __m256i v_idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + const __m256i v_col = _mm256_set1_epi32(num_col); + + /* compute the hash result parallelly */ + v_hash_result = rte_xxh64_sketch_avx512 + (key, key_len, *(__m512i *)ss->hash_seeds, num_col); + v_row_base = _mm256_mullo_epi32(v_idx, v_col); + v_hash_result = _mm256_add_epi32(v_row_base, v_hash_result); + + current_sketch = + _mm512_i32gather_epi64(v_hash_result, count_array, 8); + v_count = _mm512_set1_epi64(count); + updated_sketch = _mm512_add_epi64(current_sketch, v_count); + _mm512_i32scatter_epi64 + ((void *)count_array, v_hash_result, updated_sketch, 8); +} + +uint64_t +sketch_lookup_avx512(const struct rte_member_setsum *ss, const void *key) +{ + uint32_t col[ss->num_row]; + + /* currently only for sketch byte count mode */ + __m256i v_hash_result = rte_xxh64_sketch_avx512 + (key, ss->key_len, *(__m512i *)ss->hash_seeds, ss->num_col); + _mm256_storeu_si256((__m256i *)col, v_hash_result); + + return count_min(ss, col); +} + +void +sketch_delete_avx512(const struct rte_member_setsum *ss, const void *key) +{ + uint32_t col[ss->num_row]; + uint64_t *count_array = ss->table; + uint64_t min = UINT64_MAX; + uint32_t cur_row; + + __m256i v_hash_result = rte_xxh64_sketch_avx512 + (key, ss->key_len, *(__m512i *)ss->hash_seeds, + RTE_ALIGN_FLOOR(ss->num_col, 32)); + _mm256_storeu_si256((__m256i *)col, v_hash_result); + + min = count_min(ss, col); + + /* subtract the min value from all the counters */ + for (cur_row = 0; cur_row < ss->num_row; cur_row++) + count_array[cur_row * ss->num_col + col[cur_row]] -= min; +} diff --git a/lib/member/rte_member_sketch_avx512.h b/lib/member/rte_member_sketch_avx512.h new file mode 100644 index 0000000000..68105cd895 --- /dev/null +++ b/lib/member/rte_member_sketch_avx512.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + */ + +#ifndef _RTE_MEMBER_SKETCH_AVX512_H_ +#define _RTE_MEMBER_SKETCH_AVX512_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_vect.h> +#include "rte_member.h" +#include "rte_member_sketch.h" + +#define NUM_ROW_VEC 8 + +void +sketch_update_avx512(const struct rte_member_setsum *ss, + const void *key, + uint32_t count); + +uint64_t +sketch_lookup_avx512(const struct rte_member_setsum *ss, + const void *key); + +void +sketch_delete_avx512(const struct rte_member_setsum *ss, + const void *key); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMBER_SKETCH_AVX512_H_ */ diff --git a/lib/member/rte_xxh64_avx512.h b/lib/member/rte_xxh64_avx512.h new file mode 100644 index 0000000000..50ca1b52c7 --- /dev/null +++ b/lib/member/rte_xxh64_avx512.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + */ + +#ifndef _RTE_XXH64_AVX512_H_ +#define _RTE_XXH64_AVX512_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> +#include <immintrin.h> + +/* 0b1001111000110111011110011011000110000101111010111100101010000111 */ +static const uint64_t PRIME64_1 = 0x9E3779B185EBCA87ULL; +/* 0b1100001010110010101011100011110100100111110101001110101101001111 */ +static const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; +/* 0b0001011001010110011001111011000110011110001101110111100111111001 */ +static const uint64_t PRIME64_3 = 0x165667B19E3779F9ULL; +/* 0b1000010111101011110010100111011111000010101100101010111001100011 */ +static const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63ULL; +/* 0b0010011111010100111010110010111100010110010101100110011111000101 */ +static const uint64_t PRIME64_5 = 0x27D4EB2F165667C5ULL; + +static __rte_always_inline __m512i +xxh64_round_avx512(__m512i hash, __m512i input) +{ + hash = _mm512_madd52lo_epu64(hash, + input, + _mm512_set1_epi64(PRIME64_2)); + + hash = _mm512_rol_epi64(hash, 31); + + return hash; +} + +static __rte_always_inline __m512i +xxh64_fmix_avx512(__m512i hash) +{ + hash = _mm512_xor_si512(hash, _mm512_srli_epi64(hash, 33)); + + return hash; +} + +static __rte_always_inline __m256i +rte_xxh64_sketch_avx512(const void *key, uint32_t key_len, + __m512i v_seed, uint32_t modulo) +{ + __m512i v_prime64_5, v_hash; + size_t remaining = key_len; + size_t offset = 0; + __m512i input; + + v_prime64_5 = _mm512_set1_epi64(PRIME64_5); + v_hash = _mm512_add_epi64 + (_mm512_add_epi64(v_seed, v_prime64_5), + _mm512_set1_epi64(key_len)); + + while (remaining >= 8) { + input = _mm512_set1_epi64(*(uint64_t *)RTE_PTR_ADD(key, offset)); + v_hash = _mm512_xor_epi64(v_hash, + xxh64_round_avx512(_mm512_setzero_si512(), input)); + v_hash = _mm512_madd52lo_epu64(_mm512_set1_epi64(PRIME64_4), + v_hash, + _mm512_set1_epi64(PRIME64_1)); + + remaining -= 8; + offset += 8; + } + + if (remaining >= 4) { + input = _mm512_set1_epi64 + (*(uint32_t *)RTE_PTR_ADD(key, offset)); + v_hash = _mm512_xor_epi64(v_hash, + _mm512_mullo_epi64(input, + _mm512_set1_epi64(PRIME64_1))); + v_hash = _mm512_madd52lo_epu64 + (_mm512_set1_epi64(PRIME64_3), + _mm512_rol_epi64(v_hash, 23), + _mm512_set1_epi64(PRIME64_2)); + + offset += 4; + remaining -= 4; + } + + while (remaining != 0) { + input = _mm512_set1_epi64 + (*(uint8_t *)RTE_PTR_ADD(key, offset)); + v_hash = _mm512_xor_epi64(v_hash, + _mm512_mullo_epi64(input, + _mm512_set1_epi64(PRIME64_5))); + v_hash = _mm512_mullo_epi64 + (_mm512_rol_epi64(v_hash, 11), + _mm512_set1_epi64(PRIME64_1)); + offset++; + remaining--; + } + + v_hash = xxh64_fmix_avx512(v_hash); + + /* + * theoritically, such modular operations can be replaced by + * _mm512_rem_epi64(), but seems it depends on the compiler's + * implementation. so here is the limitation that the modulo + * value should be power of 2. + */ + __m512i v_hash_remainder = _mm512_set1_epi64((modulo - 1)); + + return _mm512_cvtepi64_epi32(_mm512_and_si512(v_hash, v_hash_remainder)); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_XXH64_AVX512_H_ */ diff --git a/lib/member/version.map b/lib/member/version.map index 19469c6aba..35199270ff 100644 --- a/lib/member/version.map +++ b/lib/member/version.map @@ -14,3 +14,12 @@ DPDK_23 { local: *; }; + +EXPERIMENTAL { + global: + + # added in 22.11 + rte_member_add_byte_count; + rte_member_query_count; + rte_member_report_heavyhitter; +}; -- 2.25.1