Sketching algorithm provide high-fidelity approximate measurements and
appears as a promising alternative to traditional approaches such as
packet sampling.

NitroSketch [1] is a software sketching framework that optimizes
performance, provides accuracy guarantees, and supports a variety of
sketches.

This commit adds a new data structure called sketch into
membership library. This new data structure is an efficient
way to profile the traffic for heavy hitters. Also use min-heap
structure to maintain the top-k flow keys.

[1] Zaoxing Liu, Ran Ben-Basat, Gil Einziger, Yaron Kassner, Vladimir
Braverman, Roy Friedman, Vyas Sekar, "NitroSketch: Robust and General
Sketch-based Monitoring in Software Switches", in ACM SIGCOMM 2019.
https://dl.acm.org/doi/pdf/10.1145/3341302.3342076

Signed-off-by: Alan Liu <zaoxing...@gmail.com>
Signed-off-by: Yipeng Wang <yipeng1.w...@intel.com>
Signed-off-by: Leyi Rong <leyi.r...@intel.com>
---
 lib/member/meson.build                |  49 ++-
 lib/member/rte_member.c               |  75 ++++
 lib/member/rte_member.h               | 154 ++++++-
 lib/member/rte_member_heap.h          | 424 ++++++++++++++++++
 lib/member/rte_member_sketch.c        | 594 ++++++++++++++++++++++++++
 lib/member/rte_member_sketch.h        |  97 +++++
 lib/member/rte_member_sketch_avx512.c |  70 +++
 lib/member/rte_member_sketch_avx512.h |  35 ++
 lib/member/rte_xxh64_avx512.h         | 117 +++++
 lib/member/version.map                |   9 +
 10 files changed, 1619 insertions(+), 5 deletions(-)
 create mode 100644 lib/member/rte_member_heap.h
 create mode 100644 lib/member/rte_member_sketch.c
 create mode 100644 lib/member/rte_member_sketch.h
 create mode 100644 lib/member/rte_member_sketch_avx512.c
 create mode 100644 lib/member/rte_member_sketch_avx512.h
 create mode 100644 lib/member/rte_xxh64_avx512.h

diff --git a/lib/member/meson.build b/lib/member/meson.build
index e06fddc240..8bd0af492c 100644
--- a/lib/member/meson.build
+++ b/lib/member/meson.build
@@ -7,6 +7,51 @@ if is_windows
     subdir_done()
 endif
 
-sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c')
 headers = files('rte_member.h')
-deps += ['hash']
+
+sources = files(
+    'rte_member.c',
+    'rte_member_ht.c',
+    'rte_member_vbf.c',
+    'rte_member_sketch.c'
+)
+
+deps += ['hash', 'ring']
+
+# compile AVX512 version if:
+if dpdk_conf.has('RTE_ARCH_X86_64') and binutils_ok
+    # compile AVX512 version if either:
+    # a. we have AVX512 supported in minimum instruction set
+    #    baseline
+    # b. it's not minimum instruction set, but supported by
+    #    compiler
+    #
+    # in former case, just add avx512 C file to files list
+    # in latter case, compile c file to static lib, using correct
+    # compiler flags, and then have the .o file from static lib
+    # linked into main lib.
+
+    #check if all required flags already enabled
+    sketch_avx512_flags = ['__AVX512F__', '__AVX512DQ__', '__AVX512IFMA__']
+
+    sketch_avx512_on = true
+    foreach f:sketch_avx512_flags
+        if cc.get_define(f, args: machine_args) == ''
+            sketch_avx512_on = false
+        endif
+    endforeach
+
+    if sketch_avx512_on == true
+        cflags += ['-DCC_AVX512_SUPPORT']
+        sources += files('rte_member_sketch_avx512.c')
+    elif cc.has_multi_arguments('-mavx512f', '-mavx512dq', '-mavx512ifma')
+        sketch_avx512_tmp = static_library('sketch_avx512_tmp',
+            'rte_member_sketch_avx512.c',
+            include_directories: includes,
+            dependencies: [static_rte_eal, static_rte_hash],
+            c_args: cflags +
+                ['-mavx512f', '-mavx512dq', '-mavx512ifma'])
+        objs += sketch_avx512_tmp.extract_objects('rte_member_sketch_avx512.c')
+        cflags += ['-DCC_AVX512_SUPPORT']
+    endif
+endif
diff --git a/lib/member/rte_member.c b/lib/member/rte_member.c
index 7e1632e6b5..8f859f7fbd 100644
--- a/lib/member/rte_member.c
+++ b/lib/member/rte_member.c
@@ -9,10 +9,12 @@
 #include <rte_malloc.h>
 #include <rte_errno.h>
 #include <rte_tailq.h>
+#include <rte_ring_elem.h>
 
 #include "rte_member.h"
 #include "rte_member_ht.h"
 #include "rte_member_vbf.h"
+#include "rte_member_sketch.h"
 
 TAILQ_HEAD(rte_member_list, rte_tailq_entry);
 static struct rte_tailq_elem rte_member_tailq = {
@@ -72,6 +74,9 @@ rte_member_free(struct rte_member_setsum *setsum)
        case RTE_MEMBER_TYPE_VBF:
                rte_member_free_vbf(setsum);
                break;
+       case RTE_MEMBER_TYPE_SKETCH:
+               rte_member_free_sketch(setsum);
+               break;
        default:
                break;
        }
@@ -86,6 +91,8 @@ rte_member_create(const struct rte_member_parameters *params)
        struct rte_member_list *member_list;
        struct rte_member_setsum *setsum;
        int ret;
+       char ring_name[RTE_RING_NAMESIZE];
+       struct rte_ring *sketch_key_ring = NULL;
 
        if (params == NULL) {
                rte_errno = EINVAL;
@@ -100,6 +107,16 @@ rte_member_create(const struct rte_member_parameters 
*params)
                return NULL;
        }
 
+       if (params->type == RTE_MEMBER_TYPE_SKETCH) {
+               snprintf(ring_name, sizeof(ring_name), "SK_%s", params->name);
+               sketch_key_ring = rte_ring_create_elem(ring_name, 
sizeof(uint32_t),
+                               rte_align32pow2(params->top_k), 
params->socket_id, 0);
+               if (sketch_key_ring == NULL) {
+                       RTE_MEMBER_LOG(ERR, "Sketch Ring Memory allocation 
failed\n");
+                       return NULL;
+               }
+       }
+
        member_list = RTE_TAILQ_CAST(rte_member_tailq.head, rte_member_list);
 
        rte_mcfg_tailq_write_lock();
@@ -145,6 +162,9 @@ rte_member_create(const struct rte_member_parameters 
*params)
        case RTE_MEMBER_TYPE_VBF:
                ret = rte_member_create_vbf(setsum, params);
                break;
+       case RTE_MEMBER_TYPE_SKETCH:
+               ret = rte_member_create_sketch(setsum, params, sketch_key_ring);
+               break;
        default:
                goto error_unlock_exit;
        }
@@ -162,6 +182,7 @@ rte_member_create(const struct rte_member_parameters 
*params)
 error_unlock_exit:
        rte_free(te);
        rte_free(setsum);
+       rte_ring_free(sketch_key_ring);
        rte_mcfg_tailq_write_unlock();
        return NULL;
 }
@@ -178,6 +199,23 @@ rte_member_add(const struct rte_member_setsum *setsum, 
const void *key,
                return rte_member_add_ht(setsum, key, set_id);
        case RTE_MEMBER_TYPE_VBF:
                return rte_member_add_vbf(setsum, key, set_id);
+       case RTE_MEMBER_TYPE_SKETCH:
+               return rte_member_add_sketch(setsum, key, set_id);
+       default:
+               return -EINVAL;
+       }
+}
+
+int
+rte_member_add_byte_count(const struct rte_member_setsum *setsum,
+                         const void *key, uint32_t byte_count)
+{
+       if (setsum == NULL || key == NULL || byte_count == 0)
+               return -EINVAL;
+
+       switch (setsum->type) {
+       case RTE_MEMBER_TYPE_SKETCH:
+               return rte_member_add_sketch_byte_count(setsum, key, 
byte_count);
        default:
                return -EINVAL;
        }
@@ -195,6 +233,8 @@ rte_member_lookup(const struct rte_member_setsum *setsum, 
const void *key,
                return rte_member_lookup_ht(setsum, key, set_id);
        case RTE_MEMBER_TYPE_VBF:
                return rte_member_lookup_vbf(setsum, key, set_id);
+       case RTE_MEMBER_TYPE_SKETCH:
+               return rte_member_lookup_sketch(setsum, key, set_id);
        default:
                return -EINVAL;
        }
@@ -261,6 +301,36 @@ rte_member_lookup_multi_bulk(const struct 
rte_member_setsum *setsum,
        }
 }
 
+int
+rte_member_query_count(const struct rte_member_setsum *setsum,
+                      const void *key, uint64_t *output)
+{
+       if (setsum == NULL || key == NULL || output == NULL)
+               return -EINVAL;
+
+       switch (setsum->type) {
+       case RTE_MEMBER_TYPE_SKETCH:
+               return rte_member_query_sketch(setsum, key, output);
+       default:
+               return -EINVAL;
+       }
+}
+
+int
+rte_member_report_heavyhitter(const struct rte_member_setsum *setsum,
+                               void **key, uint64_t *count)
+{
+       if (setsum == NULL || key == NULL || count == NULL)
+               return -EINVAL;
+
+       switch (setsum->type) {
+       case RTE_MEMBER_TYPE_SKETCH:
+               return rte_member_report_heavyhitter_sketch(setsum, key, count);
+       default:
+               return -EINVAL;
+       }
+}
+
 int
 rte_member_delete(const struct rte_member_setsum *setsum, const void *key,
                        member_set_t set_id)
@@ -272,6 +342,8 @@ rte_member_delete(const struct rte_member_setsum *setsum, 
const void *key,
        case RTE_MEMBER_TYPE_HT:
                return rte_member_delete_ht(setsum, key, set_id);
        /* current vBF implementation does not support delete function */
+       case RTE_MEMBER_TYPE_SKETCH:
+               return rte_member_delete_sketch(setsum, key);
        case RTE_MEMBER_TYPE_VBF:
        default:
                return -EINVAL;
@@ -290,6 +362,9 @@ rte_member_reset(const struct rte_member_setsum *setsum)
        case RTE_MEMBER_TYPE_VBF:
                rte_member_reset_vbf(setsum);
                return;
+       case RTE_MEMBER_TYPE_SKETCH:
+               rte_member_reset_sketch(setsum);
+               return;
        default:
                return;
        }
diff --git a/lib/member/rte_member.h b/lib/member/rte_member.h
index 2611015771..178efeed33 100644
--- a/lib/member/rte_member.h
+++ b/lib/member/rte_member.h
@@ -39,6 +39,18 @@
  * |          |                     | not overwrite  |                         
|
  * |          |                     | existing key.  |                         
|
  * 
+----------+---------------------+----------------+-------------------------+
+ * +==========+=============================+
+ * |   type   |      sketch                 |
+ * +==========+=============================+
+ * |structure | counting bloom filter array |
+ * +----------+-----------------------------+
+ * |set id    | 1: heavy set, 0: light set  |
+ * |          |                             |
+ * +----------+-----------------------------+
+ * |usages &  | count size of a flow,       |
+ * |properties| used for heavy hitter       |
+ * |          | detection.                  |
+ * +----------+-----------------------------+
  * -->
  */
 
@@ -50,6 +62,8 @@ extern "C" {
 #endif
 
 #include <stdint.h>
+#include <stdbool.h>
+#include <inttypes.h>
 
 #include <rte_common.h>
 
@@ -65,6 +79,20 @@ typedef uint16_t member_set_t;
 #define RTE_MEMBER_BUCKET_ENTRIES 16
 /** Maximum number of characters in setsum name. */
 #define RTE_MEMBER_NAMESIZE 32
+/** Max value of the random number */
+#define RTE_RAND_MAX      ~0LLU
+/**
+ * As packets skipped in the sampling-based algorithm, the accounting
+ * results accuracy is not guaranteed in the start stage. There should
+ * be a "convergence time" to achieve the accuracy after receiving enough
+ * packets.
+ * For sketch, use the flag if prefer always bounded mode, which only
+ * starts sampling after receiving enough packets to keep the results
+ * accuracy always bounded.
+ */
+#define RTE_MEMBER_SKETCH_ALWAYS_BOUNDED 0x01
+/** For sketch, use the flag if to count packet size instead of packet count */
+#define RTE_MEMBER_SKETCH_COUNT_BYTE 0x02
 
 /** @internal Hash function used by membership library. */
 #if defined(RTE_ARCH_X86) || defined(__ARM_FEATURE_CRC32)
@@ -104,6 +132,7 @@ struct rte_member_parameters;
 enum rte_member_setsum_type {
        RTE_MEMBER_TYPE_HT = 0,  /**< Hash table based set summary. */
        RTE_MEMBER_TYPE_VBF,     /**< Vector of bloom filters. */
+       RTE_MEMBER_TYPE_SKETCH,
        RTE_MEMBER_NUM_TYPE
 };
 
@@ -114,6 +143,19 @@ enum rte_member_sig_compare_function {
        RTE_MEMBER_COMPARE_NUM
 };
 
+/* sketch update function with different implementations. */
+typedef void (*sketch_update_fn_t)(const struct rte_member_setsum *ss,
+                                  const void *key,
+                                  uint32_t count);
+
+/* sketch lookup function with different implementations. */
+typedef uint64_t (*sketch_lookup_fn_t)(const struct rte_member_setsum *ss,
+                                      const void *key);
+
+/* sketch delete function with different implementations. */
+typedef void (*sketch_delete_fn_t)(const struct rte_member_setsum *ss,
+                                  const void *key);
+
 /** @internal setsummary structure. */
 struct rte_member_setsum {
        enum rte_member_setsum_type type; /* Type of the set summary. */
@@ -134,6 +176,21 @@ struct rte_member_setsum {
        uint32_t bit_mask;      /* Bit mask to get bit location in bf. */
        uint32_t num_hashes;    /* Number of hash values to index bf. */
 
+       /* Parameters for sketch */
+       float error_rate;
+       float sample_rate;
+       uint32_t num_col;
+       uint32_t num_row;
+       int always_bounded;
+       double converge_thresh;
+       uint32_t topk;
+       uint32_t count_byte;
+       uint64_t *hash_seeds;
+       sketch_update_fn_t sketch_update; /* Pointer to the sketch update 
function */
+       sketch_lookup_fn_t sketch_lookup; /* Pointer to the sketch lookup 
function */
+       sketch_delete_fn_t sketch_delete; /* Pointer to the sketch delete 
function */
+
+       void *runtime_var;
        uint32_t mul_shift;  /* vbf internal variable used during bit test. */
        uint32_t div_shift;  /* vbf internal variable used during bit test. */
 
@@ -143,6 +200,9 @@ struct rte_member_setsum {
        /* Second cache line should start here. */
        uint32_t socket_id;          /* NUMA Socket ID for memory. */
        char name[RTE_MEMBER_NAMESIZE]; /* Name of this set summary. */
+#ifdef RTE_ARCH_X86
+       bool use_avx512;
+#endif
 } __rte_cache_aligned;
 
 /**
@@ -261,8 +321,33 @@ struct rte_member_parameters {
         */
        uint32_t sec_hash_seed;
 
+       /**
+        * For count(min) sketch data structure, error rate defines the accuracy
+        * required by the user. Higher accuracy leads to more memory usage, but
+        * the flow size is estimated more accurately.
+        */
+       float error_rate;
+
+       /**
+        * Sampling rate means the internal sample rate of the rows of the count
+        * min sketches. Lower sampling rate can reduce CPU overhead, but the
+        * data structure will require more time to converge statistically.
+        */
+       float sample_rate;
+
+       /**
+        * How many top heavy hitter to be reported. The library will internally
+        * keep the keys of heavy hitters for final report.
+        */
+       uint32_t top_k;
+
+       /**
+        * Extra flags that may passed in by user
+        */
+       uint32_t extra_flag;
+
        int socket_id;                  /**< NUMA Socket ID for memory. */
-};
+} __rte_cache_aligned;
 
 /**
  * @warning
@@ -418,7 +503,7 @@ rte_member_lookup_multi_bulk(const struct rte_member_setsum 
*setsum,
  *   RTE_MEMBER_NO_MATCH by default is set as 0.
  *   For HT mode, the set_id has range as [1, 0x7FFF], MSB is reserved.
  *   For vBF mode the set id is limited by the num_set parameter when create
- *   the set-summary.
+ *   the set-summary. For sketch mode, this id is ignored.
  * @return
  *   HT (cache mode) and vBF should never fail unless the set_id is not in the
  *   valid range. In such case -EINVAL is returned.
@@ -429,12 +514,75 @@ rte_member_lookup_multi_bulk(const struct 
rte_member_setsum *setsum,
  *   Return 0 for HT (cache mode) if the add does not cause
  *   eviction, return 1 otherwise. Return 0 for non-cache mode if success,
  *   -ENOSPC for full, and 1 if cuckoo eviction happens.
- *   Always returns 0 for vBF mode.
+ *   Always returns 0 for vBF mode and sketch.
  */
 int
 rte_member_add(const struct rte_member_setsum *setsum, const void *key,
                        member_set_t set_id);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add the packet byte size into the sketch.
+ *
+ * @param setsum
+ *   Pointer of a set-summary.
+ * @param key
+ *   Pointer of the key to be added.
+ * @param byte_count
+ *   Add the byte count of the packet into the sketch.
+ * @return
+ * Return -EINVAL for invalid parameters, otherwise return 0.
+ */
+__rte_experimental
+int
+rte_member_add_byte_count(const struct rte_member_setsum *setsum,
+                         const void *key, uint32_t byte_count);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Query packet count for a certain flow-key.
+ *
+ * @param setsum
+ *   Pointer of a set-summary.
+ * @param key
+ *   Pointer of the key to be added.
+ * @param count
+ *   The output packet count or byte count.
+ * @return
+ *   Return -EINVAL for invalid parameters.
+ */
+__rte_experimental
+int
+rte_member_query_count(const struct rte_member_setsum *setsum,
+                      const void *key, uint64_t *count);
+
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Report heavyhitter flow-keys into set-summary (SS).
+ *
+ * @param setsum
+ *   Pointer of a set-summary.
+ * @param keys
+ *   Pointer of the output top-k key array.
+ * @param counts
+ *   Pointer of the output packet count or byte count array of the top-k keys.
+ * @return
+ *   Return -EINVAL for invalid parameters. Return a positive integer indicate
+ *   how many heavy hitters are reported.
+ */
+__rte_experimental
+int
+rte_member_report_heavyhitter(const struct rte_member_setsum *setsum,
+                             void **keys, uint64_t *counts);
+
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change without prior notice
diff --git a/lib/member/rte_member_heap.h b/lib/member/rte_member_heap.h
new file mode 100644
index 0000000000..3ced34160a
--- /dev/null
+++ b/lib/member/rte_member_heap.h
@@ -0,0 +1,424 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ * Copyright(c) 2020, Alan Liu <zaoxing...@gmail.com>
+ */
+
+#ifndef _RTE_MEMBER_HEAP_H_
+#define _RTE_MEMBER_HEAP_H_
+
+#include <rte_ring_elem.h>
+#include "rte_member.h"
+
+#define LCHILD(x) (2 * x + 1)
+#define RCHILD(x) (2 * x + 2)
+#define PARENT(x) ((x - 1) / 2)
+
+#define HASH_BKT_SIZE 16
+#define HASH_HP_MULTI 4
+#define HASH_RESIZE_MULTI 2
+
+struct hash_bkt {
+       uint16_t sig[HASH_BKT_SIZE];
+       uint16_t idx[HASH_BKT_SIZE];
+};
+
+struct hash {
+       uint16_t bkt_cnt;
+       uint16_t num_item;
+       uint32_t seed;
+       struct hash_bkt buckets[0];
+};
+
+struct node {
+       void *key;
+       uint64_t count;
+};
+
+struct minheap {
+       uint32_t key_len;
+       uint32_t size;
+       uint32_t socket;
+       struct hash *hashtable;
+       struct node *elem;
+};
+
+static int
+hash_table_insert(const void *key, int value, int key_len, struct hash *table)
+{
+       uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
+       uint16_t idx = hash % table->bkt_cnt;
+       uint16_t sig = hash >> 16;
+       int i;
+
+       for (i = 0; i < HASH_BKT_SIZE; i++) {
+               if (table->buckets[idx].idx[i] == 0) {
+                       table->buckets[idx].idx[i] = value;
+                       table->buckets[idx].sig[i] = sig;
+                       table->num_item++;
+                       return 0;
+               }
+       }
+
+       return -ENOMEM;
+}
+
+static int
+hash_table_update(const void *key, int old_value, int value, int key_len, 
struct hash *table)
+{
+       uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
+       uint16_t idx = hash % table->bkt_cnt;
+       uint16_t sig = hash >> 16;
+       int i;
+
+       for (i = 0; i < HASH_BKT_SIZE; i++) {
+               if (table->buckets[idx].sig[i] == sig && 
table->buckets[idx].idx[i] == old_value) {
+                       table->buckets[idx].idx[i] = value;
+                       return 0;
+               }
+       }
+
+       return -1;
+}
+
+static int
+hash_table_del(const void *key, uint16_t value, int key_len, struct hash 
*table)
+{
+       uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
+       uint16_t idx = hash % table->bkt_cnt;
+       uint16_t sig = hash >> 16;
+       int i;
+
+       for (i = 0; i < HASH_BKT_SIZE; i++) {
+               if (table->buckets[idx].sig[i] == sig && 
table->buckets[idx].idx[i] == value) {
+                       table->buckets[idx].idx[i] = 0;
+                       table->num_item--;
+                       return 0;
+               }
+       }
+
+       return -1;
+}
+
+static int
+hash_table_lookup(const void *key, int key_len, struct minheap *hp)
+{
+       struct hash *table = hp->hashtable;
+       uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
+       uint16_t idx = hash % table->bkt_cnt;
+       uint16_t sig = hash >> 16;
+       int i;
+
+       for (i = 0; i < HASH_BKT_SIZE; i++) {
+               if (table->buckets[idx].sig[i] == sig && 
table->buckets[idx].idx[i] != 0) {
+                       uint32_t hp_idx = table->buckets[idx].idx[i] - 1;
+
+                       if (memcmp(hp->elem[hp_idx].key, key, hp->key_len) == 0)
+                               return hp_idx;
+               }
+       }
+
+       return -ENOENT; /* key doesn't exist */
+}
+
+static int
+resize_hash_table(struct minheap *hp)
+{
+       uint32_t i;
+       uint32_t new_bkt_cnt;
+
+       while (1) {
+               new_bkt_cnt = hp->hashtable->bkt_cnt * HASH_RESIZE_MULTI;
+
+               RTE_MEMBER_LOG(ERR, "Sketch Minheap HT load factor is [%f]\n",
+                       hp->hashtable->num_item / 
((float)hp->hashtable->bkt_cnt * HASH_BKT_SIZE));
+               RTE_MEMBER_LOG(ERR, "Sketch Minheap HT resize happen!\n");
+               rte_free(hp->hashtable);
+               hp->hashtable = rte_zmalloc_socket(NULL, sizeof(struct hash) +
+                                               new_bkt_cnt * sizeof(struct 
hash_bkt),
+                                               RTE_CACHE_LINE_SIZE, 
hp->socket);
+
+               if (hp->hashtable == NULL) {
+                       RTE_MEMBER_LOG(ERR, "Sketch Minheap HT allocation 
failed\n");
+                       return -ENOMEM;
+               }
+
+               hp->hashtable->bkt_cnt = new_bkt_cnt;
+
+               for (i = 0; i < hp->size; ++i) {
+                       if (hash_table_insert(hp->elem[i].key,
+                               i + 1, hp->key_len, hp->hashtable) < 0) {
+                               RTE_MEMBER_LOG(ERR,
+                                       "Sketch Minheap HT resize insert 
fail!\n");
+                               break;
+                       }
+               }
+               if (i == hp->size)
+                       break;
+       }
+
+       return 0;
+}
+
+/* find the item in the given minheap */
+static int
+rte_member_minheap_find(struct minheap *hp, const void *key)
+{
+       int idx = hash_table_lookup(key, hp->key_len, hp);
+       return idx;
+}
+
+static int
+rte_member_minheap_init(struct minheap *heap, int size,
+                       uint32_t socket, uint32_t seed)
+{
+       heap->elem = rte_zmalloc_socket(NULL, sizeof(struct node) * size,
+                               RTE_CACHE_LINE_SIZE, socket);
+       if (heap->elem == NULL) {
+               RTE_MEMBER_LOG(ERR, "Sketch Minheap elem allocation failed\n");
+               return -ENOMEM;
+       }
+
+       uint32_t hash_bkt_cnt = rte_align32pow2(size * HASH_HP_MULTI) / 
HASH_BKT_SIZE;
+
+       if (hash_bkt_cnt == 0)
+               hash_bkt_cnt = 1;
+
+       heap->hashtable = rte_zmalloc_socket(NULL, sizeof(struct hash) +
+                                       hash_bkt_cnt * sizeof(struct hash_bkt),
+                                       RTE_CACHE_LINE_SIZE, socket);
+
+       if (heap->hashtable == NULL) {
+               RTE_MEMBER_LOG(ERR, "Sketch Minheap HT allocation failed\n");
+               rte_free(heap->elem);
+               return -ENOMEM;
+       }
+
+       heap->hashtable->seed = seed;
+       heap->hashtable->bkt_cnt = hash_bkt_cnt;
+       heap->socket = socket;
+
+       return 0;
+}
+
+/* swap the minheap nodes */
+static __rte_always_inline void
+rte_member_heap_swap(struct node *n1, struct node *n2)
+{
+       struct node temp = *n1;
+       *n1 = *n2;
+       *n2 = temp;
+}
+
+/* heapify function */
+static void
+rte_member_heapify(struct minheap *hp, uint32_t idx, bool update_hash)
+{
+       uint32_t smallest;
+
+       if (LCHILD(idx) < hp->size &&
+                       hp->elem[LCHILD(idx)].count < hp->elem[idx].count)
+               smallest = LCHILD(idx);
+       else
+               smallest = idx;
+
+       if (RCHILD(idx) < hp->size &&
+                       hp->elem[RCHILD(idx)].count < hp->elem[smallest].count)
+               smallest = RCHILD(idx);
+
+       if (smallest != idx) {
+               rte_member_heap_swap(&(hp->elem[idx]), &(hp->elem[smallest]));
+
+               if (update_hash) {
+                       if (hash_table_update(hp->elem[smallest].key, idx + 1, 
smallest + 1,
+                                       hp->key_len, hp->hashtable) < 0) {
+                               RTE_MEMBER_LOG(ERR, "Minheap Hash Table update 
failed\n");
+                               return;
+                       }
+
+                       if (hash_table_update(hp->elem[idx].key, smallest + 1, 
idx + 1,
+                                       hp->key_len, hp->hashtable) < 0) {
+                               RTE_MEMBER_LOG(ERR, "Minheap Hash Table update 
failed\n");
+                               return;
+                       }
+               }
+               rte_member_heapify(hp, smallest, update_hash);
+       }
+}
+
+/* insert a node into the minheap */
+static int
+rte_member_minheap_insert_node(struct minheap *hp, const void *key,
+                              int counter, void *key_slot,
+                              struct rte_ring *free_key_slot)
+{
+       struct node nd;
+       uint32_t slot_id;
+
+       if (rte_ring_sc_dequeue_elem(free_key_slot, &slot_id, sizeof(uint32_t)) 
!= 0) {
+               RTE_MEMBER_LOG(ERR, "Minheap get empty keyslot failed\n");
+               return -1;
+       }
+
+       nd.count = counter;
+       nd.key = RTE_PTR_ADD(key_slot, slot_id * hp->key_len);
+
+       memcpy(nd.key, key, hp->key_len);
+
+       uint32_t i = (hp->size)++;
+
+       while (i && nd.count < hp->elem[PARENT(i)].count) {
+               hp->elem[i] = hp->elem[PARENT(i)];
+               if (hash_table_update(hp->elem[i].key, PARENT(i) + 1, i + 1,
+                               hp->key_len, hp->hashtable) < 0) {
+                       RTE_MEMBER_LOG(ERR, "Minheap Hash Table update 
failed\n");
+                       return -1;
+               }
+               i = PARENT(i);
+       }
+       hp->elem[i] = nd;
+
+       if (hash_table_insert(key, i + 1, hp->key_len, hp->hashtable) < 0) {
+               if (resize_hash_table(hp) < 0) {
+                       RTE_MEMBER_LOG(ERR, "Minheap Hash Table resize 
failed\n");
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+/* delete a key from the minheap */
+static int
+rte_member_minheap_delete_node(struct minheap *hp, const void *key,
+                              void *key_slot, struct rte_ring *free_key_slot)
+{
+       int idx = rte_member_minheap_find(hp, key);
+       uint32_t offset = RTE_PTR_DIFF(hp->elem[idx].key, key_slot) / 
hp->key_len;
+
+       if (hash_table_del(key, idx + 1, hp->key_len, hp->hashtable) < 0) {
+               RTE_MEMBER_LOG(ERR, "Minheap Hash Table delete failed\n");
+               return -1;
+       }
+
+       rte_ring_sp_enqueue_elem(free_key_slot, &offset, sizeof(uint32_t));
+
+       if (idx == (int)(hp->size - 1)) {
+               hp->size--;
+               return 0;
+       }
+
+       hp->elem[idx] = hp->elem[hp->size - 1];
+
+       if (hash_table_update(hp->elem[idx].key, hp->size, idx + 1,
+                               hp->key_len, hp->hashtable) < 0) {
+               RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+               return -1;
+       }
+       hp->size--;
+       rte_member_heapify(hp, idx, true);
+
+       return 0;
+}
+
+/* replace a min node with a new key. */
+static int
+rte_member_minheap_replace_node(struct minheap *hp,
+                               const void *new_key,
+                               int new_counter)
+{
+       struct node nd;
+       void *recycle_key = NULL;
+
+       recycle_key = hp->elem[0].key;
+
+       if (hash_table_del(recycle_key, 1, hp->key_len, hp->hashtable) < 0) {
+               RTE_MEMBER_LOG(ERR, "Minheap Hash Table delete failed\n");
+               return -1;
+       }
+
+       hp->elem[0] = hp->elem[hp->size - 1];
+
+       if (hash_table_update(hp->elem[0].key, hp->size, 1,
+                               hp->key_len, hp->hashtable) < 0) {
+               RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+               return -1;
+       }
+       hp->size--;
+
+       rte_member_heapify(hp, 0, true);
+
+       nd.count = new_counter;
+       nd.key = recycle_key;
+
+       memcpy(nd.key, new_key, hp->key_len);
+
+       uint32_t i = (hp->size)++;
+
+       while (i && nd.count < hp->elem[PARENT(i)].count) {
+               hp->elem[i] = hp->elem[PARENT(i)];
+               if (hash_table_update(hp->elem[i].key, PARENT(i) + 1, i + 1,
+                               hp->key_len, hp->hashtable) < 0) {
+                       RTE_MEMBER_LOG(ERR, "Minheap Hash Table update 
failed\n");
+                       return -1;
+               }
+               i = PARENT(i);
+       }
+
+       hp->elem[i] = nd;
+
+       if (hash_table_insert(new_key, i + 1, hp->key_len, hp->hashtable) < 0) {
+               RTE_MEMBER_LOG(ERR, "Minheap Hash Table replace insert 
failed\n");
+               if (resize_hash_table(hp) < 0) {
+                       RTE_MEMBER_LOG(ERR, "Minheap Hash Table replace resize 
failed\n");
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+/* sort the heap into a descending array */
+static void
+rte_member_heapsort(struct minheap *hp, struct node *result_array)
+{
+       struct minheap new_hp;
+
+       /* build a new heap for using the given array */
+       new_hp.size = hp->size;
+       new_hp.key_len = hp->key_len;
+       new_hp.elem = result_array;
+       memcpy(result_array, hp->elem, hp->size * sizeof(struct node));
+
+       /* sort the new heap */
+       while (new_hp.size > 1) {
+               rte_member_heap_swap(&(new_hp.elem[0]), 
&(new_hp.elem[new_hp.size - 1]));
+               new_hp.size--;
+               rte_member_heapify(&new_hp, 0, false);
+       }
+}
+
+static void
+rte_member_minheap_free(struct minheap *hp)
+{
+       if (hp == NULL)
+               return;
+
+       rte_free(hp->elem);
+       rte_free(hp->hashtable);
+}
+
+static void
+rte_member_minheap_reset(struct minheap *hp)
+{
+       if (hp == NULL)
+               return;
+
+       memset(hp->elem, 0, sizeof(struct node) * hp->size);
+       hp->size = 0;
+
+       memset((char *)hp->hashtable + sizeof(struct hash), 0,
+                       hp->hashtable->bkt_cnt * sizeof(struct hash_bkt));
+       hp->hashtable->num_item = 0;
+}
+
+#endif /* _RTE_MEMBER_HEAP_H_ */
diff --git a/lib/member/rte_member_sketch.c b/lib/member/rte_member_sketch.c
new file mode 100644
index 0000000000..524ba77620
--- /dev/null
+++ b/lib/member/rte_member_sketch.c
@@ -0,0 +1,594 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ * Copyright(c) 2020, Alan Liu <zaoxing...@gmail.com>
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include <rte_malloc.h>
+#include <rte_memory.h>
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_random.h>
+#include <rte_prefetch.h>
+#include <rte_ring_elem.h>
+
+#include "rte_member.h"
+#include "rte_member_sketch.h"
+#include "rte_member_heap.h"
+
+#ifdef CC_AVX512_SUPPORT
+#include "rte_member_sketch_avx512.h"
+#endif /* CC_AVX512_SUPPORT */
+
+struct sketch_runtime {
+       uint64_t pkt_cnt;
+       uint32_t until_next;
+       int converged;
+       struct minheap heap;
+       struct node *report_array;
+       void *key_slots;
+       struct rte_ring *free_key_slots;
+} __rte_cache_aligned;
+
+/*
+ * Geometric sampling to calculate how many packets needs to be
+ * skipped until next update. This method can mitigate the CPU
+ * overheads compared with coin-toss sampling.
+ */
+static uint32_t
+draw_geometric(const struct rte_member_setsum *ss)
+{
+       double rand = 1;
+
+       if (ss->sample_rate == 1)
+               return 1;
+
+       while (rand == 1 || rand == 0)
+               rand = (double) rte_rand() / (double)(RTE_RAND_MAX);
+
+       return (uint32_t)ceil(log(1 - rand) / log(1 - ss->sample_rate));
+}
+
+static void
+isort(uint64_t *array, int n)
+{
+       int i;
+
+       for (i = 1; i < n; i++) {
+               uint64_t t = array[i];
+               int j;
+
+               for (j = i - 1; j >= 0; j--) {
+                       if (t < array[j])
+                               array[j + 1] = array[j];
+                       else
+                               break;
+               }
+               array[j + 1] = t;
+       }
+}
+
+static __rte_always_inline void
+swap(uint64_t *a, uint64_t *b)
+{
+       uint64_t tmp = *a;
+       *a = *b;
+       *b = tmp;
+}
+
+static uint64_t
+medianof5(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e)
+{
+       if (a > b)
+               swap(&a, &b);
+       if (c > d)
+               swap(&c, &d);
+       if (a > c) {
+               if (d > e)
+                       swap(&c, &e);
+               else {
+                       swap(&c, &d);
+                       swap(&d, &e);
+               }
+       } else {
+               if (b > e)
+                       swap(&a, &e);
+               else {
+                       swap(&a, &b);
+                       swap(&b, &e);
+               }
+       }
+
+       if (a > c)
+               return a > d ? d : a;
+       else
+               return b > c ? c : b;
+}
+
+int
+rte_member_create_sketch(struct rte_member_setsum *ss,
+                        const struct rte_member_parameters *params,
+                        struct rte_ring *ring)
+{
+       struct sketch_runtime *runtime;
+       uint32_t num_col;
+       uint32_t i;
+
+       if (params->sample_rate == 0 || params->sample_rate > 1) {
+               rte_errno = EINVAL;
+               RTE_MEMBER_LOG(ERR,
+                       "Membership Sketch created with invalid parameters\n");
+               return -EINVAL;
+       }
+
+       if (params->extra_flag & RTE_MEMBER_SKETCH_COUNT_BYTE)
+               ss->count_byte = 1;
+
+#ifdef RTE_ARCH_X86
+       if (ss->count_byte == 1 &&
+               rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512 &&
+               rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+               rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512IFMA) == 1) {
+#ifdef CC_AVX512_SUPPORT
+               ss->use_avx512 = true;
+#else
+               ss->use_avx512 = false;
+#endif
+       }
+
+       if (ss->use_avx512 == true) {
+#ifdef CC_AVX512_SUPPORT
+               ss->num_row = NUM_ROW_VEC;
+               RTE_MEMBER_LOG(NOTICE,
+                       "Membership Sketch AVX512 update/lookup/delete ops is 
selected\n");
+               ss->sketch_update = sketch_update_avx512;
+               ss->sketch_lookup = sketch_lookup_avx512;
+               ss->sketch_delete = sketch_delete_avx512;
+#endif
+       } else
+#endif
+       {
+               ss->num_row = NUM_ROW_SCALAR;
+               RTE_MEMBER_LOG(NOTICE,
+                       "Membership Sketch SCALAR update/lookup/delete ops is 
selected\n");
+               ss->sketch_update = sketch_update_scalar;
+               ss->sketch_lookup = sketch_lookup_scalar;
+               ss->sketch_delete = sketch_delete_scalar;
+       }
+
+       ss->socket_id = params->socket_id;
+
+       if (ss->count_byte == 0)
+               num_col = 4.0 / params->error_rate / params->sample_rate;
+#ifdef RTE_ARCH_X86
+       else if (ss->use_avx512 == true)
+               num_col = rte_align32pow2(4.0 / params->error_rate);
+#endif
+       else
+               num_col = 4.0 / params->error_rate;
+
+       ss->table = rte_zmalloc_socket(NULL,
+                       sizeof(uint64_t) * num_col * ss->num_row,
+                       RTE_CACHE_LINE_SIZE, ss->socket_id);
+       if (ss->table == NULL) {
+               RTE_MEMBER_LOG(ERR, "Sketch Table memory allocation failed\n");
+               return -ENOMEM;
+       }
+
+       ss->hash_seeds = rte_zmalloc_socket(NULL, sizeof(uint64_t) * 
ss->num_row,
+                       RTE_CACHE_LINE_SIZE, ss->socket_id);
+       if (ss->hash_seeds == NULL) {
+               RTE_MEMBER_LOG(ERR, "Sketch Hashseeds memory allocation 
failed\n");
+               return -ENOMEM;
+       }
+
+       ss->runtime_var = rte_zmalloc_socket(NULL, sizeof(struct 
sketch_runtime),
+                                       RTE_CACHE_LINE_SIZE, ss->socket_id);
+       if (ss->runtime_var == NULL) {
+               RTE_MEMBER_LOG(ERR, "Sketch Runtime memory allocation 
failed\n");
+               rte_free(ss);
+               return -ENOMEM;
+       }
+       runtime = ss->runtime_var;
+
+       ss->num_col = num_col;
+       ss->sample_rate = params->sample_rate;
+       ss->prim_hash_seed = params->prim_hash_seed;
+       ss->sec_hash_seed = params->sec_hash_seed;
+       ss->error_rate = params->error_rate;
+       ss->topk = params->top_k;
+       ss->key_len = params->key_len;
+       runtime->heap.key_len = ss->key_len;
+
+       runtime->key_slots = rte_zmalloc_socket(NULL, ss->key_len * ss->topk,
+                                       RTE_CACHE_LINE_SIZE, ss->socket_id);
+       if (runtime->key_slots == NULL) {
+               RTE_MEMBER_LOG(ERR, "Sketch Key Slots allocation failed\n");
+               goto error;
+       }
+
+       runtime->free_key_slots = ring;
+       for (i = 0; i < ss->topk; i++)
+               rte_ring_sp_enqueue_elem(runtime->free_key_slots,
+                                       &i, sizeof(uint32_t));
+
+       if (rte_member_minheap_init(&(runtime->heap), params->top_k,
+                       ss->socket_id, params->prim_hash_seed) < 0) {
+               RTE_MEMBER_LOG(ERR, "Sketch Minheap allocation failed\n");
+               goto error_runtime;
+       }
+
+       runtime->report_array = rte_zmalloc_socket(NULL, sizeof(struct node) * 
ss->topk,
+                                       RTE_CACHE_LINE_SIZE, ss->socket_id);
+       if (runtime->report_array == NULL) {
+               RTE_MEMBER_LOG(ERR, "Sketch Runtime Report Array allocation 
failed\n");
+               goto error_runtime;
+       }
+
+       rte_srand(ss->prim_hash_seed);
+       for (i = 0; i < ss->num_row; i++)
+               ss->hash_seeds[i] = rte_rand();
+
+       if (params->extra_flag & RTE_MEMBER_SKETCH_ALWAYS_BOUNDED)
+               ss->always_bounded = 1;
+
+       if (ss->always_bounded) {
+               double delta = 1.0 / (pow(2, ss->num_row));
+
+               ss->converge_thresh = 10 * pow(ss->error_rate, -2.0) * 
sqrt(log(1 / delta));
+       }
+
+       RTE_MEMBER_LOG(DEBUG, "Sketch created, "
+               "the total memory required is %u Bytes\n",  ss->num_col * 
ss->num_row * 8);
+
+       return 0;
+
+error_runtime:
+       rte_member_minheap_free(&runtime->heap);
+       rte_ring_free(runtime->free_key_slots);
+       rte_free(runtime->key_slots);
+error:
+       rte_free(runtime);
+       rte_free(ss);
+
+       return -ENOMEM;
+}
+
+uint64_t
+sketch_lookup_scalar(const struct rte_member_setsum *ss, const void *key)
+{
+       uint64_t *count_array = ss->table;
+       uint32_t col[ss->num_row];
+       uint64_t count_row[ss->num_row];
+       uint32_t cur_row;
+       uint64_t count;
+
+       for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+               col[cur_row] = MEMBER_HASH_FUNC(key, ss->key_len,
+                       ss->hash_seeds[cur_row]) % ss->num_col;
+
+               rte_prefetch0(&count_array[cur_row * ss->num_col + 
col[cur_row]]);
+       }
+
+       /* if sample rate is 1, it is a regular count-min, we report the min */
+       if (ss->sample_rate == 1 || ss->count_byte == 1)
+               return count_min(ss, col);
+
+       memset(count_row, 0, sizeof(uint64_t) * ss->num_row);
+
+       /* otherwise we report the median number */
+       for (cur_row = 0; cur_row < ss->num_row; cur_row++)
+               count_row[cur_row] = count_array[cur_row * ss->num_col + 
col[cur_row]];
+
+       if (ss->num_row == 5)
+               return medianof5(count_row[0], count_row[1],
+                               count_row[2], count_row[3], count_row[4]);
+
+       isort(count_row, ss->num_row);
+
+       if (ss->num_row % 2 == 0) {
+               count = (count_row[ss->num_row / 2] + count_row[ss->num_row / 2 
- 1]) / 2;
+               return count;
+       }
+       /* ss->num_row % 2 != 0 */
+       count = count_row[ss->num_row / 2];
+
+       return count;
+}
+
+void
+sketch_delete_scalar(const struct rte_member_setsum *ss, const void *key)
+{
+       uint32_t col[ss->num_row];
+       uint64_t *count_array = ss->table;
+       uint32_t cur_row;
+
+       for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+               col[cur_row] = MEMBER_HASH_FUNC(key, ss->key_len,
+                       ss->hash_seeds[cur_row]) % ss->num_col;
+
+               /* set corresponding counter to 0 */
+               count_array[cur_row * ss->num_col + col[cur_row]] = 0;
+       }
+}
+
+int
+rte_member_query_sketch(const struct rte_member_setsum *ss,
+                       const void *key,
+                       uint64_t *output)
+{
+       uint64_t count = ss->sketch_lookup(ss, key);
+       *output = count;
+
+       return 0;
+}
+
+void
+rte_member_update_heap(const struct rte_member_setsum *ss)
+{
+       uint32_t i;
+       struct sketch_runtime *runtime_var = ss->runtime_var;
+
+       for (i = 0; i < runtime_var->heap.size; i++) {
+               uint64_t count = ss->sketch_lookup(ss, 
runtime_var->heap.elem[i].key);
+
+               runtime_var->heap.elem[i].count = count;
+       }
+}
+
+int
+rte_member_report_heavyhitter_sketch(const struct rte_member_setsum *setsum,
+                                    void **key,
+                                    uint64_t *count)
+{
+       uint32_t i;
+       struct sketch_runtime *runtime_var = setsum->runtime_var;
+
+       rte_member_update_heap(setsum);
+       rte_member_heapsort(&(runtime_var->heap), runtime_var->report_array);
+
+       for (i = 0; i < runtime_var->heap.size; i++) {
+               key[i] = runtime_var->report_array[i].key;
+               count[i] = runtime_var->report_array[i].count;
+       }
+
+       return runtime_var->heap.size;
+}
+
+int
+rte_member_lookup_sketch(const struct rte_member_setsum *ss,
+                        const void *key, member_set_t *set_id)
+{
+       uint64_t count = ss->sketch_lookup(ss, key);
+       struct sketch_runtime *runtime_var = ss->runtime_var;
+
+       if (runtime_var->heap.size > 0 && count >= 
runtime_var->heap.elem[0].count)
+               *set_id = 1;
+       else
+               *set_id = 0;
+
+       if (count == 0)
+               return 0;
+       else
+               return 1;
+}
+
+static void
+should_converge(const struct rte_member_setsum *ss)
+{
+       struct sketch_runtime *runtime_var = ss->runtime_var;
+
+       /* For count min sketch - L1 norm */
+       if (runtime_var->pkt_cnt > ss->converge_thresh) {
+               runtime_var->converged = 1;
+               RTE_MEMBER_LOG(DEBUG, "Sketch converged, begin sampling "
+                                       "from key count %"PRIu64"\n",
+                                       runtime_var->pkt_cnt);
+       }
+}
+
+static void
+sketch_update_row(const struct rte_member_setsum *ss, const void *key,
+                 uint32_t count, uint32_t cur_row)
+{
+       uint64_t *count_array = ss->table;
+       uint32_t col = MEMBER_HASH_FUNC(key, ss->key_len,
+                       ss->hash_seeds[cur_row]) % ss->num_col;
+
+       /* sketch counter update */
+       count_array[cur_row * ss->num_col + col] +=
+                       ceil(count / (ss->sample_rate));
+}
+
+void
+sketch_update_scalar(const struct rte_member_setsum *ss,
+                    const void *key,
+                    uint32_t count)
+{
+       uint64_t *count_array = ss->table;
+       uint32_t col;
+       uint32_t cur_row;
+
+       for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+               col = MEMBER_HASH_FUNC(key, ss->key_len,
+                               ss->hash_seeds[cur_row]) % ss->num_col;
+               count_array[cur_row * ss->num_col + col] += count;
+       }
+}
+
+static void
+heap_update(const struct rte_member_setsum *ss, const void *key)
+{
+       struct sketch_runtime *runtime_var = ss->runtime_var;
+       uint64_t key_cnt = 0;
+       int found;
+
+       /* We also update the heap for this key */
+       key_cnt = ss->sketch_lookup(ss, key);
+       if (key_cnt > runtime_var->heap.elem[0].count) {
+               found = rte_member_minheap_find(&runtime_var->heap, key);
+               /* the key is found in the top-k heap */
+               if (found >= 0) {
+                       if (runtime_var->heap.elem[found].count < key_cnt)
+                               rte_member_heapify(&runtime_var->heap, found, 
true);
+
+                       runtime_var->heap.elem[found].count = key_cnt;
+               } else if (runtime_var->heap.size < ss->topk) {
+                       rte_member_minheap_insert_node(&runtime_var->heap, key,
+                               key_cnt, runtime_var->key_slots, 
runtime_var->free_key_slots);
+               } else {
+                       rte_member_minheap_replace_node(&runtime_var->heap, 
key, key_cnt);
+               }
+       } else if (runtime_var->heap.size < ss->topk) {
+               found = rte_member_minheap_find(&runtime_var->heap, key);
+               if (found >= 0) {
+                       if (runtime_var->heap.elem[found].count < key_cnt)
+                               rte_member_heapify(&runtime_var->heap, found, 
true);
+
+                       runtime_var->heap.elem[found].count = key_cnt;
+               } else
+                       rte_member_minheap_insert_node(&runtime_var->heap, key,
+                               key_cnt, runtime_var->key_slots, 
runtime_var->free_key_slots);
+       }
+}
+
+/*
+ * Add a single packet into the sketch.
+ * Sketch value is meatured by packet numbers in this mode.
+ */
+int
+rte_member_add_sketch(const struct rte_member_setsum *ss,
+                     const void *key,
+                     __rte_unused member_set_t set_id)
+{
+       uint32_t cur_row;
+       struct sketch_runtime *runtime_var = ss->runtime_var;
+       uint32_t *until_next = &(runtime_var->until_next);
+
+       /*
+        * If sketch is measured by byte count,
+        * the rte_member_add_sketch_byte_count routine should be used.
+        */
+       if (ss->count_byte == 1) {
+               RTE_MEMBER_LOG(ERR, "Sketch is Byte Mode, "
+                       "should use rte_member_add_byte_count()!\n");
+               return -EINVAL;
+       }
+
+       if (ss->sample_rate == 1) {
+               ss->sketch_update(ss, key, 1);
+               heap_update(ss, key);
+               return 0;
+       }
+
+       /* convergence stage if it's needed */
+       if (ss->always_bounded && !runtime_var->converged) {
+               ss->sketch_update(ss, key, 1);
+
+               if (!((++runtime_var->pkt_cnt) & (INTERVAL - 1)))
+                       should_converge(ss);
+
+               heap_update(ss, key);
+               return 0;
+       }
+
+       /* should we skip this packet */
+       if (*until_next >= ss->num_row) {
+               *until_next -= ss->num_row;
+               return 0;
+       }
+       cur_row = *until_next;
+       do {
+               sketch_update_row(ss, key, 1, cur_row);
+               *until_next = draw_geometric(ss);
+               if (cur_row + *until_next >= ss->num_row)
+                       break;
+               cur_row += *until_next;
+       } while (1);
+
+       *until_next -= (ss->num_row - cur_row);
+
+       heap_update(ss, key);
+
+       return 0;
+}
+
+/*
+ * Add the byte count of the packet into the sketch.
+ * Sketch value is meatured by byte count numbers in this mode.
+ */
+int
+rte_member_add_sketch_byte_count(const struct rte_member_setsum *ss,
+                                const void *key,
+                                uint32_t byte_count)
+{
+       struct sketch_runtime *runtime_var = ss->runtime_var;
+       uint32_t *until_next = &(runtime_var->until_next);
+
+       /* should not call this API if not in count byte mode */
+       if (ss->count_byte == 0) {
+               RTE_MEMBER_LOG(ERR, "Sketch is Pkt Mode, "
+                       "should use rte_member_add()!\n");
+               return -EINVAL;
+       }
+
+       /* there's specific optimization for the sketch update */
+       ss->sketch_update(ss, key, byte_count);
+
+       if (*until_next != 0) {
+               *until_next = *until_next - 1;
+               return 0;
+       }
+
+       *until_next = draw_geometric(ss) - 1;
+
+       heap_update(ss, key);
+
+       return 0;
+}
+
+int
+rte_member_delete_sketch(const struct rte_member_setsum *ss,
+                        const void *key)
+{
+       struct sketch_runtime *runtime_var = ss->runtime_var;
+       int found;
+
+       found = rte_member_minheap_find(&runtime_var->heap, key);
+       if (found < 0)
+               return -1;
+
+       ss->sketch_delete(ss, key);
+
+       return rte_member_minheap_delete_node
+               (&runtime_var->heap, key, runtime_var->key_slots, 
runtime_var->free_key_slots);
+}
+
+void
+rte_member_free_sketch(struct rte_member_setsum *ss)
+{
+       struct sketch_runtime *runtime_var = ss->runtime_var;
+
+       rte_free(ss->table);
+       rte_member_minheap_free(&runtime_var->heap);
+       rte_free(runtime_var->key_slots);
+       rte_ring_free(runtime_var->free_key_slots);
+       rte_free(runtime_var);
+}
+
+void
+rte_member_reset_sketch(const struct rte_member_setsum *ss)
+{
+       struct sketch_runtime *runtime_var = ss->runtime_var;
+       uint64_t *sketch = ss->table;
+       uint32_t i;
+
+       memset(sketch, 0, sizeof(uint64_t) * ss->num_col * ss->num_row);
+       rte_member_minheap_reset(&runtime_var->heap);
+       rte_ring_reset(runtime_var->free_key_slots);
+
+       for (i = 0; i < ss->topk; i++)
+               rte_ring_sp_enqueue_elem(runtime_var->free_key_slots, &i, 
sizeof(uint32_t));
+}
diff --git a/lib/member/rte_member_sketch.h b/lib/member/rte_member_sketch.h
new file mode 100644
index 0000000000..219323008b
--- /dev/null
+++ b/lib/member/rte_member_sketch.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017 Intel Corporation
+ */
+
+#ifndef _RTE_MEMBER_SKETCH_H_
+#define _RTE_MEMBER_SKETCH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_vect.h>
+#include <rte_ring_elem.h>
+
+#define NUM_ROW_SCALAR 5
+#define INTERVAL (1 << 15)
+
+#if !RTE_IS_POWER_OF_2(INTERVAL)
+#error sketch INTERVAL macro must be a power of 2
+#endif
+
+int
+rte_member_create_sketch(struct rte_member_setsum *ss,
+                        const struct rte_member_parameters *params,
+                        struct rte_ring *r);
+
+int
+rte_member_lookup_sketch(const struct rte_member_setsum *setsum,
+                        const void *key, member_set_t *set_id);
+
+int
+rte_member_add_sketch(const struct rte_member_setsum *setsum,
+                     const void *key,
+                     member_set_t set_id);
+
+int
+rte_member_add_sketch_byte_count(const struct rte_member_setsum *ss,
+                                const void *key, uint32_t byte_count);
+
+void
+sketch_update_scalar(const struct rte_member_setsum *ss,
+                    const void *key,
+                    uint32_t count);
+
+uint64_t
+sketch_lookup_scalar(const struct rte_member_setsum *ss,
+                    const void *key);
+
+void
+sketch_delete_scalar(const struct rte_member_setsum *ss,
+                    const void *key);
+
+int
+rte_member_delete_sketch(const struct rte_member_setsum *setsum,
+                        const void *key);
+
+int
+rte_member_query_sketch(const struct rte_member_setsum *setsum,
+                       const void *key, uint64_t *output);
+
+void
+rte_member_free_sketch(struct rte_member_setsum *ss);
+
+void
+rte_member_reset_sketch(const struct rte_member_setsum *setsum);
+
+int
+rte_member_report_heavyhitter_sketch(const struct rte_member_setsum *setsum,
+                                    void **key, uint64_t *count);
+
+void
+rte_member_update_heap(const struct rte_member_setsum *ss);
+
+static __rte_always_inline uint64_t
+count_min(const struct rte_member_setsum *ss, const uint32_t *hash_results)
+{
+       uint64_t *count_array = ss->table;
+       uint64_t count;
+       uint32_t cur_row;
+       uint64_t min = UINT64_MAX;
+
+       for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+               uint64_t cnt = count_array[cur_row * ss->num_col + 
hash_results[cur_row]];
+
+               if (cnt < min)
+                       min = cnt;
+       }
+       count = min;
+
+       return count;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMBER_SKETCH_H_ */
diff --git a/lib/member/rte_member_sketch_avx512.c 
b/lib/member/rte_member_sketch_avx512.c
new file mode 100644
index 0000000000..288e37a446
--- /dev/null
+++ b/lib/member/rte_member_sketch_avx512.c
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include "rte_xxh64_avx512.h"
+#include "rte_member_sketch_avx512.h"
+
+__rte_always_inline void
+sketch_update_avx512(const struct rte_member_setsum *ss,
+                    const void *key,
+                    uint32_t count)
+{
+       uint64_t *count_array = ss->table;
+       uint32_t num_col = ss->num_col;
+       uint32_t key_len = ss->key_len;
+       __m256i v_row_base;
+       __m256i v_hash_result;
+       __m512i current_sketch;
+       __m512i updated_sketch;
+       __m512i v_count;
+
+       const __m256i v_idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+       const __m256i v_col = _mm256_set1_epi32(num_col);
+
+       /* compute the hash result parallelly */
+       v_hash_result = rte_xxh64_sketch_avx512
+               (key, key_len, *(__m512i *)ss->hash_seeds, num_col);
+       v_row_base = _mm256_mullo_epi32(v_idx, v_col);
+       v_hash_result = _mm256_add_epi32(v_row_base, v_hash_result);
+
+       current_sketch =
+               _mm512_i32gather_epi64(v_hash_result, count_array, 8);
+       v_count = _mm512_set1_epi64(count);
+       updated_sketch = _mm512_add_epi64(current_sketch, v_count);
+       _mm512_i32scatter_epi64
+               ((void *)count_array, v_hash_result, updated_sketch, 8);
+}
+
+uint64_t
+sketch_lookup_avx512(const struct rte_member_setsum *ss, const void *key)
+{
+       uint32_t col[ss->num_row];
+
+       /* currently only for sketch byte count mode */
+       __m256i v_hash_result = rte_xxh64_sketch_avx512
+               (key, ss->key_len, *(__m512i *)ss->hash_seeds, ss->num_col);
+       _mm256_storeu_si256((__m256i *)col, v_hash_result);
+
+       return count_min(ss, col);
+}
+
+void
+sketch_delete_avx512(const struct rte_member_setsum *ss, const void *key)
+{
+       uint32_t col[ss->num_row];
+       uint64_t *count_array = ss->table;
+       uint64_t min = UINT64_MAX;
+       uint32_t cur_row;
+
+       __m256i v_hash_result = rte_xxh64_sketch_avx512
+               (key, ss->key_len, *(__m512i *)ss->hash_seeds,
+                RTE_ALIGN_FLOOR(ss->num_col, 32));
+       _mm256_storeu_si256((__m256i *)col, v_hash_result);
+
+       min = count_min(ss, col);
+
+       /* subtract the min value from all the counters */
+       for (cur_row = 0; cur_row < ss->num_row; cur_row++)
+               count_array[cur_row * ss->num_col + col[cur_row]] -= min;
+}
diff --git a/lib/member/rte_member_sketch_avx512.h 
b/lib/member/rte_member_sketch_avx512.h
new file mode 100644
index 0000000000..68105cd895
--- /dev/null
+++ b/lib/member/rte_member_sketch_avx512.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _RTE_MEMBER_SKETCH_AVX512_H_
+#define _RTE_MEMBER_SKETCH_AVX512_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_vect.h>
+#include "rte_member.h"
+#include "rte_member_sketch.h"
+
+#define NUM_ROW_VEC 8
+
+void
+sketch_update_avx512(const struct rte_member_setsum *ss,
+                    const void *key,
+                    uint32_t count);
+
+uint64_t
+sketch_lookup_avx512(const struct rte_member_setsum *ss,
+                    const void *key);
+
+void
+sketch_delete_avx512(const struct rte_member_setsum *ss,
+                    const void *key);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMBER_SKETCH_AVX512_H_ */
diff --git a/lib/member/rte_xxh64_avx512.h b/lib/member/rte_xxh64_avx512.h
new file mode 100644
index 0000000000..50ca1b52c7
--- /dev/null
+++ b/lib/member/rte_xxh64_avx512.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _RTE_XXH64_AVX512_H_
+#define _RTE_XXH64_AVX512_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <immintrin.h>
+
+/* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const uint64_t PRIME64_1 = 0x9E3779B185EBCA87ULL;
+/* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;
+/* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const uint64_t PRIME64_3 = 0x165667B19E3779F9ULL;
+/* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63ULL;
+/* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+static const uint64_t PRIME64_5 = 0x27D4EB2F165667C5ULL;
+
+static __rte_always_inline  __m512i
+xxh64_round_avx512(__m512i hash, __m512i input)
+{
+       hash = _mm512_madd52lo_epu64(hash,
+                       input,
+                       _mm512_set1_epi64(PRIME64_2));
+
+       hash = _mm512_rol_epi64(hash, 31);
+
+       return hash;
+}
+
+static __rte_always_inline  __m512i
+xxh64_fmix_avx512(__m512i hash)
+{
+       hash = _mm512_xor_si512(hash, _mm512_srli_epi64(hash, 33));
+
+       return hash;
+}
+
+static __rte_always_inline __m256i
+rte_xxh64_sketch_avx512(const void *key, uint32_t key_len,
+                       __m512i v_seed, uint32_t modulo)
+{
+       __m512i v_prime64_5, v_hash;
+       size_t remaining = key_len;
+       size_t offset = 0;
+       __m512i input;
+
+       v_prime64_5 = _mm512_set1_epi64(PRIME64_5);
+       v_hash = _mm512_add_epi64
+                       (_mm512_add_epi64(v_seed, v_prime64_5),
+                        _mm512_set1_epi64(key_len));
+
+       while (remaining >= 8) {
+               input = _mm512_set1_epi64(*(uint64_t *)RTE_PTR_ADD(key, 
offset));
+               v_hash = _mm512_xor_epi64(v_hash,
+                               xxh64_round_avx512(_mm512_setzero_si512(), 
input));
+               v_hash = _mm512_madd52lo_epu64(_mm512_set1_epi64(PRIME64_4),
+                               v_hash,
+                               _mm512_set1_epi64(PRIME64_1));
+
+               remaining -= 8;
+               offset += 8;
+       }
+
+       if (remaining >= 4) {
+               input = _mm512_set1_epi64
+                       (*(uint32_t *)RTE_PTR_ADD(key, offset));
+               v_hash = _mm512_xor_epi64(v_hash,
+                       _mm512_mullo_epi64(input,
+                               _mm512_set1_epi64(PRIME64_1)));
+               v_hash = _mm512_madd52lo_epu64
+                               (_mm512_set1_epi64(PRIME64_3),
+                               _mm512_rol_epi64(v_hash, 23),
+                               _mm512_set1_epi64(PRIME64_2));
+
+               offset += 4;
+               remaining -= 4;
+       }
+
+       while (remaining != 0) {
+               input = _mm512_set1_epi64
+                       (*(uint8_t *)RTE_PTR_ADD(key, offset));
+               v_hash = _mm512_xor_epi64(v_hash,
+                       _mm512_mullo_epi64(input,
+                               _mm512_set1_epi64(PRIME64_5)));
+               v_hash = _mm512_mullo_epi64
+                       (_mm512_rol_epi64(v_hash, 11),
+                       _mm512_set1_epi64(PRIME64_1));
+               offset++;
+               remaining--;
+       }
+
+       v_hash = xxh64_fmix_avx512(v_hash);
+
+       /*
+        * theoritically, such modular operations can be replaced by
+        * _mm512_rem_epi64(), but seems it depends on the compiler's
+        * implementation. so here is the limitation that the modulo
+        * value should be power of 2.
+        */
+       __m512i v_hash_remainder = _mm512_set1_epi64((modulo - 1));
+
+       return _mm512_cvtepi64_epi32(_mm512_and_si512(v_hash, 
v_hash_remainder));
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_XXH64_AVX512_H_ */
diff --git a/lib/member/version.map b/lib/member/version.map
index 19469c6aba..35199270ff 100644
--- a/lib/member/version.map
+++ b/lib/member/version.map
@@ -14,3 +14,12 @@ DPDK_23 {
 
        local: *;
 };
+
+EXPERIMENTAL {
+       global:
+
+       # added in 22.11
+       rte_member_add_byte_count;
+       rte_member_query_count;
+       rte_member_report_heavyhitter;
+};
-- 
2.25.1

Reply via email to