This patch implements memcmp and use librte_hash as the first candidate
to use rte_memcmp which is implemented using AVX/SSE intrinsics.

Tested with GCC(4.8.2) and Clang(3.4-1) compilers and both tests show better
performance on Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz, Ubuntu 14.04
x86_64 shows when compared to memcmp.

Changes in v3:
Implement complete memcmp functionality.
Implement functional and performance tests and add it to
"make test" infrastructure code.

Changes in v2:
Modified code to support only upto 64 bytes as that's the max bytes
used by hash for comparison.

Changes in v1:
Initial changes to support memcmp with support upto 128 bytes.

Signed-off-by: Ravi Kerur <rkerur at gmail.com>
---
 app/test/Makefile                                  |   5 +-
 app/test/autotest_data.py                          |  19 +
 app/test/test_hash_perf.c                          |  36 +-
 app/test/test_memcmp.c                             | 229 ++++++
 app/test/test_memcmp_perf.c                        | 339 ++++++++
 .../common/include/arch/ppc_64/rte_memcmp.h        |  62 ++
 .../common/include/arch/x86/rte_memcmp.h           | 900 +++++++++++++++++++++
 lib/librte_eal/common/include/generic/rte_memcmp.h | 175 ++++
 lib/librte_hash/rte_hash.c                         |  59 +-
 9 files changed, 1789 insertions(+), 35 deletions(-)
 create mode 100644 app/test/test_memcmp.c
 create mode 100644 app/test/test_memcmp_perf.c
 create mode 100644 lib/librte_eal/common/include/arch/ppc_64/rte_memcmp.h
 create mode 100644 lib/librte_eal/common/include/arch/x86/rte_memcmp.h
 create mode 100644 lib/librte_eal/common/include/generic/rte_memcmp.h

diff --git a/app/test/Makefile b/app/test/Makefile
index 4aca77c..957e4f1 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -81,6 +81,9 @@ SRCS-y += test_logs.c
 SRCS-y += test_memcpy.c
 SRCS-y += test_memcpy_perf.c

+SRCS-y += test_memcmp.c
+SRCS-y += test_memcmp_perf.c
+
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash.c
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash_perf.c

@@ -150,7 +153,7 @@ CFLAGS_test_kni.o += -Wno-deprecated-declarations
 endif
 CFLAGS += -D_GNU_SOURCE

-# Disable VTA for memcpy test
+# Disable VTA for memcpy tests
 ifeq ($(CC), gcc)
 ifeq ($(shell test $(GCC_VERSION) -ge 44 && echo 1), 1)
 CFLAGS_test_memcpy.o += -fno-var-tracking-assignments
diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
index 618a946..e07f087 100644
--- a/app/test/autotest_data.py
+++ b/app/test/autotest_data.py
@@ -187,6 +187,12 @@ parallel_test_group_list = [
                 "Report" :     None,
                },
                {
+                "Name" :       "Memcmp autotest",
+                "Command" :    "memcmp_autotest",
+                "Func" :       default_autotest,
+                "Report" :     None,
+               },
+               {
                 "Name" :       "Memzone autotest",
                 "Command" :    "memzone_autotest",
                 "Func" :       default_autotest,
@@ -399,6 +405,19 @@ non_parallel_test_group_list = [
        ]
 },
 {
+       "Prefix":       "memcmp_perf",
+       "Memory" :      all_sockets(512),
+       "Tests" :
+       [
+               {
+                "Name" :       "Memcmp performance autotest",
+                "Command" :    "memcmp_perf_autotest",
+                "Func" :       default_autotest,
+                "Report" :     None,
+               },
+       ]
+},
+{
        "Prefix":       "hash_perf",
        "Memory" :      all_sockets(512),
        "Tests" :       
diff --git a/app/test/test_hash_perf.c b/app/test/test_hash_perf.c
index 6eabb21..6887629 100644
--- a/app/test/test_hash_perf.c
+++ b/app/test/test_hash_perf.c
@@ -440,7 +440,7 @@ run_single_tbl_perf_test(const struct rte_hash *h, 
hash_operation func,
                uint32_t *invalid_pos_count)
 {
        uint64_t begin, end, ticks = 0;
-       uint8_t *key = NULL;
+       uint8_t * volatile key = NULL;
        uint32_t *bucket_occupancies = NULL;
        uint32_t num_buckets, i, j;
        int32_t pos;
@@ -547,30 +547,30 @@ run_tbl_perf_test(struct tbl_perf_test_params *params)
        case ADD_UPDATE:
                num_iterations = params->num_iterations;
                params->num_iterations = params->entries;
-               run_single_tbl_perf_test(handle, rte_hash_add_key, params,
-                               &avg_occupancy, &invalid_pos);
-               params->num_iterations = num_iterations;
                ticks = run_single_tbl_perf_test(handle, rte_hash_add_key,
                                params, &avg_occupancy, &invalid_pos);
+               params->num_iterations = num_iterations;
+               ticks += run_single_tbl_perf_test(handle, rte_hash_add_key,
+                               params, &avg_occupancy, &invalid_pos);
                break;
        case DELETE:
                num_iterations = params->num_iterations;
                params->num_iterations = params->entries;
-               run_single_tbl_perf_test(handle, rte_hash_add_key, params,
-                               &avg_occupancy, &invalid_pos);
+               ticks = run_single_tbl_perf_test(handle, rte_hash_add_key,
+                               params, &avg_occupancy, &invalid_pos);

                params->num_iterations = num_iterations;
-               ticks = run_single_tbl_perf_test(handle, rte_hash_del_key,
+               ticks += run_single_tbl_perf_test(handle, rte_hash_del_key,
                                params, &avg_occupancy, &invalid_pos);
                break;
        case LOOKUP:
                num_iterations = params->num_iterations;
                params->num_iterations = params->entries;
-               run_single_tbl_perf_test(handle, rte_hash_add_key, params,
-                               &avg_occupancy, &invalid_pos);
+               ticks = run_single_tbl_perf_test(handle, rte_hash_add_key,
+                               params, &avg_occupancy, &invalid_pos);

                params->num_iterations = num_iterations;
-               ticks = run_single_tbl_perf_test(handle, rte_hash_lookup,
+               ticks += run_single_tbl_perf_test(handle, rte_hash_lookup,
                                params, &avg_occupancy, &invalid_pos);
                break;
        default: return -1;
@@ -623,10 +623,15 @@ static int run_all_tbl_perf_tests(void)
 static void run_hash_func_test(rte_hash_function f, uint32_t init_val,
                uint32_t key_len)
 {
-       static uint8_t key[RTE_HASH_KEY_LENGTH_MAX];
+       static uint8_t * volatile key;
        uint64_t ticks = 0, start, end;
        unsigned i, j;

+       key = rte_zmalloc("func hash key",
+                         key_len * sizeof(uint8_t), 16);
+       if (key == NULL)
+               return;
+
        for (i = 0; i < HASHTEST_ITERATIONS; i++) {

                for (j = 0; j < key_len; j++)
@@ -638,8 +643,11 @@ static void run_hash_func_test(rte_hash_function f, 
uint32_t init_val,
                ticks += end - start;
        }

-       printf("%-12s, %-18u, %-13u, %.02f\n", get_hash_name(f), (unsigned) 
key_len,
-                       (unsigned) init_val, (double)ticks / 
HASHTEST_ITERATIONS);
+       rte_free(key);
+
+       printf("%-12s, %-18u, %-13u, %.02f\n",
+               get_hash_name(f), (unsigned) key_len, (unsigned) init_val,
+               (double)ticks / HASHTEST_ITERATIONS);
 }

 /*
@@ -687,7 +695,7 @@ fbk_hash_perf_test(void)
                .socket_id = rte_socket_id(),
        };
        struct rte_fbk_hash_table *handle = NULL;
-       uint32_t *keys = NULL;
+       uint32_t * volatile keys = NULL;
        unsigned indexes[TEST_SIZE];
        uint64_t lookup_time = 0;
        unsigned added = 0;
diff --git a/app/test/test_memcmp.c b/app/test/test_memcmp.c
new file mode 100644
index 0000000..7d9c85f
--- /dev/null
+++ b/app/test/test_memcmp.c
@@ -0,0 +1,229 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <sys/queue.h>
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_cycles.h>
+#include <rte_random.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_memcmp.h>
+
+#include "test.h"
+
+/*******************************************************************************
+ * Memcmp function performance test configuration section.
+ * Each performance test will be performed HASHTEST_ITERATIONS times.
+ *
+ * The five arrays below control what tests are performed. Every combination
+ * from the array entries is tested.
+ */
+static size_t memcmp_sizes[] = {
+       1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255,
+       256, 257, 320, 384, 511, 512, 513, 1023, 1024, 1025, 1518, 1522, 1600,
+       2048, 3072, 4096, 5120, 6144, 7168, 8192, 16384
+};
+
+/******************************************************************************/
+
+#define RTE_MEMCMP_LENGTH_MAX 16384
+
+/*
+ * Test a memcmp equal function.
+ */
+static int run_memcmp_eq_func_test(uint32_t len)
+{
+       uint32_t i, rc = 0;
+       uint8_t * volatile key = NULL;
+
+       key = rte_zmalloc("memcmp key", len * sizeof(uint8_t), 16);
+       if (key == NULL)
+               return -1;
+
+       for (i = 0; i < len; i++)
+               key[i] = (uint8_t) rte_rand();
+
+       rc = rte_memcmp(key, key, len);
+       rte_free(key);
+
+       return rc;
+}
+
+/*
+ * Test memcmp equal functions.
+ */
+static int run_memcmp_eq_func_tests(void)
+{
+       unsigned i;
+
+       for (i = 0;
+            i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+            i++) {
+               if (run_memcmp_eq_func_test(memcmp_sizes[i])) {
+                       printf("Comparing equal %zd bytes failed\n", 
memcmp_sizes[i]);
+                       return 1;
+               }
+       }
+       printf("RTE memcmp for equality successful\n");
+       return 0;
+}
+
+/*
+ * Test a memcmp less than function.
+ */
+static int run_memcmp_lt_func_test(uint32_t len)
+{
+       uint32_t i, rc;
+       uint8_t * volatile key_1 = NULL;
+       uint8_t * volatile key_2 = NULL;
+
+       key_1 = rte_zmalloc("memcmp key_1", len * sizeof(uint8_t), 16);
+       if (key_1 == NULL)
+               return -1;
+
+       key_2 = rte_zmalloc("memcmp key_2", len * sizeof(uint8_t), 16);
+       if (key_2 == NULL)
+               return -1;
+
+       for (i = 0; i < len; i++)
+               key_1[i] = i;
+
+       for (i = 0; i < len; i++)
+               key_2[i] = 2;
+
+       rc = rte_memcmp(key_1, key_2, len);
+       rte_free(key_1);
+       rte_free(key_2);
+
+       return rc;
+}
+
+/*
+ * Test memcmp less than functions.
+ */
+static int run_memcmp_lt_func_tests(void)
+{
+       unsigned i;
+
+       for (i = 0;
+            i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+            i++) {
+               if (!(run_memcmp_lt_func_test(memcmp_sizes[i]) < 0)) {
+                       printf("Comparing less than for %zd bytes failed\n", 
memcmp_sizes[i]);
+                       return 1;
+               }
+       }
+       printf("RTE memcmp for less than successful\n");
+       return 0;
+}
+
+/*
+ * Test a memcmp greater than function.
+ */
+static int run_memcmp_gt_func_test(uint32_t len)
+{
+       uint32_t i, rc;
+       uint8_t * volatile key_1 = NULL;
+       uint8_t * volatile key_2 = NULL;
+
+       key_1 = rte_zmalloc("memcmp key_1", len * sizeof(uint8_t), 16);
+       if (key_1 == NULL)
+               return -1;
+
+       key_2 = rte_zmalloc("memcmp key_2", len * sizeof(uint8_t), 16);
+       if (key_2 == NULL)
+               return -1;
+
+       for (i = 0; i < len; i++)
+               key_1[i] = 2;
+
+       for (i = 0; i < len; i++)
+               key_2[i] = i;
+
+       rc = rte_memcmp(key_1, key_2, len);
+       rte_free(key_1);
+       rte_free(key_2);
+
+       return rc;
+}
+
+/*
+ * Test memcmp less than functions.
+ */
+static int run_memcmp_gt_func_tests(void)
+{
+       unsigned i;
+
+       for (i = 0;
+            i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+            i++) {
+               if (!(run_memcmp_gt_func_test(memcmp_sizes[i]) > 0)) {
+                       printf("Comparing greater than for %zd bytes failed\n", 
memcmp_sizes[i]);
+                       return 1;
+               }
+       }
+       printf("RTE memcmp for greater than successful\n");
+       return 0;
+}
+
+/*
+ * Do all unit and performance tests.
+ */
+static int
+test_memcmp(void)
+{
+       if (run_memcmp_eq_func_tests())
+               return -1;
+
+       if (run_memcmp_gt_func_tests())
+               return -1;
+
+       if (run_memcmp_lt_func_tests())
+               return -1;
+
+       return 0;
+}
+
+static struct test_command memcmp_cmd = {
+       .command = "memcmp_autotest",
+       .callback = test_memcmp,
+};
+REGISTER_TEST_COMMAND(memcmp_cmd);
diff --git a/app/test/test_memcmp_perf.c b/app/test/test_memcmp_perf.c
new file mode 100644
index 0000000..8b7a0c4
--- /dev/null
+++ b/app/test/test_memcmp_perf.c
@@ -0,0 +1,339 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <sys/queue.h>
+#include <sys/times.h>
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_cycles.h>
+#include <rte_random.h>
+#include <rte_memory.h>
+#include <rte_memcmp.h>
+
+#include "test.h"
+
+/*******************************************************************************
+ * Memcmp function performance test configuration section. Each performance 
test
+ * will be performed MEMCMP_ITERATIONS times.
+ *
+ * The five arrays below control what tests are performed. Every combination
+ * from the array entries is tested.
+ */
+#define MEMCMP_ITERATIONS 500 * 500 * 500
+
+static size_t memcmp_sizes[] = {
+       2, 5, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128,
+       129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 
448,
+       449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1522, 1536, 1600,
+       2048, 2560, 3072, 3584, 4096, 4608, 5632, 6144, 6656, 7168, 7680, 8192,
+       16834
+};
+
+static size_t memcmp_lt_gt_sizes[] = {
+       16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192};
+
+/******************************************************************************/
+
+static int
+run_single_memcmp_eq_perf_test(uint32_t len, int func_type, uint64_t 
iterations)
+{
+       double begin = 0, end = 0;
+       uint64_t i, j, rc = 0;
+       uint8_t * volatile key = NULL;
+
+       key = rte_zmalloc("memcmp key", len * sizeof(uint8_t), 16);
+       if (key == NULL)
+               return -1;
+
+       /* Prepare inputs for the current iteration */
+       for (j = 0; j < len; j++)
+               key[j] = j / 64;
+
+       begin = rte_rdtsc();
+
+       /* Perform operation, and measure time it takes */
+       for (i = 0; i < iterations; i++) {
+
+               if (func_type == 1)
+                       rc += rte_memcmp(key, key, len);
+               else
+                       rc += memcmp(key, key, len);
+       }
+
+       end = rte_rdtsc() - begin;
+
+       printf(" *** %10i, %10.4f ***\n", len, (double)(end/iterations));
+
+       rte_free(key);
+
+       return rc;
+}
+
+/*
+ * Run all memcmp table performance tests.
+ */
+static int run_all_memcmp_eq_perf_tests(void)
+{
+       unsigned i;
+
+       printf(" *** RTE memcmp equal performance test results ***\n");
+       printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+       /* Loop through every combination of test parameters */
+       for (i = 0;
+            i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+            i++) {
+               /* Perform test */
+               if (run_single_memcmp_eq_perf_test(memcmp_sizes[i], 1,
+                                               MEMCMP_ITERATIONS) != 0)
+                       return -1;
+       }
+
+       printf(" *** memcmp equal performance test results ***\n");
+       printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+       /* Loop through every combination of test parameters */
+       for (i = 0;
+            i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+            i++) {
+               /* Perform test */
+               if (run_single_memcmp_eq_perf_test(memcmp_sizes[i], 2,
+                                               MEMCMP_ITERATIONS) != 0)
+                       return -1;
+       }
+       return 0;
+}
+
+static int
+run_single_memcmp_lt_perf_test(uint32_t len, int func_type,
+                                       uint64_t iterations)
+{
+       double begin = 0, end = 0;
+       uint64_t i, j;
+       uint8_t * volatile key_1 = NULL;
+       uint8_t * volatile key_2 = NULL;
+
+       key_1 = rte_zmalloc("memcmp key_1", len * sizeof(uint8_t), 16);
+       if (key_1 == NULL)
+               return -1;
+
+       key_2 = rte_zmalloc("memcmp key_2", len * sizeof(uint8_t), 16);
+       if (key_2 == NULL) {
+               rte_free(key_1);
+               return -1;
+       }
+
+       /* Prepare inputs for the current iteration */
+       for (j = 0; j < len; j++)
+               key_1[j] = 1;
+
+       for (j = 0; j < len; j++)
+               key_2[j] = 1;
+
+       key_2[len / 2] = 2;
+
+       begin = rte_rdtsc();
+
+       /* Perform operation, and measure time it takes */
+       for (i = 0; i < iterations; i++) {
+
+               if (func_type == 1) {
+                       if (!(rte_memcmp(key_1, key_2, len) < 0))
+                               return -1;
+               } else {
+                       if (!(memcmp(key_1, key_2, len) < 0))
+                               return -1;
+               }
+       }
+
+       end = rte_rdtsc() - begin;
+
+       printf(" *** %10i, %10.4f ***\n", len, (double)(end/iterations));
+
+       rte_free(key_1);
+       rte_free(key_2);
+
+       return 0;
+}
+
+/*
+ * Run all memcmp table performance tests.
+ */
+static int run_all_memcmp_lt_perf_tests(void)
+{
+       unsigned i;
+
+       printf(" *** RTE memcmp less than performance test results ***\n");
+       printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+       /* Loop through every combination of test parameters */
+       for (i = 0;
+            i < sizeof(memcmp_lt_gt_sizes) / sizeof(memcmp_lt_gt_sizes[0]);
+            i++) {
+               /* Perform test */
+               if (run_single_memcmp_lt_perf_test(memcmp_lt_gt_sizes[i], 1,
+                                               MEMCMP_ITERATIONS) != 0)
+                       return -1;
+       }
+
+       printf(" *** memcmp less than performance test results ***\n");
+       printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+       /* Loop through every combination of test parameters */
+       for (i = 0;
+            i < sizeof(memcmp_lt_gt_sizes) / sizeof(memcmp_lt_gt_sizes[0]);
+            i++) {
+               /* Perform test */
+               if (run_single_memcmp_lt_perf_test(memcmp_lt_gt_sizes[i], 2,
+                                               MEMCMP_ITERATIONS) != 0)
+                       return -1;
+       }
+       return 0;
+}
+
+static int
+run_single_memcmp_gt_perf_test(uint32_t len, int func_type,
+                                       uint64_t iterations)
+{
+       double begin = 0, end = 0;
+       uint64_t i, j;
+       uint8_t * volatile key_1 = NULL;
+       uint8_t * volatile key_2 = NULL;
+
+       key_1 = rte_zmalloc("memcmp key_1", len * sizeof(uint8_t), 16);
+       if (key_1 == NULL)
+               return -1;
+
+       key_2 = rte_zmalloc("memcmp key_2", len * sizeof(uint8_t), 16);
+       if (key_2 == NULL) {
+               rte_free(key_1);
+               return -1;
+       }
+
+       /* Prepare inputs for the current iteration */
+       for (j = 0; j < len; j++)
+               key_1[j] = 1;
+       key_1[len / 2] = 2;
+
+       for (j = 0; j < len; j++)
+               key_2[j] = 1;
+
+       begin = rte_rdtsc();
+
+       /* Perform operation, and measure time it takes */
+       for (i = 0; i < iterations; i++) {
+
+               if (func_type == 1) {
+                       if (!(rte_memcmp(key_1, key_2, len) > 0))
+                               return -1;
+               } else {
+                       if (!(memcmp(key_1, key_2, len) > 0))
+                               return -1;
+               }
+       }
+
+       end = rte_rdtsc() - begin;
+
+       printf(" *** %10i, %10.4f ***\n", len, (double)(end/iterations));
+
+       rte_free(key_1);
+       rte_free(key_2);
+
+       return 0;
+}
+
+/*
+ * Run all memcmp table performance tests.
+ */
+static int run_all_memcmp_gt_perf_tests(void)
+{
+       unsigned i;
+
+       printf(" *** RTE memcmp greater than performance test results ***\n");
+       printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+       /* Loop through every combination of test parameters */
+       for (i = 0;
+            i < sizeof(memcmp_lt_gt_sizes) / sizeof(memcmp_lt_gt_sizes[0]);
+            i++) {
+               /* Perform test */
+               if (run_single_memcmp_gt_perf_test(memcmp_lt_gt_sizes[i], 1,
+                                               MEMCMP_ITERATIONS) != 0)
+                       return -1;
+       }
+
+       printf(" *** memcmp greater than performance test results ***\n");
+       printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+       /* Loop through every combination of test parameters */
+       for (i = 0;
+            i < sizeof(memcmp_lt_gt_sizes) / sizeof(memcmp_lt_gt_sizes[0]);
+            i++) {
+               /* Perform test */
+               if (run_single_memcmp_gt_perf_test(memcmp_lt_gt_sizes[i], 2,
+                                               MEMCMP_ITERATIONS) != 0)
+                       return -1;
+       }
+       return 0;
+}
+
+/*
+ * Do all performance tests.
+ */
+static int
+test_memcmp_perf(void)
+{
+       if (run_all_memcmp_eq_perf_tests() != 0)
+               return -1;
+
+       if (run_all_memcmp_lt_perf_tests() != 0)
+               return -1;
+
+       if (run_all_memcmp_gt_perf_tests() != 0)
+               return -1;
+
+       return 0;
+}
+
+static struct test_command memcmp_perf_cmd = {
+       .command = "memcmp_perf_autotest",
+       .callback = test_memcmp_perf,
+};
+REGISTER_TEST_COMMAND(memcmp_perf_cmd);
diff --git a/lib/librte_eal/common/include/arch/ppc_64/rte_memcmp.h 
b/lib/librte_eal/common/include/arch/ppc_64/rte_memcmp.h
new file mode 100644
index 0000000..6e54f3b
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/ppc_64/rte_memcmp.h
@@ -0,0 +1,62 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) IBM Corporation 2015.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of IBM Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _RTE_MEMCMP_PPC_64_H_
+#define _RTE_MEMCMP_PPC_64_H_
+
+#include <stdint.h>
+#include <string.h>
+/*To include altivec.h, GCC version must  >= 4.8 */
+#include <altivec.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_memcmp.h"
+
+#define rte_memcmp(dst, src, n)              \
+       ({ (__builtin_constant_p(n)) ?       \
+       memcmp((dst), (src), (n)) :          \
+       rte_memcmp_func((dst), (src), (n)); })
+
+static inline bool
+rte_memcmp_func(void *dst, const void *src, size_t n)
+{
+       return memcmp(dst, src, n);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCMP_PPC_64_H_ */
diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcmp.h 
b/lib/librte_eal/common/include/arch/x86/rte_memcmp.h
new file mode 100644
index 0000000..085dfb2
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcmp.h
@@ -0,0 +1,900 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMCMP_X86_64_H_
+#define _RTE_MEMCMP_X86_64_H_
+
+/**
+ * @file
+ *
+ * Functions for SSE/AVX/AVX2 implementation of memcmp().
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+#include <rte_vect.h>
+#include <rte_branch_prediction.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Compare bytes between two locations. The locations must not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ * @param n
+ *   Number of bytes to compare.
+ * @return
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_memcmp(const void *src_1, const void *src,
+               size_t n) __attribute__((always_inline));
+
+/**
+ * Find the first different bit for comparison.
+ */
+static inline int
+rte_cmpffd (uint32_t x, uint32_t y)
+{
+       int i;
+       int pos = x ^ y;
+       for (i = 0; i < 32; i++)
+               if (pos & (1<<i))
+                       return i;
+       return -1;
+}
+
+/**
+ * Find the first different byte for comparison.
+ */
+static inline int
+rte_cmpffdb (const uint8_t *x, const uint8_t *y, size_t n)
+{
+       size_t i;
+       for (i = 0; i < n; i++)
+               if (x[i] != y[i])
+                       return x[i] - y[i];
+       return 0;
+}
+
+/**
+ * Compare 16 bytes between two locations.
+ * locations should not overlap.
+ */
+static inline int
+rte_cmp16(const void *src_1, const void *src_2)
+{
+       __m128i xmm0, xmm1, xmm2;
+
+       xmm0 = _mm_lddqu_si128((const __m128i *)src_1);
+       xmm1 = _mm_lddqu_si128((const __m128i *)src_2);
+       xmm2 = _mm_xor_si128(xmm0, xmm1);
+
+       if (unlikely(!_mm_testz_si128(xmm2, xmm2))) {
+
+               uint64_t mm11 = _mm_extract_epi64(xmm0, 0);
+               uint64_t mm12 = _mm_extract_epi64(xmm0, 1);
+
+               uint64_t mm21 = _mm_extract_epi64(xmm1, 0);
+               uint64_t mm22 = _mm_extract_epi64(xmm1, 1);
+
+               if (mm11 == mm21)
+                       return rte_cmpffdb((const uint8_t *)&mm12,
+                                       (const uint8_t *)&mm22, 8);
+               else
+                       return rte_cmpffdb((const uint8_t *)&mm11,
+                                       (const uint8_t *)&mm21, 8);
+       }
+
+       return 0;
+}
+
+/**
+ * Compare 0 to 15 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_memcmp_regular(const uint8_t *src_1u, const uint8_t *src_2u, size_t n)
+{
+       int ret = 1;
+
+       /**
+        * Compare less than 16 bytes
+        */
+       if (n & 0x08) {
+               ret = (*(const uint64_t *)src_1u ==
+                               *(const uint64_t *)src_2u);
+
+               if ((ret != 1))
+                       goto exit_8;
+
+               n -= 0x8;
+               src_1u += 0x8;
+               src_2u += 0x8;
+       }
+
+       if (n & 0x04) {
+               ret = (*(const uint32_t *)src_1u ==
+                               *(const uint32_t *)src_2u);
+
+               if ((ret != 1))
+                       goto exit_4;
+
+               n -= 0x4;
+               src_1u += 0x4;
+               src_2u += 0x4;
+       }
+
+       if (n & 0x02) {
+               ret = (*(const uint16_t *)src_1u ==
+                               *(const uint16_t *)src_2u);
+
+               if ((ret != 1))
+                       goto exit_2;
+
+               n -= 0x2;
+               src_1u += 0x2;
+               src_2u += 0x2;
+       }
+
+       if (n & 0x01) {
+               ret = (*(const uint8_t *)src_1u ==
+                               *(const uint8_t *)src_2u);
+
+               if ((ret != 1))
+                       goto exit_1;
+
+               n -= 0x1;
+               src_1u += 0x1;
+               src_2u += 0x1;
+       }
+
+       return !ret;
+
+exit_8:
+       return rte_cmpffdb(src_1u, src_2u, 8);
+exit_4:
+       return rte_cmpffdb(src_1u, src_2u, 4);
+exit_2:
+       return rte_cmpffdb(src_1u, src_2u, 2);
+exit_1:
+       return rte_cmpffdb(src_1u, src_2u, 1);
+}
+
+/**
+ * AVX2 implementation below
+ */
+#ifdef RTE_MACHINE_CPUFLAG_AVX2
+
+/**
+ * Compare 32 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp32(const void *src_1, const void *src_2)
+{
+       const __m128i* src1 = (const __m128i*)src_1;
+       const __m128i* src2 = (const __m128i*)src_2;
+       const uint8_t *s1, *s2;
+
+       __m128i mm11 = _mm_lddqu_si128(src1);
+       __m128i mm12 = _mm_lddqu_si128(src1 + 1);
+       __m128i mm21 = _mm_lddqu_si128(src2);
+       __m128i mm22 = _mm_lddqu_si128(src2 + 1);
+
+       __m128i mm1 = _mm_xor_si128(mm11, mm21);
+       __m128i mm2 = _mm_xor_si128(mm12, mm22);
+       __m128i mm = _mm_or_si128(mm1, mm2);
+
+       if (unlikely(!_mm_testz_si128(mm, mm))) {
+
+               /*
+                * Find out which of the two 16-byte blocks
+                * are different.
+                */
+               if (_mm_testz_si128(mm1, mm1)) {
+                       mm11 = mm12;
+                       mm21 = mm22;
+                       mm1 = mm2;
+                       s1 = (const uint8_t *)(src1 + 1);
+                       s2 = (const uint8_t *)(src2 + 1);
+               } else {
+                       s1 = (const uint8_t *)src1;
+                       s2 = (const uint8_t *)src2;
+               }
+
+               // Produce the comparison result
+               __m128i mm_cmp = _mm_cmpgt_epi8(mm11, mm21);
+               __m128i mm_rcmp = _mm_cmpgt_epi8(mm21, mm11);
+               mm_cmp = _mm_xor_si128(mm1, mm_cmp);
+               mm_rcmp = _mm_xor_si128(mm1, mm_rcmp);
+
+               uint32_t cmp = _mm_movemask_epi8(mm_cmp);
+               uint32_t rcmp = _mm_movemask_epi8(mm_rcmp);
+
+               int cmp_b = rte_cmpffd(cmp, rcmp);
+
+               int ret = (cmp_b == -1) ? 0 : (s1[cmp_b] - s2[cmp_b]);
+               return ret;
+       }
+
+       return 0;
+}
+
+/**
+ * Compare 48 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp48(const void *src_1, const void *src_2)
+{
+       int ret;
+
+       ret = rte_cmp32((const uint8_t *)src_1 + 0 * 32,
+                       (const uint8_t *)src_2 + 0 * 32);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 1 * 32,
+                       (const uint8_t *)src_2 + 1 * 32);
+       return ret;
+}
+
+/**
+ * Compare 64 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp64 (const void* src_1, const void* src_2)
+{
+       const __m256i* src1 = (const __m256i*)src_1;
+       const __m256i* src2 = (const __m256i*)src_2;
+       const uint8_t *s1, *s2;
+
+       __m256i mm11 = _mm256_lddqu_si256(src1);
+       __m256i mm12 = _mm256_lddqu_si256(src1 + 1);
+       __m256i mm21 = _mm256_lddqu_si256(src2);
+       __m256i mm22 = _mm256_lddqu_si256(src2 + 1);
+
+       __m256i mm1 = _mm256_xor_si256(mm11, mm21);
+       __m256i mm2 = _mm256_xor_si256(mm12, mm22);
+       __m256i mm = _mm256_or_si256(mm1, mm2);
+
+       if (unlikely(!_mm256_testz_si256(mm, mm))) {
+               /*
+                * Find out which of the two 32-byte blocks
+                * are different.
+                */
+               if (_mm256_testz_si256(mm1, mm1)) {
+                       mm11 = mm12;
+                       mm21 = mm22;
+                       mm1 = mm2;
+                       s1 = (const uint8_t *)(src1 + 1);
+                       s2 = (const uint8_t *)(src2 + 1);
+               } else {
+                       s1 = (const uint8_t *)src1;
+                       s2 = (const uint8_t *)src2;
+               }
+
+               // Produce the comparison result
+               __m256i mm_cmp = _mm256_cmpgt_epi8(mm11, mm21);
+               __m256i mm_rcmp = _mm256_cmpgt_epi8(mm21, mm11);
+               mm_cmp = _mm256_xor_si256(mm1, mm_cmp);
+               mm_rcmp = _mm256_xor_si256(mm1, mm_rcmp);
+
+               uint32_t cmp = _mm256_movemask_epi8(mm_cmp);
+               uint32_t rcmp = _mm256_movemask_epi8(mm_rcmp);
+
+               int cmp_b = rte_cmpffd(cmp, rcmp);
+
+               int ret = (cmp_b == -1) ? 0 : (s1[cmp_b] - s2[cmp_b]);
+               return ret;
+       }
+
+       return 0;
+}
+
+/**
+ * Compare 128 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp128(const void *src_1, const void *src_2)
+{
+       int ret;
+
+       ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
+                       (const uint8_t *)src_2 + 0 * 64);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       return rte_cmp64((const uint8_t *)src_1 + 1 * 64,
+                       (const uint8_t *)src_2 + 1 * 64);
+}
+
+/**
+ * Compare 256 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp256(const void *src_1, const void *src_2)
+{
+       int ret;
+
+       ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
+                       (const uint8_t *)src_2 + 0 * 64);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp64((const uint8_t *)src_1 + 1 * 64,
+                       (const uint8_t *)src_2 + 1 * 64);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp64((const uint8_t *)src_1 + 2 * 64,
+                       (const uint8_t *)src_2 + 2 * 64);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       return rte_cmp64((const uint8_t *)src_1 + 3 * 64,
+                       (const uint8_t *)src_2 + 3 * 64);
+}
+
+/**
+ * Compare bytes between two locations. The locations must not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ * @param n
+ *   Number of bytes to compare.
+ * @return
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
+{
+       const uint8_t *src_1 = (const uint8_t *)_src_1;
+       const uint8_t *src_2 = (const uint8_t *)_src_2;
+       int ret = 0;
+
+       if (n < 16)
+               return rte_memcmp_regular(src_1, src_2, n);
+
+       if (n <= 32) {
+               ret = rte_cmp16(src_1, src_2);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+       }
+
+       if (n <= 48) {
+               ret = rte_cmp32(src_1, src_2);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+       }
+
+       if (n <= 64) {
+               ret = rte_cmp32(src_1, src_2);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               ret = rte_cmp16(src_1 + 32, src_2 + 32);
+
+               if (unlikely(ret != 0))
+                       return ret;
+
+               return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+       }
+
+       if (n <= 96) {
+               ret = rte_cmp64(src_1, src_2);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               ret = rte_cmp16(src_1 + 64, src_2 + 64);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+       }
+
+       if (n <= 128) {
+               ret = rte_cmp64(src_1, src_2);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               ret = rte_cmp32(src_1 + 64, src_2 + 64);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               ret = rte_cmp16(src_1 + 96, src_2 + 96);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+       }
+
+CMP_BLOCK_LESS_THAN_512:
+       if (n <= 512) {
+               if (n >= 256) {
+                       ret = rte_cmp256(src_1, src_2);
+                       if (unlikely(ret != 0))
+                               return ret;
+                       src_1 = src_1 + 256;
+                       src_2 = src_2 + 256;
+                       n -= 256;
+               }
+               if (n >= 128) {
+                       ret = rte_cmp128(src_1, src_2);
+                       if (unlikely(ret != 0))
+                               return ret;
+                       src_1 = src_1 + 128;
+                       src_2 = src_2 + 128;
+                       n -= 128;
+               }
+               if (n >= 64) {
+                       n -= 64;
+                       ret = rte_cmp64(src_1, src_2);
+                       if (unlikely(ret != 0))
+                               return ret;
+                       src_1 = src_1 + 64;
+                       src_2 = src_2 + 64;
+               }
+               if (n > 32) {
+                       ret = rte_cmp32(src_1, src_2);
+                       if (unlikely(ret != 0))
+                               return ret;
+                       ret = rte_cmp32(src_1 - 32 + n, src_2 - 32 + n);
+                       return ret;
+               }
+               if (n > 0)
+                       ret = rte_cmp32(src_1 - 32 + n, src_2 - 32 + n);
+
+               return ret;
+       }
+
+       while (n > 512) {
+               ret = rte_cmp256(src_1 + 0 * 256, src_2 + 0 * 256);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               ret = rte_cmp256(src_1 + 1 * 256, src_2 + 1 * 256);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               src_1 = src_1 + 512;
+               src_2 = src_2 + 512;
+               n -= 512;
+       }
+       goto CMP_BLOCK_LESS_THAN_512;
+}
+
+#else /* RTE_MACHINE_CPUFLAG_AVX2 */
+
+/**
+ * Compare 32 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp32(const void *src_1, const void *src_2)
+{
+       int ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+                       (const uint8_t *)src_2 + 0 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       return rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+                       (const uint8_t *)src_2 + 1 * 16);
+}
+
+/**
+ * Compare 48 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp48(const void *src_1, const void *src_2)
+{
+       int ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+                       (const uint8_t *)src_2 + 0 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+                       (const uint8_t *)src_2 + 1 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       return rte_cmp16((const uint8_t *)src_1 + 2 * 16,
+                       (const uint8_t *)src_2 + 2 * 16);
+}
+
+/**
+ * Compare 64 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp64(const void *src_1, const void *src_2)
+{
+       int ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+                       (const uint8_t *)src_2 + 0 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+                       (const uint8_t *)src_2 + 1 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 2 * 16,
+                       (const uint8_t *)src_2 + 2 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       return rte_cmp16((const uint8_t *)src_1 + 3 * 16,
+                       (const uint8_t *)src_2 + 3 * 16);
+}
+
+/**
+ * Compare 128 bytes or its multiple between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp128(const void *src_1, const void *src_2)
+{
+       int ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+                       (const uint8_t *)src_2 + 0 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+                       (const uint8_t *)src_2 + 1 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 2 * 16,
+                       (const uint8_t *)src_2 + 2 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 3 * 16,
+                       (const uint8_t *)src_2 + 3 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 4 * 16,
+                       (const uint8_t *)src_2 + 4 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 5 * 16,
+                       (const uint8_t *)src_2 + 5 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 6 * 16,
+                       (const uint8_t *)src_2 + 6 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       return rte_cmp16((const uint8_t *)src_1 + 7 * 16,
+                       (const uint8_t *)src_2 + 7 * 16);
+}
+
+/**
+ * Compare 256 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp256(const void *src_1, const void *src_2)
+{
+       int ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+                       (const uint8_t *)src_2 + 0 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+                       (const uint8_t *)src_2 + 1 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 2 * 16,
+                       (const uint8_t *)src_2 + 2 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 3 * 16,
+                       (const uint8_t *)src_2 + 3 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 4 * 16,
+                       (const uint8_t *)src_2 + 4 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 5 * 16,
+                       (const uint8_t *)src_2 + 5 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 6 * 16,
+                       (const uint8_t *)src_2 + 6 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 7 * 16,
+                       (const uint8_t *)src_2 + 7 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 8 * 16,
+                       (const uint8_t *)src_2 + 8 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 9 * 16,
+                       (const uint8_t *)src_2 + 9 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 10 * 16,
+                       (const uint8_t *)src_2 + 10 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 11 * 16,
+                       (const uint8_t *)src_2 + 11 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 12 * 16,
+                       (const uint8_t *)src_2 + 12 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 13 * 16,
+                       (const uint8_t *)src_2 + 13 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       ret = rte_cmp16((const uint8_t *)src_1 + 14 * 16,
+                       (const uint8_t *)src_2 + 14 * 16);
+
+       if (unlikely(ret != 0))
+               return ret;
+
+       return rte_cmp16((const uint8_t *)src_1 + 15 * 16,
+                       (const uint8_t *)src_2 + 15 * 16);
+}
+
+/**
+ * Compare bytes between two locations. The locations must not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ * @param n
+ *   Number of bytes to compare.
+ * @return
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
+{
+       const uint8_t *src_1 = (const uint8_t *)_src_1;
+       const uint8_t *src_2 = (const uint8_t *)_src_2;
+       int ret = 0;
+
+       if (n < 16)
+               return rte_memcmp_regular(src_1, src_2, n);
+
+       if (n <= 32) {
+               ret = rte_cmp16(src_1, src_2);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+       }
+
+       if (n <= 48) {
+               ret = rte_cmp32(src_1, src_2);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+       }
+
+       if (n <= 64) {
+               ret = rte_cmp32(src_1, src_2);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               ret = rte_cmp16(src_1 + 32, src_2 + 32);
+
+               if (unlikely(ret != 0))
+                       return ret;
+
+               return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+       }
+
+       if (n <= 96) {
+               ret = rte_cmp64(src_1, src_2);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               ret = rte_cmp16(src_1 + 64, src_2 + 64);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+       }
+
+       if (n <= 128)
+               goto CMP_BLOCK_LESS_THAN_128;
+
+       if (n <= 512) {
+               if (n >= 256) {
+                       ret = rte_cmp256(src_1, src_2);
+                       if (unlikely(ret != 0))
+                               return ret;
+
+                       src_1 = src_1 + 256;
+                       src_2 = src_2 + 256;
+                       n -= 256;
+               }
+
+CMP_BLOCK_LESS_THAN_256:
+               if (n >= 128) {
+                       ret = rte_cmp128(src_1, src_2);
+                       if (unlikely(ret != 0))
+                               return ret;
+
+                       src_1 = src_1 + 128;
+                       src_2 = src_2 + 128;
+                       n -= 128;
+               }
+
+CMP_BLOCK_LESS_THAN_128:
+               if (n >= 64) {
+                       ret = rte_cmp64(src_1, src_2);
+                       if (unlikely(ret != 0))
+                               return ret;
+
+                       src_1 = src_1 + 64;
+                       src_2 = src_2 + 64;
+                       n -= 64;
+               }
+
+               if (n >= 32) {
+                       ret = rte_cmp32(src_1, src_2);
+                       if (unlikely(ret != 0))
+                               return ret;
+                       src_1 = src_1 + 32;
+                       src_2 = src_2 + 32;
+                       n -= 32;
+               }
+               if (n > 16) {
+                       ret = rte_cmp16(src_1, src_2);
+                       if (unlikely(ret != 0))
+                               return ret;
+                       ret = rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+                       return ret;
+               }
+               if (n > 0)
+                       ret = rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+
+               return ret;
+       }
+
+       for (; n >= 256; n -= 256) {
+               ret = rte_cmp256(src_1, src_2);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               src_1 = src_1 + 256;
+               src_2 = src_2 + 256;
+       }
+
+       goto CMP_BLOCK_LESS_THAN_256;
+}
+
+#endif /* RTE_MACHINE_CPUFLAG_AVX2 */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCMP_X86_64_H_ */
diff --git a/lib/librte_eal/common/include/generic/rte_memcmp.h 
b/lib/librte_eal/common/include/generic/rte_memcmp.h
new file mode 100644
index 0000000..5e68036
--- /dev/null
+++ b/lib/librte_eal/common/include/generic/rte_memcmp.h
@@ -0,0 +1,175 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMCMP_H_
+#define _RTE_MEMCMP_H_
+
+/**
+ * @file
+ *
+ * Functions for vectorised implementation of memcmp().
+ */
+
+/**
+ * Find the first different bit for comparison.
+ */
+static inline int
+rte_cmpffd (uint32_t x, uint32_t y);
+
+/**
+ * Find the first different byte for comparison.
+ */
+static inline int
+rte_cmpffdb (const uint8_t *x, const uint8_t *y, size_t n);
+
+/**
+ * Compare 16 bytes between two locations using optimised
+ * instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp16(const void *src_1, const void *src_2);
+
+/**
+ * Compare 32 bytes between two locations using optimised
+ * instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp32(const void *src_1, const void *src_2);
+
+/**
+ * Compare 64 bytes between two locations using optimised
+ * instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp64(const void *src_1, const void *src_2);
+
+/**
+ * Compare 48 bytes between two locations using optimised
+ * instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp48(const void *src_1, const void *src_2);
+
+/**
+ * Compare 128 bytes between two locations using
+ * optimised instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp128(const void *src_1, const void *src_2);
+
+/**
+ * Compare 256 bytes or greater between two locations using
+ * optimised instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp256(const void *src_1, const void *src_2);
+
+#ifdef __DOXYGEN__
+
+/**
+ * Compare bytes between two locations. The locations must not overlap.
+ *
+ * @note This is implemented as a macro, so it's address should not be taken
+ * and care is needed as parameter expressions may be evaluated multiple times.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ * @param n
+ *   Number of bytes to copy.
+ * @return
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static int
+rte_memcmp(const void *dst, const void *src, size_t n);
+
+#endif /* __DOXYGEN__ */
+
+/*
+ * memcmp() function used by rte_memcmp macro
+ */
+static inline int
+rte_memcmp_func(void *dst, const void *src, size_t n) 
__attribute__((always_inline));
+
+#endif /* _RTE_MEMCMP_H_ */
diff --git a/lib/librte_hash/rte_hash.c b/lib/librte_hash/rte_hash.c
index 9245716..075da62 100644
--- a/lib/librte_hash/rte_hash.c
+++ b/lib/librte_hash/rte_hash.c
@@ -42,6 +42,7 @@
 #include <rte_memory.h>         /* for definition of RTE_CACHE_LINE_SIZE */
 #include <rte_log.h>
 #include <rte_memcpy.h>
+#include <rte_memcmp.h>
 #include <rte_prefetch.h>
 #include <rte_branch_prediction.h>
 #include <rte_memzone.h>
@@ -299,6 +300,7 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h,
        uint8_t *key_bucket;
        uint32_t bucket_index, i;
        int32_t pos;
+       const void * volatile key_1 = key;

        /* Get the hash signature and bucket index */
        sig |= h->sig_msb;
@@ -308,10 +310,13 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h,

        /* Check if key is already present in the hash */
        for (i = 0; i < h->bucket_entries; i++) {
-               if ((sig == sig_bucket[i]) &&
-                   likely(memcmp(key, get_key_from_bucket(h, key_bucket, i),
-                                 h->key_len) == 0)) {
-                       return bucket_index * h->bucket_entries + i;
+               if (sig == sig_bucket[i]) {
+
+                       const void * volatile key_2 =
+                               get_key_from_bucket(h, key_bucket, i);
+
+                       if (likely(rte_memcmp(key_1, key_2, h->key_len) == 0))
+                               return bucket_index * h->bucket_entries + i;
                }
        }

@@ -350,6 +355,8 @@ __rte_hash_del_key_with_hash(const struct rte_hash *h,
        uint8_t *key_bucket;
        uint32_t bucket_index, i;

+       const void * volatile key_1 = key;
+
        /* Get the hash signature and bucket index */
        sig = sig | h->sig_msb;
        bucket_index = sig & h->bucket_bitmask;
@@ -358,11 +365,14 @@ __rte_hash_del_key_with_hash(const struct rte_hash *h,

        /* Check if key is already present in the hash */
        for (i = 0; i < h->bucket_entries; i++) {
-               if ((sig == sig_bucket[i]) &&
-                   likely(memcmp(key, get_key_from_bucket(h, key_bucket, i),
-                                 h->key_len) == 0)) {
-                       sig_bucket[i] = NULL_SIGNATURE;
-                       return bucket_index * h->bucket_entries + i;
+               if (sig == sig_bucket[i]) {
+                       const void * volatile key_2 =
+                               get_key_from_bucket(h, key_bucket, i);
+
+                       if (likely(rte_memcmp(key_1, key_2, h->key_len) == 0)) {
+                               sig_bucket[i] = NULL_SIGNATURE;
+                               return bucket_index * h->bucket_entries + i;
+                       }
                }
        }

@@ -392,6 +402,8 @@ __rte_hash_lookup_with_hash(const struct rte_hash *h,
        uint8_t *key_bucket;
        uint32_t bucket_index, i;

+       const void * volatile key_1 = key;
+
        /* Get the hash signature and bucket index */
        sig |= h->sig_msb;
        bucket_index = sig & h->bucket_bitmask;
@@ -400,10 +412,13 @@ __rte_hash_lookup_with_hash(const struct rte_hash *h,

        /* Check if key is already present in the hash */
        for (i = 0; i < h->bucket_entries; i++) {
-               if ((sig == sig_bucket[i]) &&
-                   likely(memcmp(key, get_key_from_bucket(h, key_bucket, i),
-                                 h->key_len) == 0)) {
-                       return bucket_index * h->bucket_entries + i;
+               if (sig == sig_bucket[i]) {
+
+                       const void * volatile key_2 =
+                               get_key_from_bucket(h, key_bucket, i);
+
+                       if (likely(rte_memcmp(key_1, key_2, h->key_len) == 0))
+                               return bucket_index * h->bucket_entries + i;
                }
        }

@@ -456,13 +471,17 @@ rte_hash_lookup_bulk(const struct rte_hash *h, const void 
**keys,
                positions[i] = -ENOENT;

                for (j = 0; j < h->bucket_entries; j++) {
-                       if ((sigs[i] == sig_bucket[j]) &&
-                           likely(memcmp(keys[i],
-                                         get_key_from_bucket(h, key_bucket, j),
-                                         h->key_len) == 0)) {
-                               positions[i] = bucket_index *
-                                       h->bucket_entries + j;
-                               break;
+                       if (sigs[i] == sig_bucket[j]) {
+
+                               const void * volatile key_1 = keys[i];
+                               const void * volatile key_2 =
+                                       get_key_from_bucket(h, key_bucket, j);
+                               if (likely(rte_memcmp(key_1, key_2,
+                                                       h->key_len) == 0)) {
+                                       positions[i] = bucket_index *
+                                                       h->bucket_entries + j;
+                                       break;
+                               }
                        }
                }
        }
-- 
1.9.1

Reply via email to