Adding Paul for feedback

> -----Original Message-----
> From: Ruifeng Wang <ruifeng.w...@arm.com>
> Sent: Friday, September 6, 2019 4:46 AM
> To: bruce.richard...@intel.com; vladimir.medved...@intel.com;
> olivier.m...@6wind.com
> Cc: dev@dpdk.org; step...@networkplumber.org;
> konstantin.anan...@intel.com; Gavin Hu (Arm Technology China)
> <gavin...@arm.com>; Honnappa Nagarahalli
> <honnappa.nagaraha...@arm.com>; Dharmik Thakkar
> <dharmik.thak...@arm.com>; nd <n...@arm.com>
> Subject: [PATCH v2 6/6] test/lpm: add RCU integration performance tests
> 
> From: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com>
> 
> Add performance tests for RCU integration. The performance difference with
> and without RCU integration is very small (~1% to ~2%) on both Arm and x86
> platforms.
> 
> Signed-off-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com>
> Reviewed-by: Gavin Hu <gavin...@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.w...@arm.com>
> ---
>  app/test/test_lpm_perf.c | 274 ++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 271 insertions(+), 3 deletions(-)
> 
> diff --git a/app/test/test_lpm_perf.c b/app/test/test_lpm_perf.c index
> a2578fe90..475e5d488 100644
> --- a/app/test/test_lpm_perf.c
> +++ b/app/test/test_lpm_perf.c
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2010-2014 Intel Corporation
> + * Copyright(c) 2019 Arm Limited
>   */
> 
>  #include <stdio.h>
> @@ -10,12 +11,23 @@
>  #include <rte_cycles.h>
>  #include <rte_random.h>
>  #include <rte_branch_prediction.h>
> +#include <rte_malloc.h>
>  #include <rte_ip.h>
>  #include <rte_lpm.h>
> +#include <rte_rcu_qsbr.h>
> 
>  #include "test.h"
>  #include "test_xmmt_ops.h"
> 
> +struct rte_lpm *lpm;
> +static struct rte_rcu_qsbr *rv;
> +static volatile uint8_t writer_done;
> +static volatile uint32_t thr_id;
> +/* Report quiescent state interval every 8192 lookups. Larger critical
> + * sections in reader will result in writer polling multiple times.
> + */
> +#define QSBR_REPORTING_INTERVAL 8192
> +
>  #define TEST_LPM_ASSERT(cond) do {                                           
>  \
>       if (!(cond)) {                                                        \
>               printf("Error at line %d: \n", __LINE__);                     \
> @@ -24,6 +36,7 @@
>  } while(0)
> 
>  #define ITERATIONS (1 << 10)
> +#define RCU_ITERATIONS 10
>  #define BATCH_SIZE (1 << 12)
>  #define BULK_SIZE 32
> 
> @@ -35,9 +48,13 @@ struct route_rule {
>  };
> 
>  struct route_rule large_route_table[MAX_RULE_NUM];
> +/* Route table for routes with depth > 24 */ struct route_rule
> +large_ldepth_route_table[MAX_RULE_NUM];
> 
>  static uint32_t num_route_entries;
> +static uint32_t num_ldepth_route_entries;
>  #define NUM_ROUTE_ENTRIES num_route_entries
> +#define NUM_LDEPTH_ROUTE_ENTRIES num_ldepth_route_entries
> 
>  enum {
>       IP_CLASS_A,
> @@ -191,7 +208,7 @@ static void generate_random_rule_prefix(uint32_t
> ip_class, uint8_t depth)
>       uint32_t ip_head_mask;
>       uint32_t rule_num;
>       uint32_t k;
> -     struct route_rule *ptr_rule;
> +     struct route_rule *ptr_rule, *ptr_ldepth_rule;
> 
>       if (ip_class == IP_CLASS_A) {        /* IP Address class A */
>               fixed_bit_num = IP_HEAD_BIT_NUM_A;
> @@ -236,10 +253,20 @@ static void generate_random_rule_prefix(uint32_t
> ip_class, uint8_t depth)
>        */
>       start = lrand48() & mask;
>       ptr_rule = &large_route_table[num_route_entries];
> +     ptr_ldepth_rule =
> &large_ldepth_route_table[num_ldepth_route_entries];
>       for (k = 0; k < rule_num; k++) {
>               ptr_rule->ip = (start << (RTE_LPM_MAX_DEPTH - depth))
>                       | ip_head_mask;
>               ptr_rule->depth = depth;
> +             /* If the depth of the route is more than 24, store it
> +              * in another table as well.
> +              */
> +             if (depth > 24) {
> +                     ptr_ldepth_rule->ip = ptr_rule->ip;
> +                     ptr_ldepth_rule->depth = ptr_rule->depth;
> +                     ptr_ldepth_rule++;
> +                     num_ldepth_route_entries++;
> +             }
>               ptr_rule++;
>               start = (start + step) & mask;
>       }
> @@ -273,6 +300,7 @@ static void generate_large_route_rule_table(void)
>       uint8_t  depth;
> 
>       num_route_entries = 0;
> +     num_ldepth_route_entries = 0;
>       memset(large_route_table, 0, sizeof(large_route_table));
> 
>       for (ip_class = IP_CLASS_A; ip_class <= IP_CLASS_C; ip_class++) { @@ -
> 316,10 +344,248 @@ print_route_distribution(const struct route_rule *table,
> uint32_t n)
>       printf("\n");
>  }
> 
> +/* Check condition and return an error if true. */ static uint16_t
> +enabled_core_ids[RTE_MAX_LCORE]; static unsigned int num_cores;
> +
> +/* Simple way to allocate thread ids in 0 to RTE_MAX_LCORE space */
> +static inline uint32_t
> +alloc_thread_id(void)
> +{
> +     uint32_t tmp_thr_id;
> +
> +     tmp_thr_id = __atomic_fetch_add(&thr_id, 1, __ATOMIC_RELAXED);
> +     if (tmp_thr_id >= RTE_MAX_LCORE)
> +             printf("Invalid thread id %u\n", tmp_thr_id);
> +
> +     return tmp_thr_id;
> +}
> +
> +/*
> + * Reader thread using rte_lpm data structure without RCU.
> + */
> +static int
> +test_lpm_reader(__attribute__((unused)) void *arg) {
> +     int i;
> +     uint32_t ip_batch[QSBR_REPORTING_INTERVAL];
> +     uint32_t next_hop_return = 0;
> +
> +     do {
> +             for (i = 0; i < QSBR_REPORTING_INTERVAL; i++)
> +                     ip_batch[i] = rte_rand();
> +
> +             for (i = 0; i < QSBR_REPORTING_INTERVAL; i++)
> +                     rte_lpm_lookup(lpm, ip_batch[i], &next_hop_return);
> +
> +     } while (!writer_done);
> +
> +     return 0;
> +}
> +
> +/*
> + * Reader thread using rte_lpm data structure with RCU.
> + */
> +static int
> +test_lpm_rcu_qsbr_reader(__attribute__((unused)) void *arg) {
> +     int i;
> +     uint32_t thread_id = alloc_thread_id();
> +     uint32_t ip_batch[QSBR_REPORTING_INTERVAL];
> +     uint32_t next_hop_return = 0;
> +
> +     /* Register this thread to report quiescent state */
> +     rte_rcu_qsbr_thread_register(rv, thread_id);
> +     rte_rcu_qsbr_thread_online(rv, thread_id);
> +
> +     do {
> +             for (i = 0; i < QSBR_REPORTING_INTERVAL; i++)
> +                     ip_batch[i] = rte_rand();
> +
> +             for (i = 0; i < QSBR_REPORTING_INTERVAL; i++)
> +                     rte_lpm_lookup(lpm, ip_batch[i], &next_hop_return);
> +
> +             /* Update quiescent state */
> +             rte_rcu_qsbr_quiescent(rv, thread_id);
> +     } while (!writer_done);
> +
> +     rte_rcu_qsbr_thread_offline(rv, thread_id);
> +     rte_rcu_qsbr_thread_unregister(rv, thread_id);
> +
> +     return 0;
> +}
> +
> +/*
> + * Functional test:
> + * Single writer, Single QS variable, Single QSBR query,
> + * Non-blocking rcu_qsbr_check
> + */
> +static int
> +test_lpm_rcu_perf(void)
> +{
> +     struct rte_lpm_config config;
> +     uint64_t begin, total_cycles;
> +     size_t sz;
> +     unsigned int i, j;
> +     uint16_t core_id;
> +     uint32_t next_hop_add = 0xAA;
> +
> +     if (rte_lcore_count() < 2) {
> +             printf("Not enough cores for lpm_rcu_perf_autotest,
> expecting at least 2\n");
> +             return TEST_SKIPPED;
> +     }
> +
> +     num_cores = 0;
> +     RTE_LCORE_FOREACH_SLAVE(core_id) {
> +             enabled_core_ids[num_cores] = core_id;
> +             num_cores++;
> +     }
> +
> +     printf("\nPerf test: 1 writer, %d readers, RCU integration enabled\n",
> +             num_cores);
> +
> +     /* Create LPM table */
> +     config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES;
> +     config.number_tbl8s = NUM_LDEPTH_ROUTE_ENTRIES;
> +     config.flags = 0;
> +     lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
> +     TEST_LPM_ASSERT(lpm != NULL);
> +
> +     /* Init RCU variable */
> +     sz = rte_rcu_qsbr_get_memsize(num_cores);
> +     rv = (struct rte_rcu_qsbr *)rte_zmalloc("rcu0", sz,
> +                                             RTE_CACHE_LINE_SIZE);
> +     rte_rcu_qsbr_init(rv, num_cores);
> +
> +     /* Assign the RCU variable to LPM */
> +     if (rte_lpm_rcu_qsbr_add(lpm, rv) != 0) {
> +             printf("RCU variable assignment failed\n");
> +             goto error;
> +     }
> +
> +     writer_done = 0;
> +     __atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
> +
> +     /* Launch reader threads */
> +     for (i = 0; i < num_cores; i++)
> +             rte_eal_remote_launch(test_lpm_rcu_qsbr_reader, NULL,
> +                                     enabled_core_ids[i]);
> +
> +     /* Measure add/delete. */
> +     begin = rte_rdtsc_precise();
> +     for (i = 0; i < RCU_ITERATIONS; i++) {
> +             /* Add all the entries */
> +             for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++)
> +                     if (rte_lpm_add(lpm, large_ldepth_route_table[j].ip,
> +                                     large_ldepth_route_table[j].depth,
> +                                     next_hop_add) != 0) {
> +                             printf("Failed to add iteration %d,
> route# %d\n",
> +                                     i, j);
> +                             goto error;
> +                     }
> +
> +             /* Delete all the entries */
> +             for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++)
> +                     if (rte_lpm_delete(lpm, large_ldepth_route_table[j].ip,
> +                             large_ldepth_route_table[j].depth) != 0) {
> +                             printf("Failed to delete iteration %d,
> route# %d\n",
> +                                     i, j);
> +                             goto error;
> +                     }
> +     }
> +     total_cycles = rte_rdtsc_precise() - begin;
> +
> +     printf("Total LPM Adds: %d\n", ITERATIONS *
> NUM_LDEPTH_ROUTE_ENTRIES);
> +     printf("Total LPM Deletes: %d\n",
> +             ITERATIONS * NUM_LDEPTH_ROUTE_ENTRIES);
> +     printf("Average LPM Add/Del: %g cycles\n",
> +             (double)total_cycles / (NUM_LDEPTH_ROUTE_ENTRIES *
> ITERATIONS));
> +
> +     writer_done = 1;
> +     /* Wait and check return value from reader threads */
> +     for (i = 0; i < num_cores; i++)
> +             if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0)
> +                     goto error;
> +
> +     rte_lpm_free(lpm);
> +     rte_free(rv);
> +     lpm = NULL;
> +     rv = NULL;
> +
> +     /* Test without RCU integration */
> +     printf("\nPerf test: 1 writer, %d readers, RCU integration disabled\n",
> +             num_cores);
> +
> +     /* Create LPM table */
> +     config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES;
> +     config.number_tbl8s = NUM_LDEPTH_ROUTE_ENTRIES;
> +     config.flags = 0;
> +     lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
> +     TEST_LPM_ASSERT(lpm != NULL);
> +
> +     writer_done = 0;
> +     __atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
> +
> +     /* Launch reader threads */
> +     for (i = 0; i < num_cores; i++)
> +             rte_eal_remote_launch(test_lpm_reader, NULL,
> +                                     enabled_core_ids[i]);
> +
> +     /* Measure add/delete. */
> +     begin = rte_rdtsc_precise();
> +     for (i = 0; i < RCU_ITERATIONS; i++) {
> +             /* Add all the entries */
> +             for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++)
> +                     if (rte_lpm_add(lpm, large_ldepth_route_table[j].ip,
> +                                     large_ldepth_route_table[j].depth,
> +                                     next_hop_add) != 0) {
> +                             printf("Failed to add iteration %d,
> route# %d\n",
> +                                     i, j);
> +                             goto error;
> +                     }
> +
> +             /* Delete all the entries */
> +             for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++)
> +                     if (rte_lpm_delete(lpm, large_ldepth_route_table[j].ip,
> +                             large_ldepth_route_table[j].depth) != 0) {
> +                             printf("Failed to delete iteration %d,
> route# %d\n",
> +                                     i, j);
> +                             goto error;
> +                     }
> +     }
> +     total_cycles = rte_rdtsc_precise() - begin;
> +
> +     printf("Total LPM Adds: %d\n", ITERATIONS *
> NUM_LDEPTH_ROUTE_ENTRIES);
> +     printf("Total LPM Deletes: %d\n",
> +             ITERATIONS * NUM_LDEPTH_ROUTE_ENTRIES);
> +     printf("Average LPM Add/Del: %g cycles\n",
> +             (double)total_cycles / (NUM_LDEPTH_ROUTE_ENTRIES *
> ITERATIONS));
> +
> +     writer_done = 1;
> +     /* Wait and check return value from reader threads */
> +     for (i = 0; i < num_cores; i++)
> +             if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0)
> +                     printf("Warning: lcore %u not finished.\n",
> +                             enabled_core_ids[i]);
> +
> +     rte_lpm_free(lpm);
> +
> +     return 0;
> +
> +error:
> +     writer_done = 1;
> +     /* Wait until all readers have exited */
> +     rte_eal_mp_wait_lcore();
> +
> +     rte_lpm_free(lpm);
> +     rte_free(rv);
> +
> +     return -1;
> +}
> +
>  static int
>  test_lpm_perf(void)
>  {
> -     struct rte_lpm *lpm = NULL;
>       struct rte_lpm_config config;
> 
>       config.max_rules = 2000000;
> @@ -343,7 +609,7 @@ test_lpm_perf(void)
>       lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
>       TEST_LPM_ASSERT(lpm != NULL);
> 
> -     /* Measue add. */
> +     /* Measure add. */
>       begin = rte_rdtsc();
> 
>       for (i = 0; i < NUM_ROUTE_ENTRIES; i++) { @@ -478,6 +744,8 @@
> test_lpm_perf(void)
>       rte_lpm_delete_all(lpm);
>       rte_lpm_free(lpm);
> 
> +     test_lpm_rcu_perf();
> +
>       return 0;
>  }
> 
> --
> 2.17.1

Reply via email to