lpm: avoid code duplication in rcu qsbr perf

Honnappa Nagarahalli Tue, 03 Nov 2020 06:52:50 -0800

<snip>

> >>>>
> >>>> Avoid code duplication by combining single and multi threaded tests
> >>>>
> >>>> Signed-off-by: Dharmik Thakkar <dharmik.thak...@arm.com>
> >>>> Reviewed-by: Ruifeng Wang <ruifeng.w...@arm.com>
> >>>> ---
> >>>> app/test/test_lpm_perf.c | 362
> >>>> ++++++++++-----------------------------
> >>>> 1 file changed, 91 insertions(+), 271 deletions(-)
> >>>>
> >>>> diff --git a/app/test/test_lpm_perf.c b/app/test/test_lpm_perf.c
> >>>> index
> >>>> 224c92fa3d65..229c835c23f7 100644
> >>>> --- a/app/test/test_lpm_perf.c
> >>>> +++ b/app/test/test_lpm_perf.c
> >>>> @@ -67,6 +67,12 @@ enum {
> >>>> IP_CLASS_C
> >>>> };
> >>>>
> >>>> +enum {
> >>>> +SINGLE_WRITER = 1,
> >>>> +MULTI_WRITER_1,
> >>>> +MULTI_WRITER_2
> >>>> +};
> >>> Do we need this? Can we use the number of cores instead?
> >>>
> >>
> >> There are 3 combinations of writes (adds/deletes):
> >> 1. Write all the entries - in case of a single writer 2. Write half
> >> of the entries - in case of multiple writers 3. Write remaining half
> >> of the entries - in case of multiple writers
> >>
> >> So, I think this is required.
> > IMO, this is not scalable. Essentially, we need 2 parameters to divide the
> routes among each writer thread. We need 2 parameters, 1) total number of
> writers 2) the core ID in the linear space.
> > Creating a structure with these 2 and passing that to the writer thread
> would be better and scalable.
> 
> Yes, agreed this is only applicable for 2 writers. Currently, the multi writer
> test is only limited to a maximum of 2 writers.
> To support more number of writers, we need something like this (which I
> believe is in lines with your suggestion):
> 1. Calculate what each writer will write: single_insert = TOTAL_WRITES /
> num_writers 2. Pass core ID in linear space as an argument to the writer
> function: pos_core 3. Calculate si and ei in the writer function: si = 
> pos_core *
> single_insert; ei = si + single_insert
> 
> I can update the patch to enable more than 2 writers.
> Do you also suggest we expand the scope of the test to test with more than
> 2 writers?
> This will increase the time for which the test is running (which currently is
> significant even with 2 writers).
Agree, no to increasing the number of writers. Yes for making the code more 
generic.


> 
> >
> >>
> >>>> +
> >>>> /* struct route_rule_count defines the total number of rules in
> >>>> following a/b/c
> >>>> * each item in a[]/b[]/c[] is the number of common IP address class
> >>>> A/B/C, not
> >>>> * including the ones for private local network.
> >>>> @@ -430,11 +436,16 @@ test_lpm_rcu_qsbr_writer(void *arg)  {
> >> unsigned
> >>>> int i, j, si, ei; uint64_t begin, total_cycles; -uint8_t core_id =
> >>>> (uint8_t)((uintptr_t)arg);
> >>>> +uint8_t writer_id = (uint8_t)((uintptr_t)arg);
> >>>> uint32_t next_hop_add = 0xAA;
> >>>>
> >>>> -/* 2 writer threads are used */
> >>>> -if (core_id % 2 == 0) {
> >>>> +/* Single writer (writer_id = 1) */ if (writer_id ==
> >>>> +SINGLE_WRITER) { si = 0; ei = NUM_LDEPTH_ROUTE_ENTRIES; }
> >>>> +/* 2 Writers (writer_id = 2/3)*/
> >>>> +else if (writer_id == MULTI_WRITER_1) {
> >>>> si = 0;
> >>>> ei = NUM_LDEPTH_ROUTE_ENTRIES / 2;
> >>>> } else {
> >>>> @@ -447,29 +458,35 @@ test_lpm_rcu_qsbr_writer(void *arg) for (i =
> >>>> 0; i < RCU_ITERATIONS; i++) {
> >>>> /* Add all the entries */
> >>>> for (j = si; j < ei; j++) {
> >>>> -pthread_mutex_lock(&lpm_mutex);
> >>>> +if (writer_id != SINGLE_WRITER)
> >>>> +pthread_mutex_lock(&lpm_mutex);
> >>>> if (rte_lpm_add(lpm, large_ldepth_route_table[j].ip,
> >>>> large_ldepth_route_table[j].depth,
> >>>> next_hop_add) != 0) {
> >>>> printf("Failed to add iteration %d, route# %d\n", i, j);
> >>>> -pthread_mutex_unlock(&lpm_mutex);
> >>>> +if (writer_id != SINGLE_WRITER)
> >>>> +
> >>>> pthread_mutex_unlock(&lpm_mutex);
> >>>> return -1;
> >>>> }
> >>>> -pthread_mutex_unlock(&lpm_mutex);
> >>>> +if (writer_id != SINGLE_WRITER)
> >>>> +pthread_mutex_unlock(&lpm_mutex);
> >>>> }
> >>>>
> >>>> /* Delete all the entries */
> >>>> for (j = si; j < ei; j++) {
> >>>> -pthread_mutex_lock(&lpm_mutex);
> >>>> +if (writer_id != SINGLE_WRITER)
> >>>> +pthread_mutex_lock(&lpm_mutex);
> >>>> if (rte_lpm_delete(lpm,
> >>>> large_ldepth_route_table[j].ip,
> >>>> large_ldepth_route_table[j].depth) != 0) { printf("Failed to delete
> >>>> iteration %d, route# %d\n", i, j);
> >>>> -pthread_mutex_unlock(&lpm_mutex);
> >>>> +if (writer_id != SINGLE_WRITER)
> >>>> +
> >>>> pthread_mutex_unlock(&lpm_mutex);
> >>>> return -1;
> >>>> }
> >>>> -pthread_mutex_unlock(&lpm_mutex);
> >>>> +if (writer_id != SINGLE_WRITER)
> >>>> +pthread_mutex_unlock(&lpm_mutex);
> >>>> }
> >>>> }
> >>>>
> >>>> @@ -482,16 +499,17 @@ test_lpm_rcu_qsbr_writer(void *arg)
> >>>>
> >>>> /*
> >>>> * Functional test:
> >>>> - * 2 writers, rest are readers
> >>>> + * 1/2 writers, rest are readers
> >>>> */
> >>>> static int
> >>>> -test_lpm_rcu_perf_multi_writer(void)
> >>>> +test_lpm_rcu_perf_multi_writer(uint8_t use_rcu)
> >>>> {
> >>>> struct rte_lpm_config config;
> >>>> size_t sz;
> >>>> -unsigned int i;
> >>>> +unsigned int i, j;
> >>>> uint16_t core_id;
> >>>> struct rte_lpm_rcu_config rcu_cfg = {0};
> >>>> +int (*reader_f)(void *arg) = NULL;
> >>>>
> >>>> if (rte_lcore_count() < 3) {
> >>>> printf("Not enough cores for lpm_rcu_perf_autotest, expecting at
> >>>> least 3\n"); @@ -504,273 +522,76 @@
> >>>> test_lpm_rcu_perf_multi_writer(void)
> >>>> num_cores++;
> >>>> }
> >>>>
> >>>> -printf("\nPerf test: 2 writers, %d readers, RCU integration
> >>>> enabled\n", -num_cores - 2);
> >>>> -
> >>>> -/* Create LPM table */
> >>>> -config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES; -
> >> config.number_tbl8s =
> >>>> NUM_LDEPTH_ROUTE_ENTRIES; -config.flags = 0; -lpm =
> >>>> rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
> >>>> -TEST_LPM_ASSERT(lpm != NULL);
> >>>> -
> >>>> -/* Init RCU variable */
> >>>> -sz = rte_rcu_qsbr_get_memsize(num_cores);
> >>>> -rv = (struct rte_rcu_qsbr *)rte_zmalloc("rcu0", sz,
> >>>> -RTE_CACHE_LINE_SIZE); -rte_rcu_qsbr_init(rv, num_cores);
> >>>> -
> >>>> -rcu_cfg.v = rv;
> >>>> -/* Assign the RCU variable to LPM */ -if
> >>>> (rte_lpm_rcu_qsbr_add(lpm,
> >>>> &rcu_cfg) != 0) { -printf("RCU variable assignment failed\n");
> >>>> -goto error; -}
> >>>> -
> >>>> -writer_done = 0;
> >>>> -__atomic_store_n(&gwrite_cycles, 0, __ATOMIC_RELAXED);
> >>>> -
> >>>> -__atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
> >>>> -
> >>>> -/* Launch reader threads */
> >>>> -for (i = 2; i < num_cores; i++)
> >>>> -rte_eal_remote_launch(test_lpm_rcu_qsbr_reader, NULL,
> >>>> -enabled_core_ids[i]);
> >>>> -
> >>>> -/* Launch writer threads */
> >>>> -for (i = 0; i < 2; i++)
> >>>> -rte_eal_remote_launch(test_lpm_rcu_qsbr_writer,
> >>>> -(void *)(uintptr_t)i,
> >>>> -enabled_core_ids[i]);
> >>>> -
> >>>> -/* Wait for writer threads */
> >>>> -for (i = 0; i < 2; i++)
> >>>> -if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0) -goto error;
> >>>> -
> >>>> -printf("Total LPM Adds: %d\n", TOTAL_WRITES); -printf("Total LPM
> >>>> Deletes: %d\n", TOTAL_WRITES); -printf("Average LPM Add/Del:
> >>>> %"PRIu64" cycles\n", -__atomic_load_n(&gwrite_cycles,
> >>>> __ATOMIC_RELAXED) -/ TOTAL_WRITES);
> >>>> -
> >>>> -writer_done = 1;
> >>>> -/* Wait until all readers have exited */ -for (i = 2; i <
> >>>> num_cores;
> >>>> i++) -rte_eal_wait_lcore(enabled_core_ids[i]);
> >>>> -
> >>>> -rte_lpm_free(lpm);
> >>>> -rte_free(rv);
> >>>> -lpm = NULL;
> >>>> -rv = NULL;
> >>>> -
> >>>> -/* Test without RCU integration */ -printf("\nPerf test: 2
> >>>> writers, %d readers, RCU integration disabled\n", -num_cores - 2);
> >>>> -
> >>>> -/* Create LPM table */
> >>>> -config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES; -
> >> config.number_tbl8s =
> >>>> NUM_LDEPTH_ROUTE_ENTRIES; -config.flags = 0; -lpm =
> >>>> rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
> >>>> -TEST_LPM_ASSERT(lpm != NULL);
> >>>> -
> >>>> -writer_done = 0;
> >>>> -__atomic_store_n(&gwrite_cycles, 0, __ATOMIC_RELAXED);
> >>>> -__atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
> >>>> -
> >>>> -/* Launch reader threads */
> >>>> -for (i = 2; i < num_cores; i++)
> >>>> -rte_eal_remote_launch(test_lpm_reader, NULL,
> >>>> -enabled_core_ids[i]);
> >>>> -
> >>>> -/* Launch writer threads */
> >>>> -for (i = 0; i < 2; i++)
> >>>> -rte_eal_remote_launch(test_lpm_rcu_qsbr_writer,
> >>>> -(void *)(uintptr_t)i,
> >>>> -enabled_core_ids[i]);
> >>>> -
> >>>> -/* Wait for writer threads */
> >>>> -for (i = 0; i < 2; i++)
> >>>> -if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0) -goto error;
> >>>> -
> >>>> -printf("Total LPM Adds: %d\n", TOTAL_WRITES); -printf("Total LPM
> >>>> Deletes: %d\n", TOTAL_WRITES); -printf("Average LPM Add/Del:
> >>>> %"PRIu64" cycles\n", -__atomic_load_n(&gwrite_cycles,
> >>>> __ATOMIC_RELAXED) -/ TOTAL_WRITES);
> >>>> -
> >>>> -writer_done = 1;
> >>>> -/* Wait until all readers have exited */ -for (i = 2; i <
> >>>> num_cores;
> >>>> i++) -rte_eal_wait_lcore(enabled_core_ids[i]);
> >>>> -
> >>>> -rte_lpm_free(lpm);
> >>>> -
> >>>> -return 0;
> >>>> -
> >>>> -error:
> >>>> -writer_done = 1;
> >>>> -/* Wait until all readers have exited */ -rte_eal_mp_wait_lcore();
> >>>> -
> >>>> -rte_lpm_free(lpm);
> >>>> -rte_free(rv);
> >>>> -
> >>>> -return -1;
> >>>> -}
> >>>> -
> >>>> -/*
> >>>> - * Functional test:
> >>>> - * Single writer, rest are readers
> >>>> - */
> >>>> -static int
> >>>> -test_lpm_rcu_perf(void)
> >>>> -{
> >>>> -struct rte_lpm_config config;
> >>>> -uint64_t begin, total_cycles;
> >>>> -size_t sz;
> >>>> -unsigned int i, j;
> >>>> -uint16_t core_id;
> >>>> -uint32_t next_hop_add = 0xAA;
> >>>> -struct rte_lpm_rcu_config rcu_cfg = {0};
> >>>> -
> >>>> -if (rte_lcore_count() < 2) {
> >>>> -printf("Not enough cores for lpm_rcu_perf_autotest, expecting at
> >>>> least 2\n"); -return TEST_SKIPPED; -}
> >>>> -
> >>>> -num_cores = 0;
> >>>> -RTE_LCORE_FOREACH_WORKER(core_id) { -
> enabled_core_ids[num_cores] =
> >>>> core_id; -num_cores++; -}
> >>>> -
> >>>> -printf("\nPerf test: 1 writer, %d readers, RCU integration
> >>>> enabled\n", -num_cores);
> >>>> -
> >>>> -/* Create LPM table */
> >>>> -config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES; -
> >> config.number_tbl8s =
> >>>> NUM_LDEPTH_ROUTE_ENTRIES; -config.flags = 0; -lpm =
> >>>> rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
> >>>> -TEST_LPM_ASSERT(lpm != NULL);
> >>>> -
> >>>> -/* Init RCU variable */
> >>>> -sz = rte_rcu_qsbr_get_memsize(num_cores);
> >>>> -rv = (struct rte_rcu_qsbr *)rte_zmalloc("rcu0", sz,
> >>>> -RTE_CACHE_LINE_SIZE); -rte_rcu_qsbr_init(rv, num_cores);
> >>>> -
> >>>> -rcu_cfg.v = rv;
> >>>> -/* Assign the RCU variable to LPM */ -if
> >>>> (rte_lpm_rcu_qsbr_add(lpm,
> >>>> &rcu_cfg) != 0) { -printf("RCU variable assignment failed\n");
> >>>> -goto error; -}
> >>>> -
> >>>> -writer_done = 0;
> >>>> -__atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
> >>>> -
> >>>> -/* Launch reader threads */
> >>>> -for (i = 0; i < num_cores; i++)
> >>>> -rte_eal_remote_launch(test_lpm_rcu_qsbr_reader, NULL,
> >>>> -enabled_core_ids[i]);
> >>>> -
> >>>> -/* Measure add/delete. */
> >>>> -begin = rte_rdtsc_precise();
> >>>> -for (i = 0; i < RCU_ITERATIONS; i++) {
> >>>> -/* Add all the entries */
> >>>> -for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++) -if
> >>>> (rte_lpm_add(lpm, large_ldepth_route_table[j].ip,
> >>>> -large_ldepth_route_table[j].depth,
> >>>> -next_hop_add) != 0) {
> >>>> -printf("Failed to add iteration %d, route# %d\n", -i, j);
> >>>> +for (j = 1; j < 3; j++) {
> >>>> +if (use_rcu)
> >>>> +printf("\nPerf test: %d writer(s), %d reader(s),"
> >>>> +       " RCU integration enabled\n", j, num_cores - j); else
> >>>> +printf("\nPerf test: %d writer(s), %d reader(s),"
> >>>> +       " RCU integration disabled\n", j, num_cores - j);
> >>>> +
> >>>> +/* Create LPM table */
> >>>> +config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES;
> >> config.number_tbl8s =
> >>>> +NUM_LDEPTH_ROUTE_ENTRIES; config.flags = 0; lpm =
> >>>> +rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
> >>>> +TEST_LPM_ASSERT(lpm != NULL);
> >>>> +
> >>>> +/* Init RCU variable */
> >>>> +if (use_rcu) {
> >>>> +sz = rte_rcu_qsbr_get_memsize(num_cores);
> >>>> +rv = (struct rte_rcu_qsbr *)rte_zmalloc("rcu0", sz,
> >>>> +
> >>>> RTE_CACHE_LINE_SIZE);
> >>>> +rte_rcu_qsbr_init(rv, num_cores);
> >>>> +
> >>>> +rcu_cfg.v = rv;
> >>>> +/* Assign the RCU variable to LPM */ if (rte_lpm_rcu_qsbr_add(lpm,
> >>>> +&rcu_cfg) != 0) { printf("RCU variable assignment failed\n");
> >>>> goto error;
> >>>> }
> >>>>
> >>>> -/* Delete all the entries */
> >>>> -for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++) -if
> >>>> (rte_lpm_delete(lpm, large_ldepth_route_table[j].ip,
> >>>> -large_ldepth_route_table[j].depth) != 0) { -printf("Failed to
> >>>> delete iteration %d, route# %d\n", -i, j); -goto error; -} -}
> >>>> -total_cycles = rte_rdtsc_precise() - begin;
> >>>> +reader_f = test_lpm_rcu_qsbr_reader; } else reader_f =
> >>>> +test_lpm_reader;
> >>>>
> >>>> -printf("Total LPM Adds: %d\n", TOTAL_WRITES); -printf("Total LPM
> >>>> Deletes: %d\n", TOTAL_WRITES); -printf("Average LPM Add/Del: %g
> >>>> cycles\n", -(double)total_cycles / TOTAL_WRITES);
> >>>> +writer_done = 0;
> >>>> +__atomic_store_n(&gwrite_cycles, 0, __ATOMIC_RELAXED);
> >>>>
> >>>> -writer_done = 1;
> >>>> -/* Wait until all readers have exited */ -for (i = 0; i <
> >>>> num_cores;
> >>>> i++) -if (rte_eal_wait_lcore(enabled_core_ids[i]);
> >>>> -
> >>>> -rte_lpm_free(lpm);
> >>>> -rte_free(rv);
> >>>> -lpm = NULL;
> >>>> -rv = NULL;
> >>>> -
> >>>> -/* Test without RCU integration */ -printf("\nPerf test: 1 writer,
> >>>> %d readers, RCU integration disabled\n", -num_cores);
> >>>> -
> >>>> -/* Create LPM table */
> >>>> -config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES; -
> >> config.number_tbl8s =
> >>>> NUM_LDEPTH_ROUTE_ENTRIES; -config.flags = 0; -lpm =
> >>>> rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
> >>>> -TEST_LPM_ASSERT(lpm != NULL);
> >>>> +__atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
> >>>>
> >>>> -writer_done = 0;
> >>>> -__atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
> >>>> +/* Launch reader threads */
> >>>> +for (i = j; i < num_cores; i++)
> >>>> +rte_eal_remote_launch(reader_f, NULL, enabled_core_ids[i]);
> >>>>
> >>>> -/* Launch reader threads */
> >>>> -for (i = 0; i < num_cores; i++)
> >>>> -rte_eal_remote_launch(test_lpm_reader, NULL,
> >>>> -enabled_core_ids[i]);
> >>>> +/* Launch writer threads */
> >>>> +for (i = 0; i < j; i++)
> >>>> +rte_eal_remote_launch(test_lpm_rcu_qsbr_writer,
> >>>> +(void *)(uintptr_t)(i + j),
> >>> This can be just 'j'?
> >>>
> >>>> +enabled_core_ids[i]);
> >>>>
> >>>> -/* Measure add/delete. */
> >>>> -begin = rte_rdtsc_precise();
> >>>> -for (i = 0; i < RCU_ITERATIONS; i++) {
> >>>> -/* Add all the entries */
> >>>> -for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++) -if
> >>>> (rte_lpm_add(lpm, large_ldepth_route_table[j].ip,
> >>>> -large_ldepth_route_table[j].depth,
> >>>> -next_hop_add) != 0) {
> >>>> -printf("Failed to add iteration %d, route# %d\n", -i, j);
> >>>> +/* Wait for writer threads */
> >>>> +for (i = 0; i < j; i++)
> >>>> +if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0)
> >>>> goto error;
> >>>> -}
> >>>>
> >>>> -/* Delete all the entries */
> >>>> -for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++) -if
> >>>> (rte_lpm_delete(lpm, large_ldepth_route_table[j].ip,
> >>>> -large_ldepth_route_table[j].depth) != 0) { -printf("Failed to
> >>>> delete iteration %d, route# %d\n", -i, j); -goto error; -}
> >>>> +printf("Total LPM Adds: %d\n", TOTAL_WRITES); printf("Total LPM
> >>>> +Deletes: %d\n", TOTAL_WRITES); printf("Average LPM Add/Del:
> >>>> +%"PRIu64" cycles\n", __atomic_load_n(&gwrite_cycles,
> >>>> __ATOMIC_RELAXED)
> >>>> +/ TOTAL_WRITES);
> >>>> +
> >>>> +writer_done = 1;
> >>>> +/* Wait until all readers have exited */ for (i = j; i <
> >>>> +num_cores; i++) rte_eal_wait_lcore(enabled_core_ids[i]);
> >>>> +
> >>>> +rte_lpm_free(lpm);
> >>>> +rte_free(rv);
> >>>> +lpm = NULL;
> >>>> +rv = NULL;
> >>>> }
> >>>> -total_cycles = rte_rdtsc_precise() - begin;
> >>>> -
> >>>> -printf("Total LPM Adds: %d\n", TOTAL_WRITES); -printf("Total LPM
> >>>> Deletes: %d\n", TOTAL_WRITES); -printf("Average LPM Add/Del: %g
> >>>> cycles\n", -(double)total_cycles / TOTAL_WRITES);
> >>>> -
> >>>> -writer_done = 1;
> >>>> -/* Wait until all readers have exited */ -for (i = 0; i <
> >>>> num_cores; i++) -rte_eal_wait_lcore(enabled_core_ids[i]);
> >>>> -
> >>>> -rte_lpm_free(lpm);
> >>>>
> >>>> return 0;
> >>>>
> >>>> @@ -946,9 +767,8 @@ test_lpm_perf(void) rte_lpm_delete_all(lpm);
> >>>> rte_lpm_free(lpm);
> >>>>
> >>>> -test_lpm_rcu_perf();
> >>>> -
> >>>> -test_lpm_rcu_perf_multi_writer();
> >>>> +test_lpm_rcu_perf_multi_writer(0);
> >>>> +test_lpm_rcu_perf_multi_writer(1);
> >>>>
> >>>> return 0;
> >>>> }
> >>>> --
> >>>> 2.17.1
>

Re: [dpdk-dev] [PATCH v2 4/4] test/lpm: avoid code duplication in rcu qsbr perf

Reply via email to