mcslock: move performance test to perf tests

Aaron Conole Thu, 23 Jul 2020 08:15:16 -0700

Phil Yang <phil.y...@arm.com> writes:

> Hi Aaron,
>
> It seems Travis CI cannot capture this timeout issue.


That's probably because we use -t 3 to multiply the timeout.  We needed
to do that because a few of the tests were on the cusp of failure in the
Travis environment since it's a bit lower power.  In more brawny
systems, probably they aren't an issue.

> But the local test records these two cases as TIMEOUT under the default 
> timeout configuration.
>
> 1. The test results on x86 server (72 lcores online@ 2.60GHz) are:
> a. Default setting:
> $ sudo meson test -C build_test --suite DPDK:fast-tests
>    3/92 DPDK:fast-tests / atomic_autotest                TIMEOUT        10.32s
> 39/92 DPDK:fast-tests / mcslock_autotest               TIMEOUT        10.32s
>
> b. Extend timeout:
> $ sudo meson test -C build_test --suite DPDK:fast-tests -t 30
>    3/92 DPDK:fast-tests / atomic_autotest                OK             
> 173.01s
> 39/92 DPDK:fast-tests / mcslock_autotest               OK             12.19s
>
> 2. The test results on aarch64 server (56 lcore online @ 2.0GHz) are:
> a. Default setting:
> $ sudo meson test -C build_test --suite DPDK:fast-tests
>    3/92 DPDK:fast-tests / atomic_autotest       TIMEOUT 10.47 s
> 39/92 DPDK:fast-tests / mcslock_autotest      TIMEOUT 10.47 s
>
> b. Extend timeout:
> $ sudo meson test -C build_test --suite DPDK:fast-tests -t 60
>    3/92 DPDK:fast-tests / atomic_autotest       OK      431.89 s
> 39/92 DPDK:fast-tests / mcslock_autotest      OK      21.25 s

In all the cases it seems the mcslock_autotest is really not too far
off, but the atomic_autotest looks really long no matter what.

> So I think these two patches can resolve this issue. I'd love to hear your 
> thoughts.

In general, I'm always happy to see perf tests more appropriately put
under perf area.  I don't have much opinion on this specific patch,
though.

> Thanks,
> Phil
>
> <snip>
>
>> Subject: [dpdk-dev] [PATCH 1/2] test/mcslock: move performance test to
>> perf tests
>> 
>> The MCS lock performance test takes more than 10 seconds and leads
>> to meson test timeout on some platforms. Move the performance test
>> into perf tests.
>> 
>> Signed-off-by: Phil Yang <phil.y...@arm.com>
>> Reviewed-by: Gavin Hu <gavin...@arm.com>
>> ---
>>  MAINTAINERS                  |   1 +
>>  app/test/Makefile            |   1 +
>>  app/test/autotest_data.py    |   6 +++
>>  app/test/meson.build         |   2 +
>>  app/test/test_mcslock.c      |  88 -------------------------------
>>  app/test/test_mcslock_perf.c | 121
>> +++++++++++++++++++++++++++++++++++++++++++
>>  6 files changed, 131 insertions(+), 88 deletions(-)
>>  create mode 100644 app/test/test_mcslock_perf.c
>> 
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index db235c2..411bdeb 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -247,6 +247,7 @@ MCSlock - EXPERIMENTAL
>>  M: Phil Yang <phil.y...@arm.com>
>>  F: lib/librte_eal/common/include/generic/rte_mcslock.h
>>  F: app/test/test_mcslock.c
>> +F: app/test/test_mcslock_perf.c
>> 
>>  Ticketlock
>>  M: Joyce Kong <joyce.k...@arm.com>
>> diff --git a/app/test/Makefile b/app/test/Makefile
>> index 1f080d1..97de3ac 100644
>> --- a/app/test/Makefile
>> +++ b/app/test/Makefile
>> @@ -65,6 +65,7 @@ SRCS-y += test_barrier.c
>>  SRCS-y += test_malloc.c
>>  SRCS-y += test_cycles.c
>>  SRCS-y += test_mcslock.c
>> +SRCS-y += test_mcslock_perf.c
>>  SRCS-y += test_spinlock.c
>>  SRCS-y += test_ticketlock.c
>>  SRCS-y += test_memory.c
>> diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
>> index 7b1d013..2a4619d 100644
>> --- a/app/test/autotest_data.py
>> +++ b/app/test/autotest_data.py
>> @@ -784,6 +784,12 @@
>>          "Func":    default_autotest,
>>          "Report":  None,
>>      },
>> +    {
>> +        "Name":    "MCS Lock performance autotest",
>> +        "Command": "mcslock_perf_autotest",
>> +        "Func":    default_autotest,
>> +        "Report":  None,
>> +    },
>>      #
>>      # Please always make sure that ring_perf is the last test!
>>      #
>> diff --git a/app/test/meson.build b/app/test/meson.build
>> index 0a2ce71..335a869 100644
>> --- a/app/test/meson.build
>> +++ b/app/test/meson.build
>> @@ -82,6 +82,7 @@ test_sources = files('commands.c',
>>      'test_meter.c',
>>      'test_metrics.c',
>>      'test_mcslock.c',
>> +    'test_mcslock_perf.c',
>>      'test_mp_secondary.c',
>>      'test_per_lcore.c',
>>      'test_pmd_perf.c',
>> @@ -270,6 +271,7 @@ perf_test_names = [
>>          'rand_perf_autotest',
>>          'hash_readwrite_perf_autotest',
>>          'hash_readwrite_lf_perf_autotest',
>> +    'mcslock_perf_autotest',
>>  ]
>> 
>>  driver_test_names = [
>> diff --git a/app/test/test_mcslock.c b/app/test/test_mcslock.c
>> index e9359df..15f9751 100644
>> --- a/app/test/test_mcslock.c
>> +++ b/app/test/test_mcslock.c
>> @@ -32,23 +32,16 @@
>>   *
>>   *   - The function takes the global lock, display something, then releases
>>   *     the global lock on each core.
>> - *
>> - * - A load test is carried out, with all cores attempting to lock a single
>> - *   lock multiple times.
>>   */
>> 
>>  RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_me);
>>  RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_try_me);
>> -RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_perf_me);
>> 
>>  rte_mcslock_t *p_ml;
>>  rte_mcslock_t *p_ml_try;
>> -rte_mcslock_t *p_ml_perf;
>> 
>>  static unsigned int count;
>> 
>> -static rte_atomic32_t synchro;
>> -
>>  static int
>>  test_mcslock_per_core(__attribute__((unused)) void *arg)
>>  {
>> @@ -63,85 +56,8 @@ test_mcslock_per_core(__attribute__((unused)) void
>> *arg)
>>      return 0;
>>  }
>> 
>> -static uint64_t time_count[RTE_MAX_LCORE] = {0};
>> -
>>  #define MAX_LOOP 1000000
>> 
>> -static int
>> -load_loop_fn(void *func_param)
>> -{
>> -    uint64_t time_diff = 0, begin;
>> -    uint64_t hz = rte_get_timer_hz();
>> -    volatile uint64_t lcount = 0;
>> -    const int use_lock = *(int *)func_param;
>> -    const unsigned int lcore = rte_lcore_id();
>> -
>> -    /**< Per core me node. */
>> -    rte_mcslock_t ml_perf_me = RTE_PER_LCORE(_ml_perf_me);
>> -
>> -    /* wait synchro */
>> -    while (rte_atomic32_read(&synchro) == 0)
>> -            ;
>> -
>> -    begin = rte_get_timer_cycles();
>> -    while (lcount < MAX_LOOP) {
>> -            if (use_lock)
>> -                    rte_mcslock_lock(&p_ml_perf, &ml_perf_me);
>> -
>> -            lcount++;
>> -            if (use_lock)
>> -                    rte_mcslock_unlock(&p_ml_perf, &ml_perf_me);
>> -    }
>> -    time_diff = rte_get_timer_cycles() - begin;
>> -    time_count[lcore] = time_diff * 1000000 / hz;
>> -    return 0;
>> -}
>> -
>> -static int
>> -test_mcslock_perf(void)
>> -{
>> -    unsigned int i;
>> -    uint64_t total = 0;
>> -    int lock = 0;
>> -    const unsigned int lcore = rte_lcore_id();
>> -
>> -    printf("\nTest with no lock on single core...\n");
>> -    rte_atomic32_set(&synchro, 1);
>> -    load_loop_fn(&lock);
>> -    printf("Core [%u] Cost Time = %"PRIu64" us\n",
>> -                    lcore, time_count[lcore]);
>> -    memset(time_count, 0, sizeof(time_count));
>> -
>> -    printf("\nTest with lock on single core...\n");
>> -    lock = 1;
>> -    rte_atomic32_set(&synchro, 1);
>> -    load_loop_fn(&lock);
>> -    printf("Core [%u] Cost Time = %"PRIu64" us\n",
>> -                    lcore, time_count[lcore]);
>> -    memset(time_count, 0, sizeof(time_count));
>> -
>> -    printf("\nTest with lock on %u cores...\n", (rte_lcore_count()));
>> -
>> -    rte_atomic32_set(&synchro, 0);
>> -    rte_eal_mp_remote_launch(load_loop_fn, &lock, SKIP_MASTER);
>> -
>> -    /* start synchro and launch test on master */
>> -    rte_atomic32_set(&synchro, 1);
>> -    load_loop_fn(&lock);
>> -
>> -    rte_eal_mp_wait_lcore();
>> -
>> -    RTE_LCORE_FOREACH(i) {
>> -            printf("Core [%u] Cost Time = %"PRIu64" us\n",
>> -                            i, time_count[i]);
>> -            total += time_count[i];
>> -    }
>> -
>> -    printf("Total Cost Time = %"PRIu64" us\n", total);
>> -
>> -    return 0;
>> -}
>> -
>>  /*
>>   * Use rte_mcslock_trylock() to trylock a mcs lock object,
>>   * If it could not lock the object successfully, it would
>> @@ -240,10 +156,6 @@ test_mcslock(void)
>>              ret = -1;
>>      rte_mcslock_unlock(&p_ml, &ml_me);
>> 
>> -    /* mcs lock perf test */
>> -    if (test_mcslock_perf() < 0)
>> -            return -1;
>> -
>>      return ret;
>>  }
>> 
>> diff --git a/app/test/test_mcslock_perf.c b/app/test/test_mcslock_perf.c
>> new file mode 100644
>> index 0000000..6948344
>> --- /dev/null
>> +++ b/app/test/test_mcslock_perf.c
>> @@ -0,0 +1,121 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2019 Arm Limited
>> + */
>> +
>> +#include <stdio.h>
>> +#include <stdint.h>
>> +#include <inttypes.h>
>> +#include <string.h>
>> +#include <unistd.h>
>> +#include <sys/queue.h>
>> +
>> +#include <rte_common.h>
>> +#include <rte_memory.h>
>> +#include <rte_per_lcore.h>
>> +#include <rte_launch.h>
>> +#include <rte_eal.h>
>> +#include <rte_lcore.h>
>> +#include <rte_cycles.h>
>> +#include <rte_mcslock.h>
>> +#include <rte_atomic.h>
>> +
>> +#include "test.h"
>> +
>> +/*
>> + * RTE MCS lock perf test
>> + * ======================
>> + *
>> + * These tests are derived from spin lock perf test cases.
>> + *
>> + * - A load test is carried out, with all cores attempting to lock a single
>> + *   lock multiple times.
>> + */
>> +
>> +RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_perf_me);
>> +rte_mcslock_t *p_ml_perf;
>> +
>> +static rte_atomic32_t synchro;
>> +static uint64_t time_count[RTE_MAX_LCORE] = {0};
>> +
>> +#define MAX_LOOP 1000000
>> +
>> +static int
>> +load_loop_fn(void *func_param)
>> +{
>> +    uint64_t time_diff = 0, begin;
>> +    uint64_t hz = rte_get_timer_hz();
>> +    volatile uint64_t lcount = 0;
>> +    const int use_lock = *(int *)func_param;
>> +    const unsigned int lcore = rte_lcore_id();
>> +
>> +    /**< Per core me node. */
>> +    rte_mcslock_t ml_perf_me = RTE_PER_LCORE(_ml_perf_me);
>> +
>> +    /* wait synchro */
>> +    while (rte_atomic32_read(&synchro) == 0)
>> +            ;
>> +
>> +    begin = rte_get_timer_cycles();
>> +    while (lcount < MAX_LOOP) {
>> +            if (use_lock)
>> +                    rte_mcslock_lock(&p_ml_perf, &ml_perf_me);
>> +
>> +            lcount++;
>> +            if (use_lock)
>> +                    rte_mcslock_unlock(&p_ml_perf, &ml_perf_me);
>> +    }
>> +    time_diff = rte_get_timer_cycles() - begin;
>> +    time_count[lcore] = time_diff * 1000000 / hz;
>> +    return 0;
>> +}
>> +
>> +/*
>> + * Test rte_eal_get_lcore_state() in addition to mcs locks
>> + * as we have "waiting" then "running" lcores.
>> + */
>> +static int
>> +test_mcslock_perf(void)
>> +{
>> +    unsigned int i;
>> +    uint64_t total = 0;
>> +    int lock = 0;
>> +    const unsigned int lcore = rte_lcore_id();
>> +
>> +    printf("\nTest with no lock on single core...\n");
>> +    rte_atomic32_set(&synchro, 1);
>> +    load_loop_fn(&lock);
>> +    printf("Core [%u] Cost Time = %"PRIu64" us\n",
>> +                    lcore, time_count[lcore]);
>> +    memset(time_count, 0, sizeof(time_count));
>> +
>> +    printf("\nTest with lock on single core...\n");
>> +    lock = 1;
>> +    rte_atomic32_set(&synchro, 1);
>> +    load_loop_fn(&lock);
>> +    printf("Core [%u] Cost Time = %"PRIu64" us\n",
>> +                    lcore, time_count[lcore]);
>> +    memset(time_count, 0, sizeof(time_count));
>> +
>> +    printf("\nTest with lock on %u cores...\n", (rte_lcore_count()));
>> +
>> +    rte_atomic32_set(&synchro, 0);
>> +    rte_eal_mp_remote_launch(load_loop_fn, &lock, SKIP_MASTER);
>> +
>> +    /* start synchro and launch test on master */
>> +    rte_atomic32_set(&synchro, 1);
>> +    load_loop_fn(&lock);
>> +
>> +    rte_eal_mp_wait_lcore();
>> +
>> +    RTE_LCORE_FOREACH(i) {
>> +            printf("Core [%u] Cost Time = %"PRIu64" us\n",
>> +                            i, time_count[i]);
>> +            total += time_count[i];
>> +    }
>> +
>> +    printf("Total Cost Time = %"PRIu64" us\n", total);
>> +
>> +    return 0;
>> +}
>> +
>> +REGISTER_TEST_COMMAND(mcslock_perf_autotest, test_mcslock_perf);
>> --
>> 2.7.4

Re: [dpdk-dev] [PATCH 1/2] test/mcslock: move performance test to perf tests

Reply via email to