> >> > >> This is especially useful in cases where CPU cores are isolated i.e > >> run dedicated tasks. In such cases one cannot use standard perf > >> utility without sacrificing latency and performance. > >> > >> Signed-off-by: Tomasz Duszynski <tduszyn...@marvell.com> > >> Acked-by: Morten Brørup <m...@smartsharesystems.com> > >> --- > >> MAINTAINERS | 5 + > >> app/test/meson.build | 2 + > >> app/test/test_pmu.c | 62 ++++ > >> doc/api/doxy-api-index.md | 3 +- > >> doc/api/doxy-api.conf.in | 1 + > >> doc/guides/prog_guide/profile_app.rst | 12 + > >> doc/guides/rel_notes/release_23_03.rst | 7 + > >> lib/meson.build | 1 + > >> lib/pmu/meson.build | 13 + > >> lib/pmu/pmu_private.h | 32 ++ > >> lib/pmu/rte_pmu.c | 460 +++++++++++++++++++++++++ > >> lib/pmu/rte_pmu.h | 212 ++++++++++++ > >> lib/pmu/version.map | 15 + > >> 13 files changed, 824 insertions(+), 1 deletion(-) > >> create mode 100644 app/test/test_pmu.c > >> create mode 100644 lib/pmu/meson.build > >> create mode 100644 lib/pmu/pmu_private.h > >> create mode 100644 lib/pmu/rte_pmu.c > >> create mode 100644 lib/pmu/rte_pmu.h > >> create mode 100644 lib/pmu/version.map > >> > >> diff --git a/MAINTAINERS b/MAINTAINERS index 3495946d0f..d37f242120 > >> 100644 > >> --- a/MAINTAINERS > >> +++ b/MAINTAINERS > >> @@ -1697,6 +1697,11 @@ M: Nithin Dabilpuram <ndabilpu...@marvell.com> > >> M: Pavan Nikhilesh <pbhagavat...@marvell.com> > >> F: lib/node/ > >> > >> +PMU - EXPERIMENTAL > >> +M: Tomasz Duszynski <tduszyn...@marvell.com> > >> +F: lib/pmu/ > >> +F: app/test/test_pmu* > >> + > >> > >> Test Applications > >> ----------------- > >> diff --git a/app/test/meson.build b/app/test/meson.build index > >> f34d19e3c3..6b61b7fc32 100644 > >> --- a/app/test/meson.build > >> +++ b/app/test/meson.build > >> @@ -111,6 +111,7 @@ test_sources = files( > >> 'test_reciprocal_division_perf.c', > >> 'test_red.c', > >> 'test_pie.c', > >> + 'test_pmu.c', > >> 'test_reorder.c', > >> 'test_rib.c', > >> 'test_rib6.c', > >> @@ -239,6 +240,7 @@ fast_tests = [ > >> ['kni_autotest', false, true], > >> ['kvargs_autotest', true, true], > >> ['member_autotest', true, true], > >> + ['pmu_autotest', true, true], > >> ['power_cpufreq_autotest', false, true], > >> ['power_autotest', true, true], > >> ['power_kvm_vm_autotest', false, true], diff --git > >> a/app/test/test_pmu.c b/app/test/test_pmu.c new file mode 100644 index > >> 0000000000..c257638e8b > >> --- /dev/null > >> +++ b/app/test/test_pmu.c > >> @@ -0,0 +1,62 @@ > >> +/* SPDX-License-Identifier: BSD-3-Clause > >> + * Copyright(C) 2023 Marvell International Ltd. > >> + */ > >> + > >> +#include "test.h" > >> + > >> +#ifndef RTE_EXEC_ENV_LINUX > >> + > >> +static int > >> +test_pmu(void) > >> +{ > >> + printf("pmu_autotest only supported on Linux, skipping test\n"); > >> + return TEST_SKIPPED; > >> +} > >> + > >> +#else > >> + > >> +#include <rte_pmu.h> > >> + > >> +static int > >> +test_pmu_read(void) > >> +{ > >> + const char *name = NULL; > >> + int tries = 10, event; > >> + uint64_t val = 0; > >> + > >> + if (name == NULL) { > >> + printf("PMU not supported on this arch\n"); > >> + return TEST_SKIPPED; > >> + } > >> + > >> + if (rte_pmu_init() < 0) > >> + return TEST_SKIPPED; > >> + > >> + event = rte_pmu_add_event(name); > >> + while (tries--) > >> + val += rte_pmu_read(event); > >> + > >> + rte_pmu_fini(); > >> + > >> + return val ? TEST_SUCCESS : TEST_FAILED; } > >> + > >> +static struct unit_test_suite pmu_tests = { > >> + .suite_name = "pmu autotest", > >> + .setup = NULL, > >> + .teardown = NULL, > >> + .unit_test_cases = { > >> + TEST_CASE(test_pmu_read), > >> + TEST_CASES_END() > >> + } > >> +}; > >> + > >> +static int > >> +test_pmu(void) > >> +{ > >> + return unit_test_suite_runner(&pmu_tests); > >> +} > >> + > >> +#endif /* RTE_EXEC_ENV_LINUX */ > >> + > >> +REGISTER_TEST_COMMAND(pmu_autotest, test_pmu); > >> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md > >> index 2deec7ea19..a8e04a195d 100644 > >> --- a/doc/api/doxy-api-index.md > >> +++ b/doc/api/doxy-api-index.md > >> @@ -223,7 +223,8 @@ The public API headers are grouped by topics: > >> [log](@ref rte_log.h), > >> [errno](@ref rte_errno.h), > >> [trace](@ref rte_trace.h), > >> - [trace_point](@ref rte_trace_point.h) > >> + [trace_point](@ref rte_trace_point.h), [pmu](@ref rte_pmu.h) > >> > >> - **misc**: > >> [EAL config](@ref rte_eal.h), > >> diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in index > >> e859426099..350b5a8c94 100644 > >> --- a/doc/api/doxy-api.conf.in > >> +++ b/doc/api/doxy-api.conf.in > >> @@ -63,6 +63,7 @@ INPUT = > >> @TOPDIR@/doc/api/doxy-api-index.md \ > >> @TOPDIR@/lib/pci \ > >> @TOPDIR@/lib/pdump \ > >> @TOPDIR@/lib/pipeline \ > >> + @TOPDIR@/lib/pmu \ > >> @TOPDIR@/lib/port \ > >> @TOPDIR@/lib/power \ > >> @TOPDIR@/lib/rawdev \ diff --git > >> a/doc/guides/prog_guide/profile_app.rst > >> b/doc/guides/prog_guide/profile_app.rst > >> index 14292d4c25..89e38cd301 100644 > >> --- a/doc/guides/prog_guide/profile_app.rst > >> +++ b/doc/guides/prog_guide/profile_app.rst > >> @@ -7,6 +7,18 @@ Profile Your Application > >> The following sections describe methods of profiling DPDK applications on > >> different architectures. > >> > >> +Performance counter based profiling > >> +----------------------------------- > >> + > >> +Majority of architectures support some performance monitoring unit (PMU). > >> +Such unit provides programmable counters that monitor specific events. > >> + > >> +Different tools gather that information, like for example perf. > >> +However, in some scenarios when CPU cores are isolated and run > >> +dedicated tasks interrupting those tasks with perf may be undesirable. > >> + > >> +In such cases, an application can use the PMU library to read such events > >> via > >``rte_pmu_read()``. > >> + > >> > >> Profiling on x86 > >> ---------------- > >> diff --git a/doc/guides/rel_notes/release_23_03.rst > >> b/doc/guides/rel_notes/release_23_03.rst > >> index ab998a5357..20622efe58 100644 > >> --- a/doc/guides/rel_notes/release_23_03.rst > >> +++ b/doc/guides/rel_notes/release_23_03.rst > >> @@ -147,6 +147,13 @@ New Features > >> * Added support to capture packets at each graph node with packet > >> metadata and > >> node name. > >> > >> +* **Added PMU library.** > >> + > >> + Added a new performance monitoring unit (PMU) library which allows > >> + applications to perform self monitoring activities without depending on > >> external utilities > >like perf. > >> + After integration with :doc:`../prog_guide/trace_lib` data gathered > >> + from hardware counters can be stored in CTF format for further analysis. > >> + > >> > >> Removed Items > >> ------------- > >> diff --git a/lib/meson.build b/lib/meson.build index > >> 450c061d2b..8a42d45d20 100644 > >> --- a/lib/meson.build > >> +++ b/lib/meson.build > >> @@ -11,6 +11,7 @@ > >> libraries = [ > >> 'kvargs', # eal depends on kvargs > >> 'telemetry', # basic info querying > >> + 'pmu', > >> 'eal', # everything depends on eal > >> 'ring', > >> 'rcu', # rcu depends on ring diff --git > >> a/lib/pmu/meson.build b/lib/pmu/meson.build new file mode 100644 index > >> 0000000000..a4160b494e > >> --- /dev/null > >> +++ b/lib/pmu/meson.build > >> @@ -0,0 +1,13 @@ > >> +# SPDX-License-Identifier: BSD-3-Clause # Copyright(C) 2023 Marvell > >> +International Ltd. > >> + > >> +if not is_linux > >> + build = false > >> + reason = 'only supported on Linux' > >> + subdir_done() > >> +endif > >> + > >> +includes = [global_inc] > >> + > >> +sources = files('rte_pmu.c') > >> +headers = files('rte_pmu.h') > >> diff --git a/lib/pmu/pmu_private.h b/lib/pmu/pmu_private.h new file > >> mode 100644 index 0000000000..b9f8c1ddc8 > >> --- /dev/null > >> +++ b/lib/pmu/pmu_private.h > >> @@ -0,0 +1,32 @@ > >> +/* SPDX-License-Identifier: BSD-3-Clause > >> + * Copyright(c) 2023 Marvell > >> + */ > >> + > >> +#ifndef _PMU_PRIVATE_H_ > >> +#define _PMU_PRIVATE_H_ > >> + > >> +/** > >> + * Architecture specific PMU init callback. > >> + * > >> + * @return > >> + * 0 in case of success, negative value otherwise. > >> + */ > >> +int > >> +pmu_arch_init(void); > >> + > >> +/** > >> + * Architecture specific PMU cleanup callback. > >> + */ > >> +void > >> +pmu_arch_fini(void); > >> + > >> +/** > >> + * Apply architecture specific settings to config before passing it to > >> syscall. > >> + * > >> + * @param config > >> + * Architecture specific event configuration. Consult kernel sources > >> for available options. > >> + */ > >> +void > >> +pmu_arch_fixup_config(uint64_t config[3]); > >> + > >> +#endif /* _PMU_PRIVATE_H_ */ > >> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c new file mode > >> 100644 index 0000000000..950f999cb7 > >> --- /dev/null > >> +++ b/lib/pmu/rte_pmu.c > >> @@ -0,0 +1,460 @@ > >> +/* SPDX-License-Identifier: BSD-3-Clause > >> + * Copyright(C) 2023 Marvell International Ltd. > >> + */ > >> + > >> +#include <ctype.h> > >> +#include <dirent.h> > >> +#include <errno.h> > >> +#include <regex.h> > >> +#include <stdlib.h> > >> +#include <string.h> > >> +#include <sys/ioctl.h> > >> +#include <sys/mman.h> > >> +#include <sys/queue.h> > >> +#include <sys/syscall.h> > >> +#include <unistd.h> > >> + > >> +#include <rte_atomic.h> > >> +#include <rte_per_lcore.h> > >> +#include <rte_pmu.h> > >> +#include <rte_spinlock.h> > >> +#include <rte_tailq.h> > >> + > >> +#include "pmu_private.h" > >> + > >> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices" > > > > > >I suppose that pass (as the whole implementation) is linux specific? > >If so, wouldn't it make sense to have it under linux subdir? > > > > There are not any plans to support that elsewhere currently so flat > directory structure is good enough. > > >> + > >> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> > >> +((64 - 1 - (h))))) #define FIELD_PREP(m, v) (((uint64_t)(v) << > >> +(__builtin_ffsll(m) - 1)) & (m)) > >> + > >> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group); > >> +struct rte_pmu rte_pmu; > > > >Do we really need struct declaration here? > > > > What’s the problem with this placement precisely?
Not a big deal, but It seems excessive for me. As I understand you do have include just above for the whole .h that contains definition of that struct anyway. > > > > >> +/* > >> + * Following __rte_weak functions provide default no-op. > >> +Architectures should override them if > >> + * necessary. > >> + */ > >> + > >> +int > >> +__rte_weak pmu_arch_init(void) > >> +{ > >> + return 0; > >> +} > >> + > >> +void > >> +__rte_weak pmu_arch_fini(void) > >> +{ > >> +} > >> + > >> +void > >> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3]) { } > >> + > >> +static int > >> +get_term_format(const char *name, int *num, uint64_t *mask) { > >> + char path[PATH_MAX]; > >> + char *config = NULL; > >> + int high, low, ret; > >> + FILE *fp; > >> + > >> + *num = *mask = 0; > >> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", > >> rte_pmu.name, name); > >> + fp = fopen(path, "r"); > >> + if (fp == NULL) > >> + return -errno; > >> + > >> + errno = 0; > >> + ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high); > >> + if (ret < 2) { > >> + ret = -ENODATA; > >> + goto out; > >> + } > >> + if (errno) { > >> + ret = -errno; > >> + goto out; > >> + } > >> + > >> + if (ret == 2) > >> + high = low; > >> + > >> + *mask = GENMASK_ULL(high, low); > >> + /* Last digit should be [012]. If last digit is missing 0 is implied. */ > >> + *num = config[strlen(config) - 1]; > >> + *num = isdigit(*num) ? *num - '0' : 0; > >> + > >> + ret = 0; > >> +out: > >> + free(config); > >> + fclose(fp); > >> + > >> + return ret; > >> +} > >> + > >> +static int > >> +parse_event(char *buf, uint64_t config[3]) { > >> + char *token, *term; > >> + int num, ret, val; > >> + uint64_t mask; > >> + > >> + config[0] = config[1] = config[2] = 0; > >> + > >> + token = strtok(buf, ","); > >> + while (token) { > >> + errno = 0; > >> + /* <term>=<value> */ > >> + ret = sscanf(token, "%m[^=]=%i", &term, &val); > >> + if (ret < 1) > >> + return -ENODATA; > >> + if (errno) > >> + return -errno; > >> + if (ret == 1) > >> + val = 1; > >> + > >> + ret = get_term_format(term, &num, &mask); > >> + free(term); > >> + if (ret) > >> + return ret; > >> + > >> + config[num] |= FIELD_PREP(mask, val); > >> + token = strtok(NULL, ","); > >> + } > >> + > >> + return 0; > >> +} > >> + > >> +static int > >> +get_event_config(const char *name, uint64_t config[3]) { > >> + char path[PATH_MAX], buf[BUFSIZ]; > >> + FILE *fp; > >> + int ret; > >> + > >> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", > >> rte_pmu.name, name); > >> + fp = fopen(path, "r"); > >> + if (fp == NULL) > >> + return -errno; > >> + > >> + ret = fread(buf, 1, sizeof(buf), fp); > >> + if (ret == 0) { > >> + fclose(fp); > >> + > >> + return -EINVAL; > >> + } > >> + fclose(fp); > >> + buf[ret] = '\0'; > >> + > >> + return parse_event(buf, config); > >> +} > >> + > >> +static int > >> +do_perf_event_open(uint64_t config[3], int group_fd) { > >> + struct perf_event_attr attr = { > >> + .size = sizeof(struct perf_event_attr), > >> + .type = PERF_TYPE_RAW, > >> + .exclude_kernel = 1, > >> + .exclude_hv = 1, > >> + .disabled = 1, > >> + }; > >> + > >> + pmu_arch_fixup_config(config); > >> + > >> + attr.config = config[0]; > >> + attr.config1 = config[1]; > >> + attr.config2 = config[2]; > >> + > >> + return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0); } > >> + > >> +static int > >> +open_events(struct rte_pmu_event_group *group) { > >> + struct rte_pmu_event *event; > >> + uint64_t config[3]; > >> + int num = 0, ret; > >> + > >> + /* group leader gets created first, with fd = -1 */ > >> + group->fds[0] = -1; > >> + > >> + TAILQ_FOREACH(event, &rte_pmu.event_list, next) { > >> + ret = get_event_config(event->name, config); > >> + if (ret) > >> + continue; > >> + > >> + ret = do_perf_event_open(config, group->fds[0]); > >> + if (ret == -1) { > >> + ret = -errno; > >> + goto out; > >> + } > >> + > >> + group->fds[event->index] = ret; > >> + num++; > >> + } > >> + > >> + return 0; > >> +out: > >> + for (--num; num >= 0; num--) { > >> + close(group->fds[num]); > >> + group->fds[num] = -1; > >> + } > >> + > >> + > >> + return ret; > >> +} > >> + > >> +static int > >> +mmap_events(struct rte_pmu_event_group *group) { > >> + long page_size = sysconf(_SC_PAGE_SIZE); > >> + unsigned int i; > >> + void *addr; > >> + int ret; > >> + > >> + for (i = 0; i < rte_pmu.num_group_events; i++) { > >> + addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], > >> 0); > >> + if (addr == MAP_FAILED) { > >> + ret = -errno; > >> + goto out; > >> + } > >> + > >> + group->mmap_pages[i] = addr; > >> + if (!group->mmap_pages[i]->cap_user_rdpmc) { > >> + ret = -EPERM; > >> + goto out; > >> + } > >> + } > >> + > >> + return 0; > >> +out: > >> + for (; i; i--) { > >> + munmap(group->mmap_pages[i - 1], page_size); > >> + group->mmap_pages[i - 1] = NULL; > >> + } > >> + > >> + return ret; > >> +} > >> + > >> +static void > >> +cleanup_events(struct rte_pmu_event_group *group) { > >> + unsigned int i; > >> + > >> + if (group->fds[0] != -1) > >> + ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, > >> PERF_IOC_FLAG_GROUP); > >> + > >> + for (i = 0; i < rte_pmu.num_group_events; i++) { > >> + if (group->mmap_pages[i]) { > >> + munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE)); > >> + group->mmap_pages[i] = NULL; > >> + } > >> + > >> + if (group->fds[i] != -1) { > >> + close(group->fds[i]); > >> + group->fds[i] = -1; > >> + } > >> + } > >> + > >> + group->enabled = false; > >> +} > >> + > >> +int > >> +__rte_pmu_enable_group(void) > >> +{ > >> + struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group); > >> + int ret; > >> + > >> + if (rte_pmu.num_group_events == 0) > >> + return -ENODEV; > >> + > >> + ret = open_events(group); > >> + if (ret) > >> + goto out; > >> + > >> + ret = mmap_events(group); > >> + if (ret) > >> + goto out; > >> + > >> + if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == > >> -1) { > >> + ret = -errno; > >> + goto out; > >> + } > >> + > >> + if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == > >> -1) { > >> + ret = -errno; > >> + goto out; > >> + } > >> + > >> + rte_spinlock_lock(&rte_pmu.lock); > >> + TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next); > >> + rte_spinlock_unlock(&rte_pmu.lock); > >> + group->enabled = true; > >> + > >> + return 0; > >> + > >> +out: > >> + cleanup_events(group); > >> + > >> + return ret; > >> +} > >> + > >> +static int > >> +scan_pmus(void) > >> +{ > >> + char path[PATH_MAX]; > >> + struct dirent *dent; > >> + const char *name; > >> + DIR *dirp; > >> + > >> + dirp = opendir(EVENT_SOURCE_DEVICES_PATH); > >> + if (dirp == NULL) > >> + return -errno; > >> + > >> + while ((dent = readdir(dirp))) { > >> + name = dent->d_name; > >> + if (name[0] == '.') > >> + continue; > >> + > >> + /* sysfs entry should either contain cpus or be a cpu */ > >> + if (!strcmp(name, "cpu")) > >> + break; > >> + > >> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH > >> "/%s/cpus", name); > >> + if (access(path, F_OK) == 0) > >> + break; > >> + } > >> + > >> + if (dent) { > >> + rte_pmu.name = strdup(name); > >> + if (rte_pmu.name == NULL) { > >> + closedir(dirp); > >> + > >> + return -ENOMEM; > >> + } > >> + } > >> + > >> + closedir(dirp); > >> + > >> + return rte_pmu.name ? 0 : -ENODEV; > >> +} > >> + > >> +static struct rte_pmu_event * > >> +new_event(const char *name) > >> +{ > >> + struct rte_pmu_event *event; > >> + > >> + event = calloc(1, sizeof(*event)); > >> + if (event == NULL) > >> + goto out; > >> + > >> + event->name = strdup(name); > >> + if (event->name == NULL) { > >> + free(event); > >> + event = NULL; > >> + } > >> + > >> +out: > >> + return event; > >> +} > >> + > >> +static void > >> +free_event(struct rte_pmu_event *event) { > >> + free(event->name); > >> + free(event); > >> +} > >> + > >> +int > >> +rte_pmu_add_event(const char *name) > >> +{ > >> + struct rte_pmu_event *event; > >> + char path[PATH_MAX]; > >> + > >> + if (rte_pmu.name == NULL) > >> + return -ENODEV; > >> + > >> + if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS) > >> + return -ENOSPC; > >> + > >> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", > >> rte_pmu.name, name); > >> + if (access(path, R_OK)) > >> + return -ENODEV; > >> + > >> + TAILQ_FOREACH(event, &rte_pmu.event_list, next) { > >> + if (!strcmp(event->name, name)) > >> + return event->index; > >> + continue; > >> + } > >> + > >> + event = new_event(name); > >> + if (event == NULL) > >> + return -ENOMEM; > >> + > >> + event->index = rte_pmu.num_group_events++; > >> + TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next); > >> + > >> + return event->index; > >> +} > >> + > >> +int > >> +rte_pmu_init(void) > >> +{ > >> + int ret; > >> + > >> + /* Allow calling init from multiple contexts within a single thread. > >> This simplifies > >> + * resource management a bit e.g in case fast-path tracepoint has > >> already been enabled > >> + * via command line but application doesn't care enough and performs > >> init/fini again. > >> + */ > >> + if (rte_pmu.initialized != 0) { > >> + rte_pmu.initialized++; > >> + return 0; > >> + } > >> + > >> + ret = scan_pmus(); > >> + if (ret) > >> + goto out; > >> + > >> + ret = pmu_arch_init(); > >> + if (ret) > >> + goto out; > >> + > >> + TAILQ_INIT(&rte_pmu.event_list); > >> + TAILQ_INIT(&rte_pmu.event_group_list); > >> + rte_spinlock_init(&rte_pmu.lock); > >> + rte_pmu.initialized = 1; > >> + > >> + return 0; > >> +out: > >> + free(rte_pmu.name); > >> + rte_pmu.name = NULL; > >> + > >> + return ret; > >> +} > >> + > >> +void > >> +rte_pmu_fini(void) > >> +{ > >> + struct rte_pmu_event_group *group, *tmp_group; > >> + struct rte_pmu_event *event, *tmp_event; > >> + > >> + /* cleanup once init count drops to zero */ > >> + if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0) > >> + return; > >> + > >> + RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) { > >> + TAILQ_REMOVE(&rte_pmu.event_list, event, next); > >> + free_event(event); > >> + } > >> + > >> + RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, > >> tmp_group) { > >> + TAILQ_REMOVE(&rte_pmu.event_group_list, group, next); > >> + cleanup_events(group); > >> + } > >> + > >> + pmu_arch_fini(); > >> + free(rte_pmu.name); > >> + rte_pmu.name = NULL; > >> + rte_pmu.num_group_events = 0; > >> +} > >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode > >> 100644 index 0000000000..6b664c3336 > >> --- /dev/null > >> +++ b/lib/pmu/rte_pmu.h > >> @@ -0,0 +1,212 @@ > >> +/* SPDX-License-Identifier: BSD-3-Clause > >> + * Copyright(c) 2023 Marvell > >> + */ > >> + > >> +#ifndef _RTE_PMU_H_ > >> +#define _RTE_PMU_H_ > >> + > >> +/** > >> + * @file > >> + * > >> + * PMU event tracing operations > >> + * > >> + * This file defines generic API and types necessary to setup PMU and > >> + * read selected counters in runtime. > >> + */ > >> + > >> +#ifdef __cplusplus > >> +extern "C" { > >> +#endif > >> + > >> +#include <linux/perf_event.h> > >> + > >> +#include <rte_atomic.h> > >> +#include <rte_branch_prediction.h> > >> +#include <rte_common.h> > >> +#include <rte_compat.h> > >> +#include <rte_spinlock.h> > >> + > >> +/** Maximum number of events in a group */ #define > >> +MAX_NUM_GROUP_EVENTS 8 > >> + > >> +/** > >> + * A structure describing a group of events. > >> + */ > >> +struct rte_pmu_event_group { > >> + struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< > >> array of user pages */ > >> + int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */ > >> + bool enabled; /**< true if group was enabled on particular lcore */ > >> + TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ } > >> +__rte_cache_aligned; > >> + > >> +/** > >> + * A structure describing an event. > >> + */ > >> +struct rte_pmu_event { > >> + char *name; /**< name of an event */ > >> + unsigned int index; /**< event index into fds/mmap_pages */ > >> + TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ }; > >> + > >> +/** > >> + * A PMU state container. > >> + */ > >> +struct rte_pmu { > >> + char *name; /**< name of core PMU listed under > >> /sys/bus/event_source/devices */ > >> + rte_spinlock_t lock; /**< serialize access to event group list */ > >> + TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event > >> groups */ > >> + unsigned int num_group_events; /**< number of events in a group */ > >> + TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */ > >> + unsigned int initialized; /**< initialization counter */ }; > >> + > >> +/** lcore event group */ > >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group); > >> + > >> +/** PMU state container */ > >> +extern struct rte_pmu rte_pmu; > >> + > >> +/** Each architecture supporting PMU needs to provide its own version > >> +*/ #ifndef rte_pmu_pmc_read #define rte_pmu_pmc_read(index) ({ 0; }) > >> +#endif > >> + > >> +/** > >> + * @warning > >> + * @b EXPERIMENTAL: this API may change without prior notice > >> + * > >> + * Read PMU counter. > >> + * > >> + * @warning This should be not called directly. > >> + * > >> + * @param pc > >> + * Pointer to the mmapped user page. > >> + * @return > >> + * Counter value read from hardware. > >> + */ > >> +static __rte_always_inline uint64_t > >> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc) { > >> + uint64_t width, offset; > >> + uint32_t seq, index; > >> + int64_t pmc; > >> + > >> + for (;;) { > >> + seq = pc->lock; > >> + rte_compiler_barrier(); > > > >Are you sure that compiler_barrier() is enough here? > >On some archs CPU itself has freedom to re-order reads. > >Or I am missing something obvious here? > > > > It's a matter of not keeping old stuff cached in registers > and making sure that we have two reads of lock. CPU reordering > won't do any harm here. Sorry, I didn't get you here: Suppose CPU will re-order reads and will read lock *after* index or offset value. Wouldn't it mean that in that case index and/or offset can contain old/invalid values? > > >> + index = pc->index; > >> + offset = pc->offset; > >> + width = pc->pmc_width; > >> + > >> + /* index set to 0 means that particular counter cannot be used > >> */ > >> + if (likely(pc->cap_user_rdpmc && index)) { > >> + pmc = rte_pmu_pmc_read(index - 1); > >> + pmc <<= 64 - width; > >> + pmc >>= 64 - width; > >> + offset += pmc; > >> + } > >> + > >> + rte_compiler_barrier(); > >> + > >> + if (likely(pc->lock == seq)) > >> + return offset; > >> + } > >> + > >> + return 0; > >> +} > >> + > >> +/** > >> + * @warning > >> + * @b EXPERIMENTAL: this API may change without prior notice > >> + * > >> + * Enable group of events on the calling lcore. > >> + * > >> + * @warning This should be not called directly. > >> + * > >> + * @return > >> + * 0 in case of success, negative value otherwise. > >> + */ > >> +__rte_experimental > >> +int > >> +__rte_pmu_enable_group(void); > >> + > >> +/** > >> + * @warning > >> + * @b EXPERIMENTAL: this API may change without prior notice > >> + * > >> + * Initialize PMU library. > >> + * > >> + * @warning This should be not called directly. > >> + * > >> + * @return > >> + * 0 in case of success, negative value otherwise. > >> + */ > >> +__rte_experimental > >> +int > >> +rte_pmu_init(void); > >> + > >> +/** > >> + * @warning > >> + * @b EXPERIMENTAL: this API may change without prior notice > >> + * > >> + * Finalize PMU library. This should be called after PMU counters are no > >> longer being read. > >> + */ > >> +__rte_experimental > >> +void > >> +rte_pmu_fini(void); > >> + > >> +/** > >> + * @warning > >> + * @b EXPERIMENTAL: this API may change without prior notice > >> + * > >> + * Add event to the group of enabled events. > >> + * > >> + * @param name > >> + * Name of an event listed under > >> /sys/bus/event_source/devices/pmu/events. > >> + * @return > >> + * Event index in case of success, negative value otherwise. > >> + */ > >> +__rte_experimental > >> +int > >> +rte_pmu_add_event(const char *name); > >> + > >> +/** > >> + * @warning > >> + * @b EXPERIMENTAL: this API may change without prior notice > >> + * > >> + * Read hardware counter configured to count occurrences of an event. > >> + * > >> + * @param index > >> + * Index of an event to be read. > >> + * @return > >> + * Event value read from register. In case of errors or lack of support > >> + * 0 is returned. In other words, stream of zeros in a trace file > >> + * indicates problem with reading particular PMU event register. > >> + */ > >> +__rte_experimental > >> +static __rte_always_inline uint64_t > >> +rte_pmu_read(unsigned int index) > >> +{ > >> + struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group); > >> + int ret; > >> + > >> + if (unlikely(!rte_pmu.initialized)) > >> + return 0; > >> + > >> + if (unlikely(!group->enabled)) { > >> + ret = __rte_pmu_enable_group(); > >> + if (ret) > >> + return 0; > >> + } > >> + > >> + if (unlikely(index >= rte_pmu.num_group_events)) > >> + return 0; > >> + > >> + return __rte_pmu_read_userpage(group->mmap_pages[index]); > >> +} > >> + > >> +#ifdef __cplusplus > >> +} > >> +#endif > >> + > >> +#endif /* _RTE_PMU_H_ */ > >> diff --git a/lib/pmu/version.map b/lib/pmu/version.map new file mode > >> 100644 index 0000000000..39a4f279c1 > >> --- /dev/null > >> +++ b/lib/pmu/version.map > >> @@ -0,0 +1,15 @@ > >> +DPDK_23 { > >> + local: *; > >> +}; > >> + > >> +EXPERIMENTAL { > >> + global: > >> + > >> + __rte_pmu_enable_group; > >> + per_lcore__event_group; > >> + rte_pmu; > >> + rte_pmu_add_event; > >> + rte_pmu_fini; > >> + rte_pmu_init; > >> + rte_pmu_read; > >> +};