> Add support for programming PMU counters and reading their values
> in runtime bypassing kernel completely.
> 
> This is especially useful in cases where CPU cores are isolated
> i.e run dedicated tasks. In such cases one cannot use standard
> perf utility without sacrificing latency and performance.

LGTM in general, just few questions, nits - majority about docs/comments.

> Signed-off-by: Tomasz Duszynski <tduszyn...@marvell.com>
> ---
>  MAINTAINERS                            |   5 +
>  app/test/meson.build                   |   1 +
>  app/test/test_pmu.c                    |  49 +++
>  doc/api/doxy-api-index.md              |   3 +-
>  doc/api/doxy-api.conf.in               |   1 +
>  doc/guides/prog_guide/profile_app.rst  |  29 ++
>  doc/guides/rel_notes/release_24_11.rst |   7 +
>  lib/eal/meson.build                    |   3 +
>  lib/meson.build                        |   1 +
>  lib/pmu/meson.build                    |  13 +
>  lib/pmu/pmu_private.h                  |  32 ++
>  lib/pmu/rte_pmu.c                      | 473 +++++++++++++++++++++++++
>  lib/pmu/rte_pmu.h                      | 205 +++++++++++
>  lib/pmu/version.map                    |  13 +
>  14 files changed, 834 insertions(+), 1 deletion(-)
>  create mode 100644 app/test/test_pmu.c
>  create mode 100644 lib/pmu/meson.build
>  create mode 100644 lib/pmu/pmu_private.h
>  create mode 100644 lib/pmu/rte_pmu.c
>  create mode 100644 lib/pmu/rte_pmu.h
>  create mode 100644 lib/pmu/version.map
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cd78bc7db1..077efe41cf 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1816,6 +1816,11 @@ M: Nithin Dabilpuram <ndabilpu...@marvell.com>
>  M: Pavan Nikhilesh <pbhagavat...@marvell.com>
>  F: lib/node/
> 
> +PMU - EXPERIMENTAL
> +M: Tomasz Duszynski <tduszyn...@marvell.com>
> +F: lib/pmu/
> +F: app/test/test_pmu*
> +
> 
>  Test Applications
>  -----------------
> diff --git a/app/test/meson.build b/app/test/meson.build
> index 0f7e11969a..5f1622ecab 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -141,6 +141,7 @@ source_file_deps = {
>      'test_pmd_perf.c': ['ethdev', 'net'] + packet_burst_generator_deps,
>      'test_pmd_ring.c': ['net_ring', 'ethdev', 'bus_vdev'],
>      'test_pmd_ring_perf.c': ['ethdev', 'net_ring', 'bus_vdev'],
> +    'test_pmu.c': ['pmu'],
>      'test_power.c': ['power'],
>      'test_power_cpufreq.c': ['power'],
>      'test_power_intel_uncore.c': ['power'],
> diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c
> new file mode 100644
> index 0000000000..464e5068ec
> --- /dev/null
> +++ b/app/test/test_pmu.c
> @@ -0,0 +1,49 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2024 Marvell International Ltd.
> + */
> +
> +#include <rte_pmu.h>
> +
> +#include "test.h"
> +
> +static int
> +test_pmu_read(void)
> +{
> +     const char *name = NULL;
> +     int tries = 10, event;
> +     uint64_t val = 0;
> +
> +     if (name == NULL) {
> +             printf("PMU not supported on this arch\n");
> +             return TEST_SKIPPED;
> +     }
> +
> +     if (rte_pmu_init() < 0)
> +             return TEST_FAILED;
> +
> +     event = rte_pmu_add_event(name);
> +     while (tries--)
> +             val += rte_pmu_read(event);
> +
> +     rte_pmu_fini();
> +
> +     return val ? TEST_SUCCESS : TEST_FAILED;
> +}
> +
> +static struct unit_test_suite pmu_tests = {
> +     .suite_name = "pmu autotest",
> +     .setup = NULL,
> +     .teardown = NULL,
> +     .unit_test_cases = {
> +             TEST_CASE(test_pmu_read),
> +             TEST_CASES_END()
> +     }
> +};
> +
> +static int
> +test_pmu(void)
> +{
> +     return unit_test_suite_runner(&pmu_tests);
> +}
> +
> +REGISTER_FAST_TEST(pmu_autotest, true, true, test_pmu);
> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
> index 266c8b90dc..3e6020f367 100644
> --- a/doc/api/doxy-api-index.md
> +++ b/doc/api/doxy-api-index.md
> @@ -240,7 +240,8 @@ The public API headers are grouped by topics:
>    [log](@ref rte_log.h),
>    [errno](@ref rte_errno.h),
>    [trace](@ref rte_trace.h),
> -  [trace_point](@ref rte_trace_point.h)
> +  [trace_point](@ref rte_trace_point.h),
> +  [pmu](@ref rte_pmu.h)
> 
>  - **misc**:
>    [EAL config](@ref rte_eal.h),
> diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in
> index c94f02d411..f4b382e073 100644
> --- a/doc/api/doxy-api.conf.in
> +++ b/doc/api/doxy-api.conf.in
> @@ -69,6 +69,7 @@ INPUT                   = 
> @TOPDIR@/doc/api/doxy-api-index.md \
>                            @TOPDIR@/lib/pdcp \
>                            @TOPDIR@/lib/pdump \
>                            @TOPDIR@/lib/pipeline \
> +                          @TOPDIR@/lib/pmu \
>                            @TOPDIR@/lib/port \
>                            @TOPDIR@/lib/power \
>                            @TOPDIR@/lib/ptr_compress \
> diff --git a/doc/guides/prog_guide/profile_app.rst 
> b/doc/guides/prog_guide/profile_app.rst
> index a6b5fb4d5e..854c515a61 100644
> --- a/doc/guides/prog_guide/profile_app.rst
> +++ b/doc/guides/prog_guide/profile_app.rst
> @@ -7,6 +7,35 @@ Profile Your Application
>  The following sections describe methods of profiling DPDK applications on
>  different architectures.
> 
> +Performance counter based profiling
> +-----------------------------------
> +
> +Majority of architectures support some performance monitoring unit (PMU).
> +Such unit provides programmable counters that monitor specific events.
> +
> +Different tools gather that information, like for example perf.
> +However, in some scenarios when CPU cores are isolated and run
> +dedicated tasks interrupting those tasks with perf may be undesirable.
> +
> +In such cases, an application can use the PMU library to read such events 
> via ``rte_pmu_read()``.
> +
> +By default, userspace applications are not allowed to access PMU internals. 
> That can be changed
> +by setting ``/sys/kernel/perf_event_paranoid`` to 2 (that should be a 
> default value anyway) and
> +adding ``CAP_PERFMON`` capability to DPDK application. Please refer to
> +``Documentation/admin-guide/perf-security.rst`` under Linux sources for more 
> information. Fairly
> +recent kernel, i.e >= 5.9, is advised too.
> +
> +As of now implementation imposes certain limitations:
> +
> +* Management APIs that normally return a non-negative value will return 
> error (``-ENOTSUP``) while
> +  ``rte_pmu_read()`` will return ``UINT64_MAX`` if running under unsupported 
> operating system.
> +
> +* Only EAL lcores are supported
> +
> +* EAL lcores must not share a cpu
> +
> +* Each EAL lcore measures same group of events
> +
> 
>  Profiling on x86
>  ----------------
> diff --git a/doc/guides/rel_notes/release_24_11.rst 
> b/doc/guides/rel_notes/release_24_11.rst
> index fa4822d928..d34ecb55e0 100644
> --- a/doc/guides/rel_notes/release_24_11.rst
> +++ b/doc/guides/rel_notes/release_24_11.rst
> @@ -247,6 +247,13 @@ New Features
>    Added ability for node to advertise and update multiple xstat counters,
>    that can be retrieved using ``rte_graph_cluster_stats_get``.
> 
> +* **Added PMU library.**
> +
> +  Added a new performance monitoring unit (PMU) library which allows 
> applications
> +  to perform self monitoring activities without depending on external 
> utilities like perf.
> +  After integration with :doc:`../prog_guide/trace_lib` data gathered from 
> hardware counters
> +  can be stored in CTF format for further analysis.
> +
> 
>  Removed Items
>  -------------
> diff --git a/lib/eal/meson.build b/lib/eal/meson.build
> index e1d6c4cf17..1349624653 100644
> --- a/lib/eal/meson.build
> +++ b/lib/eal/meson.build
> @@ -18,6 +18,9 @@ deps += ['log', 'kvargs']
>  if not is_windows
>      deps += ['telemetry']
>  endif
> +if dpdk_conf.has('RTE_LIB_PMU')
> +    deps += ['pmu']
> +endif
>  if dpdk_conf.has('RTE_USE_LIBBSD')
>      ext_deps += libbsd
>  endif
> diff --git a/lib/meson.build b/lib/meson.build
> index ce92cb5537..968ad29e8d 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -13,6 +13,7 @@ libraries = [
>          'kvargs', # eal depends on kvargs
>          'argparse',
>          'telemetry', # basic info querying
> +        'pmu',
>          'eal', # everything depends on eal
>          'ptr_compress',
>          'ring',
> diff --git a/lib/pmu/meson.build b/lib/pmu/meson.build
> new file mode 100644
> index 0000000000..f173b6f55c
> --- /dev/null
> +++ b/lib/pmu/meson.build
> @@ -0,0 +1,13 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(C) 2024 Marvell International Ltd.
> +
> +if not is_linux
> +    build = false
> +    reason = 'only supported on Linux'
> +    subdir_done()
> +endif
> +
> +headers = files('rte_pmu.h')
> +sources = files('rte_pmu.c')
> +
> +deps += ['log']
> diff --git a/lib/pmu/pmu_private.h b/lib/pmu/pmu_private.h
> new file mode 100644
> index 0000000000..d2b15615bf
> --- /dev/null
> +++ b/lib/pmu/pmu_private.h
> @@ -0,0 +1,32 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2024 Marvell
> + */
> +
> +#ifndef _PMU_PRIVATE_H_
> +#define _PMU_PRIVATE_H_
> +
> +/**
> + * Architecture specific PMU init callback.
> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +int
> +pmu_arch_init(void);
> +
> +/**
> + * Architecture specific PMU cleanup callback.
> + */
> +void
> +pmu_arch_fini(void);
> +
> +/**
> + * Apply architecture specific settings to config before passing it to 
> syscall.
> + *
> + * @param config
> + *   Architecture specific event configuration. Consult kernel sources for 
> available options.
> + */
> +void
> +pmu_arch_fixup_config(uint64_t config[3]);
> +
> +#endif /* _PMU_PRIVATE_H_ */
> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c
> new file mode 100644
> index 0000000000..dd57961627
> --- /dev/null
> +++ b/lib/pmu/rte_pmu.c
> @@ -0,0 +1,473 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2024 Marvell International Ltd.
> + */
> +
> +#include <ctype.h>
> +#include <dirent.h>
> +#include <errno.h>
> +#include <regex.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/queue.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_bitops.h>
> +#include <rte_lcore.h>
> +#include <rte_log.h>
> +#include <rte_per_lcore.h>
> +#include <rte_pmu.h>
> +#include <rte_tailq.h>
> +
> +#include "pmu_private.h"
> +
> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> +
> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
> +
> +RTE_LOG_REGISTER_DEFAULT(rte_pmu_logtype, INFO)
> +#define RTE_LOGTYPE_PMU rte_pmu_logtype
> +
> +#define PMU_LOG(level, ...) \
> +     RTE_LOG_LINE(level, PMU, ## __VA_ARGS__)
> +
> +/* A structure describing an event */
> +struct rte_pmu_event {
> +     char *name;
> +     unsigned int index;
> +     TAILQ_ENTRY(rte_pmu_event) next;
> +};
> +
> +struct rte_pmu rte_pmu;
> +
> +/*
> + * Following __rte_weak functions provide default no-op. Architectures 
> should override them if
> + * necessary.
> + */
> +
> +int
> +__rte_weak pmu_arch_init(void)
> +{
> +     return 0;
> +}
> +
> +void
> +__rte_weak pmu_arch_fini(void)
> +{
> +}
> +
> +void
> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
> +{
> +}
> +
> +static int
> +get_term_format(const char *name, int *num, uint64_t *mask)
> +{
> +     char path[PATH_MAX];
> +     char *config = NULL;
> +     int high, low, ret;
> +     FILE *fp;
> +
> +     *num = *mask = 0;
> +     snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", 
> rte_pmu.name, name);
> +     fp = fopen(path, "r");
> +     if (fp == NULL)
> +             return -errno;
> +
> +     errno = 0;
> +     ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> +     if (ret < 2) {
> +             ret = -ENODATA;
> +             goto out;
> +     }
> +     if (errno) {
> +             ret = -errno;
> +             goto out;
> +     }
> +
> +     if (ret == 2)
> +             high = low;
> +
> +     *mask = RTE_GENMASK64(high, low);
> +     /* Last digit should be [012]. If last digit is missing 0 is implied. */
> +     *num = config[strlen(config) - 1];
> +     *num = isdigit(*num) ? *num - '0' : 0;
> +
> +     ret = 0;
> +out:
> +     free(config);
> +     fclose(fp);
> +
> +     return ret;
> +}
> +
> +static int
> +parse_event(char *buf, uint64_t config[3])
> +{
> +     char *token, *term;
> +     int num, ret, val;
> +     uint64_t mask;
> +
> +     config[0] = config[1] = config[2] = 0;
> +
> +     token = strtok(buf, ",");
> +     while (token) {
> +             errno = 0;
> +             /* <term>=<value> */
> +             ret = sscanf(token, "%m[^=]=%i", &term, &val);
> +             if (ret < 1)
> +                     return -ENODATA;
> +             if (errno)
> +                     return -errno;
> +             if (ret == 1)
> +                     val = 1;
> +
> +             ret = get_term_format(term, &num, &mask);
> +             free(term);
> +             if (ret)
> +                     return ret;
> +
> +             config[num] |= FIELD_PREP(mask, val);
> +             token = strtok(NULL, ",");
> +     }
> +
> +     return 0;
> +}
> +
> +static int
> +get_event_config(const char *name, uint64_t config[3])
> +{
> +     char path[PATH_MAX], buf[BUFSIZ];
> +     FILE *fp;
> +     int ret;
> +
> +     snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", 
> rte_pmu.name, name);
> +     fp = fopen(path, "r");
> +     if (fp == NULL)
> +             return -errno;
> +
> +     ret = fread(buf, 1, sizeof(buf), fp);
> +     if (ret == 0) {
> +             fclose(fp);
> +
> +             return -EINVAL;
> +     }
> +     fclose(fp);
> +     buf[ret] = '\0';
> +
> +     return parse_event(buf, config);
> +}
> +
> +static int
> +do_perf_event_open(uint64_t config[3], int group_fd)
> +{
> +     struct perf_event_attr attr = {
> +             .size = sizeof(struct perf_event_attr),
> +             .type = PERF_TYPE_RAW,
> +             .exclude_kernel = 1,
> +             .exclude_hv = 1,
> +             .disabled = 1,
> +             .pinned = group_fd == -1,
> +     };
> +
> +     pmu_arch_fixup_config(config);
> +
> +     attr.config = config[0];
> +     attr.config1 = config[1];
> +     attr.config2 = config[2];
> +
> +     return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0);
> +}
> +
> +static int
> +open_events(struct rte_pmu_event_group *group)
> +{
> +     struct rte_pmu_event *event;
> +     uint64_t config[3];
> +     int num = 0, ret;
> +
> +     /* group leader gets created first, with fd = -1 */
> +     group->fds[0] = -1;
> +
> +     TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> +             ret = get_event_config(event->name, config);
> +             if (ret)
> +                     continue;
> +
> +             ret = do_perf_event_open(config, group->fds[0]);
> +             if (ret == -1) {
> +                     ret = -errno;
> +                     goto out;
> +             }
> +
> +             group->fds[event->index] = ret;
> +             num++;
> +     }
> +
> +     return 0;
> +out:
> +     for (--num; num >= 0; num--) {
> +             close(group->fds[num]);
> +             group->fds[num] = -1;
> +     }
> +
> +     return ret;
> +}
> +
> +static int
> +mmap_events(struct rte_pmu_event_group *group)
> +{
> +     long page_size = sysconf(_SC_PAGE_SIZE);
> +     unsigned int i;
> +     void *addr;
> +     int ret;
> +
> +     for (i = 0; i < rte_pmu.num_group_events; i++) {
> +             addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 
> 0);
> +             if (addr == MAP_FAILED) {
> +                     ret = -errno;
> +                     goto out;
> +             }
> +
> +             group->mmap_pages[i] = addr;
> +     }
> +
> +     return 0;
> +out:
> +     for (; i; i--) {
> +             munmap(group->mmap_pages[i - 1], page_size);
> +             group->mmap_pages[i - 1] = NULL;
> +     }
> +
> +     return ret;
> +}
> +
> +static void
> +cleanup_events(struct rte_pmu_event_group *group)
> +{
> +     unsigned int i;
> +
> +     if (group->fds[0] != -1)
> +             ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, 
> PERF_IOC_FLAG_GROUP);
> +
> +     for (i = 0; i < rte_pmu.num_group_events; i++) {
> +             if (group->mmap_pages[i]) {
> +                     munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
> +                     group->mmap_pages[i] = NULL;
> +             }
> +
> +             if (group->fds[i] != -1) {
> +                     close(group->fds[i]);
> +                     group->fds[i] = -1;
> +             }
> +     }
> +
> +     group->enabled = false;
> +}
> +
> +int
> +__rte_pmu_enable_group(struct rte_pmu_event_group *group)
> +{
> +     int ret;
> +
> +     if (rte_pmu.num_group_events == 0)
> +             return -ENODEV;
> +
> +     ret = open_events(group);
> +     if (ret)
> +             goto out;
> +
> +     ret = mmap_events(group);
> +     if (ret)
> +             goto out;
> +
> +     if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == 
> -1) {
> +             ret = -errno;
> +             goto out;
> +     }
> +
> +     if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == 
> -1) {
> +             ret = -errno;
> +             goto out;
> +     }
> +
> +     group->enabled = true;
> +
> +     return 0;
> +out:
> +     cleanup_events(group);
> +
> +     return ret;
> +}
> +
> +static int
> +scan_pmus(void)
> +{
> +     char path[PATH_MAX];
> +     struct dirent *dent;
> +     const char *name;
> +     DIR *dirp;
> +
> +     dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> +     if (dirp == NULL)
> +             return -errno;
> +
> +     while ((dent = readdir(dirp))) {
> +             name = dent->d_name;
> +             if (name[0] == '.')
> +                     continue;
> +
> +             /* sysfs entry should either contain cpus or be a cpu */
> +             if (!strcmp(name, "cpu"))
> +                     break;
> +
> +             snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH 
> "/%s/cpus", name);
> +             if (access(path, F_OK) == 0)
> +                     break;
> +     }
> +
> +     if (dent) {
> +             rte_pmu.name = strdup(name);
> +             if (rte_pmu.name == NULL) {
> +                     closedir(dirp);
> +
> +                     return -ENOMEM;
> +             }
> +     }
> +
> +     closedir(dirp);
> +
> +     return rte_pmu.name ? 0 : -ENODEV;
> +}
> +
> +static struct rte_pmu_event *
> +new_event(const char *name)
> +{
> +     struct rte_pmu_event *event;
> +
> +     event = calloc(1, sizeof(*event));
> +     if (event == NULL)
> +             goto out;
> +
> +     event->name = strdup(name);
> +     if (event->name == NULL) {
> +             free(event);
> +             event = NULL;
> +     }
> +
> +out:
> +     return event;
> +}
> +
> +static void
> +free_event(struct rte_pmu_event *event)
> +{
> +     free(event->name);
> +     free(event);
> +}
> +
> +int
> +rte_pmu_add_event(const char *name)
> +{
> +     struct rte_pmu_event *event;
> +     char path[PATH_MAX];
> +
> +     if (!rte_pmu.initialized) {
> +             PMU_LOG(ERR, "PMU is not initialized");
> +             return -ENODEV;
> +     }
> +
> +     if (rte_pmu.num_group_events + 1 >= RTE_MAX_NUM_GROUP_EVENTS) {
> +             PMU_LOG(ERR, "Excessive number of events in a group (%d > %d)",
> +                     rte_pmu.num_group_events, RTE_MAX_NUM_GROUP_EVENTS);
> +             return -ENOSPC;
> +     }
> +
> +     snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", 
> rte_pmu.name, name);
> +     if (access(path, R_OK)) {
> +             PMU_LOG(ERR, "Cannot access %s", path);
> +             return -ENODEV;
> +     }
> +
> +     TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> +             if (strcmp(event->name, name))
> +                     continue;
> +
> +             return event->index;
> +     }
> +
> +     event = new_event(name);
> +     if (event == NULL) {
> +             PMU_LOG(ERR, "Failed to create event %s", name);
> +             return -ENOMEM;
> +     }
> +
> +     event->index = rte_pmu.num_group_events++;
> +     TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
> +
> +     return event->index;
> +}
> +
> +int
> +rte_pmu_init(void)
> +{
> +     int ret;
> +
> +     if (rte_pmu.initialized)
> +             return 0;
> +
> +     ret = scan_pmus();
> +     if (ret) {
> +             PMU_LOG(ERR, "Failed to scan for event sources");
> +             goto out;
> +     }
> +
> +     ret = pmu_arch_init();
> +     if (ret) {
> +             PMU_LOG(ERR, "Failed to setup arch internals");
> +             goto out;
> +     }
> +
> +     TAILQ_INIT(&rte_pmu.event_list);
> +     rte_pmu.initialized = 1;
> +out:
> +     if (ret) {
> +             free(rte_pmu.name);
> +             rte_pmu.name = NULL;
> +     }
> +
> +     return ret;
> +}
> +
> +void
> +rte_pmu_fini(void)
> +{
> +     struct rte_pmu_event *event, *tmp_event;
> +     struct rte_pmu_event_group *group;
> +     unsigned int i;
> +
> +     if (!rte_pmu.initialized)
> +             return;
> +
> +     RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
> +             TAILQ_REMOVE(&rte_pmu.event_list, event, next);
> +             free_event(event);
> +     }
> +
> +     for (i = 0; i < RTE_DIM(rte_pmu.event_groups); i++) {
> +             group = &rte_pmu.event_groups[i];
> +             if (!group->enabled)
> +                     continue;
> +
> +             cleanup_events(group);
> +     }
> +
> +     pmu_arch_fini();
> +     free(rte_pmu.name);
> +     rte_pmu.name = NULL;
> +     rte_pmu.num_group_events = 0;
> +}
> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h
> new file mode 100644
> index 0000000000..7d10a0dc56
> --- /dev/null
> +++ b/lib/pmu/rte_pmu.h
> @@ -0,0 +1,205 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2024 Marvell
> + */
> +
> +#ifndef _RTE_PMU_H_
> +#define _RTE_PMU_H_
> +
> +/**
> + * @file
> + *
> + * PMU event tracing operations
> + *
> + * This file defines generic API and types necessary to setup PMU and
> + * read selected counters in runtime. Exported APIs are generally not 
> MT-safe.
> + * One exception is rte_pmu_read() which can be called concurrently once
> + * everything has been setup.
> + */
> +

>From reading the code - I can see ithat PMU API is not MT safe.
The only function that can run in parallel is rte_pmu_read(), correct?
All other combinations: let say pmu_read() and add_event() are not possible,
right?  
If so, then it is probably worth to articulate that explicitly in the public API
comments, after all extra comment doesn't hurt.

> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <errno.h>
> +#include <linux/perf_event.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_branch_prediction.h>
> +#include <rte_common.h>
> +#include <rte_compat.h>
> +#include <rte_lcore.h>
> +
> +/** Maximum number of events in a group */
> +#define RTE_MAX_NUM_GROUP_EVENTS 8
> +
> +/**
> + * A structure describing a group of events.
> + */
> +struct __rte_cache_aligned rte_pmu_event_group {
> +     /** array of user pages */
> +     struct perf_event_mmap_page *mmap_pages[RTE_MAX_NUM_GROUP_EVENTS];
> +     int fds[RTE_MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> +     TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
> +     bool enabled; /**< true if group was enabled on particular lcore */
> +};
> +
> +/**
> + * A PMU state container.
> + */
> +struct rte_pmu {
> +     struct rte_pmu_event_group event_groups[RTE_MAX_LCORE]; /**< event 
> groups */
> +     unsigned int num_group_events; /**< number of events in a group */
> +     unsigned int initialized; /**< initialization counter */
> +     char *name; /**< name of core PMU listed under 
> /sys/bus/event_source/devices */
> +     TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> +};
> +
> +/** PMU state container */
> +extern struct rte_pmu rte_pmu;
> +
> +/** Each architecture supporting PMU needs to provide its own version */
> +#ifndef rte_pmu_pmc_read
> +#define rte_pmu_pmc_read(index) ({ (void)(index); 0; })
> +#endif
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read PMU counter.
> + *
> + * @warning This should not be called directly.
> + *
> + * @param pc
> + *   Pointer to the mmapped user page.
> + * @return
> + *   Counter value read from hardware.
> + */
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
> +{
> +#define __RTE_PMU_READ_ONCE(x) (*(const volatile typeof(x) *)&(x))
> +     uint64_t width, offset;
> +     uint32_t seq, index;
> +     int64_t pmc;
> +
> +     for (;;) {
> +             seq = __RTE_PMU_READ_ONCE(pc->lock);
> +             rte_compiler_barrier();
> +             index = __RTE_PMU_READ_ONCE(pc->index);
> +             offset = __RTE_PMU_READ_ONCE(pc->offset);
> +             width = __RTE_PMU_READ_ONCE(pc->pmc_width);
> +
> +             /* index set to 0 means that particular counter cannot be used 
> */
> +             if (likely(pc->cap_user_rdpmc && index)) {
> +                     pmc = rte_pmu_pmc_read(index - 1);
> +                     pmc <<= 64 - width;
> +                     pmc >>= 64 - width;
> +                     offset += pmc;
> +             }
> +
> +             rte_compiler_barrier();
> +
> +             if (likely(__RTE_PMU_READ_ONCE(pc->lock) == seq))
> +                     return offset;
> +     }
> +
> +     return 0;
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Enable group of events on the calling lcore.
> + *
> + * @warning This should not be called directly.
> + *
> + * @param group
> + *   Pointer to the group which will be enabled.
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +__rte_pmu_enable_group(struct rte_pmu_event_group *group);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Initialize PMU library.
> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_init(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Finalize PMU library.
> + */
> +__rte_experimental
> +void
> +rte_pmu_fini(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add event to the group of enabled events.
> + *
> + * @param name
> + *   Name of an event listed under 
> /sys/bus/event_source/devices/<pmu>/events.
> + * @return
> + *   Event index in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_add_event(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read hardware counter configured to count occurrences of an event.
> + *
> + * @param index
> + *   Index of an event to be read.
> + * @return
> + *   Event value read from register. In case of errors or lack of support
> + *   0 is returned. In other words, stream of zeros in a trace file
> + *   indicates problem with reading particular PMU event register.
> + */

As I remember that function implies to be called from the thread
bind to one particular cpu.
If that is still the case - please state it explicitly in the comments above. 

> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(unsigned int index)
> +{
> +     struct rte_pmu_event_group *group = 
> &rte_pmu.event_groups[rte_lcore_id()];

I think we better check here value returned by rte_lcore_id() before using it 
as an index.
 
> +     int ret;
> +
> +     if (unlikely(!rte_pmu.initialized))
> +             return 0;
> +
> +     if (unlikely(!group->enabled)) {
> +             ret = __rte_pmu_enable_group(group);
> +             if (ret)
> +                     return 0;
> +     }
> +
> +     if (unlikely(index >= rte_pmu.num_group_events))
> +             return 0;
> +
> +     return __rte_pmu_read_userpage(group->mmap_pages[index]);
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PMU_H_ */
> diff --git a/lib/pmu/version.map b/lib/pmu/version.map
> new file mode 100644
> index 0000000000..d0f907d13d
> --- /dev/null
> +++ b/lib/pmu/version.map
> @@ -0,0 +1,13 @@
> +EXPERIMENTAL {
> +     global:
> +
> +     # added in 24.11
> +     __rte_pmu_enable_group;
> +     rte_pmu;
> +     rte_pmu_add_event;
> +     rte_pmu_fini;
> +     rte_pmu_init;
> +     rte_pmu_read;
> +
> +     local: *;
> +};
> --
> 2.34.1

Reply via email to