Add support for programming PMU counters and reading their values
in runtime bypassing kernel completely.
This is especially useful in cases where CPU cores are isolated
i.e run dedicated tasks. In such cases one cannot use standard
perf utility without sacrificing latency and performance.
Signed-off-by: Tomasz Duszynski <tduszyn...@marvell.com>
Acked-by: Morten Brørup <m...@smartsharesystems.com>
Few more comments/questions below.
diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c
new file mode 100644
index 0000000000..950f999cb7
--- /dev/null
+++ b/lib/pmu/rte_pmu.c
@@ -0,0 +1,460 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell International Ltd.
+ */
+
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <regex.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <rte_atomic.h>
+#include <rte_per_lcore.h>
+#include <rte_pmu.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+
+#include "pmu_private.h"
+
+#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
+
+#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 -
(h)))))
+#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
+
+RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
+struct rte_pmu rte_pmu;
+
+/*
+ * Following __rte_weak functions provide default no-op. Architectures should
override them if
+ * necessary.
+ */
+
+int
+__rte_weak pmu_arch_init(void)
+{
+ return 0;
+}
+
+void
+__rte_weak pmu_arch_fini(void)
+{
+}
+
+void
+__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
+{
+}
+
+static int
+get_term_format(const char *name, int *num, uint64_t *mask)
+{
+ char path[PATH_MAX];
+ char *config = NULL;
+ int high, low, ret;
+ FILE *fp;
+
+ *num = *mask = 0;
+ snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s",
rte_pmu.name, name);
+ fp = fopen(path, "r");
+ if (fp == NULL)
+ return -errno;
+
+ errno = 0;
+ ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
+ if (ret < 2) {
+ ret = -ENODATA;
+ goto out;
+ }
+ if (errno) {
+ ret = -errno;
+ goto out;
+ }
+
+ if (ret == 2)
+ high = low;
+
+ *mask = GENMASK_ULL(high, low);
+ /* Last digit should be [012]. If last digit is missing 0 is implied. */
+ *num = config[strlen(config) - 1];
+ *num = isdigit(*num) ? *num - '0' : 0;
+
+ ret = 0;
+out:
+ free(config);
+ fclose(fp);
+
+ return ret;
+}
+
+static int
+parse_event(char *buf, uint64_t config[3])
+{
+ char *token, *term;
+ int num, ret, val;
+ uint64_t mask;
+
+ config[0] = config[1] = config[2] = 0;
+
+ token = strtok(buf, ",");
+ while (token) {
+ errno = 0;
+ /* <term>=<value> */
+ ret = sscanf(token, "%m[^=]=%i", &term, &val);
+ if (ret < 1)
+ return -ENODATA;
+ if (errno)
+ return -errno;
+ if (ret == 1)
+ val = 1;
+
+ ret = get_term_format(term, &num, &mask);
+ free(term);
+ if (ret)
+ return ret;
+
+ config[num] |= FIELD_PREP(mask, val);
+ token = strtok(NULL, ",");
+ }
+
+ return 0;
+}
+
+static int
+get_event_config(const char *name, uint64_t config[3])
+{
+ char path[PATH_MAX], buf[BUFSIZ];
+ FILE *fp;
+ int ret;
+
+ snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s",
rte_pmu.name, name);
+ fp = fopen(path, "r");
+ if (fp == NULL)
+ return -errno;
+
+ ret = fread(buf, 1, sizeof(buf), fp);
+ if (ret == 0) {
+ fclose(fp);
+
+ return -EINVAL;
+ }
+ fclose(fp);
+ buf[ret] = '\0';
+
+ return parse_event(buf, config);
+}
+
+static int
+do_perf_event_open(uint64_t config[3], int group_fd)
+{
+ struct perf_event_attr attr = {
+ .size = sizeof(struct perf_event_attr),
+ .type = PERF_TYPE_RAW,
+ .exclude_kernel = 1,
+ .exclude_hv = 1,
+ .disabled = 1,
+ };
+
+ pmu_arch_fixup_config(config);
+
+ attr.config = config[0];
+ attr.config1 = config[1];
+ attr.config2 = config[2];
+
+ return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0);
+}
+
+static int
+open_events(struct rte_pmu_event_group *group)
+{
+ struct rte_pmu_event *event;
+ uint64_t config[3];
+ int num = 0, ret;
+
+ /* group leader gets created first, with fd = -1 */
+ group->fds[0] = -1;
+
+ TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
+ ret = get_event_config(event->name, config);
+ if (ret)
+ continue;
+
+ ret = do_perf_event_open(config, group->fds[0]);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ group->fds[event->index] = ret;
+ num++;
+ }
+
+ return 0;
+out:
+ for (--num; num >= 0; num--) {
+ close(group->fds[num]);
+ group->fds[num] = -1;
+ }
+
+
+ return ret;
+}
+
+static int
+mmap_events(struct rte_pmu_event_group *group)
+{
+ long page_size = sysconf(_SC_PAGE_SIZE);
+ unsigned int i;
+ void *addr;
+ int ret;
+
+ for (i = 0; i < rte_pmu.num_group_events; i++) {
+ addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i],
0);
+ if (addr == MAP_FAILED) {
+ ret = -errno;
+ goto out;
+ }
+
+ group->mmap_pages[i] = addr;
+ if (!group->mmap_pages[i]->cap_user_rdpmc) {
+ ret = -EPERM;
+ goto out;
+ }
+ }
+
+ return 0;
+out:
+ for (; i; i--) {
+ munmap(group->mmap_pages[i - 1], page_size);
+ group->mmap_pages[i - 1] = NULL;
+ }
+
+ return ret;
+}
+
+static void
+cleanup_events(struct rte_pmu_event_group *group)
+{
+ unsigned int i;
+
+ if (group->fds[0] != -1)
+ ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE,
PERF_IOC_FLAG_GROUP);
+
+ for (i = 0; i < rte_pmu.num_group_events; i++) {
+ if (group->mmap_pages[i]) {
+ munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
+ group->mmap_pages[i] = NULL;
+ }
+
+ if (group->fds[i] != -1) {
+ close(group->fds[i]);
+ group->fds[i] = -1;
+ }
+ }
+
+ group->enabled = false;
+}
+
+int
+__rte_pmu_enable_group(void)
+{
+ struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
+ int ret;
+
+ if (rte_pmu.num_group_events == 0)
+ return -ENODEV;
+
+ ret = open_events(group);
+ if (ret)
+ goto out;
+
+ ret = mmap_events(group);
+ if (ret)
+ goto out;
+
+ if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) ==
-1) {
+ ret = -errno;
+ goto out;
+ }
+
+ if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) ==
-1) {
+ ret = -errno;
+ goto out;
+ }
+
+ rte_spinlock_lock(&rte_pmu.lock);
+ TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
Hmm.. so we insert pointer to TLS variable into the global list?
Wonder what would happen if that thread get terminated?
Can memory from its TLS block get re-used (by other thread or for other
purposes)?
+ rte_spinlock_unlock(&rte_pmu.lock);
+ group->enabled = true;
+
+ return 0;
+
+out:
+ cleanup_events(group);
+
+ return ret;
+}
+
+static int
+scan_pmus(void)
+{
+ char path[PATH_MAX];
+ struct dirent *dent;
+ const char *name;
+ DIR *dirp;
+
+ dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
+ if (dirp == NULL)
+ return -errno;
+
+ while ((dent = readdir(dirp))) {
+ name = dent->d_name;
+ if (name[0] == '.')
+ continue;
+
+ /* sysfs entry should either contain cpus or be a cpu */
+ if (!strcmp(name, "cpu"))
+ break;
+
+ snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH
"/%s/cpus", name);
+ if (access(path, F_OK) == 0)
+ break;
+ }
+
+ if (dent) {
+ rte_pmu.name = strdup(name);
+ if (rte_pmu.name == NULL) {
+ closedir(dirp);
+
+ return -ENOMEM;
+ }
+ }
+
+ closedir(dirp);
+
+ return rte_pmu.name ? 0 : -ENODEV;
+}
+
+static struct rte_pmu_event *
+new_event(const char *name)
+{
+ struct rte_pmu_event *event;
+
+ event = calloc(1, sizeof(*event));
+ if (event == NULL)
+ goto out;
+
+ event->name = strdup(name);
+ if (event->name == NULL) {
+ free(event);
+ event = NULL;
+ }
+
+out:
+ return event;
+}
+
+static void
+free_event(struct rte_pmu_event *event)
+{
+ free(event->name);
+ free(event);
+}
+
+int
+rte_pmu_add_event(const char *name)
+{
+ struct rte_pmu_event *event;
+ char path[PATH_MAX];
+
+ if (rte_pmu.name == NULL)
+ return -ENODEV;
+
+ if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
+ return -ENOSPC;
+
+ snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s",
rte_pmu.name, name);
+ if (access(path, R_OK))
+ return -ENODEV;
+
+ TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
+ if (!strcmp(event->name, name))
+ return event->index;
+ continue;
+ }
+
+ event = new_event(name);
+ if (event == NULL)
+ return -ENOMEM;
+
+ event->index = rte_pmu.num_group_events++;
+ TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
+
+ return event->index;
+}
+
+int
+rte_pmu_init(void)
+{
+ int ret;
+
+ /* Allow calling init from multiple contexts within a single thread.
This simplifies
+ * resource management a bit e.g in case fast-path tracepoint has
already been enabled
+ * via command line but application doesn't care enough and performs
init/fini again.
+ */
+ if (rte_pmu.initialized != 0) {
+ rte_pmu.initialized++;
+ return 0;
+ }
+
+ ret = scan_pmus();
+ if (ret)
+ goto out;
+
+ ret = pmu_arch_init();
+ if (ret)
+ goto out;
+
+ TAILQ_INIT(&rte_pmu.event_list);
+ TAILQ_INIT(&rte_pmu.event_group_list);
+ rte_spinlock_init(&rte_pmu.lock);
+ rte_pmu.initialized = 1;
+
+ return 0;
+out:
+ free(rte_pmu.name);
+ rte_pmu.name = NULL;
+
+ return ret;
+}
+
+void
+rte_pmu_fini(void)
+{
+ struct rte_pmu_event_group *group, *tmp_group;
+ struct rte_pmu_event *event, *tmp_event;
+
+ /* cleanup once init count drops to zero */
+ if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
+ return;
+
+ RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
+ TAILQ_REMOVE(&rte_pmu.event_list, event, next);
+ free_event(event);
+ }
+
+ RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next,
tmp_group) {
+ TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
+ cleanup_events(group);
+ }
+
+ pmu_arch_fini();
+ free(rte_pmu.name);
+ rte_pmu.name = NULL;
+ rte_pmu.num_group_events = 0;
+}
diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h
new file mode 100644
index 0000000000..6b664c3336
--- /dev/null
+++ b/lib/pmu/rte_pmu.h
@@ -0,0 +1,212 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell
+ */
+
+#ifndef _RTE_PMU_H_
+#define _RTE_PMU_H_
+
+/**
+ * @file
+ *
+ * PMU event tracing operations
+ *
+ * This file defines generic API and types necessary to setup PMU and
+ * read selected counters in runtime.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <linux/perf_event.h>
+
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_compat.h>
+#include <rte_spinlock.h>
+
+/** Maximum number of events in a group */
+#define MAX_NUM_GROUP_EVENTS 8
forgot RTE_ prefix.
In fact, do you really need number of events in group to be hard-coded?
Couldn't mmap_pages[] and fds[] be allocated dynamically by enable_group()?
+
+/**
+ * A structure describing a group of events.
+ */
+struct rte_pmu_event_group {
+ struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**<
array of user pages */
+ int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
+ bool enabled; /**< true if group was enabled on particular lcore */
+ TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
+} __rte_cache_aligned;
+
Even if we'd decide to keep rte_pmu_read() as static inline (still not
sure it is a good idea),
why these two struct below (rte_pmu_event and rte_pmu) have to be public?
I think both can be safely moved away from public headers.
+/**
+ * A structure describing an event.
+ */
+struct rte_pmu_event {
+ char *name; /**< name of an event */
+ unsigned int index; /**< event index into fds/mmap_pages */
+ TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */
+};
+
+/**
+ * A PMU state container.
+ */
+struct rte_pmu {
+ char *name; /**< name of core PMU listed under
/sys/bus/event_source/devices */
+ rte_spinlock_t lock; /**< serialize access to event group list */
+ TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event
groups */
+ unsigned int num_group_events; /**< number of events in a group */
+ TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
+ unsigned int initialized; /**< initialization counter */
+};
+
+/** lcore event group */
+RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
+
+/** PMU state container */
+extern struct rte_pmu rte_pmu;
+
+/** Each architecture supporting PMU needs to provide its own version */
+#ifndef rte_pmu_pmc_read
+#define rte_pmu_pmc_read(index) ({ 0; })
+#endif
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Read PMU counter.
+ *
+ * @warning This should be not called directly.
+ *
+ * @param pc
+ * Pointer to the mmapped user page.
+ * @return
+ * Counter value read from hardware.
+ */
+static __rte_always_inline uint64_t
+__rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
+{
+ uint64_t width, offset;
+ uint32_t seq, index;
+ int64_t pmc;
+
+ for (;;) {
+ seq = pc->lock;
+ rte_compiler_barrier();
+ index = pc->index;
+ offset = pc->offset;
+ width = pc->pmc_width;
+
+ /* index set to 0 means that particular counter cannot be used
*/
+ if (likely(pc->cap_user_rdpmc && index)) {
In mmap_events() you return EPERM if cap_user_rdpmc is not enabled.
Do you need another check here? Or this capability can be disabled by
kernel at run-time?
+ pmc = rte_pmu_pmc_read(index - 1);
+ pmc <<= 64 - width;
+ pmc >>= 64 - width;
+ offset += pmc;
+ }
+
+ rte_compiler_barrier();
+
+ if (likely(pc->lock == seq))
+ return offset;
+ }
+
+ return 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Enable group of events on the calling lcore.
+ *
+ * @warning This should be not called directly.
__rte_internal ?
+ *
+ * @return
+ * 0 in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+__rte_pmu_enable_group(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Initialize PMU library.
+ *
+ * @warning This should be not called directly.
Hmm.. then who should call it?
If it not supposed to be called directly, why to declare it here?
+ *
+ * @return
+ * 0 in case of success, negative value otherwise.
+ */
Probably worth to mention that this function is not MT safe.
Same for _fini_ and add_event.
Also worth to mention that all control-path functions
(init/fini/add_event) and data-path (pmu_read) can't be called concurrently.
+__rte_experimental
+int
+rte_pmu_init(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Finalize PMU library. This should be called after PMU counters are no
longer being read.
+ */
+__rte_experimental
+void
+rte_pmu_fini(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add event to the group of enabled events.
+ *
+ * @param name
+ * Name of an event listed under /sys/bus/event_source/devices/pmu/events.
+ * @return
+ * Event index in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+rte_pmu_add_event(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Read hardware counter configured to count occurrences of an event.
+ *
+ * @param index
+ * Index of an event to be read.
+ * @return
+ * Event value read from register. In case of errors or lack of support
+ * 0 is returned. In other words, stream of zeros in a trace file
+ * indicates problem with reading particular PMU event register.
+ */
+__rte_experimental
+static __rte_always_inline uint64_t
+rte_pmu_read(unsigned int index)
+{
+ struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
+ int ret;
+
+ if (unlikely(!rte_pmu.initialized))
+ return 0;
+
+ if (unlikely(!group->enabled)) {
+ ret = __rte_pmu_enable_group();
+ if (ret)
+ return 0;
+ }
+
+ if (unlikely(index >= rte_pmu.num_group_events))
+ return 0;
+
+ return __rte_pmu_read_userpage(group->mmap_pages[index]);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PMU_H_ */
diff --git a/lib/pmu/version.map b/lib/pmu/version.map
new file mode 100644
index 0000000000..39a4f279c1
--- /dev/null
+++ b/lib/pmu/version.map
@@ -0,0 +1,15 @@
+DPDK_23 {
+ local: *;
+};
+
+EXPERIMENTAL {
+ global:
+
+ __rte_pmu_enable_group;
+ per_lcore__event_group;
+ rte_pmu;
+ rte_pmu_add_event;
+ rte_pmu_fini;
+ rte_pmu_init;
+ rte_pmu_read;
+};