> From: Tomasz Duszynski [mailto:tduszyn...@marvell.com] > Sent: Tuesday, 13 December 2022 11.44 > > Add support for programming PMU counters and reading their values > in runtime bypassing kernel completely. > > This is especially useful in cases where CPU cores are isolated > (nohz_full) i.e run dedicated tasks. In such cases one cannot use > standard perf utility without sacrificing latency and performance. > > Signed-off-by: Tomasz Duszynski <tduszyn...@marvell.com> > ---
> +++ b/lib/eal/common/rte_pmu.c > @@ -0,0 +1,456 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(C) 2022 Marvell International Ltd. > + */ > + > +#include <ctype.h> > +#include <dirent.h> > +#include <errno.h> > +#include <regex.h> > +#include <stdlib.h> > +#include <string.h> > +#include <sys/ioctl.h> > +#include <sys/mman.h> > +#include <sys/queue.h> > +#include <sys/syscall.h> > +#include <unistd.h> > + > +#include <rte_eal_paging.h> > +#include <rte_pmu.h> > +#include <rte_tailq.h> > + > +#include "pmu_private.h" > + > +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices" > + > +#ifndef GENMASK_ULL > +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> > ((64 - 1 - (h))))) > +#endif > + > +#ifndef FIELD_PREP > +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) > & (m)) > +#endif > + > +struct rte_pmu *rte_pmu; > + > +/* > + * Following __rte_weak functions provide default no-op. Architectures > should override them if > + * necessary. > + */ > + > +int > +__rte_weak pmu_arch_init(void) > +{ > + return 0; > +} > + > +void > +__rte_weak pmu_arch_fini(void) > +{ > +} > + > +void > +__rte_weak pmu_arch_fixup_config(uint64_t config[3]) > +{ > + RTE_SET_USED(config); > +} > + > +static int > +get_term_format(const char *name, int *num, uint64_t *mask) > +{ > + char *config = NULL; > + char path[PATH_MAX]; > + int high, low, ret; > + FILE *fp; > + > + /* quiesce -Wmaybe-uninitialized warning */ > + *num = 0; > + *mask = 0; > + > + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH > "/%s/format/%s", rte_pmu->name, name); > + fp = fopen(path, "r"); > + if (!fp) > + return -errno; > + > + errno = 0; > + ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high); > + if (ret < 2) { > + ret = -ENODATA; > + goto out; > + } > + if (errno) { > + ret = -errno; > + goto out; > + } > + > + if (ret == 2) > + high = low; > + > + *mask = GENMASK_ULL(high, low); > + /* Last digit should be [012]. If last digit is missing 0 is > implied. */ > + *num = config[strlen(config) - 1]; > + *num = isdigit(*num) ? *num - '0' : 0; > + > + ret = 0; > +out: > + free(config); > + fclose(fp); > + > + return ret; > +} > + > +static int > +parse_event(char *buf, uint64_t config[3]) > +{ > + char *token, *term; > + int num, ret, val; > + uint64_t mask; > + > + config[0] = config[1] = config[2] = 0; > + > + token = strtok(buf, ","); > + while (token) { > + errno = 0; > + /* <term>=<value> */ > + ret = sscanf(token, "%m[^=]=%i", &term, &val); > + if (ret < 1) > + return -ENODATA; > + if (errno) > + return -errno; > + if (ret == 1) > + val = 1; > + > + ret = get_term_format(term, &num, &mask); > + free(term); > + if (ret) > + return ret; > + > + config[num] |= FIELD_PREP(mask, val); > + token = strtok(NULL, ","); > + } > + > + return 0; > +} > + > +static int > +get_event_config(const char *name, uint64_t config[3]) > +{ > + char path[PATH_MAX], buf[BUFSIZ]; > + FILE *fp; > + int ret; > + > + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH > "/%s/events/%s", rte_pmu->name, name); > + fp = fopen(path, "r"); > + if (!fp) > + return -errno; > + > + ret = fread(buf, 1, sizeof(buf), fp); > + if (ret == 0) { > + fclose(fp); > + > + return -EINVAL; > + } > + fclose(fp); > + buf[ret] = '\0'; > + > + return parse_event(buf, config); > +} > + > +static int > +do_perf_event_open(uint64_t config[3], int lcore_id, int group_fd) > +{ > + struct perf_event_attr attr = { > + .size = sizeof(struct perf_event_attr), > + .type = PERF_TYPE_RAW, > + .exclude_kernel = 1, > + .exclude_hv = 1, > + .disabled = 1, > + }; > + > + pmu_arch_fixup_config(config); > + > + attr.config = config[0]; > + attr.config1 = config[1]; > + attr.config2 = config[2]; > + > + return syscall(SYS_perf_event_open, &attr, rte_gettid(), > rte_lcore_to_cpu_id(lcore_id), > + group_fd, 0); > +} > + > +static int > +open_events(int lcore_id) > +{ > + struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id]; > + struct rte_pmu_event *event; > + uint64_t config[3]; > + int num = 0, ret; > + > + /* group leader gets created first, with fd = -1 */ > + group->fds[0] = -1; > + > + TAILQ_FOREACH(event, &rte_pmu->event_list, next) { > + ret = get_event_config(event->name, config); > + if (ret) { > + RTE_LOG(ERR, EAL, "failed to get %s event config\n", > event->name); > + continue; > + } > + > + ret = do_perf_event_open(config, lcore_id, group->fds[0]); > + if (ret == -1) { > + if (errno == EOPNOTSUPP) > + RTE_LOG(ERR, EAL, "64 bit counters not > supported\n"); > + > + ret = -errno; > + goto out; > + } > + > + group->fds[event->index] = ret; > + num++; > + } > + > + return 0; > +out: > + for (--num; num >= 0; num--) { > + close(group->fds[num]); > + group->fds[num] = -1; > + } > + > + > + return ret; > +} > + > +static int > +mmap_events(int lcore_id) > +{ > + struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id]; > + void *addr; > + int ret, i; > + > + for (i = 0; i < rte_pmu->num_group_events; i++) { > + addr = mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED, > group->fds[i], 0); > + if (addr == MAP_FAILED) { > + ret = -errno; > + goto out; > + } > + > + group->mmap_pages[i] = addr; > + } > + > + return 0; > +out: > + for (; i; i--) { > + munmap(group->mmap_pages[i - 1], rte_mem_page_size()); > + group->mmap_pages[i - 1] = NULL; > + } > + > + return ret; > +} > + > +static void > +cleanup_events(int lcore_id) > +{ > + struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id]; > + int i; > + > + if (!group->fds) > + return; > + > + if (group->fds[0] != -1) > + ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, > PERF_IOC_FLAG_GROUP); > + > + for (i = 0; i < rte_pmu->num_group_events; i++) { > + if (group->mmap_pages[i]) { > + munmap(group->mmap_pages[i], rte_mem_page_size()); > + group->mmap_pages[i] = NULL; > + } > + > + if (group->fds[i] != -1) { > + close(group->fds[i]); > + group->fds[i] = -1; > + } > + } > + > + free(group->mmap_pages); > + free(group->fds); > + > + group->mmap_pages = NULL; > + group->fds = NULL; > + group->enabled = false; > +} > + > +int __rte_noinline > +rte_pmu_enable_group(int lcore_id) > +{ > + struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id]; > + int ret; > + > + if (rte_pmu->num_group_events == 0) { > + RTE_LOG(DEBUG, EAL, "no matching PMU events\n"); > + > + return 0; > + } > + > + group->fds = calloc(rte_pmu->num_group_events, sizeof(*group- > >fds)); > + if (!group->fds) { > + RTE_LOG(ERR, EAL, "failed to alloc descriptor memory\n"); > + > + return -ENOMEM; > + } > + > + group->mmap_pages = calloc(rte_pmu->num_group_events, > sizeof(*group->mmap_pages)); > + if (!group->mmap_pages) { > + RTE_LOG(ERR, EAL, "failed to alloc userpage memory\n"); > + > + ret = -ENOMEM; > + goto out; > + } > + > + ret = open_events(lcore_id); > + if (ret) { > + RTE_LOG(ERR, EAL, "failed to open events on lcore-worker- > %d\n", lcore_id); > + goto out; > + } > + > + ret = mmap_events(lcore_id); > + if (ret) { > + RTE_LOG(ERR, EAL, "failed to map events on lcore-worker- > %d\n", lcore_id); > + goto out; > + } > + > + if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, > PERF_IOC_FLAG_GROUP) == -1) { > + RTE_LOG(ERR, EAL, "failed to enable events on lcore-worker- > %d\n", lcore_id); > + > + ret = -errno; > + goto out; > + } > + > + return 0; > + > +out: > + cleanup_events(lcore_id); > + > + return ret; > +} > + > +static int > +scan_pmus(void) > +{ > + char path[PATH_MAX]; > + struct dirent *dent; > + const char *name; > + DIR *dirp; > + > + dirp = opendir(EVENT_SOURCE_DEVICES_PATH); > + if (!dirp) > + return -errno; > + > + while ((dent = readdir(dirp))) { > + name = dent->d_name; > + if (name[0] == '.') > + continue; > + > + /* sysfs entry should either contain cpus or be a cpu */ > + if (!strcmp(name, "cpu")) > + break; > + > + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH > "/%s/cpus", name); > + if (access(path, F_OK) == 0) > + break; > + } > + > + closedir(dirp); > + > + if (dent) { > + rte_pmu->name = strdup(name); > + if (!rte_pmu->name) > + return -ENOMEM; > + } > + > + return rte_pmu->name ? 0 : -ENODEV; > +} > + > +int > +rte_pmu_add_event(const char *name) > +{ > + struct rte_pmu_event *event; > + char path[PATH_MAX]; > + > + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH > "/%s/events/%s", rte_pmu->name, name); > + if (access(path, R_OK)) > + return -ENODEV; > + > + TAILQ_FOREACH(event, &rte_pmu->event_list, next) { > + if (!strcmp(event->name, name)) > + return event->index; > + continue; > + } > + > + event = calloc(1, sizeof(*event)); > + if (!event) > + return -ENOMEM; > + > + event->name = strdup(name); > + if (!event->name) { > + free(event); > + > + return -ENOMEM; > + } > + > + event->index = rte_pmu->num_group_events++; > + TAILQ_INSERT_TAIL(&rte_pmu->event_list, event, next); > + > + RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name, event- > >index); > + > + return event->index; > +} > + > +void > +eal_pmu_init(void) > +{ > + int ret; > + > + rte_pmu = calloc(1, sizeof(*rte_pmu)); > + if (!rte_pmu) { > + RTE_LOG(ERR, EAL, "failed to alloc PMU\n"); > + > + return; > + } > + > + TAILQ_INIT(&rte_pmu->event_list); > + > + ret = scan_pmus(); > + if (ret) { > + RTE_LOG(ERR, EAL, "failed to find core pmu\n"); > + goto out; > + } > + > + ret = pmu_arch_init(); > + if (ret) { > + RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n"); > + goto out; > + } > + > + return; > +out: > + free(rte_pmu->name); > + free(rte_pmu); > +} > + > +void > +eal_pmu_fini(void) > +{ > + struct rte_pmu_event *event, *tmp; > + int lcore_id; > + > + RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu->event_list, next, tmp) { > + TAILQ_REMOVE(&rte_pmu->event_list, event, next); > + free(event->name); > + free(event); > + } > + > + RTE_LCORE_FOREACH_WORKER(lcore_id) > + cleanup_events(lcore_id); > + > + pmu_arch_fini(); > + free(rte_pmu->name); > + free(rte_pmu); > +} > diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build > index cfcd40aaed..3bf830adee 100644 > --- a/lib/eal/include/meson.build > +++ b/lib/eal/include/meson.build > @@ -36,6 +36,7 @@ headers += files( > 'rte_pci_dev_features.h', > 'rte_per_lcore.h', > 'rte_pflock.h', > + 'rte_pmu.h', > 'rte_random.h', > 'rte_reciprocal.h', > 'rte_seqcount.h', > diff --git a/lib/eal/include/rte_pmu.h b/lib/eal/include/rte_pmu.h > new file mode 100644 > index 0000000000..e4b4f6b052 > --- /dev/null > +++ b/lib/eal/include/rte_pmu.h > @@ -0,0 +1,204 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2022 Marvell > + */ > + > +#ifndef _RTE_PMU_H_ > +#define _RTE_PMU_H_ > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +#include <rte_common.h> > +#include <rte_compat.h> > + > +#ifdef RTE_EXEC_ENV_LINUX > + > +#include <linux/perf_event.h> > + > +#include <rte_atomic.h> > +#include <rte_branch_prediction.h> > +#include <rte_lcore.h> > +#include <rte_log.h> > + > +/** > + * @file > + * > + * PMU event tracing operations > + * > + * This file defines generic API and types necessary to setup PMU and > + * read selected counters in runtime. > + */ > + > +/** > + * A structure describing a group of events. > + */ > +struct rte_pmu_event_group { > + int *fds; /**< array of event descriptors */ > + void **mmap_pages; /**< array of pointers to mmapped > perf_event_attr structures */ There seems to be a lot of indirection involved here. Why are these arrays not statically sized, instead of dynamically allocated? Also, what is the reason for hiding the type struct perf_event_mmap_page **mmap_pages opaque by using void **mmap_pages instead? > + bool enabled; /**< true if group was enabled on particular lcore > */ > +}; > + > +/** > + * A structure describing an event. > + */ > +struct rte_pmu_event { > + char *name; /** name of an event */ > + int index; /** event index into fds/mmap_pages */ > + TAILQ_ENTRY(rte_pmu_event) next; /** list entry */ > +}; > + > +/** > + * A PMU state container. > + */ > +struct rte_pmu { > + char *name; /** name of core PMU listed under > /sys/bus/event_source/devices */ > + struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore > event group data */ > + int num_group_events; /**< number of events in a group */ > + TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching > events */ > +}; > + > +/** Pointer to the PMU state container */ > +extern struct rte_pmu *rte_pmu; Again, why not just extern struct rte_pmu, instead of dynamic allocation? > + > +/** Each architecture supporting PMU needs to provide its own version > */ > +#ifndef rte_pmu_pmc_read > +#define rte_pmu_pmc_read(index) ({ 0; }) > +#endif > + > +/** > + * @internal > + * > + * Read PMU counter. > + * > + * @param pc > + * Pointer to the mmapped user page. > + * @return > + * Counter value read from hardware. > + */ > +__rte_internal > +static __rte_always_inline uint64_t > +rte_pmu_read_userpage(struct perf_event_mmap_page *pc) > +{ > + uint64_t offset, width, pmc = 0; > + uint32_t seq, index; > + int tries = 100; > + > + for (;;) { > + seq = pc->lock; > + rte_compiler_barrier(); > + index = pc->index; > + offset = pc->offset; > + width = pc->pmc_width; > + > + if (likely(pc->cap_user_rdpmc && index)) { > + pmc = rte_pmu_pmc_read(index - 1); > + pmc <<= 64 - width; > + pmc >>= 64 - width; > + } > + > + rte_compiler_barrier(); > + > + if (likely(pc->lock == seq)) > + return pmc + offset; > + > + if (--tries == 0) { > + RTE_LOG(DEBUG, EAL, "failed to get > perf_event_mmap_page lock\n"); > + break; > + } > + } > + > + return 0; > +} > + > +/** > + * @internal > + * > + * Enable group of events for a given lcore. > + * > + * @param lcore_id > + * The identifier of the lcore. > + * @return > + * 0 in case of success, negative value otherwise. > + */ > +__rte_internal > +int > +rte_pmu_enable_group(int lcore_id); > + > +/** > + * @warning > + * @b EXPERIMENTAL: this API may change without prior notice > + * > + * Add event to the group of enabled events. > + * > + * @param name > + * Name of an event listed under > /sys/bus/event_source/devices/pmu/events. > + * @return > + * Event index in case of success, negative value otherwise. > + */ > +__rte_experimental > +int > +rte_pmu_add_event(const char *name); > + > +/** > + * @warning > + * @b EXPERIMENTAL: this API may change without prior notice > + * > + * Read hardware counter configured to count occurrences of an event. > + * > + * @param index > + * Index of an event to be read. > + * @return > + * Event value read from register. In case of errors or lack of > support > + * 0 is returned. In other words, stream of zeros in a trace file > + * indicates problem with reading particular PMU event register. > + */ > +__rte_experimental > +static __rte_always_inline uint64_t > +rte_pmu_read(int index) > +{ > + int lcore_id = rte_lcore_id(); > + struct rte_pmu_event_group *group; > + int ret; > + > + if (!rte_pmu) > + return 0; > + > + group = &rte_pmu->group[lcore_id]; > + if (!group->enabled) { > + ret = rte_pmu_enable_group(lcore_id); > + if (ret) > + return 0; > + > + group->enabled = true; > + } Why is the group not enabled in the setup function, rte_pmu_add_event(), instead of here, in the hot path? > + > + if (index < 0 || index >= rte_pmu->num_group_events) > + return 0; > + > + return rte_pmu_read_userpage((struct perf_event_mmap_page > *)group->mmap_pages[index]); Using fixed size arrays instead of multiple indirections via pointers is faster. It could be: return rte_pmu_read_userpage((struct perf_event_mmap_page *)rte_pmu.group[lcore_id].mmap_pages[index]); With our without suggested performance improvements... Series-acked-by: Morten Brørup <m...@smartsharesystems.com>