Hi Ruifeng, >-----Original Message----- >From: Ruifeng Wang <ruifeng.w...@arm.com> >Sent: Monday, January 9, 2023 8:37 AM >To: Tomasz Duszynski <tduszyn...@marvell.com>; dev@dpdk.org >Cc: tho...@monjalon.net; Jerin Jacob Kollanukkaran <jer...@marvell.com>; >m...@smartsharesystems.com; >zhou...@loongson.cn; nd <n...@arm.com> >Subject: [EXT] RE: [PATCH v4 1/4] eal: add generic support for reading PMU >events > >External Email > >---------------------------------------------------------------------- >> -----Original Message----- >> From: Tomasz Duszynski <tduszyn...@marvell.com> >> Sent: Tuesday, December 13, 2022 6:44 PM >> To: dev@dpdk.org >> Cc: tho...@monjalon.net; jer...@marvell.com; m...@smartsharesystems.com; >> zhou...@loongson.cn; Tomasz Duszynski <tduszyn...@marvell.com> >> Subject: [PATCH v4 1/4] eal: add generic support for reading PMU >> events >> >> Add support for programming PMU counters and reading their values in >> runtime bypassing kernel completely. >> >> This is especially useful in cases where CPU cores are isolated >> (nohz_full) i.e run dedicated tasks. In such cases one cannot use >> standard perf utility without sacrificing latency and performance. >> >> Signed-off-by: Tomasz Duszynski <tduszyn...@marvell.com> >> --- >> app/test/meson.build | 1 + >> app/test/test_pmu.c | 41 +++ >> doc/guides/prog_guide/profile_app.rst | 8 + >> lib/eal/common/meson.build | 3 + >> lib/eal/common/pmu_private.h | 41 +++ >> lib/eal/common/rte_pmu.c | 456 ++++++++++++++++++++++++++ >> lib/eal/include/meson.build | 1 + >> lib/eal/include/rte_pmu.h | 204 ++++++++++++ >> lib/eal/linux/eal.c | 4 + >> lib/eal/version.map | 6 + >> 10 files changed, 765 insertions(+) >> create mode 100644 app/test/test_pmu.c create mode 100644 >> lib/eal/common/pmu_private.h create mode 100644 >> lib/eal/common/rte_pmu.c create mode 100644 lib/eal/include/rte_pmu.h >> ><snip> >> diff --git a/lib/eal/common/rte_pmu.c b/lib/eal/common/rte_pmu.c new >> file mode 100644 index 0000000000..049fe19fe3 >> --- /dev/null >> +++ b/lib/eal/common/rte_pmu.c >> @@ -0,0 +1,456 @@ >> +/* SPDX-License-Identifier: BSD-3-Clause >> + * Copyright(C) 2022 Marvell International Ltd. >> + */ >> + >> +#include <ctype.h> >> +#include <dirent.h> >> +#include <errno.h> >> +#include <regex.h> >> +#include <stdlib.h> >> +#include <string.h> >> +#include <sys/ioctl.h> >> +#include <sys/mman.h> >> +#include <sys/queue.h> >> +#include <sys/syscall.h> >> +#include <unistd.h> >> + >> +#include <rte_eal_paging.h> >> +#include <rte_pmu.h> >> +#include <rte_tailq.h> >> + >> +#include "pmu_private.h" >> + >> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices" >> + >> +#ifndef GENMASK_ULL >> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> >> +((64 >> +- 1 - (h))))) #endif >> + >> +#ifndef FIELD_PREP >> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) >> +& >> +(m)) #endif >> + >> +struct rte_pmu *rte_pmu; >> + >> +/* >> + * Following __rte_weak functions provide default no-op. >> +Architectures should override them if >> + * necessary. >> + */ >> + >> +int >> +__rte_weak pmu_arch_init(void) >> +{ >> + return 0; >> +} >> + >> +void >> +__rte_weak pmu_arch_fini(void) >> +{ >> +} >> + >> +void >> +__rte_weak pmu_arch_fixup_config(uint64_t config[3]) { >> + RTE_SET_USED(config); >> +} >> + >> +static int >> +get_term_format(const char *name, int *num, uint64_t *mask) { >> + char *config = NULL; >> + char path[PATH_MAX]; >> + int high, low, ret; >> + FILE *fp; >> + >> + /* quiesce -Wmaybe-uninitialized warning */ >> + *num = 0; >> + *mask = 0; >> + >> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH >> +"/%s/format/%s", rte_pmu->name, >> name); >> + fp = fopen(path, "r"); >> + if (!fp) >> + return -errno; >> + >> + errno = 0; >> + ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high); >> + if (ret < 2) { >> + ret = -ENODATA; >> + goto out; >> + } >> + if (errno) { >> + ret = -errno; >> + goto out; >> + } >> + >> + if (ret == 2) >> + high = low; >> + >> + *mask = GENMASK_ULL(high, low); >> + /* Last digit should be [012]. If last digit is missing 0 is implied. */ >> + *num = config[strlen(config) - 1]; >> + *num = isdigit(*num) ? *num - '0' : 0; >> + >> + ret = 0; >> +out: >> + free(config); >> + fclose(fp); >> + >> + return ret; >> +} >> + >> +static int >> +parse_event(char *buf, uint64_t config[3]) { >> + char *token, *term; >> + int num, ret, val; >> + uint64_t mask; >> + >> + config[0] = config[1] = config[2] = 0; >> + >> + token = strtok(buf, ","); >> + while (token) { >> + errno = 0; >> + /* <term>=<value> */ >> + ret = sscanf(token, "%m[^=]=%i", &term, &val); >> + if (ret < 1) >> + return -ENODATA; >> + if (errno) >> + return -errno; >> + if (ret == 1) >> + val = 1; >> + >> + ret = get_term_format(term, &num, &mask); >> + free(term); >> + if (ret) >> + return ret; >> + >> + config[num] |= FIELD_PREP(mask, val); >> + token = strtok(NULL, ","); >> + } >> + >> + return 0; >> +} >> + >> +static int >> +get_event_config(const char *name, uint64_t config[3]) { >> + char path[PATH_MAX], buf[BUFSIZ]; >> + FILE *fp; >> + int ret; >> + >> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH >> +"/%s/events/%s", rte_pmu->name, >> name); >> + fp = fopen(path, "r"); >> + if (!fp) >> + return -errno; >> + >> + ret = fread(buf, 1, sizeof(buf), fp); >> + if (ret == 0) { >> + fclose(fp); >> + >> + return -EINVAL; >> + } >> + fclose(fp); >> + buf[ret] = '\0'; >> + >> + return parse_event(buf, config); >> +} >> + >> +static int >> +do_perf_event_open(uint64_t config[3], int lcore_id, int group_fd) { >> + struct perf_event_attr attr = { >> + .size = sizeof(struct perf_event_attr), >> + .type = PERF_TYPE_RAW, >> + .exclude_kernel = 1, >> + .exclude_hv = 1, >> + .disabled = 1, >> + }; >> + >> + pmu_arch_fixup_config(config); >> + >> + attr.config = config[0]; >> + attr.config1 = config[1]; >> + attr.config2 = config[2]; >> + >> + return syscall(SYS_perf_event_open, &attr, rte_gettid(), > >Looks like using '0' instead of rte_gettid() takes the same effect. A small >optimization. > >> rte_lcore_to_cpu_id(lcore_id), >> + group_fd, 0); >> +} >> + >> +static int >> +open_events(int lcore_id) >> +{ >> + struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id]; >> + struct rte_pmu_event *event; >> + uint64_t config[3]; >> + int num = 0, ret; >> + >> + /* group leader gets created first, with fd = -1 */ >> + group->fds[0] = -1; >> + >> + TAILQ_FOREACH(event, &rte_pmu->event_list, next) { >> + ret = get_event_config(event->name, config); >> + if (ret) { >> + RTE_LOG(ERR, EAL, "failed to get %s event config\n", >> event->name); >> + continue; >> + } >> + >> + ret = do_perf_event_open(config, lcore_id, group->fds[0]); >> + if (ret == -1) { >> + if (errno == EOPNOTSUPP) >> + RTE_LOG(ERR, EAL, "64 bit counters not >> supported\n"); >> + >> + ret = -errno; >> + goto out; >> + } >> + >> + group->fds[event->index] = ret; >> + num++; >> + } >> + >> + return 0; >> +out: >> + for (--num; num >= 0; num--) { >> + close(group->fds[num]); >> + group->fds[num] = -1; >> + } >> + >> + >> + return ret; >> +} >> + >> +static int >> +mmap_events(int lcore_id) >> +{ >> + struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id]; >> + void *addr; >> + int ret, i; >> + >> + for (i = 0; i < rte_pmu->num_group_events; i++) { >> + addr = mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED, >> group->fds[i], 0); >> + if (addr == MAP_FAILED) { >> + ret = -errno; >> + goto out; >> + } >> + >> + group->mmap_pages[i] = addr; >> + } >> + >> + return 0; >> +out: >> + for (; i; i--) { >> + munmap(group->mmap_pages[i - 1], rte_mem_page_size()); >> + group->mmap_pages[i - 1] = NULL; >> + } >> + >> + return ret; >> +} >> + >> +static void >> +cleanup_events(int lcore_id) >> +{ >> + struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id]; >> + int i; >> + >> + if (!group->fds) >> + return; >> + >> + if (group->fds[0] != -1) >> + ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, >> PERF_IOC_FLAG_GROUP); >> + >> + for (i = 0; i < rte_pmu->num_group_events; i++) { >> + if (group->mmap_pages[i]) { >> + munmap(group->mmap_pages[i], rte_mem_page_size()); >> + group->mmap_pages[i] = NULL; >> + } >> + >> + if (group->fds[i] != -1) { >> + close(group->fds[i]); >> + group->fds[i] = -1; >> + } >> + } >> + >> + free(group->mmap_pages); >> + free(group->fds); >> + >> + group->mmap_pages = NULL; >> + group->fds = NULL; >> + group->enabled = false; >> +} >> + >> +int __rte_noinline >> +rte_pmu_enable_group(int lcore_id) >> +{ >> + struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id]; >> + int ret; >> + >> + if (rte_pmu->num_group_events == 0) { >> + RTE_LOG(DEBUG, EAL, "no matching PMU events\n"); >> + >> + return 0; >> + } >> + >> + group->fds = calloc(rte_pmu->num_group_events, sizeof(*group->fds)); >> + if (!group->fds) { >> + RTE_LOG(ERR, EAL, "failed to alloc descriptor memory\n"); >> + >> + return -ENOMEM; >> + } >> + >> + group->mmap_pages = calloc(rte_pmu->num_group_events, >> sizeof(*group->mmap_pages)); >> + if (!group->mmap_pages) { >> + RTE_LOG(ERR, EAL, "failed to alloc userpage memory\n"); >> + >> + ret = -ENOMEM; >> + goto out; >> + } >> + >> + ret = open_events(lcore_id); >> + if (ret) { >> + RTE_LOG(ERR, EAL, "failed to open events on lcore-worker-%d\n", >> lcore_id); >> + goto out; >> + } >> + >> + ret = mmap_events(lcore_id); >> + if (ret) { >> + RTE_LOG(ERR, EAL, "failed to map events on lcore-worker-%d\n", >> lcore_id); >> + goto out; >> + } >> + >> + if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == >> -1) { >> + RTE_LOG(ERR, EAL, "failed to enable events on >> lcore-worker-%d\n", >> +lcore_id); >> + >> + ret = -errno; >> + goto out; >> + } >> + >> + return 0; >> + >> +out: >> + cleanup_events(lcore_id); >> + >> + return ret; >> +} >> + >> +static int >> +scan_pmus(void) >> +{ >> + char path[PATH_MAX]; >> + struct dirent *dent; >> + const char *name; >> + DIR *dirp; >> + >> + dirp = opendir(EVENT_SOURCE_DEVICES_PATH); >> + if (!dirp) >> + return -errno; >> + >> + while ((dent = readdir(dirp))) { >> + name = dent->d_name; >> + if (name[0] == '.') >> + continue; >> + >> + /* sysfs entry should either contain cpus or be a cpu */ >> + if (!strcmp(name, "cpu")) >> + break; >> + >> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH >> "/%s/cpus", name); >> + if (access(path, F_OK) == 0) >> + break; >> + } >> + >> + closedir(dirp); >> + >> + if (dent) { >> + rte_pmu->name = strdup(name); >> + if (!rte_pmu->name) >> + return -ENOMEM; >> + } >> + >> + return rte_pmu->name ? 0 : -ENODEV; >> +} >> + >> +int >> +rte_pmu_add_event(const char *name) >> +{ >> + struct rte_pmu_event *event; >> + char path[PATH_MAX]; >> + >> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH >> +"/%s/events/%s", rte_pmu->name, >> name); > >Better to check if rte_pmu is available. >See below. > >> + if (access(path, R_OK)) >> + return -ENODEV; >> + >> + TAILQ_FOREACH(event, &rte_pmu->event_list, next) { >> + if (!strcmp(event->name, name)) >> + return event->index; >> + continue; >> + } >> + >> + event = calloc(1, sizeof(*event)); >> + if (!event) >> + return -ENOMEM; >> + >> + event->name = strdup(name); >> + if (!event->name) { >> + free(event); >> + >> + return -ENOMEM; >> + } >> + >> + event->index = rte_pmu->num_group_events++; >> + TAILQ_INSERT_TAIL(&rte_pmu->event_list, event, next); >> + >> + RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name, >> +event->index); >> + >> + return event->index; >> +} >> + >> +void >> +eal_pmu_init(void) >> +{ >> + int ret; >> + >> + rte_pmu = calloc(1, sizeof(*rte_pmu)); >> + if (!rte_pmu) { >> + RTE_LOG(ERR, EAL, "failed to alloc PMU\n"); >> + >> + return; >> + } >> + >> + TAILQ_INIT(&rte_pmu->event_list); >> + >> + ret = scan_pmus(); >> + if (ret) { >> + RTE_LOG(ERR, EAL, "failed to find core pmu\n"); >> + goto out; >> + } >> + >> + ret = pmu_arch_init(); >> + if (ret) { >> + RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n"); >> + goto out; >> + } >> + >> + return; >> +out: >> + free(rte_pmu->name); >> + free(rte_pmu); > >Set rte_pmu to NULL to prevent unintentional use? >
Next series will take use of global pmu instance so this will no longer be required though your suggestions may be applied to other pointers around. >> +} >> + >> +void >> +eal_pmu_fini(void) >> +{ >> + struct rte_pmu_event *event, *tmp; >> + int lcore_id; >> + >> + RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu->event_list, next, tmp) { > >rte_pmu can be unavailable if init fails. Better to check before accessing. > Yep. >> + TAILQ_REMOVE(&rte_pmu->event_list, event, next); >> + free(event->name); >> + free(event); >> + } >> + >> + RTE_LCORE_FOREACH_WORKER(lcore_id) >> + cleanup_events(lcore_id); >> + >> + pmu_arch_fini(); >> + free(rte_pmu->name); >> + free(rte_pmu); >> +} ><snip>