> -----Original Message-----
> From: Tomasz Duszynski <tduszyn...@marvell.com>
> Sent: Tuesday, December 13, 2022 6:44 PM
> To: dev@dpdk.org
> Cc: tho...@monjalon.net; jer...@marvell.com; m...@smartsharesystems.com; 
> zhou...@loongson.cn;
> Tomasz Duszynski <tduszyn...@marvell.com>
> Subject: [PATCH v4 1/4] eal: add generic support for reading PMU events
> 
> Add support for programming PMU counters and reading their values in runtime 
> bypassing
> kernel completely.
> 
> This is especially useful in cases where CPU cores are isolated
> (nohz_full) i.e run dedicated tasks. In such cases one cannot use standard 
> perf utility
> without sacrificing latency and performance.
> 
> Signed-off-by: Tomasz Duszynski <tduszyn...@marvell.com>
> ---
>  app/test/meson.build                  |   1 +
>  app/test/test_pmu.c                   |  41 +++
>  doc/guides/prog_guide/profile_app.rst |   8 +
>  lib/eal/common/meson.build            |   3 +
>  lib/eal/common/pmu_private.h          |  41 +++
>  lib/eal/common/rte_pmu.c              | 456 ++++++++++++++++++++++++++
>  lib/eal/include/meson.build           |   1 +
>  lib/eal/include/rte_pmu.h             | 204 ++++++++++++
>  lib/eal/linux/eal.c                   |   4 +
>  lib/eal/version.map                   |   6 +
>  10 files changed, 765 insertions(+)
>  create mode 100644 app/test/test_pmu.c
>  create mode 100644 lib/eal/common/pmu_private.h  create mode 100644
> lib/eal/common/rte_pmu.c  create mode 100644 lib/eal/include/rte_pmu.h
> 
<snip>
> diff --git a/lib/eal/common/rte_pmu.c b/lib/eal/common/rte_pmu.c new file 
> mode 100644
> index 0000000000..049fe19fe3
> --- /dev/null
> +++ b/lib/eal/common/rte_pmu.c
> @@ -0,0 +1,456 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell International Ltd.
> + */
> +
> +#include <ctype.h>
> +#include <dirent.h>
> +#include <errno.h>
> +#include <regex.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/queue.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +
> +#include <rte_eal_paging.h>
> +#include <rte_pmu.h>
> +#include <rte_tailq.h>
> +
> +#include "pmu_private.h"
> +
> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> +
> +#ifndef GENMASK_ULL
> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64
> +- 1 - (h))))) #endif
> +
> +#ifndef FIELD_PREP
> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) &
> +(m)) #endif
> +
> +struct rte_pmu *rte_pmu;
> +
> +/*
> + * Following __rte_weak functions provide default no-op. Architectures
> +should override them if
> + * necessary.
> + */
> +
> +int
> +__rte_weak pmu_arch_init(void)
> +{
> +     return 0;
> +}
> +
> +void
> +__rte_weak pmu_arch_fini(void)
> +{
> +}
> +
> +void
> +__rte_weak pmu_arch_fixup_config(uint64_t config[3]) {
> +     RTE_SET_USED(config);
> +}
> +
> +static int
> +get_term_format(const char *name, int *num, uint64_t *mask) {
> +     char *config = NULL;
> +     char path[PATH_MAX];
> +     int high, low, ret;
> +     FILE *fp;
> +
> +     /* quiesce -Wmaybe-uninitialized warning */
> +     *num = 0;
> +     *mask = 0;
> +
> +     snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", 
> rte_pmu->name,
> name);
> +     fp = fopen(path, "r");
> +     if (!fp)
> +             return -errno;
> +
> +     errno = 0;
> +     ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> +     if (ret < 2) {
> +             ret = -ENODATA;
> +             goto out;
> +     }
> +     if (errno) {
> +             ret = -errno;
> +             goto out;
> +     }
> +
> +     if (ret == 2)
> +             high = low;
> +
> +     *mask = GENMASK_ULL(high, low);
> +     /* Last digit should be [012]. If last digit is missing 0 is implied. */
> +     *num = config[strlen(config) - 1];
> +     *num = isdigit(*num) ? *num - '0' : 0;
> +
> +     ret = 0;
> +out:
> +     free(config);
> +     fclose(fp);
> +
> +     return ret;
> +}
> +
> +static int
> +parse_event(char *buf, uint64_t config[3]) {
> +     char *token, *term;
> +     int num, ret, val;
> +     uint64_t mask;
> +
> +     config[0] = config[1] = config[2] = 0;
> +
> +     token = strtok(buf, ",");
> +     while (token) {
> +             errno = 0;
> +             /* <term>=<value> */
> +             ret = sscanf(token, "%m[^=]=%i", &term, &val);
> +             if (ret < 1)
> +                     return -ENODATA;
> +             if (errno)
> +                     return -errno;
> +             if (ret == 1)
> +                     val = 1;
> +
> +             ret = get_term_format(term, &num, &mask);
> +             free(term);
> +             if (ret)
> +                     return ret;
> +
> +             config[num] |= FIELD_PREP(mask, val);
> +             token = strtok(NULL, ",");
> +     }
> +
> +     return 0;
> +}
> +
> +static int
> +get_event_config(const char *name, uint64_t config[3]) {
> +     char path[PATH_MAX], buf[BUFSIZ];
> +     FILE *fp;
> +     int ret;
> +
> +     snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", 
> rte_pmu->name,
> name);
> +     fp = fopen(path, "r");
> +     if (!fp)
> +             return -errno;
> +
> +     ret = fread(buf, 1, sizeof(buf), fp);
> +     if (ret == 0) {
> +             fclose(fp);
> +
> +             return -EINVAL;
> +     }
> +     fclose(fp);
> +     buf[ret] = '\0';
> +
> +     return parse_event(buf, config);
> +}
> +
> +static int
> +do_perf_event_open(uint64_t config[3], int lcore_id, int group_fd) {
> +     struct perf_event_attr attr = {
> +             .size = sizeof(struct perf_event_attr),
> +             .type = PERF_TYPE_RAW,
> +             .exclude_kernel = 1,
> +             .exclude_hv = 1,
> +             .disabled = 1,
> +     };
> +
> +     pmu_arch_fixup_config(config);
> +
> +     attr.config = config[0];
> +     attr.config1 = config[1];
> +     attr.config2 = config[2];
> +
> +     return syscall(SYS_perf_event_open, &attr, rte_gettid(),

Looks like using '0' instead of rte_gettid() takes the same effect. A small 
optimization.

> rte_lcore_to_cpu_id(lcore_id),
> +                    group_fd, 0);
> +}
> +
> +static int
> +open_events(int lcore_id)
> +{
> +     struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +     struct rte_pmu_event *event;
> +     uint64_t config[3];
> +     int num = 0, ret;
> +
> +     /* group leader gets created first, with fd = -1 */
> +     group->fds[0] = -1;
> +
> +     TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
> +             ret = get_event_config(event->name, config);
> +             if (ret) {
> +                     RTE_LOG(ERR, EAL, "failed to get %s event config\n", 
> event->name);
> +                     continue;
> +             }
> +
> +             ret = do_perf_event_open(config, lcore_id, group->fds[0]);
> +             if (ret == -1) {
> +                     if (errno == EOPNOTSUPP)
> +                             RTE_LOG(ERR, EAL, "64 bit counters not 
> supported\n");
> +
> +                     ret = -errno;
> +                     goto out;
> +             }
> +
> +             group->fds[event->index] = ret;
> +             num++;
> +     }
> +
> +     return 0;
> +out:
> +     for (--num; num >= 0; num--) {
> +             close(group->fds[num]);
> +             group->fds[num] = -1;
> +     }
> +
> +
> +     return ret;
> +}
> +
> +static int
> +mmap_events(int lcore_id)
> +{
> +     struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +     void *addr;
> +     int ret, i;
> +
> +     for (i = 0; i < rte_pmu->num_group_events; i++) {
> +             addr = mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED, 
> group->fds[i], 0);
> +             if (addr == MAP_FAILED) {
> +                     ret = -errno;
> +                     goto out;
> +             }
> +
> +             group->mmap_pages[i] = addr;
> +     }
> +
> +     return 0;
> +out:
> +     for (; i; i--) {
> +             munmap(group->mmap_pages[i - 1], rte_mem_page_size());
> +             group->mmap_pages[i - 1] = NULL;
> +     }
> +
> +     return ret;
> +}
> +
> +static void
> +cleanup_events(int lcore_id)
> +{
> +     struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +     int i;
> +
> +     if (!group->fds)
> +             return;
> +
> +     if (group->fds[0] != -1)
> +             ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, 
> PERF_IOC_FLAG_GROUP);
> +
> +     for (i = 0; i < rte_pmu->num_group_events; i++) {
> +             if (group->mmap_pages[i]) {
> +                     munmap(group->mmap_pages[i], rte_mem_page_size());
> +                     group->mmap_pages[i] = NULL;
> +             }
> +
> +             if (group->fds[i] != -1) {
> +                     close(group->fds[i]);
> +                     group->fds[i] = -1;
> +             }
> +     }
> +
> +     free(group->mmap_pages);
> +     free(group->fds);
> +
> +     group->mmap_pages = NULL;
> +     group->fds = NULL;
> +     group->enabled = false;
> +}
> +
> +int __rte_noinline
> +rte_pmu_enable_group(int lcore_id)
> +{
> +     struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +     int ret;
> +
> +     if (rte_pmu->num_group_events == 0) {
> +             RTE_LOG(DEBUG, EAL, "no matching PMU events\n");
> +
> +             return 0;
> +     }
> +
> +     group->fds = calloc(rte_pmu->num_group_events, sizeof(*group->fds));
> +     if (!group->fds) {
> +             RTE_LOG(ERR, EAL, "failed to alloc descriptor memory\n");
> +
> +             return -ENOMEM;
> +     }
> +
> +     group->mmap_pages = calloc(rte_pmu->num_group_events, 
> sizeof(*group->mmap_pages));
> +     if (!group->mmap_pages) {
> +             RTE_LOG(ERR, EAL, "failed to alloc userpage memory\n");
> +
> +             ret = -ENOMEM;
> +             goto out;
> +     }
> +
> +     ret = open_events(lcore_id);
> +     if (ret) {
> +             RTE_LOG(ERR, EAL, "failed to open events on lcore-worker-%d\n", 
> lcore_id);
> +             goto out;
> +     }
> +
> +     ret = mmap_events(lcore_id);
> +     if (ret) {
> +             RTE_LOG(ERR, EAL, "failed to map events on lcore-worker-%d\n", 
> lcore_id);
> +             goto out;
> +     }
> +
> +     if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == 
> -1) {
> +             RTE_LOG(ERR, EAL, "failed to enable events on 
> lcore-worker-%d\n",
> +lcore_id);
> +
> +             ret = -errno;
> +             goto out;
> +     }
> +
> +     return 0;
> +
> +out:
> +     cleanup_events(lcore_id);
> +
> +     return ret;
> +}
> +
> +static int
> +scan_pmus(void)
> +{
> +     char path[PATH_MAX];
> +     struct dirent *dent;
> +     const char *name;
> +     DIR *dirp;
> +
> +     dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> +     if (!dirp)
> +             return -errno;
> +
> +     while ((dent = readdir(dirp))) {
> +             name = dent->d_name;
> +             if (name[0] == '.')
> +                     continue;
> +
> +             /* sysfs entry should either contain cpus or be a cpu */
> +             if (!strcmp(name, "cpu"))
> +                     break;
> +
> +             snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH 
> "/%s/cpus", name);
> +             if (access(path, F_OK) == 0)
> +                     break;
> +     }
> +
> +     closedir(dirp);
> +
> +     if (dent) {
> +             rte_pmu->name = strdup(name);
> +             if (!rte_pmu->name)
> +                     return -ENOMEM;
> +     }
> +
> +     return rte_pmu->name ? 0 : -ENODEV;
> +}
> +
> +int
> +rte_pmu_add_event(const char *name)
> +{
> +     struct rte_pmu_event *event;
> +     char path[PATH_MAX];
> +
> +     snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", 
> rte_pmu->name,
> name);

Better to check if rte_pmu is available.
See below.

> +     if (access(path, R_OK))
> +             return -ENODEV;
> +
> +     TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
> +             if (!strcmp(event->name, name))
> +                     return event->index;
> +             continue;
> +     }
> +
> +     event = calloc(1, sizeof(*event));
> +     if (!event)
> +             return -ENOMEM;
> +
> +     event->name = strdup(name);
> +     if (!event->name) {
> +             free(event);
> +
> +             return -ENOMEM;
> +     }
> +
> +     event->index = rte_pmu->num_group_events++;
> +     TAILQ_INSERT_TAIL(&rte_pmu->event_list, event, next);
> +
> +     RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name,
> +event->index);
> +
> +     return event->index;
> +}
> +
> +void
> +eal_pmu_init(void)
> +{
> +     int ret;
> +
> +     rte_pmu = calloc(1, sizeof(*rte_pmu));
> +     if (!rte_pmu) {
> +             RTE_LOG(ERR, EAL, "failed to alloc PMU\n");
> +
> +             return;
> +     }
> +
> +     TAILQ_INIT(&rte_pmu->event_list);
> +
> +     ret = scan_pmus();
> +     if (ret) {
> +             RTE_LOG(ERR, EAL, "failed to find core pmu\n");
> +             goto out;
> +     }
> +
> +     ret = pmu_arch_init();
> +     if (ret) {
> +             RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n");
> +             goto out;
> +     }
> +
> +     return;
> +out:
> +     free(rte_pmu->name);
> +     free(rte_pmu);

Set rte_pmu to NULL to prevent unintentional use?

> +}
> +
> +void
> +eal_pmu_fini(void)
> +{
> +     struct rte_pmu_event *event, *tmp;
> +     int lcore_id;
> +
> +     RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu->event_list, next, tmp) {

rte_pmu can be unavailable if init fails. Better to check before accessing.

> +             TAILQ_REMOVE(&rte_pmu->event_list, event, next);
> +             free(event->name);
> +             free(event);
> +     }
> +
> +     RTE_LCORE_FOREACH_WORKER(lcore_id)
> +             cleanup_events(lcore_id);
> +
> +     pmu_arch_fini();
> +     free(rte_pmu->name);
> +     free(rte_pmu);
> +}
<snip>

Reply via email to