RE: [EXT] Re: [PATCH v11 1/4] lib: add generic support for reading PMU events

Tomasz Duszynski Tue, 28 Feb 2023 01:57:21 -0800


>-----Original Message-----
>From: Konstantin Ananyev <konstantin.anan...@huawei.com>
>Sent: Monday, February 27, 2023 9:53 PM
>To: Tomasz Duszynski <tduszyn...@marvell.com>; Konstantin Ananyev 
><konstantin.v.anan...@yandex.ru>;
>dev@dpdk.org
>Subject: RE: [EXT] Re: [PATCH v11 1/4] lib: add generic support for reading 
>PMU events
>
>
>
>> >> Add support for programming PMU counters and reading their values
>> >> in runtime bypassing kernel completely.
>> >>
>> >> This is especially useful in cases where CPU cores are isolated i.e
>> >> run dedicated tasks. In such cases one cannot use standard perf
>> >> utility without sacrificing latency and performance.
>> >>
>> >> Signed-off-by: Tomasz Duszynski <tduszyn...@marvell.com>
>> >> Acked-by: Morten Brørup <m...@smartsharesystems.com>
>> >
>> >Few more comments/questions below.
>> >
>> >
>> >> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c new file mode
>> >> 100644 index 0000000000..950f999cb7
>> >> --- /dev/null
>> >> +++ b/lib/pmu/rte_pmu.c
>> >> @@ -0,0 +1,460 @@
>> >> +/* SPDX-License-Identifier: BSD-3-Clause
>> >> + * Copyright(C) 2023 Marvell International Ltd.
>> >> + */
>> >> +
>> >> +#include <ctype.h>
>> >> +#include <dirent.h>
>> >> +#include <errno.h>
>> >> +#include <regex.h>
>> >> +#include <stdlib.h>
>> >> +#include <string.h>
>> >> +#include <sys/ioctl.h>
>> >> +#include <sys/mman.h>
>> >> +#include <sys/queue.h>
>> >> +#include <sys/syscall.h>
>> >> +#include <unistd.h>
>> >> +
>> >> +#include <rte_atomic.h>
>> >> +#include <rte_per_lcore.h>
>> >> +#include <rte_pmu.h>
>> >> +#include <rte_spinlock.h>
>> >> +#include <rte_tailq.h>
>> >> +
>> >> +#include "pmu_private.h"
>> >> +
>> >> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
>> >> +
>> >> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >>
>> >> +((64 - 1 - (h))))) #define FIELD_PREP(m, v) (((uint64_t)(v) <<
>> >> +(__builtin_ffsll(m) - 1)) & (m))
>> >> +
>> >> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
>> >> +struct rte_pmu rte_pmu;
>> >> +
>> >> +/*
>> >> + * Following __rte_weak functions provide default no-op.
>> >> +Architectures should override them if
>> >> + * necessary.
>> >> + */
>> >> +
>> >> +int
>> >> +__rte_weak pmu_arch_init(void)
>> >> +{
>> >> + return 0;
>> >> +}
>> >> +
>> >> +void
>> >> +__rte_weak pmu_arch_fini(void)
>> >> +{
>> >> +}
>> >> +
>> >> +void
>> >> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
>> >> +{ }
>> >> +
>> >> +static int
>> >> +get_term_format(const char *name, int *num, uint64_t *mask) {
>> >> + char path[PATH_MAX];
>> >> + char *config = NULL;
>> >> + int high, low, ret;
>> >> + FILE *fp;
>> >> +
>> >> + *num = *mask = 0;
>> >> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", 
>> >> rte_pmu.name,
>name);
>> >> + fp = fopen(path, "r");
>> >> + if (fp == NULL)
>> >> +         return -errno;
>> >> +
>> >> + errno = 0;
>> >> + ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
>> >> + if (ret < 2) {
>> >> +         ret = -ENODATA;
>> >> +         goto out;
>> >> + }
>> >> + if (errno) {
>> >> +         ret = -errno;
>> >> +         goto out;
>> >> + }
>> >> +
>> >> + if (ret == 2)
>> >> +         high = low;
>> >> +
>> >> + *mask = GENMASK_ULL(high, low);
>> >> + /* Last digit should be [012]. If last digit is missing 0 is implied. */
>> >> + *num = config[strlen(config) - 1];
>> >> + *num = isdigit(*num) ? *num - '0' : 0;
>> >> +
>> >> + ret = 0;
>> >> +out:
>> >> + free(config);
>> >> + fclose(fp);
>> >> +
>> >> + return ret;
>> >> +}
>> >> +
>> >> +static int
>> >> +parse_event(char *buf, uint64_t config[3]) {
>> >> + char *token, *term;
>> >> + int num, ret, val;
>> >> + uint64_t mask;
>> >> +
>> >> + config[0] = config[1] = config[2] = 0;
>> >> +
>> >> + token = strtok(buf, ",");
>> >> + while (token) {
>> >> +         errno = 0;
>> >> +         /* <term>=<value> */
>> >> +         ret = sscanf(token, "%m[^=]=%i", &term, &val);
>> >> +         if (ret < 1)
>> >> +                 return -ENODATA;
>> >> +         if (errno)
>> >> +                 return -errno;
>> >> +         if (ret == 1)
>> >> +                 val = 1;
>> >> +
>> >> +         ret = get_term_format(term, &num, &mask);
>> >> +         free(term);
>> >> +         if (ret)
>> >> +                 return ret;
>> >> +
>> >> +         config[num] |= FIELD_PREP(mask, val);
>> >> +         token = strtok(NULL, ",");
>> >> + }
>> >> +
>> >> + return 0;
>> >> +}
>> >> +
>> >> +static int
>> >> +get_event_config(const char *name, uint64_t config[3]) {
>> >> + char path[PATH_MAX], buf[BUFSIZ];
>> >> + FILE *fp;
>> >> + int ret;
>> >> +
>> >> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", 
>> >> rte_pmu.name,
>name);
>> >> + fp = fopen(path, "r");
>> >> + if (fp == NULL)
>> >> +         return -errno;
>> >> +
>> >> + ret = fread(buf, 1, sizeof(buf), fp);
>> >> + if (ret == 0) {
>> >> +         fclose(fp);
>> >> +
>> >> +         return -EINVAL;
>> >> + }
>> >> + fclose(fp);
>> >> + buf[ret] = '\0';
>> >> +
>> >> + return parse_event(buf, config);
>> >> +}
>> >> +
>> >> +static int
>> >> +do_perf_event_open(uint64_t config[3], int group_fd) {
>> >> + struct perf_event_attr attr = {
>> >> +         .size = sizeof(struct perf_event_attr),
>> >> +         .type = PERF_TYPE_RAW,
>> >> +         .exclude_kernel = 1,
>> >> +         .exclude_hv = 1,
>> >> +         .disabled = 1,
>> >> + };
>> >> +
>> >> + pmu_arch_fixup_config(config);
>> >> +
>> >> + attr.config = config[0];
>> >> + attr.config1 = config[1];
>> >> + attr.config2 = config[2];
>> >> +
>> >> + return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0); }
>> >> +
>> >> +static int
>> >> +open_events(struct rte_pmu_event_group *group) {
>> >> + struct rte_pmu_event *event;
>> >> + uint64_t config[3];
>> >> + int num = 0, ret;
>> >> +
>> >> + /* group leader gets created first, with fd = -1 */
>> >> + group->fds[0] = -1;
>> >> +
>> >> + TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> >> +         ret = get_event_config(event->name, config);
>> >> +         if (ret)
>> >> +                 continue;
>> >> +
>> >> +         ret = do_perf_event_open(config, group->fds[0]);
>> >> +         if (ret == -1) {
>> >> +                 ret = -errno;
>> >> +                 goto out;
>> >> +         }
>> >> +
>> >> +         group->fds[event->index] = ret;
>> >> +         num++;
>> >> + }
>> >> +
>> >> + return 0;
>> >> +out:
>> >> + for (--num; num >= 0; num--) {
>> >> +         close(group->fds[num]);
>> >> +         group->fds[num] = -1;
>> >> + }
>> >> +
>> >> +
>> >> + return ret;
>> >> +}
>> >> +
>> >> +static int
>> >> +mmap_events(struct rte_pmu_event_group *group) {
>> >> + long page_size = sysconf(_SC_PAGE_SIZE);
>> >> + unsigned int i;
>> >> + void *addr;
>> >> + int ret;
>> >> +
>> >> + for (i = 0; i < rte_pmu.num_group_events; i++) {
>> >> +         addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 
>> >> 0);
>> >> +         if (addr == MAP_FAILED) {
>> >> +                 ret = -errno;
>> >> +                 goto out;
>> >> +         }
>> >> +
>> >> +         group->mmap_pages[i] = addr;
>> >> +         if (!group->mmap_pages[i]->cap_user_rdpmc) {
>> >> +                 ret = -EPERM;
>> >> +                 goto out;
>> >> +         }
>> >> + }
>> >> +
>> >> + return 0;
>> >> +out:
>> >> + for (; i; i--) {
>> >> +         munmap(group->mmap_pages[i - 1], page_size);
>> >> +         group->mmap_pages[i - 1] = NULL;
>> >> + }
>> >> +
>> >> + return ret;
>> >> +}
>> >> +
>> >> +static void
>> >> +cleanup_events(struct rte_pmu_event_group *group) {
>> >> + unsigned int i;
>> >> +
>> >> + if (group->fds[0] != -1)
>> >> +         ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE,
>> >> +PERF_IOC_FLAG_GROUP);
>> >> +
>> >> + for (i = 0; i < rte_pmu.num_group_events; i++) {
>> >> +         if (group->mmap_pages[i]) {
>> >> +                 munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
>> >> +                 group->mmap_pages[i] = NULL;
>> >> +         }
>> >> +
>> >> +         if (group->fds[i] != -1) {
>> >> +                 close(group->fds[i]);
>> >> +                 group->fds[i] = -1;
>> >> +         }
>> >> + }
>> >> +
>> >> + group->enabled = false;
>> >> +}
>> >> +
>> >> +int
>> >> +__rte_pmu_enable_group(void)
>> >> +{
>> >> + struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>> >> + int ret;
>> >> +
>> >> + if (rte_pmu.num_group_events == 0)
>> >> +         return -ENODEV;
>> >> +
>> >> + ret = open_events(group);
>> >> + if (ret)
>> >> +         goto out;
>> >> +
>> >> + ret = mmap_events(group);
>> >> + if (ret)
>> >> +         goto out;
>> >> +
>> >> + if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == 
>> >> -1) {
>> >> +         ret = -errno;
>> >> +         goto out;
>> >> + }
>> >> +
>> >> + if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == 
>> >> -1) {
>> >> +         ret = -errno;
>> >> +         goto out;
>> >> + }
>> >> +
>> >> + rte_spinlock_lock(&rte_pmu.lock);
>> >> + TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
>> >
>> >Hmm.. so we insert pointer to TLS variable into the global list?
>> >Wonder what would happen if that thread get terminated?
>>
>> Nothing special. Any pointers to that thread-local in that thread are 
>> invalided.
>>
>> >Can memory from its TLS block get re-used (by other thread or for other 
>> >purposes)?
>> >
>>
>> Why would any other thread reuse that?
>> Eventually main thread will need that data to do the cleanup.
>
>I understand that main thread would need to access that data.
>I am not sure that it would be able to.
>Imagine thread calls rte_pmu_read(...) and then terminates, while program 
>continues to run.
>As I understand address of its RTE_PER_LCORE(_event_group) will still remain in
>rte_pmu.event_group_list, even if it is probably not valid any more.
>


Okay got your point. In DPDK that will not happen. We do not spawn/kill lcores 
in runtime. 
In other scenarios such approach will not work because once thread terminates 
it's per-thread-data
becomes invalid. 

>> >
>> >> + rte_spinlock_unlock(&rte_pmu.lock);
>> >> + group->enabled = true;
>> >> +
>> >> + return 0;
>> >> +
>> >> +out:
>> >> + cleanup_events(group);
>> >> +
>> >> + return ret;
>> >> +}
>> >> +
>> >> +static int
>> >> +scan_pmus(void)
>> >> +{
>> >> + char path[PATH_MAX];
>> >> + struct dirent *dent;
>> >> + const char *name;
>> >> + DIR *dirp;
>> >> +
>> >> + dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
>> >> + if (dirp == NULL)
>> >> +         return -errno;
>> >> +
>> >> + while ((dent = readdir(dirp))) {
>> >> +         name = dent->d_name;
>> >> +         if (name[0] == '.')
>> >> +                 continue;
>> >> +
>> >> +         /* sysfs entry should either contain cpus or be a cpu */
>> >> +         if (!strcmp(name, "cpu"))
>> >> +                 break;
>> >> +
>> >> +         snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH 
>> >> "/%s/cpus", name);
>> >> +         if (access(path, F_OK) == 0)
>> >> +                 break;
>> >> + }
>> >> +
>> >> + if (dent) {
>> >> +         rte_pmu.name = strdup(name);
>> >> +         if (rte_pmu.name == NULL) {
>> >> +                 closedir(dirp);
>> >> +
>> >> +                 return -ENOMEM;
>> >> +         }
>> >> + }
>> >> +
>> >> + closedir(dirp);
>> >> +
>> >> + return rte_pmu.name ? 0 : -ENODEV; }
>> >> +
>> >> +static struct rte_pmu_event *
>> >> +new_event(const char *name)
>> >> +{
>> >> + struct rte_pmu_event *event;
>> >> +
>> >> + event = calloc(1, sizeof(*event));
>> >> + if (event == NULL)
>> >> +         goto out;
>> >> +
>> >> + event->name = strdup(name);
>> >> + if (event->name == NULL) {
>> >> +         free(event);
>> >> +         event = NULL;
>> >> + }
>> >> +
>> >> +out:
>> >> + return event;
>> >> +}
>> >> +
>> >> +static void
>> >> +free_event(struct rte_pmu_event *event) {
>> >> + free(event->name);
>> >> + free(event);
>> >> +}
>> >> +
>> >> +int
>> >> +rte_pmu_add_event(const char *name) {
>> >> + struct rte_pmu_event *event;
>> >> + char path[PATH_MAX];
>> >> +
>> >> + if (rte_pmu.name == NULL)
>> >> +         return -ENODEV;
>> >> +
>> >> + if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
>> >> +         return -ENOSPC;
>> >> +
>> >> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", 
>> >> rte_pmu.name,
>name);
>> >> + if (access(path, R_OK))
>> >> +         return -ENODEV;
>> >> +
>> >> + TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> >> +         if (!strcmp(event->name, name))
>> >> +                 return event->index;
>> >> +         continue;
>> >> + }
>> >> +
>> >> + event = new_event(name);
>> >> + if (event == NULL)
>> >> +         return -ENOMEM;
>> >> +
>> >> + event->index = rte_pmu.num_group_events++;
>> >> + TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
>> >> +
>> >> + return event->index;
>> >> +}
>> >> +
>> >> +int
>> >> +rte_pmu_init(void)
>> >> +{
>> >> + int ret;
>> >> +
>> >> + /* Allow calling init from multiple contexts within a single thread. 
>> >> This simplifies
>> >> +  * resource management a bit e.g in case fast-path tracepoint has 
>> >> already been enabled
>> >> +  * via command line but application doesn't care enough and performs 
>> >> init/fini again.
>> >> +  */
>> >> + if (rte_pmu.initialized != 0) {
>> >> +         rte_pmu.initialized++;
>> >> +         return 0;
>> >> + }
>> >> +
>> >> + ret = scan_pmus();
>> >> + if (ret)
>> >> +         goto out;
>> >> +
>> >> + ret = pmu_arch_init();
>> >> + if (ret)
>> >> +         goto out;
>> >> +
>> >> + TAILQ_INIT(&rte_pmu.event_list);
>> >> + TAILQ_INIT(&rte_pmu.event_group_list);
>> >> + rte_spinlock_init(&rte_pmu.lock);
>> >> + rte_pmu.initialized = 1;
>> >> +
>> >> + return 0;
>> >> +out:
>> >> + free(rte_pmu.name);
>> >> + rte_pmu.name = NULL;
>> >> +
>> >> + return ret;
>> >> +}
>> >> +
>> >> +void
>> >> +rte_pmu_fini(void)
>> >> +{
>> >> + struct rte_pmu_event_group *group, *tmp_group;
>> >> + struct rte_pmu_event *event, *tmp_event;
>> >> +
>> >> + /* cleanup once init count drops to zero */
>> >> + if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
>> >> +         return;
>> >> +
>> >> + RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
>> >> +         TAILQ_REMOVE(&rte_pmu.event_list, event, next);
>> >> +         free_event(event);
>> >> + }
>> >> +
>> >> + RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, 
>> >> tmp_group) {
>> >> +         TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
>> >> +         cleanup_events(group);
>> >> + }
>> >> +
>> >> + pmu_arch_fini();
>> >> + free(rte_pmu.name);
>> >> + rte_pmu.name = NULL;
>> >> + rte_pmu.num_group_events = 0;
>> >> +}
>> >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode
>> >> 100644 index 0000000000..6b664c3336
>> >> --- /dev/null
>> >> +++ b/lib/pmu/rte_pmu.h
>> >> @@ -0,0 +1,212 @@
>> >> +/* SPDX-License-Identifier: BSD-3-Clause
>> >> + * Copyright(c) 2023 Marvell
>> >> + */
>> >> +
>> >> +#ifndef _RTE_PMU_H_
>> >> +#define _RTE_PMU_H_
>> >> +
>> >> +/**
>> >> + * @file
>> >> + *
>> >> + * PMU event tracing operations
>> >> + *
>> >> + * This file defines generic API and types necessary to setup PMU
>> >> +and
>> >> + * read selected counters in runtime.
>> >> + */
>> >> +
>> >> +#ifdef __cplusplus
>> >> +extern "C" {
>> >> +#endif
>> >> +
>> >> +#include <linux/perf_event.h>
>> >> +
>> >> +#include <rte_atomic.h>
>> >> +#include <rte_branch_prediction.h> #include <rte_common.h>
>> >> +#include <rte_compat.h> #include <rte_spinlock.h>
>> >> +
>> >> +/** Maximum number of events in a group */ #define
>> >> +MAX_NUM_GROUP_EVENTS 8
>> >
>> >forgot RTE_ prefix.
>> >In fact, do you really need number of events in group to be hard-coded?
>> >Couldn't mmap_pages[] and fds[] be allocated dynamically by enable_group()?
>> >
>>
>> 8 is reasonable number I think. X86/ARM have actually less that that (was 
>> that something like
>4?).
>> Moreover events are scheduled as a group so there must be enough hw
>> counters available for that to succeed. So this number should cover current 
>> needs.
>
>If you think 8 will be enough to cover all possible future cases - I am ok 
>either way.
>Still need RTE_ prefix for it.
>

Okay that can be added. 

>> >> +
>> >> +/**
>> >> + * A structure describing a group of events.
>> >> + */
>> >> +struct rte_pmu_event_group {
>> >> + struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< 
>> >> array of user pages
>*/
>> >> + int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>> >> + bool enabled; /**< true if group was enabled on particular lcore */
>> >> + TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
>> >> +__rte_cache_aligned;
>> >> +
>> >
>> >Even if we'd decide to keep rte_pmu_read() as static inline (still
>> >not sure it is a good idea),
>>
>> We want to save as much cpu cycles as we possibly can and inlining
>> does helps in that matter.
>
>Ok, so asking same question from different thread: how many cycles it will 
>save?
>What is the difference in terms of performance when you have this function 
>inlined vs not inlined?
>

On x86 setup which is not under load, no cpusets configured, etc *just* not 
inlining rte_pmu_read() 
decreases performance by roughly 24% (44 vs 58 cpu cycles). At least that is 
reported by 
trace_perf_autotest. 


>> >why these two struct below (rte_pmu_event and rte_pmu) have to be public?
>> >I think both can be safely moved away from public headers.
>> >
>>
>> struct rte_pmu_event can be hidden I guess.
>> struct rte_pmu is used in this header hence cannot be moved elsewhere.
>
>Not sure why?
>Is that because you use it inside rte_pmu_read()?
>But that check I think can be safely moved into __rte_pmu_enable_group() or 
>probably even into
>rte_pmu_add_event().

No, we should not do that. Otherwise we'll need to call function. Even though 
check will happen
early on still function prologue/epilogue will happen. This takes cycles. 

>
>> >
>> >> +/**
>> >> + * A structure describing an event.
>> >> + */
>> >> +struct rte_pmu_event {
>> >> + char *name; /**< name of an event */
>> >> + unsigned int index; /**< event index into fds/mmap_pages */
>> >> + TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
>> >
>> >> +
>> >> +/**
>> >> + * A PMU state container.
>> >> + */
>> >> +struct rte_pmu {
>> >> + char *name; /**< name of core PMU listed under 
>> >> /sys/bus/event_source/devices */
>> >> + rte_spinlock_t lock; /**< serialize access to event group list */
>> >> + TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event 
>> >> groups */
>> >> + unsigned int num_group_events; /**< number of events in a group */
>> >> + TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
>> >> + unsigned int initialized; /**< initialization counter */ };
>> >> +
>> >> +/** lcore event group */
>> >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
>> >> +
>> >> +/** PMU state container */
>> >> +extern struct rte_pmu rte_pmu;
>> >> +
>> >> +/** Each architecture supporting PMU needs to provide its own
>> >> +version */ #ifndef rte_pmu_pmc_read #define
>> >> +rte_pmu_pmc_read(index) ({ 0; }) #endif
>> >> +
>> >> +/**
>> >> + * @warning
>> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> + *
>> >> + * Read PMU counter.
>> >> + *
>> >> + * @warning This should be not called directly.
>> >> + *
>> >> + * @param pc
>> >> + *   Pointer to the mmapped user page.
>> >> + * @return
>> >> + *   Counter value read from hardware.
>> >> + */
>> >> +static __rte_always_inline uint64_t __rte_pmu_read_userpage(struct
>> >> +perf_event_mmap_page *pc) {
>> >> + uint64_t width, offset;
>> >> + uint32_t seq, index;
>> >> + int64_t pmc;
>> >> +
>> >> + for (;;) {
>> >> +         seq = pc->lock;
>> >> +         rte_compiler_barrier();
>> >> +         index = pc->index;
>> >> +         offset = pc->offset;
>> >> +         width = pc->pmc_width;
>> >> +
>> >> +         /* index set to 0 means that particular counter cannot be used 
>> >> */
>> >> +         if (likely(pc->cap_user_rdpmc && index)) {
>> >
>> >In mmap_events() you return EPERM if cap_user_rdpmc is not enabled.
>> >Do you need another check here? Or this capability can be disabled by
>> >kernel at run-time?
>> >
>>
>> That extra check in mmap_event() may be removed actually. Some archs
>> allow disabling reading rdpmc (I think that on x86 one can do that) so this 
>> check needs to stay.
>>
>> >
>> >> +                 pmc = rte_pmu_pmc_read(index - 1);
>> >> +                 pmc <<= 64 - width;
>> >> +                 pmc >>= 64 - width;
>> >> +                 offset += pmc;
>> >> +         }
>> >> +
>> >> +         rte_compiler_barrier();
>> >> +
>> >> +         if (likely(pc->lock == seq))
>> >> +                 return offset;
>> >> + }
>> >> +
>> >> + return 0;
>> >> +}
>> >> +
>> >> +/**
>> >> + * @warning
>> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> + *
>> >> + * Enable group of events on the calling lcore.
>> >> + *
>> >> + * @warning This should be not called directly.
>> >
>> >__rte_internal ?
>> >
>>
>> No this cannot be internal because that will make functions calling it
>> internal as well hence apps won't be able to use that. This has
>> already been brought up by one of the reviewers.
>
>Ok, then we probably can mark it with ' @internal' tag in formal comments?
>

I added a warning not to call that directly. Since function is not internal (in 
DPDK parlance) per se
I don’t think we should add more confusion that extra tag. 

>>
>> >> + *
>> >> + * @return
>> >> + *   0 in case of success, negative value otherwise.
>> >> + */
>> >> +__rte_experimental
>> >> +int
>> >> +__rte_pmu_enable_group(void);
>> >> +
>> >> +/**
>> >> + * @warning
>> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> + *
>> >> + * Initialize PMU library.
>> >> + *
>> >> + * @warning This should be not called directly.
>> >
>> >Hmm.. then who should call it?
>> >If it not supposed to be called directly, why to declare it here?
>> >
>>
>> This is inlined and has one caller i.e rte_pmu_read().
>
>I thought we are talking here about rte_pmu_init().
>I don't see where it is inlined and still not clear why it can't be called 
>directly.
>

No this cannot be called by init because groups are configured in runtime. That 
is why
__rte_pmu_enable_group() is called once in rte_pmu_read().

*Other* code should not call that directly. And yes, that is not inlined - my 
mistake. 

>> >> + *
>> >> + * @return
>> >> + *   0 in case of success, negative value otherwise.
>> >> + */
>> >
>> >Probably worth to mention that this function is not MT safe.
>> >Same for _fini_ and add_event.
>> >Also worth to mention that all control-path functions
>> >(init/fini/add_event) and data-path (pmu_read) can't be called concurrently.
>> >
>>
>> Yes they are meant to be called from main thread.
>
>Ok, then please add that into formal API comments.
>
>> >> +__rte_experimental
>> >> +int
>> >> +rte_pmu_init(void);
>> >> +

RE: [EXT] Re: [PATCH v11 1/4] lib: add generic support for reading PMU events

Reply via email to