> From: Tomasz Duszynski [mailto:tduszyn...@marvell.com] > Sent: Wednesday, 11 January 2023 00.47 > > Add support for programming PMU counters and reading their values > in runtime bypassing kernel completely. > > This is especially useful in cases where CPU cores are isolated > (nohz_full) i.e run dedicated tasks. In such cases one cannot use > standard perf utility without sacrificing latency and performance. > > Signed-off-by: Tomasz Duszynski <tduszyn...@marvell.com> > ---
[...] > +static int > +do_perf_event_open(uint64_t config[3], unsigned int lcore_id, int > group_fd) > +{ > + struct perf_event_attr attr = { > + .size = sizeof(struct perf_event_attr), > + .type = PERF_TYPE_RAW, > + .exclude_kernel = 1, > + .exclude_hv = 1, > + .disabled = 1, > + }; > + > + pmu_arch_fixup_config(config); > + > + attr.config = config[0]; > + attr.config1 = config[1]; > + attr.config2 = config[2]; > + > + return syscall(SYS_perf_event_open, &attr, 0, > rte_lcore_to_cpu_id(lcore_id), group_fd, 0); > +} If SYS_perf_event_open() must be called from the worker thread itself, then lcore_id must not be passed as a parameter to do_perf_event_open(). Otherwise, I would expect to be able to call do_perf_event_open() from the main thread and pass any lcore_id of a worker thread. This comment applies to all functions that must be called from the worker thread itself. It also applies to the functions that call such functions. [...] > +/** > + * A structure describing a group of events. > + */ > +struct rte_pmu_event_group { > + int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */ > + struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; > /**< array of user pages */ > + bool enabled; /**< true if group was enabled on particular lcore > */ > +}; > + > +/** > + * A structure describing an event. > + */ > +struct rte_pmu_event { > + char *name; /** name of an event */ > + unsigned int index; /** event index into fds/mmap_pages */ > + TAILQ_ENTRY(rte_pmu_event) next; /** list entry */ > +}; Move the "enabled" field up, making it the first field in this structure. This might reduce the number of instructions required to check (!group->enabled) in rte_pmu_read(). Also, each instance of the structure is used individually per lcore, so the structure should be cache line aligned to avoid unnecessarily crossing cache lines. I.e.: struct rte_pmu_event_group { bool enabled; /**< true if group was enabled on particular lcore */ int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */ struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */ } __rte_cache_aligned; > + > +/** > + * A PMU state container. > + */ > +struct rte_pmu { > + char *name; /** name of core PMU listed under > /sys/bus/event_source/devices */ > + struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore > event group data */ > + unsigned int num_group_events; /**< number of events in a group > */ > + TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching > events */ > +}; > + > +/** Pointer to the PMU state container */ > +extern struct rte_pmu rte_pmu; Just "The PMU state container". It is not a pointer anymore. :-) [...] > +/** > + * @internal > + * > + * Read PMU counter. > + * > + * @param pc > + * Pointer to the mmapped user page. > + * @return > + * Counter value read from hardware. > + */ > +__rte_internal > +static __rte_always_inline uint64_t > +rte_pmu_read_userpage(struct perf_event_mmap_page *pc) > +{ > + uint64_t width, offset; > + uint32_t seq, index; > + int64_t pmc; > + > + for (;;) { > + seq = pc->lock; > + rte_compiler_barrier(); > + index = pc->index; > + offset = pc->offset; > + width = pc->pmc_width; > + Please add a comment here about the special meaning of index == 0. > + if (likely(pc->cap_user_rdpmc && index)) { > + pmc = rte_pmu_pmc_read(index - 1); > + pmc <<= 64 - width; > + pmc >>= 64 - width; > + offset += pmc; > + } > + > + rte_compiler_barrier(); > + > + if (likely(pc->lock == seq)) > + return offset; > + } > + > + return 0; > +} [...] > +/** > + * @warning > + * @b EXPERIMENTAL: this API may change without prior notice > + * > + * Read hardware counter configured to count occurrences of an event. > + * > + * @param index > + * Index of an event to be read. > + * @return > + * Event value read from register. In case of errors or lack of > support > + * 0 is returned. In other words, stream of zeros in a trace file > + * indicates problem with reading particular PMU event register. > + */ > +__rte_experimental > +static __rte_always_inline uint64_t > +rte_pmu_read(unsigned int index) > +{ > + struct rte_pmu_event_group *group; > + int ret, lcore_id = rte_lcore_id(); > + > + group = &rte_pmu.group[lcore_id]; > + if (unlikely(!group->enabled)) { > + ret = rte_pmu_enable_group(lcore_id); > + if (ret) > + return 0; > + > + group->enabled = true; Group->enabled should be set inside rte_pmu_enable_group(), not here. > + } > + > + if (unlikely(index >= rte_pmu.num_group_events)) > + return 0; > + > + return rte_pmu_read_userpage(group->mmap_pages[index]); > +}