> >> >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode > >> >> 100644 index 0000000000..6b664c3336 > >> >> --- /dev/null > >> >> +++ b/lib/pmu/rte_pmu.h > >> >> @@ -0,0 +1,212 @@ > >> >> +/* SPDX-License-Identifier: BSD-3-Clause > >> >> + * Copyright(c) 2023 Marvell > >> >> + */ > >> >> + > >> >> +#ifndef _RTE_PMU_H_ > >> >> +#define _RTE_PMU_H_ > >> >> + > >> >> +/** > >> >> + * @file > >> >> + * > >> >> + * PMU event tracing operations > >> >> + * > >> >> + * This file defines generic API and types necessary to setup PMU > >> >> +and > >> >> + * read selected counters in runtime. > >> >> + */ > >> >> + > >> >> +#ifdef __cplusplus > >> >> +extern "C" { > >> >> +#endif > >> >> + > >> >> +#include <linux/perf_event.h> > >> >> + > >> >> +#include <rte_atomic.h> > >> >> +#include <rte_branch_prediction.h> #include <rte_common.h> > >> >> +#include <rte_compat.h> #include <rte_spinlock.h> > >> >> + > >> >> +/** Maximum number of events in a group */ #define > >> >> +MAX_NUM_GROUP_EVENTS 8 > >> >> + > >> >> +/** > >> >> + * A structure describing a group of events. > >> >> + */ > >> >> +struct rte_pmu_event_group { > >> >> + struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; > >> >> /**< array of user pages > >*/ > >> >> + int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors > >> >> */ > >> >> + bool enabled; /**< true if group was enabled on particular > >> >> lcore */ > >> >> + TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ } > >> >> +__rte_cache_aligned; > >> >> + > >> >> +/** > >> >> + * A structure describing an event. > >> >> + */ > >> >> +struct rte_pmu_event { > >> >> + char *name; /**< name of an event */ > >> >> + unsigned int index; /**< event index into fds/mmap_pages */ > >> >> + TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ }; > >> >> + > >> >> +/** > >> >> + * A PMU state container. > >> >> + */ > >> >> +struct rte_pmu { > >> >> + char *name; /**< name of core PMU listed under > >> >> /sys/bus/event_source/devices */ > >> >> + rte_spinlock_t lock; /**< serialize access to event group list > >> >> */ > >> >> + TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list > >> >> of event groups */ > >> >> + unsigned int num_group_events; /**< number of events in a group > >> >> */ > >> >> + TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching > >> >> events */ > >> >> + unsigned int initialized; /**< initialization counter */ }; > >> >> + > >> >> +/** lcore event group */ > >> >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group); > >> >> + > >> >> +/** PMU state container */ > >> >> +extern struct rte_pmu rte_pmu; > >> >> + > >> >> +/** Each architecture supporting PMU needs to provide its own > >> >> +version */ #ifndef rte_pmu_pmc_read #define > >> >> +rte_pmu_pmc_read(index) ({ 0; }) #endif > >> >> + > >> >> +/** > >> >> + * @warning > >> >> + * @b EXPERIMENTAL: this API may change without prior notice > >> >> + * > >> >> + * Read PMU counter. > >> >> + * > >> >> + * @warning This should be not called directly. > >> >> + * > >> >> + * @param pc > >> >> + * Pointer to the mmapped user page. > >> >> + * @return > >> >> + * Counter value read from hardware. > >> >> + */ > >> >> +static __rte_always_inline uint64_t __rte_pmu_read_userpage(struct > >> >> +perf_event_mmap_page *pc) { > >> >> + uint64_t width, offset; > >> >> + uint32_t seq, index; > >> >> + int64_t pmc; > >> >> + > >> >> + for (;;) { > >> >> + seq = pc->lock; > >> >> + rte_compiler_barrier(); > >> > > >> >Are you sure that compiler_barrier() is enough here? > >> >On some archs CPU itself has freedom to re-order reads. > >> >Or I am missing something obvious here? > >> > > >> > >> It's a matter of not keeping old stuff cached in registers and making > >> sure that we have two reads of lock. CPU reordering won't do any harm > >> here. > > > >Sorry, I didn't get you here: > >Suppose CPU will re-order reads and will read lock *after* index or offset > >value. > >Wouldn't it mean that in that case index and/or offset can contain > >old/invalid values? > > > > This number is just an indicator whether kernel did change something or not. You are talking about pc->lock, right? Yes, I do understand that it is sort of seqlock. That's why I am puzzled why we do not care about possible cpu read-reordering. Manual for perf_event_open() also has a code snippet with compiler barrier only...
> If cpu reordering will come into play then this will not change anything from > pov of this loop. > All we want is fresh data when needed and no involvement of compiler when it > comes to reordering > code. Ok, can you probably explain to me why the following could not happen: T0: pc->seqlock==0; pc->index==I1; pc->offset==O1; T1: cpu #0 read pmu (due to cpu read reorder, we get index value before seqlock): index=pc->index; //index==I1; T2: cpu #1 kernel vent_update_userpage: pc->lock++; // pc->lock==1 pc->index=I2; pc->offset=O2; ... pc->lock++; //pc->lock==2 T3: cpu #0 continue with read pmu: seq=pc->lock; //seq == 2 offset=pc->offset; // offset == O2 .... pmc = rte_pmu_pmc_read(index - 1); // Note that we read at I1, not I2 offset += pmc; //offset == O2 + pmcread(I1-1); if (pc->lock == seq) // they are equal, return return offset; Or, it can happen, but by some reason we don't care much? > >> > >> >> + index = pc->index; > >> >> + offset = pc->offset; > >> >> + width = pc->pmc_width; > >> >> + > >> >> + /* index set to 0 means that particular counter cannot > >> >> be used */ > >> >> + if (likely(pc->cap_user_rdpmc && index)) { > >> >> + pmc = rte_pmu_pmc_read(index - 1); > >> >> + pmc <<= 64 - width; > >> >> + pmc >>= 64 - width; > >> >> + offset += pmc; > >> >> + } > >> >> + > >> >> + rte_compiler_barrier(); > >> >> + > >> >> + if (likely(pc->lock == seq)) > >> >> + return offset; > >> >> + } > >> >> + > >> >> + return 0; > >> >> +} > >> >> + > >> >> +/** > >> >> + * @warning > >> >> + * @b EXPERIMENTAL: this API may change without prior notice > >> >> + * > >> >> + * Enable group of events on the calling lcore. > >> >> + * > >> >> + * @warning This should be not called directly. > >> >> + * > >> >> + * @return > >> >> + * 0 in case of success, negative value otherwise. > >> >> + */ > >> >> +__rte_experimental > >> >> +int > >> >> +__rte_pmu_enable_group(void); > >> >> + > >> >> +/** > >> >> + * @warning > >> >> + * @b EXPERIMENTAL: this API may change without prior notice > >> >> + * > >> >> + * Initialize PMU library. > >> >> + * > >> >> + * @warning This should be not called directly. > >> >> + * > >> >> + * @return > >> >> + * 0 in case of success, negative value otherwise. > >> >> + */ > >> >> +__rte_experimental > >> >> +int > >> >> +rte_pmu_init(void); > >> >> + > >> >> +/** > >> >> + * @warning > >> >> + * @b EXPERIMENTAL: this API may change without prior notice > >> >> + * > >> >> + * Finalize PMU library. This should be called after PMU counters are > >> >> no longer being read. > >> >> + */ > >> >> +__rte_experimental > >> >> +void > >> >> +rte_pmu_fini(void); > >> >> + > >> >> +/** > >> >> + * @warning > >> >> + * @b EXPERIMENTAL: this API may change without prior notice > >> >> + * > >> >> + * Add event to the group of enabled events. > >> >> + * > >> >> + * @param name > >> >> + * Name of an event listed under > >> >> /sys/bus/event_source/devices/pmu/events. > >> >> + * @return > >> >> + * Event index in case of success, negative value otherwise. > >> >> + */ > >> >> +__rte_experimental > >> >> +int > >> >> +rte_pmu_add_event(const char *name); > >> >> + > >> >> +/** > >> >> + * @warning > >> >> + * @b EXPERIMENTAL: this API may change without prior notice > >> >> + * > >> >> + * Read hardware counter configured to count occurrences of an event. > >> >> + * > >> >> + * @param index > >> >> + * Index of an event to be read. > >> >> + * @return > >> >> + * Event value read from register. In case of errors or lack of > >> >> support > >> >> + * 0 is returned. In other words, stream of zeros in a trace file > >> >> + * indicates problem with reading particular PMU event register. > >> >> + */ Another question - do we really need to have __rte_pmu_read_userpage() and rte_pmu_read() as static inline functions in public header? As I understand, because of that we also have to make 'struct rte_pmu_*' definitions also public. > >> >> +__rte_experimental > >> >> +static __rte_always_inline uint64_t rte_pmu_read(unsigned int > >> >> +index) { > >> >> + struct rte_pmu_event_group *group = > >> >> &RTE_PER_LCORE(_event_group); > >> >> + int ret; > >> >> + > >> >> + if (unlikely(!rte_pmu.initialized)) > >> >> + return 0; > >> >> + > >> >> + if (unlikely(!group->enabled)) { > >> >> + ret = __rte_pmu_enable_group(); > >> >> + if (ret) > >> >> + return 0; > >> >> + } > >> >> + > >> >> + if (unlikely(index >= rte_pmu.num_group_events)) > >> >> + return 0; > >> >> + > >> >> + return __rte_pmu_read_userpage(group->mmap_pages[index]); > >> >> +} > >> >> + > >> >> +#ifdef __cplusplus > >> >> +} > >> >> +#endif > >> >> +