RE: [PATCH v5 1/4] eal: add generic support for reading PMU events

Morten Brørup Wed, 11 Jan 2023 01:05:54 -0800

> From: Tomasz Duszynski [mailto:tduszyn...@marvell.com]
> Sent: Wednesday, 11 January 2023 00.47
> 
> Add support for programming PMU counters and reading their values
> in runtime bypassing kernel completely.
> 
> This is especially useful in cases where CPU cores are isolated
> (nohz_full) i.e run dedicated tasks. In such cases one cannot use
> standard perf utility without sacrificing latency and performance.
> 
> Signed-off-by: Tomasz Duszynski <tduszyn...@marvell.com>
> ---


[...]

> +static int
> +do_perf_event_open(uint64_t config[3], unsigned int lcore_id, int
> group_fd)
> +{
> +     struct perf_event_attr attr = {
> +             .size = sizeof(struct perf_event_attr),
> +             .type = PERF_TYPE_RAW,
> +             .exclude_kernel = 1,
> +             .exclude_hv = 1,
> +             .disabled = 1,
> +     };
> +
> +     pmu_arch_fixup_config(config);
> +
> +     attr.config = config[0];
> +     attr.config1 = config[1];
> +     attr.config2 = config[2];
> +
> +     return syscall(SYS_perf_event_open, &attr, 0,
> rte_lcore_to_cpu_id(lcore_id), group_fd, 0);
> +}

If SYS_perf_event_open() must be called from the worker thread itself, then 
lcore_id must not be passed as a parameter to do_perf_event_open(). Otherwise, 
I would expect to be able to call do_perf_event_open() from the main thread and 
pass any lcore_id of a worker thread.
This comment applies to all functions that must be called from the worker 
thread itself. It also applies to the functions that call such functions.

[...]

> +/**
> + * A structure describing a group of events.
> + */
> +struct rte_pmu_event_group {
> +     int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> +     struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS];
> /**< array of user pages */
> +     bool enabled; /**< true if group was enabled on particular lcore
> */
> +};
> +
> +/**
> + * A structure describing an event.
> + */
> +struct rte_pmu_event {
> +     char *name; /** name of an event */
> +     unsigned int index; /** event index into fds/mmap_pages */
> +     TAILQ_ENTRY(rte_pmu_event) next; /** list entry */
> +};

Move the "enabled" field up, making it the first field in this structure. This 
might reduce the number of instructions required to check (!group->enabled) in 
rte_pmu_read().

Also, each instance of the structure is used individually per lcore, so the 
structure should be cache line aligned to avoid unnecessarily crossing cache 
lines.

I.e.:

struct rte_pmu_event_group {
        bool enabled; /**< true if group was enabled on particular lcore */
        int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
        struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< 
array of user pages */
} __rte_cache_aligned;

> +
> +/**
> + * A PMU state container.
> + */
> +struct rte_pmu {
> +     char *name; /** name of core PMU listed under
> /sys/bus/event_source/devices */
> +     struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore
> event group data */
> +     unsigned int num_group_events; /**< number of events in a group
> */
> +     TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching
> events */
> +};
> +
> +/** Pointer to the PMU state container */
> +extern struct rte_pmu rte_pmu;

Just "The PMU state container". It is not a pointer anymore. :-)

[...]

> +/**
> + * @internal
> + *
> + * Read PMU counter.
> + *
> + * @param pc
> + *   Pointer to the mmapped user page.
> + * @return
> + *   Counter value read from hardware.
> + */
> +__rte_internal
> +static __rte_always_inline uint64_t
> +rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
> +{
> +     uint64_t width, offset;
> +     uint32_t seq, index;
> +     int64_t pmc;
> +
> +     for (;;) {
> +             seq = pc->lock;
> +             rte_compiler_barrier();
> +             index = pc->index;
> +             offset = pc->offset;
> +             width = pc->pmc_width;
> +

Please add a comment here about the special meaning of index == 0.

> +             if (likely(pc->cap_user_rdpmc && index)) {
> +                     pmc = rte_pmu_pmc_read(index - 1);
> +                     pmc <<= 64 - width;
> +                     pmc >>= 64 - width;
> +                     offset += pmc;
> +             }
> +
> +             rte_compiler_barrier();
> +
> +             if (likely(pc->lock == seq))
> +                     return offset;
> +     }
> +
> +     return 0;
> +}

[...]

> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read hardware counter configured to count occurrences of an event.
> + *
> + * @param index
> + *   Index of an event to be read.
> + * @return
> + *   Event value read from register. In case of errors or lack of
> support
> + *   0 is returned. In other words, stream of zeros in a trace file
> + *   indicates problem with reading particular PMU event register.
> + */
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(unsigned int index)
> +{
> +     struct rte_pmu_event_group *group;
> +     int ret, lcore_id = rte_lcore_id();
> +
> +     group = &rte_pmu.group[lcore_id];
> +     if (unlikely(!group->enabled)) {
> +             ret = rte_pmu_enable_group(lcore_id);
> +             if (ret)
> +                     return 0;
> +
> +             group->enabled = true;

Group->enabled should be set inside rte_pmu_enable_group(), not here.

> +     }
> +
> +     if (unlikely(index >= rte_pmu.num_group_events))
> +             return 0;
> +
> +     return rte_pmu_read_userpage(group->mmap_pages[index]);
> +}

RE: [PATCH v5 1/4] eal: add generic support for reading PMU events

Reply via email to