>-----Original Message-----
>From: Konstantin Ananyev <konstantin.anan...@huawei.com>
>Sent: Monday, February 20, 2023 3:31 PM
>To: Tomasz Duszynski <tduszyn...@marvell.com>; Konstantin Ananyev 
><konstantin.v.anan...@yandex.ru>;
>dev@dpdk.org
>Subject: RE: [EXT] Re: [PATCH v11 1/4] lib: add generic support for reading 
>PMU events
>
>> >> >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode
>> >> >> 100644 index 0000000000..6b664c3336
>> >> >> --- /dev/null
>> >> >> +++ b/lib/pmu/rte_pmu.h
>> >> >> @@ -0,0 +1,212 @@
>> >> >> +/* SPDX-License-Identifier: BSD-3-Clause
>> >> >> + * Copyright(c) 2023 Marvell
>> >> >> + */
>> >> >> +
>> >> >> +#ifndef _RTE_PMU_H_
>> >> >> +#define _RTE_PMU_H_
>> >> >> +
>> >> >> +/**
>> >> >> + * @file
>> >> >> + *
>> >> >> + * PMU event tracing operations
>> >> >> + *
>> >> >> + * This file defines generic API and types necessary to setup
>> >> >> +PMU and
>> >> >> + * read selected counters in runtime.
>> >> >> + */
>> >> >> +
>> >> >> +#ifdef __cplusplus
>> >> >> +extern "C" {
>> >> >> +#endif
>> >> >> +
>> >> >> +#include <linux/perf_event.h>
>> >> >> +
>> >> >> +#include <rte_atomic.h>
>> >> >> +#include <rte_branch_prediction.h> #include <rte_common.h>
>> >> >> +#include <rte_compat.h> #include <rte_spinlock.h>
>> >> >> +
>> >> >> +/** Maximum number of events in a group */ #define
>> >> >> +MAX_NUM_GROUP_EVENTS 8
>> >> >> +
>> >> >> +/**
>> >> >> + * A structure describing a group of events.
>> >> >> + */
>> >> >> +struct rte_pmu_event_group {
>> >> >> +      struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS];
>> >> >> +/**< array of user pages
>> >*/
>> >> >> +      int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors 
>> >> >> */
>> >> >> +      bool enabled; /**< true if group was enabled on particular 
>> >> >> lcore */
>> >> >> +      TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
>> >> >> +__rte_cache_aligned;
>> >> >> +
>> >> >> +/**
>> >> >> + * A structure describing an event.
>> >> >> + */
>> >> >> +struct rte_pmu_event {
>> >> >> +      char *name; /**< name of an event */
>> >> >> +      unsigned int index; /**< event index into fds/mmap_pages */
>> >> >> +      TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
>> >> >> +
>> >> >> +/**
>> >> >> + * A PMU state container.
>> >> >> + */
>> >> >> +struct rte_pmu {
>> >> >> +      char *name; /**< name of core PMU listed under 
>> >> >> /sys/bus/event_source/devices */
>> >> >> +      rte_spinlock_t lock; /**< serialize access to event group list 
>> >> >> */
>> >> >> +      TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list 
>> >> >> of event groups */
>> >> >> +      unsigned int num_group_events; /**< number of events in a group 
>> >> >> */
>> >> >> +      TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching 
>> >> >> events */
>> >> >> +      unsigned int initialized; /**< initialization counter */ };
>> >> >> +
>> >> >> +/** lcore event group */
>> >> >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group,
>> >> >> +_event_group);
>> >> >> +
>> >> >> +/** PMU state container */
>> >> >> +extern struct rte_pmu rte_pmu;
>> >> >> +
>> >> >> +/** Each architecture supporting PMU needs to provide its own
>> >> >> +version */ #ifndef rte_pmu_pmc_read #define
>> >> >> +rte_pmu_pmc_read(index) ({ 0; }) #endif
>> >> >> +
>> >> >> +/**
>> >> >> + * @warning
>> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> + *
>> >> >> + * Read PMU counter.
>> >> >> + *
>> >> >> + * @warning This should be not called directly.
>> >> >> + *
>> >> >> + * @param pc
>> >> >> + *   Pointer to the mmapped user page.
>> >> >> + * @return
>> >> >> + *   Counter value read from hardware.
>> >> >> + */
>> >> >> +static __rte_always_inline uint64_t
>> >> >> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
>> >> >> +      uint64_t width, offset;
>> >> >> +      uint32_t seq, index;
>> >> >> +      int64_t pmc;
>> >> >> +
>> >> >> +      for (;;) {
>> >> >> +              seq = pc->lock;
>> >> >> +              rte_compiler_barrier();
>> >> >
>> >> >Are you sure that compiler_barrier() is enough here?
>> >> >On some archs CPU itself has freedom to re-order reads.
>> >> >Or I am missing something obvious here?
>> >> >
>> >>
>> >> It's a matter of not keeping old stuff cached in registers and
>> >> making sure that we have two reads of lock. CPU reordering won't do
>> >> any harm here.
>> >
>> >Sorry, I didn't get you here:
>> >Suppose CPU will re-order reads and will read lock *after* index or offset 
>> >value.
>> >Wouldn't it mean that in that case index and/or offset can contain 
>> >old/invalid values?
>> >
>>
>> This number is just an indicator whether kernel did change something or not.
>
>You are talking about pc->lock, right?
>Yes, I do understand that it is sort of seqlock.
>That's why I am puzzled why we do not care about possible cpu read-reordering.
>Manual for perf_event_open() also has a code snippet with compiler barrier 
>only...
>
>> If cpu reordering will come into play then this will not change anything 
>> from pov of this loop.
>> All we want is fresh data when needed and no involvement of compiler
>> when it comes to reordering code.
>
>Ok, can you probably explain to me why the following could not happen:
>T0:
>pc->seqlock==0; pc->index==I1; pc->offset==O1;
>T1:
>      cpu #0 read pmu (due to cpu read reorder, we get index value before 
> seqlock):
>       index=pc->index;  //index==I1;
> T2:
>      cpu #1 kernel vent_update_userpage:
>      pc->lock++; // pc->lock==1
>      pc->index=I2;
>      pc->offset=O2;
>      ...
>      pc->lock++; //pc->lock==2
>T3:
>      cpu #0 continue with read pmu:
>      seq=pc->lock; //seq == 2
>       offset=pc->offset; // offset == O2
>       ....
>       pmc = rte_pmu_pmc_read(index - 1);  // Note that we read at I1, not I2
>       offset += pmc; //offset == O2 + pmcread(I1-1);
>       if (pc->lock == seq) // they are equal, return
>             return offset;
>
>Or, it can happen, but by some reason we don't care much?
>

This code does self-monitoring and user page (whole group actually) is per 
thread running on
current cpu. Hence I am not sure what are you trying to prove with that 
example.  

>> >>
>> >> >> +              index = pc->index;
>> >> >> +              offset = pc->offset;
>> >> >> +              width = pc->pmc_width;
>> >> >> +
>> >> >> +              /* index set to 0 means that particular counter cannot 
>> >> >> be used */
>> >> >> +              if (likely(pc->cap_user_rdpmc && index)) {
>> >> >> +                      pmc = rte_pmu_pmc_read(index - 1);
>> >> >> +                      pmc <<= 64 - width;
>> >> >> +                      pmc >>= 64 - width;
>> >> >> +                      offset += pmc;
>> >> >> +              }
>> >> >> +
>> >> >> +              rte_compiler_barrier();
>> >> >> +
>> >> >> +              if (likely(pc->lock == seq))
>> >> >> +                      return offset;
>> >> >> +      }
>> >> >> +
>> >> >> +      return 0;
>> >> >> +}
>> >> >> +
>> >> >> +/**
>> >> >> + * @warning
>> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> + *
>> >> >> + * Enable group of events on the calling lcore.
>> >> >> + *
>> >> >> + * @warning This should be not called directly.
>> >> >> + *
>> >> >> + * @return
>> >> >> + *   0 in case of success, negative value otherwise.
>> >> >> + */
>> >> >> +__rte_experimental
>> >> >> +int
>> >> >> +__rte_pmu_enable_group(void);
>> >> >> +
>> >> >> +/**
>> >> >> + * @warning
>> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> + *
>> >> >> + * Initialize PMU library.
>> >> >> + *
>> >> >> + * @warning This should be not called directly.
>> >> >> + *
>> >> >> + * @return
>> >> >> + *   0 in case of success, negative value otherwise.
>> >> >> + */
>> >> >> +__rte_experimental
>> >> >> +int
>> >> >> +rte_pmu_init(void);
>> >> >> +
>> >> >> +/**
>> >> >> + * @warning
>> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> + *
>> >> >> + * Finalize PMU library. This should be called after PMU counters are 
>> >> >> no longer being
>read.
>> >> >> + */
>> >> >> +__rte_experimental
>> >> >> +void
>> >> >> +rte_pmu_fini(void);
>> >> >> +
>> >> >> +/**
>> >> >> + * @warning
>> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> + *
>> >> >> + * Add event to the group of enabled events.
>> >> >> + *
>> >> >> + * @param name
>> >> >> + *   Name of an event listed under 
>> >> >> /sys/bus/event_source/devices/pmu/events.
>> >> >> + * @return
>> >> >> + *   Event index in case of success, negative value otherwise.
>> >> >> + */
>> >> >> +__rte_experimental
>> >> >> +int
>> >> >> +rte_pmu_add_event(const char *name);
>> >> >> +
>> >> >> +/**
>> >> >> + * @warning
>> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> + *
>> >> >> + * Read hardware counter configured to count occurrences of an event.
>> >> >> + *
>> >> >> + * @param index
>> >> >> + *   Index of an event to be read.
>> >> >> + * @return
>> >> >> + *   Event value read from register. In case of errors or lack of 
>> >> >> support
>> >> >> + *   0 is returned. In other words, stream of zeros in a trace file
>> >> >> + *   indicates problem with reading particular PMU event register.
>> >> >> + */
>
>Another question - do we really need  to have __rte_pmu_read_userpage() and 
>rte_pmu_read() as
>static inline functions in public header?
>As I understand, because of that we also have to make 'struct rte_pmu_*'
>definitions also public.
>
>> >> >> +__rte_experimental
>> >> >> +static __rte_always_inline uint64_t rte_pmu_read(unsigned int
>> >> >> +index) {
>> >> >> +      struct rte_pmu_event_group *group = 
>> >> >> &RTE_PER_LCORE(_event_group);
>> >> >> +      int ret;
>> >> >> +
>> >> >> +      if (unlikely(!rte_pmu.initialized))
>> >> >> +              return 0;
>> >> >> +
>> >> >> +      if (unlikely(!group->enabled)) {
>> >> >> +              ret = __rte_pmu_enable_group();
>> >> >> +              if (ret)
>> >> >> +                      return 0;
>> >> >> +      }
>> >> >> +
>> >> >> +      if (unlikely(index >= rte_pmu.num_group_events))
>> >> >> +              return 0;
>> >> >> +
>> >> >> +      return __rte_pmu_read_userpage(group->mmap_pages[index]);
>> >> >> +}
>> >> >> +
>> >> >> +#ifdef __cplusplus
>> >> >> +}
>> >> >> +#endif
>> >> >> +

Reply via email to