On Wed, Oct 10, 2018 at 12:45:59PM +0200, Peter Zijlstra wrote:
> Hi all,
> 
> There have been various issues and limitations with the way perf uses
> (task) contexts to track events. Most notable is the single hardware PMU
> task context, which has resulted in a number of yucky things (both
> proposed and merged).
> 
> Notably:
> 
>  - HW breakpoint PMU
>  - ARM big.little PMU
>  - Intel Branch Monitoring PMU
> 
> Since we now track the events in RB trees, we can 'simply' add a pmu
> order to them and have them grouped that way, reducing to a single
> context. Of course, reality never quite works out that simple, and below
> ends up adding an intermediate data structure to bridge the context ->
> pmu mapping.
> 
> Something a little like:
> 
>               ,------------------------[1:n]---------------------.
>               V                                                  V
>     perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
>               ^                      ^     |                     |
>               `--------[1:n]---------'     `-[n:1]-> pmu <-[1:n]-'
> 
> This patch builds (provided you disable CGROUP_PERF), boots and survives
> perf-top without the machine catching fire.
> 
> There's still a fair bit of loose ends (look for XXX), but I think this
> is the direction we should be going.

I think this is the right direction, as this is roughly what I suggested
before the RB-tree stuff. ;)

> Comments?

Vague things inline below.

> +/*
> + *           ,------------------------[1:n]---------------------.
> + *           V                                                  V
> + * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
> + *           ^                      ^     |                     |
> + *           `--------[1:n]---------'     `-[n:1]-> pmu <-[1:n]-'
> + *
> + *
> + * XXX destroy epc when empty
> + *   refcount, !rcu
> + *
> + * XXX epc locking
> + *
> + *   event->pmu_ctx          ctx->mutex && inactive
> + *   ctx->pmu_ctx_list               ctx->mutex && ctx->lock
> + *
> + */
> +struct perf_event_pmu_context {
> +     struct pmu                      *pmu;
> +     struct perf_event_context       *ctx;
> +
> +     struct list_head                pmu_ctx_entry;
> +
> +     struct list_head                pinned_active;
> +     struct list_head                flexible_active;
> +
> +     unsigned int                    embedded : 1;

Is this just for lifetime management (i.e. not attempting to free the
embedded epc)?

Do we need a flag? Can't we have the pmu hold a ref on its embedded epc,
and init that at pmu init time?

> +
> +     unsigned int                    nr_events;
> +     unsigned int                    nr_active;
> +
> +     atomic_t                        refcount; /* event <-> epc */
> +
> +     void                            *task_ctx_data; /* pmu specific data */
> +};
>  
>  struct perf_event_groups {
>       struct rb_root  tree;
> @@ -710,7 +749,6 @@ struct perf_event_groups {
>   * Used as a container for task events and CPU events as well:
>   */
>  struct perf_event_context {
> -     struct pmu                      *pmu;
>       /*
>        * Protect the states of the events in the list,
>        * nr_active, and the list:
> @@ -723,20 +761,21 @@ struct perf_event_context {
>        */
>       struct mutex                    mutex;
>  
> -     struct list_head                active_ctx_list;
> +     struct list_head                pmu_ctx_list;
> +
>       struct perf_event_groups        pinned_groups;
>       struct perf_event_groups        flexible_groups;
>       struct list_head                event_list;

I think that the groups lists and event list should be in the
perf_event_pmu_context.

That would make scheduling and rotating events a per-pmu thing, as we
want, without complicating the RB tree logic or requiring additional
hooks.

That may make the move_group case more complicated, though.

... and maybe I've missed some other headache with that?

>  
> -     struct list_head                pinned_active;
> -     struct list_head                flexible_active;
> -
>       int                             nr_events;
>       int                             nr_active;
>       int                             is_active;
> +
> +     int                             nr_task_data;
>       int                             nr_stat;
>       int                             nr_freq;
>       int                             rotate_disable;

Likewise these all seem to be PMU-specific (though I guess we care about
them in the ctx-switch fast paths?).

> +
>       atomic_t                        refcount;
>       struct task_struct              *task;
>  
> @@ -757,7 +796,6 @@ struct perf_event_context {
>  #ifdef CONFIG_CGROUP_PERF
>       int                             nr_cgroups;      /* cgroup evts */
>  #endif
> -     void                            *task_ctx_data; /* pmu specific data */
>       struct rcu_head                 rcu_head;
>  };

[...]

> @@ -1528,6 +1498,11 @@ perf_event_groups_less(struct perf_event
>       if (left->cpu > right->cpu)
>               return false;
>  
> +     if (left->pmu_ctx->pmu < right->pmu_ctx->pmu)
> +             return true;
> +     if (left->pmu_ctx->pmu > right->pmu_ctx->pmu)
> +             return false;
> +
>       if (left->group_index < right->group_index)
>               return true;
>       if (left->group_index > right->group_index)
> @@ -1610,7 +1585,7 @@ del_event_from_groups(struct perf_event
>   * Get the leftmost event in the @cpu subtree.
>   */
>  static struct perf_event *
> -perf_event_groups_first(struct perf_event_groups *groups, int cpu)
> +perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct 
> pmu *pmu)
>  {
>       struct perf_event *node_event = NULL, *match = NULL;
>       struct rb_node *node = groups->tree.rb_node;
> @@ -1623,8 +1598,19 @@ perf_event_groups_first(struct perf_even
>               } else if (cpu > node_event->cpu) {
>                       node = node->rb_right;
>               } else {
> -                     match = node_event;
> -                     node = node->rb_left;
> +                     if (pmu) {
> +                             if (pmu < node_event->pmu_ctx->pmu) {
> +                                     node = node->rb_left;
> +                             } else if (pmu > node_event->pmu_ctx->pmu) {
> +                                     node = node->rb_right;
> +                             } else  {
> +                                     match = node_event;
> +                                     node = node->rb_left;
> +                             }
> +                     } else {
> +                             match = node_event;
> +                             node = node->rb_left;
> +                     }
>               }
>       }
>  
> @@ -1635,13 +1621,17 @@ perf_event_groups_first(struct perf_even
>   * Like rb_entry_next_safe() for the @cpu subtree.
>   */
>  static struct perf_event *
> -perf_event_groups_next(struct perf_event *event)
> +perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
>  {
>       struct perf_event *next;
>  
>       next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), 
> group_node);
> -     if (next && next->cpu == event->cpu)
> +     if (next && next->cpu == event->cpu) {
> +             if (pmu && next->pmu_ctx->pmu != pmu)
> +                     return NULL;
> +
>               return next;
> +     }
>  
>       return NULL;
>  }

This would be much nicer with a per-pmu event_list.

[...]

> +     // XXX premature; what if this is allowed, but we get moved to a PMU
> +     // that doesn't have this.
>       if (is_sampling_event(event)) {
>               if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
>                       err = -EOPNOTSUPP;

Ugh, could that happen for SW events moved into a HW context?

Thanks,
Mark.

Reply via email to