core: add mux switch to skip to the current CPU's events list on mux interrupt

Alexey Budankov Wed, 23 Aug 2017 11:12:59 -0700

On 23.08.2017 14:54, Alexander Shishkin wrote:
> Alexey Budankov <alexey.budan...@linux.intel.com> writes:
> 
>> This patch implements mux switch that triggers skipping to the 
>> current CPU's events list at mulitplexing hrtimer interrupt 
>> handler as well as adoption of the switch in the existing 
>> implementation.
>>
>> perf_event_groups_iterate_cpu() API is introduced to implement 
>> iteration thru the certain CPU groups list skipping groups 
> 
> "through"
> 
>> allocated for the other CPUs.
>>
>> Signed-off-by: Alexey Budankov <alexey.budan...@linux.intel.com>
>> ---
>>  kernel/events/core.c | 193 
>> ++++++++++++++++++++++++++++++++++++---------------
>>  1 file changed, 137 insertions(+), 56 deletions(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 08ccfb2..aeb0f81 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -556,11 +556,11 @@ void perf_sample_event_took(u64 sample_len_ns)
>>  static atomic64_t perf_event_id;
>>  
>>  static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
>> -                          enum event_type_t event_type);
>> +                          enum event_type_t event_type, int mux);
>>  
>>  static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
>>                           enum event_type_t event_type,
>> -                         struct task_struct *task);
>> +                         struct task_struct *task, int mux);
>>  
>>  static void update_context_time(struct perf_event_context *ctx);
>>  static u64 perf_event_time(struct perf_event *event);
>> @@ -702,6 +702,7 @@ static void perf_cgroup_switch(struct task_struct *task, 
>> int mode)
>>      struct perf_cpu_context *cpuctx;
>>      struct list_head *list;
>>      unsigned long flags;
>> +    int mux = 0;
>>  
>>      /*
>>       * Disable interrupts and preemption to avoid this CPU's
>> @@ -717,7 +718,7 @@ static void perf_cgroup_switch(struct task_struct *task, 
>> int mode)
>>              perf_pmu_disable(cpuctx->ctx.pmu);
>>  
>>              if (mode & PERF_CGROUP_SWOUT) {
>> -                    cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>> +                    cpu_ctx_sched_out(cpuctx, EVENT_ALL, mux);
>>                      /*
>>                       * must not be done before ctxswout due
>>                       * to event_filter_match() in event_sched_out()
>> @@ -736,7 +737,7 @@ static void perf_cgroup_switch(struct task_struct *task, 
>> int mode)
>>                       */
>>                      cpuctx->cgrp = perf_cgroup_from_task(task,
>>                                                           &cpuctx->ctx);
>> -                    cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>> +                    cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, mux);
> 
> 'mux' is always zero in this function, isn't it?
> 
>>              }
>>              perf_pmu_enable(cpuctx->ctx.pmu);
>>              perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> @@ -1613,8 +1614,16 @@ perf_event_groups_rotate(struct rb_root *groups, int 
>> cpu)
>>   */
>>  #define perf_event_groups_for_each(event, iter, tree, node, list, link) \
>>         for (iter = rb_first(tree); iter; iter = rb_next(iter))      \
>> -            list_for_each_entry(event, &(rb_entry(iter,             \
>> -                    typeof(*event), node)->list), link)
>> +            list_for_each_entry(event, &(rb_entry(iter,             \
>> +                            typeof(*event), node)->list), link)
> 
> Is this an indentation change? What is it doing here?
> 
>> +
>> +/*
>> + * Iterate event groups related to specific cpu.
>> + */
>> +#define perf_event_groups_for_each_cpu(event, cpu, tree, list, link)        
>> \
>> +            list = perf_event_groups_get_list(tree, cpu);           \
>> +            if (list)                                               \
>> +                    list_for_each_entry(event, list, link)
> 
> ..or not, if there's no list.
> 
>>  
>>  /*
>>   * Add a event from the lists for its context.
>> @@ -2397,36 +2406,38 @@ static void add_event_to_ctx(struct perf_event 
>> *event,
>>  
>>  static void ctx_sched_out(struct perf_event_context *ctx,
>>                        struct perf_cpu_context *cpuctx,
>> -                      enum event_type_t event_type);
>> +                      enum event_type_t event_type, int mux);
>>  static void
>>  ctx_sched_in(struct perf_event_context *ctx,
>>           struct perf_cpu_context *cpuctx,
>>           enum event_type_t event_type,
>> -         struct task_struct *task);
>> +         struct task_struct *task, int mux);
>>  
>>  static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
>>                             struct perf_event_context *ctx,
>>                             enum event_type_t event_type)
>>  {
>> +    int mux = 0;
>> +
>>      if (!cpuctx->task_ctx)
>>              return;
>>  
>>      if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
>>              return;
>>  
>> -    ctx_sched_out(ctx, cpuctx, event_type);
>> +    ctx_sched_out(ctx, cpuctx, event_type, mux);
> 
> Just use 0.


Well, I intentionally made this ugly switch named over local variable 
to add clarity into the code so I simply wonder - why do you prefer 
the unnamed variant?

> 
>>  }
>>  
>>  static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
>>                              struct perf_event_context *ctx,
>> -                            struct task_struct *task)
>> +                            struct task_struct *task, int mux)
>>  {
>> -    cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
>> +    cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, mux);
>>      if (ctx)
>> -            ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
>> -    cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
>> +            ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, mux);
>> +    cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, mux);
>>      if (ctx)
>> -            ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
>> +            ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, mux);
>>  }
>>  
>>  /*
>> @@ -2450,6 +2461,7 @@ static void ctx_resched(struct perf_cpu_context 
>> *cpuctx,
>>  {
>>      enum event_type_t ctx_event_type = event_type & EVENT_ALL;
>>      bool cpu_event = !!(event_type & EVENT_CPU);
>> +    int mux = 0;
>>  
>>      /*
>>       * If pinned groups are involved, flexible groups also need to be
>> @@ -2470,11 +2482,11 @@ static void ctx_resched(struct perf_cpu_context 
>> *cpuctx,
>>       *  - otherwise, do nothing more.
>>       */
>>      if (cpu_event)
>> -            cpu_ctx_sched_out(cpuctx, ctx_event_type);
>> +            cpu_ctx_sched_out(cpuctx, ctx_event_type, mux);
>>      else if (ctx_event_type & EVENT_PINNED)
>> -            cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>> +            cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
>>  
>> -    perf_event_sched_in(cpuctx, task_ctx, current);
>> +    perf_event_sched_in(cpuctx, task_ctx, current, mux);
> 
> Also mux==0 in all cases in this function.
> 
>>      perf_pmu_enable(cpuctx->ctx.pmu);
>>  }
>>  
>> @@ -2491,7 +2503,7 @@ static int  __perf_install_in_context(void *info)
>>      struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
>>      struct perf_event_context *task_ctx = cpuctx->task_ctx;
>>      bool reprogram = true;
>> -    int ret = 0;
>> +    int ret = 0, mux =0;
>>  
>>      raw_spin_lock(&cpuctx->ctx.lock);
>>      if (ctx->task) {
>> @@ -2518,7 +2530,7 @@ static int  __perf_install_in_context(void *info)
>>      }
>>  
>>      if (reprogram) {
>> -            ctx_sched_out(ctx, cpuctx, EVENT_TIME);
>> +            ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
>>              add_event_to_ctx(event, ctx);
>>              ctx_resched(cpuctx, task_ctx, get_event_type(event));
>>      } else {
>> @@ -2655,13 +2667,14 @@ static void __perf_event_enable(struct perf_event 
>> *event,
>>  {
>>      struct perf_event *leader = event->group_leader;
>>      struct perf_event_context *task_ctx;
>> +    int mux = 0;
>>  
>>      if (event->state >= PERF_EVENT_STATE_INACTIVE ||
>>          event->state <= PERF_EVENT_STATE_ERROR)
>>              return;
>>  
>>      if (ctx->is_active)
>> -            ctx_sched_out(ctx, cpuctx, EVENT_TIME);
>> +            ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
>>  
>>      __perf_event_mark_enabled(event);
>>  
>> @@ -2671,7 +2684,7 @@ static void __perf_event_enable(struct perf_event 
>> *event,
>>      if (!event_filter_match(event)) {
>>              if (is_cgroup_event(event))
>>                      perf_cgroup_defer_enabled(event);
>> -            ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
>> +            ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
>>              return;
>>      }
>>  
>> @@ -2680,7 +2693,7 @@ static void __perf_event_enable(struct perf_event 
>> *event,
>>       * then don't put it on unless the group is on.
>>       */
>>      if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
>> -            ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
>> +            ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
> 
> And here.
> 
>>              return;
>>      }
>>  
>> @@ -2876,11 +2889,13 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);
>>  
>>  static void ctx_sched_out(struct perf_event_context *ctx,
>>                        struct perf_cpu_context *cpuctx,
>> -                      enum event_type_t event_type)
>> +                      enum event_type_t event_type, int mux)
>>  {
>>      int is_active = ctx->is_active;
>> +    struct list_head *group_list;
>>      struct perf_event *event;
>>      struct rb_node *node;
>> +    int sw = -1, cpu = smp_processor_id();
> 
> Same thing seems to be happening with 'sw'.
> 
>>      lockdep_assert_held(&ctx->lock);
>>  
>>      if (likely(!ctx->nr_events)) {
>> @@ -2926,17 +2941,47 @@ static void ctx_sched_out(struct perf_event_context 
>> *ctx,
>>  
>>      perf_pmu_disable(ctx->pmu);
>>  
>> -    if (is_active & EVENT_PINNED)
>> -            perf_event_groups_for_each(event, node,
>> -                            &ctx->pinned_groups, group_node,
>> -                            group_list, group_entry)
>> -                    group_sched_out(event, cpuctx, ctx);
>> +    if (is_active & EVENT_PINNED) {
>> +            if (mux) {
> 
> So it's 'rotate', really.

Which 'rotate' do you mean? If the local variable from 
perf_rotate_context() then yes - it may be passed to this function 
call from there as mux value, but logically they are still different.

> 
>> +                    perf_event_groups_for_each_cpu(event, cpu,
>> +                                    &ctx->pinned_groups,
>> +                                    group_list, group_entry) {
>> +                                    group_sched_out(event, cpuctx, ctx);
>> +                    }
>> +                    perf_event_groups_for_each_cpu(event, sw,
>> +                                    &ctx->pinned_groups,
>> +                                    group_list, group_entry) {
>> +                                    group_sched_out(event, cpuctx, ctx);
>> +                    }
>> +            } else {
>> +                    perf_event_groups_for_each(event, node,
>> +                                    &ctx->pinned_groups, group_node,
>> +                                    group_list, group_entry) {
>> +                                    group_sched_out(event, cpuctx, ctx);
>> +                    }
>> +            }
>> +    }
>>  
>> -    if (is_active & EVENT_FLEXIBLE)
>> -            perf_event_groups_for_each(event, node,
>> -                            &ctx->flexible_groups, group_node,
>> -                            group_list, group_entry)
>> -                    group_sched_out(event, cpuctx, ctx);
>> +    if (is_active & EVENT_FLEXIBLE) {
>> +            if (mux) {
>> +                    perf_event_groups_for_each_cpu(event, cpu,
>> +                                    &ctx->flexible_groups,
>> +                                    group_list, group_entry) {
>> +                                    group_sched_out(event, cpuctx, ctx);
>> +                    }
>> +                    perf_event_groups_for_each_cpu(event, sw,
>> +                                    &ctx->flexible_groups,
>> +                                    group_list, group_entry) {
>> +                                    group_sched_out(event, cpuctx, ctx);
>> +                    }
>> +            } else {
>> +                    perf_event_groups_for_each(event, node,
>> +                                    &ctx->flexible_groups, group_node,
>> +                                    group_list, group_entry) {
>> +                                    group_sched_out(event, cpuctx, ctx);
>> +                    }
>> +            }
>> +    }
>>  
>>      perf_pmu_enable(ctx->pmu);
>>  }
>> @@ -3225,9 +3270,9 @@ void __perf_event_task_sched_out(struct task_struct 
>> *task,
>>   * Called with IRQs disabled
>>   */
>>  static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
>> -                          enum event_type_t event_type)
>> +                          enum event_type_t event_type, int mux)
>>  {
>> -    ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
>> +    ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, mux);
>>  }
>>  
>>  static void
>> @@ -3287,11 +3332,13 @@ static void
>>  ctx_sched_in(struct perf_event_context *ctx,
>>           struct perf_cpu_context *cpuctx,
>>           enum event_type_t event_type,
>> -         struct task_struct *task)
>> +         struct task_struct *task, int mux)
>>  {
>>      int is_active = ctx->is_active;
>> +    struct list_head *group_list;
>>      struct perf_event *event;
>>      struct rb_node *node;
>> +    int sw = -1, cpu = smp_processor_id();
>>  
>>      lockdep_assert_held(&ctx->lock);
>>  
>> @@ -3319,35 +3366,69 @@ ctx_sched_in(struct perf_event_context *ctx,
>>       * First go through the list and put on any pinned groups
>>       * in order to give them the best chance of going on.
>>       */
>> -    if (is_active & EVENT_PINNED)
>> -            perf_event_groups_for_each(event, node,
>> -                            &ctx->pinned_groups, group_node,
>> -                            group_list, group_entry)
>> -                    ctx_pinned_sched_in(event, cpuctx, ctx);
>> +    if (is_active & EVENT_PINNED) {
>> +            if (mux) {
>> +                    perf_event_groups_for_each_cpu(event, sw,
>> +                                    &ctx->pinned_groups,
>> +                                    group_list, group_entry) {
>> +                                    ctx_pinned_sched_in(event, cpuctx, ctx);
>> +                    }
>> +                    perf_event_groups_for_each_cpu(event, cpu,
>> +                                    &ctx->pinned_groups,
>> +                                    group_list, group_entry) {
>> +                                    ctx_pinned_sched_in(event, cpuctx, ctx);
>> +                    }
>> +            } else {
>> +                    perf_event_groups_for_each(event, node,
>> +                                    &ctx->pinned_groups, group_node,
>> +                                    group_list, group_entry) {
>> +                                    ctx_pinned_sched_in(event, cpuctx, ctx);
>> +                    }
>> +            }
>> +    }
>>  
>>      /* Then walk through the lower prio flexible groups */
>>      if (is_active & EVENT_FLEXIBLE) {
>>              int can_add_hw = 1;
>> -            perf_event_groups_for_each(event, node,
>> -                            &ctx->flexible_groups, group_node,
>> -                            group_list, group_entry)
>> -                    ctx_flexible_sched_in(event, cpuctx, ctx, &can_add_hw);
>> +            if (mux) {
>> +                    perf_event_groups_for_each_cpu(event, sw,
>> +                                    &ctx->flexible_groups,
>> +                                    group_list, group_entry) {
>> +                                    ctx_flexible_sched_in(event, cpuctx,
>> +                                                    ctx, &can_add_hw);
>> +                    }
>> +                    can_add_hw = 1;
>> +                    perf_event_groups_for_each_cpu(event, cpu,
>> +                                    &ctx->flexible_groups,
>> +                                    group_list, group_entry) {
>> +                                    ctx_flexible_sched_in(event, cpuctx,
>> +                                                    ctx, &can_add_hw);
>> +                    }
>> +            } else {
>> +                    perf_event_groups_for_each(event, node,
>> +                                    &ctx->flexible_groups, group_node,
>> +                                    group_list, group_entry) {
>> +                                    ctx_flexible_sched_in(event, cpuctx,
>> +                                                    ctx, &can_add_hw);
>> +                    }
>> +            }
>>      }
>>  }
>>  
>>  static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
>>                           enum event_type_t event_type,
>> -                         struct task_struct *task)
>> +                         struct task_struct *task, int mux)
>>  {
>>      struct perf_event_context *ctx = &cpuctx->ctx;
>>  
>> -    ctx_sched_in(ctx, cpuctx, event_type, task);
>> +    ctx_sched_in(ctx, cpuctx, event_type, task, mux);
>>  }
>>  
>>  static void perf_event_context_sched_in(struct perf_event_context *ctx,
>>                                      struct task_struct *task)
>>  {
>>      struct perf_cpu_context *cpuctx;
>> +    int mux = 0;
>>  
>>      cpuctx = __get_cpu_context(ctx);
>>      if (cpuctx->task_ctx == ctx)
>> @@ -3371,8 +3452,8 @@ static void perf_event_context_sched_in(struct 
>> perf_event_context *ctx,
>>       * events, no need to flip the cpuctx's events around.
>>       */
>>      if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
>> -            cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>> -    perf_event_sched_in(cpuctx, ctx, task);
>> +            cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
>> +    perf_event_sched_in(cpuctx, ctx, task, mux);
>>      perf_pmu_enable(ctx->pmu);
>>  
>>  unlock:
>> @@ -3618,7 +3699,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
>>  static int perf_rotate_context(struct perf_cpu_context *cpuctx)
>>  {
>>      struct perf_event_context *ctx = NULL;
>> -    int rotate = 0;
>> +    int rotate = 0, mux = 1;
>>  
>>      if (cpuctx->ctx.nr_events) {
>>              if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
>> @@ -3637,15 +3718,15 @@ static int perf_rotate_context(struct 
>> perf_cpu_context *cpuctx)
>>      perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>      perf_pmu_disable(cpuctx->ctx.pmu);
>>  
>> -    cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>> +    cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
> 
> It's '1'.
> 
>

Re: [PATCH v7 2/2] perf/core: add mux switch to skip to the current CPU's events list on mux interrupt

Reply via email to