Hello, On Mon, Mar 24, 2025 at 05:30:44PM +0000, Mingwei Zhang wrote: > From: Kan Liang <kan.li...@linux.intel.com> > > Current perf doesn't explicitly schedule out all exclude_guest events > while the guest is running. There is no problem with the current > emulated vPMU. Because perf owns all the PMU counters. It can mask the > counter which is assigned to an exclude_guest event when a guest is > running (Intel way), or set the corresponding HOSTONLY bit in evsentsel > (AMD way). The counter doesn't count when a guest is running. > > However, either way doesn't work with the introduced passthrough vPMU. > A guest owns all the PMU counters when it's running. The host should not > mask any counters. The counter may be used by the guest. The evsentsel > may be overwritten. > > Perf should explicitly schedule out all exclude_guest events to release > the PMU resources when entering a guest, and resume the counting when > exiting the guest. > > It's possible that an exclude_guest event is created when a guest is > running. The new event should not be scheduled in as well. > > The ctx time is shared among different PMUs. The time cannot be stopped > when a guest is running. It is required to calculate the time for events > from other PMUs, e.g., uncore events. Add timeguest to track the guest > run time. For an exclude_guest event, the elapsed time equals > the ctx time - guest time. > Cgroup has dedicated times. Use the same method to deduct the guest time > from the cgroup time as well. > > Co-developed-by: Peter Zijlstra (Intel) <pet...@infradead.org> > Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org> > Signed-off-by: Kan Liang <kan.li...@linux.intel.com> > Signed-off-by: Mingwei Zhang <mizh...@google.com> > --- > include/linux/perf_event.h | 6 ++ > kernel/events/core.c | 209 +++++++++++++++++++++++++++++-------- > 2 files changed, 169 insertions(+), 46 deletions(-) > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h > index a2fd1bdc955c..7bda1e20be12 100644 > --- a/include/linux/perf_event.h > +++ b/include/linux/perf_event.h > @@ -999,6 +999,11 @@ struct perf_event_context { > */ > struct perf_time_ctx time; > > + /* > + * Context clock, runs when in the guest mode. > + */ > + struct perf_time_ctx timeguest;
Why not make it an array as you use it later? > + > /* > * These fields let us detect when two contexts have both > * been cloned (inherited) from a common ancestor. > @@ -1089,6 +1094,7 @@ struct bpf_perf_event_data_kern { > */ > struct perf_cgroup_info { > struct perf_time_ctx time; > + struct perf_time_ctx timeguest; > int active; > }; > > diff --git a/kernel/events/core.c b/kernel/events/core.c > index e38c8b5e8086..7a2115b2c5c1 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -163,7 +163,8 @@ enum event_type_t { > /* see ctx_resched() for details */ > EVENT_CPU = 0x10, > EVENT_CGROUP = 0x20, > - EVENT_FLAGS = EVENT_CGROUP, > + EVENT_GUEST = 0x40, It's not clear to me if this flag is for events to include guests or exclude them. Can you please add a comment? Thanks, Namhyung > + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, > /* compound helpers */ > EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, > EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, > @@ -435,6 +436,7 @@ static atomic_t nr_include_guest_events __read_mostly; > > static atomic_t nr_mediated_pmu_vms; > static DEFINE_MUTEX(perf_mediated_pmu_mutex); > +static DEFINE_PER_CPU(bool, perf_in_guest); > > /* !exclude_guest event of PMU with PERF_PMU_CAP_MEDIATED_VPMU */ > static inline bool is_include_guest_event(struct perf_event *event) > @@ -738,6 +740,9 @@ static bool perf_skip_pmu_ctx(struct > perf_event_pmu_context *pmu_ctx, > { > if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) > return true; > + if ((event_type & EVENT_GUEST) && > + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU)) > + return true; > return false; > } > > @@ -788,6 +793,39 @@ static inline void update_perf_time_ctx(struct > perf_time_ctx *time, u64 now, boo > WRITE_ONCE(time->offset, time->time - time->stamp); > } > > +static_assert(offsetof(struct perf_event_context, timeguest) - > + offsetof(struct perf_event_context, time) == > + sizeof(struct perf_time_ctx)); > + > +#define T_TOTAL 0 > +#define T_GUEST 1 > + > +static inline u64 __perf_event_time_ctx(struct perf_event *event, > + struct perf_time_ctx *times) > +{ > + u64 time = times[T_TOTAL].time; > + > + if (event->attr.exclude_guest) > + time -= times[T_GUEST].time; > + > + return time; > +} > + > +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, > + struct perf_time_ctx *times, > + u64 now) > +{ > + if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) { > + /* > + * (now + times[total].offset) - (now + times[guest].offset) := > + * times[total].offset - times[guest].offset > + */ > + return READ_ONCE(times[T_TOTAL].offset) - > READ_ONCE(times[T_GUEST].offset); > + } > + > + return now + READ_ONCE(times[T_TOTAL].offset); > +} > + > #ifdef CONFIG_CGROUP_PERF > > static inline bool > @@ -824,12 +862,16 @@ static inline int is_cgroup_event(struct perf_event > *event) > return event->cgrp != NULL; > } > > +static_assert(offsetof(struct perf_cgroup_info, timeguest) - > + offsetof(struct perf_cgroup_info, time) == > + sizeof(struct perf_time_ctx)); > + > static inline u64 perf_cgroup_event_time(struct perf_event *event) > { > struct perf_cgroup_info *t; > > t = per_cpu_ptr(event->cgrp->info, event->cpu); > - return t->time.time; > + return __perf_event_time_ctx(event, &t->time); > } > > static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 > now) > @@ -838,9 +880,21 @@ static inline u64 perf_cgroup_event_time_now(struct > perf_event *event, u64 now) > > t = per_cpu_ptr(event->cgrp->info, event->cpu); > if (!__load_acquire(&t->active)) > - return t->time.time; > - now += READ_ONCE(t->time.offset); > - return now; > + return __perf_event_time_ctx(event, &t->time); > + > + return __perf_event_time_ctx_now(event, &t->time, now); > +} > + > +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, > u64 now, bool adv) > +{ > + update_perf_time_ctx(&info->timeguest, now, adv); > +} > + > +static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) > +{ > + update_perf_time_ctx(&info->time, now, true); > + if (__this_cpu_read(perf_in_guest)) > + __update_cgrp_guest_time(info, now, true); > } > > static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context > *cpuctx, bool final) > @@ -856,7 +910,7 @@ static inline void update_cgrp_time_from_cpuctx(struct > perf_cpu_context *cpuctx, > cgrp = container_of(css, struct perf_cgroup, css); > info = this_cpu_ptr(cgrp->info); > > - update_perf_time_ctx(&info->time, now, true); > + update_cgrp_time(info, now); > if (final) > __store_release(&info->active, 0); > } > @@ -879,11 +933,11 @@ static inline void update_cgrp_time_from_event(struct > perf_event *event) > * Do not update time when cgroup is not active > */ > if (info->active) > - update_perf_time_ctx(&info->time, perf_clock(), true); > + update_cgrp_time(info, perf_clock()); > } > > static inline void > -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) > +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) > { > struct perf_event_context *ctx = &cpuctx->ctx; > struct perf_cgroup *cgrp = cpuctx->cgrp; > @@ -903,8 +957,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context > *cpuctx) > for (css = &cgrp->css; css; css = css->parent) { > cgrp = container_of(css, struct perf_cgroup, css); > info = this_cpu_ptr(cgrp->info); > - update_perf_time_ctx(&info->time, ctx->time.stamp, false); > - __store_release(&info->active, 1); > + if (guest) { > + __update_cgrp_guest_time(info, ctx->time.stamp, false); > + } else { > + update_perf_time_ctx(&info->time, ctx->time.stamp, > false); > + __store_release(&info->active, 1); > + } > } > } > > @@ -1104,7 +1162,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct > perf_event *event, > } > > static inline void > -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) > +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) > { > } > > @@ -1514,16 +1572,24 @@ static void perf_unpin_context(struct > perf_event_context *ctx) > */ > static void __update_context_time(struct perf_event_context *ctx, bool adv) > { > - u64 now = perf_clock(); > + lockdep_assert_held(&ctx->lock); > + > + update_perf_time_ctx(&ctx->time, perf_clock(), adv); > +} > > +static void __update_context_guest_time(struct perf_event_context *ctx, bool > adv) > +{ > lockdep_assert_held(&ctx->lock); > > - update_perf_time_ctx(&ctx->time, now, adv); > + /* must be called after __update_context_time(); */ > + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); > } > > static void update_context_time(struct perf_event_context *ctx) > { > __update_context_time(ctx, true); > + if (__this_cpu_read(perf_in_guest)) > + __update_context_guest_time(ctx, true); > } > > static u64 perf_event_time(struct perf_event *event) > @@ -1536,7 +1602,7 @@ static u64 perf_event_time(struct perf_event *event) > if (is_cgroup_event(event)) > return perf_cgroup_event_time(event); > > - return ctx->time.time; > + return __perf_event_time_ctx(event, &ctx->time); > } > > static u64 perf_event_time_now(struct perf_event *event, u64 now) > @@ -1550,10 +1616,9 @@ static u64 perf_event_time_now(struct perf_event > *event, u64 now) > return perf_cgroup_event_time_now(event, now); > > if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) > - return ctx->time.time; > + return __perf_event_time_ctx(event, &ctx->time); > > - now += READ_ONCE(ctx->time.offset); > - return now; > + return __perf_event_time_ctx_now(event, &ctx->time, now); > } > > static enum event_type_t get_event_type(struct perf_event *event) > @@ -2384,20 +2449,23 @@ group_sched_out(struct perf_event *group_event, > struct perf_event_context *ctx) > } > > static inline void > -__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context > *ctx, bool final) > +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context > *ctx, > + bool final, enum event_type_t event_type) > { > if (ctx->is_active & EVENT_TIME) { > if (ctx->is_active & EVENT_FROZEN) > return; > + > update_context_time(ctx); > - update_cgrp_time_from_cpuctx(cpuctx, final); > + /* vPMU should not stop time */ > + update_cgrp_time_from_cpuctx(cpuctx, !(event_type & > EVENT_GUEST) && final); > } > } > > static inline void > ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context > *ctx) > { > - __ctx_time_update(cpuctx, ctx, false); > + __ctx_time_update(cpuctx, ctx, false, 0); > } > > /* > @@ -3405,7 +3473,7 @@ ctx_sched_out(struct perf_event_context *ctx, struct > pmu *pmu, enum event_type_t > * > * would only update time for the pinned events. > */ > - __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); > + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type); > > /* > * CPU-release for the below ->is_active store, > @@ -3431,7 +3499,18 @@ ctx_sched_out(struct perf_event_context *ctx, struct > pmu *pmu, enum event_type_t > cpuctx->task_ctx = NULL; > } > > - is_active ^= ctx->is_active; /* changed bits */ > + if (event_type & EVENT_GUEST) { > + /* > + * Schedule out all exclude_guest events of PMU > + * with PERF_PMU_CAP_MEDIATED_VPMU. > + */ > + is_active = EVENT_ALL; > + __update_context_guest_time(ctx, false); > + perf_cgroup_set_timestamp(cpuctx, true); > + barrier(); > + } else { > + is_active ^= ctx->is_active; /* changed bits */ > + } > > for_each_epc(pmu_ctx, ctx, pmu, event_type) > __pmu_ctx_sched_out(pmu_ctx, is_active); > @@ -3926,10 +4005,15 @@ static inline void group_update_userpage(struct > perf_event *group_event) > event_update_userpage(event); > } > > +struct merge_sched_data { > + int can_add_hw; > + enum event_type_t event_type; > +}; > + > static int merge_sched_in(struct perf_event *event, void *data) > { > struct perf_event_context *ctx = event->ctx; > - int *can_add_hw = data; > + struct merge_sched_data *msd = data; > > if (event->state <= PERF_EVENT_STATE_OFF) > return 0; > @@ -3937,13 +4021,22 @@ static int merge_sched_in(struct perf_event *event, > void *data) > if (!event_filter_match(event)) > return 0; > > - if (group_can_go_on(event, *can_add_hw)) { > + /* > + * Don't schedule in any host events from PMU with > + * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running. > + */ > + if (__this_cpu_read(perf_in_guest) && > + event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU && > + !(msd->event_type & EVENT_GUEST)) > + return 0; > + > + if (group_can_go_on(event, msd->can_add_hw)) { > if (!group_sched_in(event, ctx)) > list_add_tail(&event->active_list, > get_event_list(event)); > } > > if (event->state == PERF_EVENT_STATE_INACTIVE) { > - *can_add_hw = 0; > + msd->can_add_hw = 0; > if (event->attr.pinned) { > perf_cgroup_event_disable(event, ctx); > perf_event_set_state(event, PERF_EVENT_STATE_ERROR); > @@ -3962,11 +4055,15 @@ static int merge_sched_in(struct perf_event *event, > void *data) > > static void pmu_groups_sched_in(struct perf_event_context *ctx, > struct perf_event_groups *groups, > - struct pmu *pmu) > + struct pmu *pmu, > + enum event_type_t event_type) > { > - int can_add_hw = 1; > + struct merge_sched_data msd = { > + .can_add_hw = 1, > + .event_type = event_type, > + }; > visit_groups_merge(ctx, groups, smp_processor_id(), pmu, > - merge_sched_in, &can_add_hw); > + merge_sched_in, &msd); > } > > static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, > @@ -3975,9 +4072,9 @@ static void __pmu_ctx_sched_in(struct > perf_event_pmu_context *pmu_ctx, > struct perf_event_context *ctx = pmu_ctx->ctx; > > if (event_type & EVENT_PINNED) > - pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); > + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, > event_type); > if (event_type & EVENT_FLEXIBLE) > - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); > + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, > event_type); > } > > static void > @@ -3994,9 +4091,11 @@ ctx_sched_in(struct perf_event_context *ctx, struct > pmu *pmu, enum event_type_t > return; > > if (!(is_active & EVENT_TIME)) { > + /* EVENT_TIME should be active while the guest runs */ > + WARN_ON_ONCE(event_type & EVENT_GUEST); > /* start ctx time */ > __update_context_time(ctx, false); > - perf_cgroup_set_timestamp(cpuctx); > + perf_cgroup_set_timestamp(cpuctx, false); > /* > * CPU-release for the below ->is_active store, > * see __load_acquire() in perf_event_time_now() > @@ -4012,7 +4111,23 @@ ctx_sched_in(struct perf_event_context *ctx, struct > pmu *pmu, enum event_type_t > WARN_ON_ONCE(cpuctx->task_ctx != ctx); > } > > - is_active ^= ctx->is_active; /* changed bits */ > + if (event_type & EVENT_GUEST) { > + /* > + * Schedule in the required exclude_guest events of PMU > + * with PERF_PMU_CAP_MEDIATED_VPMU. > + */ > + is_active = event_type & EVENT_ALL; > + > + /* > + * Update ctx time to set the new start time for > + * the exclude_guest events. > + */ > + update_context_time(ctx); > + update_cgrp_time_from_cpuctx(cpuctx, false); > + barrier(); > + } else { > + is_active ^= ctx->is_active; /* changed bits */ > + } > > /* > * First go through the list and put on any pinned groups > @@ -4020,13 +4135,13 @@ ctx_sched_in(struct perf_event_context *ctx, struct > pmu *pmu, enum event_type_t > */ > if (is_active & EVENT_PINNED) { > for_each_epc(pmu_ctx, ctx, pmu, event_type) > - __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); > + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type > & EVENT_GUEST)); > } > > /* Then walk through the lower prio flexible groups */ > if (is_active & EVENT_FLEXIBLE) { > for_each_epc(pmu_ctx, ctx, pmu, event_type) > - __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); > + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | > (event_type & EVENT_GUEST)); > } > } > > @@ -6285,23 +6400,25 @@ void perf_event_update_userpage(struct perf_event > *event) > if (!rb) > goto unlock; > > - /* > - * compute total_time_enabled, total_time_running > - * based on snapshot values taken when the event > - * was last scheduled in. > - * > - * we cannot simply called update_context_time() > - * because of locking issue as we can be called in > - * NMI context > - */ > - calc_timer_values(event, &now, &enabled, &running); > - > - userpg = rb->user_page; > /* > * Disable preemption to guarantee consistent time stamps are stored to > * the user page. > */ > preempt_disable(); > + > + /* > + * compute total_time_enabled, total_time_running > + * based on snapshot values taken when the event > + * was last scheduled in. > + * > + * we cannot simply called update_context_time() > + * because of locking issue as we can be called in > + * NMI context > + */ > + calc_timer_values(event, &now, &enabled, &running); > + > + userpg = rb->user_page; > + > ++userpg->lock; > barrier(); > userpg->index = perf_event_index(event); > -- > 2.49.0.395.g12beb8f557-goog >