On Mon, Nov 18, 2024 at 05:11:11PM +0530, Kajol Jain wrote: > To support performance measurement for KVM on PowerVM(KoP) > feature, PowerVM hypervisor has added couple of new software > counters in Virtual Process Area(VPA) of the partition. > > Commit e1f288d2f9c69 ("KVM: PPC: Book3S HV nestedv2: Add > support for reading VPA counters for pseries guests") > have updated the paca fields with corresponding changes. > > Proposed perf interface is to expose these new software > counters for monitoring of context switch latencies and > runtime aggregate. Perf interface driver is called > "vpa_pmu" and it has dependency on KVM and perf, hence > added new config called "VPA_PMU" which depends on > "CONFIG_KVM_BOOK3S_64_HV" and "CONFIG_HV_PERF_CTRS". > Since, kvm and kvm_host are currently compiled as built-in > modules, this perf interface takes the same path and > registered as a module. > > vpa_pmu perf interface needs access to some of the kvm > functions and structures like kvmhv_get_l2_counters_status(), > hence kvm_book3s_64.h and kvm_ppc.h are included. > Below are the events added to monitor KoP: > > vpa_pmu/l1_to_l2_lat/ > vpa_pmu/l2_to_l1_lat/ > vpa_pmu/l2_runtime_agg/ > > and vpa_pmu driver supports only per-cpu monitoring with this patch. > Example usage: > > [command]# perf stat -e vpa_pmu/l1_to_l2_lat/ -a -I 1000 > 1.001017682 727,200 vpa_pmu/l1_to_l2_lat/ > 2.003540491 1,118,824 vpa_pmu/l1_to_l2_lat/ > 3.005699458 1,919,726 vpa_pmu/l1_to_l2_lat/ > 4.007827011 2,364,630 vpa_pmu/l1_to_l2_lat/ > > Signed-off-by: Kajol Jain <kj...@linux.ibm.com> > Co-developed-by: Madhavan Srinivasan <ma...@linux.ibm.com> > Signed-off-by: Madhavan Srinivasan <ma...@linux.ibm.com> > --- > Changelog: > > v1 -> v2 > - Rebase the patch on top of kvm typo fix patch: > > https://github.com/linuxppc/linux/commit/590d2f9347f7974d7954400e5d937672fd844a8b > - Fix the config check reported by kernel test robot: > https://lore.kernel.org/oe-kbuild-all/202411171117.eq9jtacb-...@intel.com/ > > arch/powerpc/include/asm/kvm_book3s_64.h | 3 + > arch/powerpc/kvm/book3s_hv.c | 19 +++ > arch/powerpc/perf/Makefile | 2 + > arch/powerpc/perf/vpa-pmu.c | 197 +++++++++++++++++++++++ > arch/powerpc/platforms/pseries/Kconfig | 14 ++ > 5 files changed, 235 insertions(+) > create mode 100644 arch/powerpc/perf/vpa-pmu.c > > diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h > b/arch/powerpc/include/asm/kvm_book3s_64.h > index 11065313d4c1..f620e3126d68 100644 > --- a/arch/powerpc/include/asm/kvm_book3s_64.h > +++ b/arch/powerpc/include/asm/kvm_book3s_64.h > @@ -688,6 +688,9 @@ int kvmhv_counters_tracepoint_regfunc(void); > void kvmhv_counters_tracepoint_unregfunc(void); > int kvmhv_get_l2_counters_status(void); > void kvmhv_set_l2_counters_status(int cpu, bool status); > +u64 kvmhv_get_l1_to_l2_cs_time(void); > +u64 kvmhv_get_l2_to_l1_cs_time(void); > +u64 kvmhv_get_l2_runtime_agg(void); > > #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ > > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index d575f7c7ab38..e618533dfc4b 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -4153,6 +4153,7 @@ void kvmhv_set_l2_counters_status(int cpu, bool status) > else > lppaca_of(cpu).l2_counters_enable = 0; > } > +EXPORT_SYMBOL(kvmhv_set_l2_counters_status); > > int kvmhv_counters_tracepoint_regfunc(void) > { > @@ -4192,6 +4193,24 @@ static void do_trace_nested_cs_time(struct kvm_vcpu > *vcpu) > *l2_runtime_agg_ptr = l2_runtime_ns; > } > > +u64 kvmhv_get_l1_to_l2_cs_time(void) > +{ > + return tb_to_ns(be64_to_cpu(get_lppaca()->l1_to_l2_cs_tb)); > +} > +EXPORT_SYMBOL(kvmhv_get_l1_to_l2_cs_time); > + > +u64 kvmhv_get_l2_to_l1_cs_time(void) > +{ > + return tb_to_ns(be64_to_cpu(get_lppaca()->l2_to_l1_cs_tb)); > +} > +EXPORT_SYMBOL(kvmhv_get_l2_to_l1_cs_time); > + > +u64 kvmhv_get_l2_runtime_agg(void) > +{ > + return tb_to_ns(be64_to_cpu(get_lppaca()->l2_runtime_tb)); > +} > +EXPORT_SYMBOL(kvmhv_get_l2_runtime_agg); > + > #else > int kvmhv_get_l2_counters_status(void) > { > diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile > index 4f53d0b97539..ac2cf58d62db 100644 > --- a/arch/powerpc/perf/Makefile > +++ b/arch/powerpc/perf/Makefile > @@ -16,6 +16,8 @@ obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o > e6500-pmu.o > > obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o > > +obj-$(CONFIG_VPA_PMU) += vpa-pmu.o
Do we need a new config option for this? I see you need 2 dependencies for vpa_pmu, so can we do the following instead? obj-$(CONFIG_KVM_BOOK3S_64_HV) += vpa-pmu.o and then in the init func of vpa_pmu, we can add the check for HV_PERF_CTRS static int __init pseries_vpa_pmu_init(void) { <snip> if (!firmware_has_feature(FW_FEATURE_LPAR) || is_kvm_guest() || !IS_ENABLED(CONFIG_HV_PERF_CTRS)) return -ENODEV; Thoughts? > + > obj-$(CONFIG_PPC_8xx) += 8xx-pmu.o > > obj-$(CONFIG_PPC64) += $(obj64-y) > diff --git a/arch/powerpc/perf/vpa-pmu.c b/arch/powerpc/perf/vpa-pmu.c > new file mode 100644 > index 000000000000..2c785eee2f71 > --- /dev/null > +++ b/arch/powerpc/perf/vpa-pmu.c > @@ -0,0 +1,197 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Performance monitoring support for Virtual Processor Area(VPA) based > counters > + * > + * Copyright (C) 2024 IBM Corporation > + */ > +#define pr_fmt(fmt) "vpa_pmu: " fmt > + > +#include <linux/module.h> > +#include <linux/perf_event.h> > +#include <asm/kvm_ppc.h> > +#include <asm/kvm_book3s_64.h> > + > +#define MODULE_VERS "1.0" > +#define MODULE_NAME "pseries_vpa_pmu" > + > +#define EVENT(_name, _code) enum{_name = _code} > + > +#define VPA_PMU_EVENT_VAR(_id) event_attr_##_id > +#define VPA_PMU_EVENT_PTR(_id) (&event_attr_##_id.attr.attr) > + > +static ssize_t vpa_pmu_events_sysfs_show(struct device *dev, > + struct device_attribute *attr, char *page) > +{ > + struct perf_pmu_events_attr *pmu_attr; > + > + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); > + > + return sprintf(page, "event=0x%02llx\n", pmu_attr->id); > +} > + > +#define VPA_PMU_EVENT_ATTR(_name, _id) \ > + PMU_EVENT_ATTR(_name, VPA_PMU_EVENT_VAR(_id), _id, \ > + vpa_pmu_events_sysfs_show) > + > +EVENT(L1_TO_L2_CS_LAT, 0x1); > +EVENT(L2_TO_L1_CS_LAT, 0x2); > +EVENT(L2_RUNTIME_AGG, 0x3); > + > +VPA_PMU_EVENT_ATTR(l1_to_l2_lat, L1_TO_L2_CS_LAT); > +VPA_PMU_EVENT_ATTR(l2_to_l1_lat, L2_TO_L1_CS_LAT); > +VPA_PMU_EVENT_ATTR(l2_runtime_agg, L2_RUNTIME_AGG); > + > +static struct attribute *vpa_pmu_events_attr[] = { > + VPA_PMU_EVENT_PTR(L1_TO_L2_CS_LAT), > + VPA_PMU_EVENT_PTR(L2_TO_L1_CS_LAT), > + VPA_PMU_EVENT_PTR(L2_RUNTIME_AGG), > + NULL > +}; > + > +static const struct attribute_group vpa_pmu_events_group = { > + .name = "events", > + .attrs = vpa_pmu_events_attr, > +}; > + > +PMU_FORMAT_ATTR(event, "config:0-31"); > +static struct attribute *vpa_pmu_format_attr[] = { > + &format_attr_event.attr, > + NULL, > +}; > + > +static struct attribute_group vpa_pmu_format_group = { > + .name = "format", > + .attrs = vpa_pmu_format_attr, > +}; > + > +static const struct attribute_group *vpa_pmu_attr_groups[] = { > + &vpa_pmu_events_group, > + &vpa_pmu_format_group, > + NULL > +}; > + > +static int vpa_pmu_event_init(struct perf_event *event) > +{ > + if (event->attr.type != event->pmu->type) > + return -ENOENT; > + > + /* it does not support event sampling mode */ > + if (is_sampling_event(event)) > + return -EOPNOTSUPP; > + > + /* no branch sampling */ > + if (has_branch_stack(event)) > + return -EOPNOTSUPP; > + > + /* Invalid event code */ > + if ((event->attr.config <= 0) || (event->attr.config > 3)) > + return -EINVAL; > + > + return 0; > +} > + > +static unsigned long get_counter_data(struct perf_event *event) > +{ > + unsigned int config = event->attr.config; > + u64 data; > + > + switch (config) { > + case L1_TO_L2_CS_LAT: > + data = kvmhv_get_l1_to_l2_cs_time(); > + break; > + case L2_TO_L1_CS_LAT: > + data = kvmhv_get_l2_to_l1_cs_time(); > + break; > + case L2_RUNTIME_AGG: > + data = kvmhv_get_l2_runtime_agg(); > + break; > + default: > + data = 0; > + break; > + } > + > + return data; > +} > + > +static int vpa_pmu_add(struct perf_event *event, int flags) > +{ > + u64 data; > + > + kvmhv_set_l2_counters_status( > + smp_processor_id(), true); > + > + data = get_counter_data(event); > + local64_set(&event->hw.prev_count, data); > + > + return 0; > +} > + > +static void vpa_pmu_read(struct perf_event *event) > +{ > + u64 prev_data, new_data, final_data; > + > + prev_data = local64_read(&event->hw.prev_count); > + new_data = get_counter_data(event); > + final_data = new_data - prev_data; > + > + local64_add(final_data, &event->count); > +} > + > +static void vpa_pmu_del(struct perf_event *event, int flags) > +{ > + vpa_pmu_read(event); > + > + /* > + * Disable vpa counter accumulation > + */ > + kvmhv_set_l2_counters_status( > + smp_processor_id(), false); > +} > + This won't work well with the kvm_hv:kvmppc_vcpu_stats tracepoint. Consider the below scenario: 1. I start recording data with the kvmppc_vcpu_stats tracepoint (with trace-cmd) and while the data is being recorded, I start using the vpa_pmu driver as well. 2. I now stop the vpa_pmu driver. This disables the l2_counters_enable counter in lppaca. (The VPA flag can be disabled for 1 or more cpus depending on the pid/tid options specified) 3. When I hit ctrl^c on the trace-cmd capture, I see partial data with 'trace-cmd report'. To get around this, we can use the fact that the 'l2_counters_enable' flag in lppaca supports all non-zero values. So we can see if we can use that as a counter instead of just using 0/1 to enable/disable recording of data. This may need some brainstorming to get right. I see the patches have been pulled in already, but I think we need to consider the above points. Thanks, Gautam