On 2025-02-20 10:36 a.m., Lucas De Marchi wrote:
> On some boots the read of MSR_PP1_ENERGY_STATUS msr returns 0, causing
> perf_msr_probe() to make the power/events/energy-gpu event non-visible.
> When that happens, the msr always read 0 until the graphics module (i915
> for Meteor Lake, xe for Lunar Lake) is loaded. Then it starts returning
> something different and re-loading the rapl module "fixes" it.
> 
> This is tested on the following platforms with the fail rates before
> this patch:
> 
>       Alder Lake S    0/20
>       Arrow Lake H    0/20
>       Lunar Lake M    8/20
>       Meteor Lake U   6/20
>       Raptor Lake P   4/20
>       Raptor Lake S   0/20
> 
> For those platforms failing, use a separate msr list with .no_check
> set so it doesn't check the runtime value to create the event - it will
> just return 0 until the i915/xe module initializes the GPU.
> 
> The issue https://github.com/ulissesf/qmassa/issues/4 is workarounded by
> reading the MSR directly since it works after xe is loaded, but the
> issue with not having the perf event is still there.
> 
> Closes: https://github.com/ulissesf/qmassa/issues/4
> Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4241
> Signed-off-by: Lucas De Marchi <lucas.demar...@intel.com
> ---
> 
> Maybe a clearer alternative is to just move all the platforms after
> RAPTORLAKE with a gpu to use the new msr list.
> 
>  arch/x86/events/rapl.c | 26 ++++++++++++++++++++++----
>  1 file changed, 22 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
> index 4952faf03e82d..18e324b8fa82c 100644
> --- a/arch/x86/events/rapl.c
> +++ b/arch/x86/events/rapl.c
> @@ -588,6 +588,14 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
>       [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, 
> &rapl_events_psys_group,  test_msr, true, RAPL_MSR_MASK },
>  };
>  
> +static struct perf_msr intel_rapl_mtl_msrs[] = {
> +     [PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      
> &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
> +     [PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      
> &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
> +     [PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     
> &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
> +     [PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      
> &rapl_events_gpu_group,   test_msr, true,  RAPL_MSR_MASK },
> +     [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, 
> &rapl_events_psys_group,  test_msr, false, RAPL_MSR_MASK },
> +};
> +
>  /*
>   * Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
>   * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
> @@ -826,6 +834,16 @@ static struct rapl_model model_spr = {
>       .rapl_pkg_msrs  = intel_rapl_spr_msrs,
>  };
>  
> +static struct rapl_model model_rpl = {
> +     .pkg_events     = BIT(PERF_RAPL_PP0) |
> +                       BIT(PERF_RAPL_PKG) |
> +                       BIT(PERF_RAPL_RAM) |
> +                       BIT(PERF_RAPL_PP1) |
> +                       BIT(PERF_RAPL_PSYS),
> +     .msr_power_unit = MSR_RAPL_POWER_UNIT,
> +     .rapl_pkg_msrs  = intel_rapl_mtl_msrs,

It's better to make the name consistent, e.g., intel_rapl_rpl_msrs.

Thanks,
Kan
> +};
> +
>  static struct rapl_model model_amd_hygon = {
>       .pkg_events     = BIT(PERF_RAPL_PKG),
>       .core_events    = BIT(PERF_RAPL_CORE),
> @@ -873,13 +891,13 @@ static const struct x86_cpu_id rapl_model_match[] 
> __initconst = {
>       X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X,   &model_spr),
>       X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X,    &model_spr),
>       X86_MATCH_VFM(INTEL_RAPTORLAKE,         &model_skl),
> -     X86_MATCH_VFM(INTEL_RAPTORLAKE_P,       &model_skl),
> +     X86_MATCH_VFM(INTEL_RAPTORLAKE_P,       &model_rpl),
>       X86_MATCH_VFM(INTEL_RAPTORLAKE_S,       &model_skl),
> -     X86_MATCH_VFM(INTEL_METEORLAKE,         &model_skl),
> -     X86_MATCH_VFM(INTEL_METEORLAKE_L,       &model_skl),
> +     X86_MATCH_VFM(INTEL_METEORLAKE,         &model_rpl),
> +     X86_MATCH_VFM(INTEL_METEORLAKE_L,       &model_rpl),
>       X86_MATCH_VFM(INTEL_ARROWLAKE_H,        &model_skl),
>       X86_MATCH_VFM(INTEL_ARROWLAKE,          &model_skl),
> -     X86_MATCH_VFM(INTEL_LUNARLAKE_M,        &model_skl),
> +     X86_MATCH_VFM(INTEL_LUNARLAKE_M,        &model_rpl),
>       {},
>  };
>  MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);

Reply via email to