On Mon, Feb 02, 2026 at 12:13:59PM +0530, Riana Tauro wrote:
> Initialize DRM RAS in hw error init. Map the UAPI error severities
> with the hardware error severities and refactor file.
> 
> Signed-off-by: Riana Tauro <[email protected]>
> ---
>  drivers/gpu/drm/xe/xe_drm_ras_types.h |  8 ++++
>  drivers/gpu/drm/xe/xe_hw_error.c      | 68 ++++++++++++++++-----------
>  2 files changed, 48 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_drm_ras_types.h 
> b/drivers/gpu/drm/xe/xe_drm_ras_types.h
> index 0ac4ae324f37..beed48811d6a 100644
> --- a/drivers/gpu/drm/xe/xe_drm_ras_types.h
> +++ b/drivers/gpu/drm/xe/xe_drm_ras_types.h
> @@ -11,6 +11,14 @@
>  
>  struct drm_ras_node;
>  
> +/* Error categories reported by hardware */
> +enum hardware_error {
> +     HARDWARE_ERROR_CORRECTABLE = 0,
> +     HARDWARE_ERROR_NONFATAL = 1,
> +     HARDWARE_ERROR_FATAL = 2,

I'd align "= x" using tabs for readability.

> +     HARDWARE_ERROR_MAX,

Guaranteed last member, so redundant comma.

> +};

Also, just curious. Are these expected to be reused anywhere?
If not, they're probably better off in the .c file.

...

> @@ -86,8 +78,8 @@ static void csc_hw_error_handler(struct xe_tile *tile, 
> const enum hardware_error
>               fw_err = xe_mmio_read32(mmio, HEC_UNCORR_FW_ERR_DW0(base));
>               for_each_set_bit(err_bit, &fw_err, HEC_UNCORR_FW_ERR_BITS) {
>                       drm_err_ratelimited(&xe->drm, HW_ERR
> -                                         "%s: HEC Uncorrected FW %s error 
> reported, bit[%d] is set\n",
> -                                          hw_err_str, 
> hec_uncorrected_fw_errors[err_bit],
> +                                         "HEC FW %s error reported, bit[%d] 
> is set\n",
> +                                          hec_uncorrected_fw_errors[err_bit],

So we're dropping severity_str here? Did I miss something?

>                                            err_bit);

...

> +static int hw_error_info_init(struct xe_device *xe)
> +{
> +     int ret;
> +
> +     if (xe->info.platform != XE_PVC)
> +             return 0;
> +
> +     ret = xe_drm_ras_allocate_nodes(xe);

Why not just

        return xe_drm_ras_allocate_nodes();

Tidy? ;)

> +     if (ret)
> +             return ret;
> +
> +     return 0;
> +}
> +
>  /*
>   * Process hardware errors during boot
>   */
> @@ -172,11 +179,16 @@ static void process_hw_errors(struct xe_device *xe)
>  void xe_hw_error_init(struct xe_device *xe)
>  {
>       struct xe_tile *tile = xe_device_get_root_tile(xe);
> +     int ret;
>  
>       if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
>               return;
>  
>       INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work);
>  
> +     ret = hw_error_info_init(xe);
> +     if (ret)
> +             drm_warn(&xe->drm, "Failed to allocate DRM RAS nodes\n");

This is less likely due to any hardware limitation, so I think
drm_err() would be more appropriate here.

Raag

> +
>       process_hw_errors(xe);
>  }
> -- 
> 2.47.1
> 

Reply via email to