On Mon, Feb 02, 2026 at 12:13:59PM +0530, Riana Tauro wrote:
> Initialize DRM RAS in hw error init. Map the UAPI error severities
> with the hardware error severities and refactor file.
>
> Signed-off-by: Riana Tauro <[email protected]>
> ---
> drivers/gpu/drm/xe/xe_drm_ras_types.h | 8 ++++
> drivers/gpu/drm/xe/xe_hw_error.c | 68 ++++++++++++++++-----------
> 2 files changed, 48 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_drm_ras_types.h
> b/drivers/gpu/drm/xe/xe_drm_ras_types.h
> index 0ac4ae324f37..beed48811d6a 100644
> --- a/drivers/gpu/drm/xe/xe_drm_ras_types.h
> +++ b/drivers/gpu/drm/xe/xe_drm_ras_types.h
> @@ -11,6 +11,14 @@
>
> struct drm_ras_node;
>
> +/* Error categories reported by hardware */
> +enum hardware_error {
> + HARDWARE_ERROR_CORRECTABLE = 0,
> + HARDWARE_ERROR_NONFATAL = 1,
> + HARDWARE_ERROR_FATAL = 2,
I'd align "= x" using tabs for readability.
> + HARDWARE_ERROR_MAX,
Guaranteed last member, so redundant comma.
> +};
Also, just curious. Are these expected to be reused anywhere?
If not, they're probably better off in the .c file.
...
> @@ -86,8 +78,8 @@ static void csc_hw_error_handler(struct xe_tile *tile,
> const enum hardware_error
> fw_err = xe_mmio_read32(mmio, HEC_UNCORR_FW_ERR_DW0(base));
> for_each_set_bit(err_bit, &fw_err, HEC_UNCORR_FW_ERR_BITS) {
> drm_err_ratelimited(&xe->drm, HW_ERR
> - "%s: HEC Uncorrected FW %s error
> reported, bit[%d] is set\n",
> - hw_err_str,
> hec_uncorrected_fw_errors[err_bit],
> + "HEC FW %s error reported, bit[%d]
> is set\n",
> + hec_uncorrected_fw_errors[err_bit],
So we're dropping severity_str here? Did I miss something?
> err_bit);
...
> +static int hw_error_info_init(struct xe_device *xe)
> +{
> + int ret;
> +
> + if (xe->info.platform != XE_PVC)
> + return 0;
> +
> + ret = xe_drm_ras_allocate_nodes(xe);
Why not just
return xe_drm_ras_allocate_nodes();
Tidy? ;)
> + if (ret)
> + return ret;
> +
> + return 0;
> +}
> +
> /*
> * Process hardware errors during boot
> */
> @@ -172,11 +179,16 @@ static void process_hw_errors(struct xe_device *xe)
> void xe_hw_error_init(struct xe_device *xe)
> {
> struct xe_tile *tile = xe_device_get_root_tile(xe);
> + int ret;
>
> if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
> return;
>
> INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work);
>
> + ret = hw_error_info_init(xe);
> + if (ret)
> + drm_warn(&xe->drm, "Failed to allocate DRM RAS nodes\n");
This is less likely due to any hardware limitation, so I think
drm_err() would be more appropriate here.
Raag
> +
> process_hw_errors(xe);
> }
> --
> 2.47.1
>