Initialize DRM RAS in hw error init. Map the UAPI error severities with the hardware error severities and refactor file.
Signed-off-by: Riana Tauro <[email protected]> --- drivers/gpu/drm/xe/xe_drm_ras_types.h | 8 ++++ drivers/gpu/drm/xe/xe_hw_error.c | 68 ++++++++++++++++----------- 2 files changed, 48 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_drm_ras_types.h b/drivers/gpu/drm/xe/xe_drm_ras_types.h index 0ac4ae324f37..beed48811d6a 100644 --- a/drivers/gpu/drm/xe/xe_drm_ras_types.h +++ b/drivers/gpu/drm/xe/xe_drm_ras_types.h @@ -11,6 +11,14 @@ struct drm_ras_node; +/* Error categories reported by hardware */ +enum hardware_error { + HARDWARE_ERROR_CORRECTABLE = 0, + HARDWARE_ERROR_NONFATAL = 1, + HARDWARE_ERROR_FATAL = 2, + HARDWARE_ERROR_MAX, +}; + /** * struct xe_drm_ras_counter - XE RAS counter * diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c index 8c65291f36fc..2019aaaa1ebe 100644 --- a/drivers/gpu/drm/xe/xe_hw_error.c +++ b/drivers/gpu/drm/xe/xe_hw_error.c @@ -10,20 +10,16 @@ #include "regs/xe_irq_regs.h" #include "xe_device.h" +#include "xe_drm_ras.h" #include "xe_hw_error.h" #include "xe_mmio.h" #include "xe_survivability_mode.h" #define HEC_UNCORR_FW_ERR_BITS 4 + extern struct fault_attr inject_csc_hw_error; -/* Error categories reported by hardware */ -enum hardware_error { - HARDWARE_ERROR_CORRECTABLE = 0, - HARDWARE_ERROR_NONFATAL = 1, - HARDWARE_ERROR_FATAL = 2, - HARDWARE_ERROR_MAX, -}; +static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES; static const char * const hec_uncorrected_fw_errors[] = { "Fatal", @@ -32,23 +28,18 @@ static const char * const hec_uncorrected_fw_errors[] = { "Data Corruption" }; -static const char *hw_error_to_str(const enum hardware_error hw_err) +static bool fault_inject_csc_hw_error(void) { - switch (hw_err) { - case HARDWARE_ERROR_CORRECTABLE: - return "CORRECTABLE"; - case HARDWARE_ERROR_NONFATAL: - return "NONFATAL"; - case HARDWARE_ERROR_FATAL: - return "FATAL"; - default: - return "UNKNOWN"; - } + return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1); } -static bool fault_inject_csc_hw_error(void) +static enum drm_xe_ras_error_severity hw_err_to_severity(enum hardware_error hw_err) { - return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1); + if (hw_err == HARDWARE_ERROR_CORRECTABLE) + return DRM_XE_RAS_ERR_SEV_CORRECTABLE; + + /* Uncorrectable errors comprise of both fatal and non-fatal errors */ + return DRM_XE_RAS_ERR_SEV_UNCORRECTABLE; } static void csc_hw_error_work(struct work_struct *work) @@ -64,7 +55,8 @@ static void csc_hw_error_work(struct work_struct *work) static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err) { - const char *hw_err_str = hw_error_to_str(hw_err); + const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err); + const char *severity_str = error_severity[severity]; struct xe_device *xe = tile_to_xe(tile); struct xe_mmio *mmio = &tile->mmio; u32 base, err_bit, err_src; @@ -77,8 +69,8 @@ static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error lockdep_assert_held(&xe->irq.lock); err_src = xe_mmio_read32(mmio, HEC_UNCORR_ERR_STATUS(base)); if (!err_src) { - drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported HEC_ERR_STATUS_%s blank\n", - tile->id, hw_err_str); + drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported %s HEC_ERR_STATUS register blank\n", + tile->id, severity_str); return; } @@ -86,8 +78,8 @@ static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error fw_err = xe_mmio_read32(mmio, HEC_UNCORR_FW_ERR_DW0(base)); for_each_set_bit(err_bit, &fw_err, HEC_UNCORR_FW_ERR_BITS) { drm_err_ratelimited(&xe->drm, HW_ERR - "%s: HEC Uncorrected FW %s error reported, bit[%d] is set\n", - hw_err_str, hec_uncorrected_fw_errors[err_bit], + "HEC FW %s error reported, bit[%d] is set\n", + hec_uncorrected_fw_errors[err_bit], err_bit); schedule_work(&tile->csc_hw_error_work); @@ -99,7 +91,8 @@ static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err) { - const char *hw_err_str = hw_error_to_str(hw_err); + const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err); + const char *severity_str = error_severity[severity]; struct xe_device *xe = tile_to_xe(tile); unsigned long flags; u32 err_src; @@ -110,8 +103,8 @@ static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_er spin_lock_irqsave(&xe->irq.lock, flags); err_src = xe_mmio_read32(&tile->mmio, DEV_ERR_STAT_REG(hw_err)); if (!err_src) { - drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported DEV_ERR_STAT_%s blank!\n", - tile->id, hw_err_str); + drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported %s DEV_ERR_STAT register blank!\n", + tile->id, severity_str); goto unlock; } @@ -146,6 +139,20 @@ void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl) hw_error_source_handler(tile, hw_err); } +static int hw_error_info_init(struct xe_device *xe) +{ + int ret; + + if (xe->info.platform != XE_PVC) + return 0; + + ret = xe_drm_ras_allocate_nodes(xe); + if (ret) + return ret; + + return 0; +} + /* * Process hardware errors during boot */ @@ -172,11 +179,16 @@ static void process_hw_errors(struct xe_device *xe) void xe_hw_error_init(struct xe_device *xe) { struct xe_tile *tile = xe_device_get_root_tile(xe); + int ret; if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) return; INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work); + ret = hw_error_info_init(xe); + if (ret) + drm_warn(&xe->drm, "Failed to allocate DRM RAS nodes\n"); + process_hw_errors(xe); } -- 2.47.1
