mmea error status in aldebaran

Hawking Zhang Fri, 16 Apr 2021 02:45:35 -0700

In aldebaran, driver only needs to harvest SDP
RdRspStatus, WrRspStatus and first parity error
on RdRsp data. Check error type before harvest
error information.


Signed-off-by: Hawking Zhang <hawking.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 21 ++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c | 11 +++++++----
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 9ca76a3ac38c..91427543aabe 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -808,7 +808,7 @@ static struct gfx_v9_4_2_utc_block gfx_v9_4_2_utc_blocks[] 
= {
          REG_SET_FIELD(0, ATC_L2_CACHE_4K_DSM_CNTL, WRITE_COUNTERS, 1) },
 };
 
-static const struct soc15_reg_entry gfx_v9_4_2_rdrsp_status_regs =
+static const struct soc15_reg_entry gfx_v9_4_2_ea_err_status_regs =
        { SOC15_REG_ENTRY(GC, 0, regGCEA_ERR_STATUS), 0, 1, 16 };
 
 static int gfx_v9_4_2_get_reg_error_count(struct amdgpu_device *adev,
@@ -1040,11 +1040,11 @@ static void gfx_v9_4_2_reset_ea_err_status(struct 
amdgpu_device *adev)
        uint32_t i, j;
 
        mutex_lock(&adev->grbm_idx_mutex);
-       for (i = 0; i < gfx_v9_4_2_rdrsp_status_regs.se_num; i++) {
-               for (j = 0; j < gfx_v9_4_2_rdrsp_status_regs.instance;
+       for (i = 0; i < gfx_v9_4_2_ea_err_status_regs.se_num; i++) {
+               for (j = 0; j < gfx_v9_4_2_ea_err_status_regs.instance;
                     j++) {
                        gfx_v9_4_2_select_se_sh(adev, i, 0, j);
-                       
WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_rdrsp_status_regs), 0x10);
+                       
WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_ea_err_status_regs), 0x10);
                }
        }
        gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
@@ -1089,17 +1089,20 @@ static void gfx_v9_4_2_query_ea_err_status(struct 
amdgpu_device *adev)
 
        mutex_lock(&adev->grbm_idx_mutex);
 
-       for (i = 0; i < gfx_v9_4_2_rdrsp_status_regs.se_num; i++) {
-               for (j = 0; j < gfx_v9_4_2_rdrsp_status_regs.instance;
+       for (i = 0; i < gfx_v9_4_2_ea_err_status_regs.se_num; i++) {
+               for (j = 0; j < gfx_v9_4_2_ea_err_status_regs.instance;
                     j++) {
                        gfx_v9_4_2_select_se_sh(adev, i, 0, j);
                        reg_value = RREG32(SOC15_REG_ENTRY_OFFSET(
-                               gfx_v9_4_2_rdrsp_status_regs));
-                       if (reg_value)
+                               gfx_v9_4_2_ea_err_status_regs));
+                       if (REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, 
SDP_RDRSP_STATUS) ||
+                           REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, 
SDP_WRRSP_STATUS) ||
+                           REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, 
SDP_RDRSP_DATAPARITY_ERROR)) {
                                dev_warn(adev->dev, "GCEA err detected at 
instance: %d, status: 0x%x!\n",
                                                j, reg_value);
+                       }
                        /* clear after read */
-                       
WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_rdrsp_status_regs), 0x10);
+                       
WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_ea_err_status_regs), 0x10);
                }
        }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
index d0f41346ea0c..cc69c434d0de 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
@@ -1286,7 +1286,7 @@ static void mmhub_v1_7_reset_ras_error_count(struct 
amdgpu_device *adev)
        }
 }
 
-static const struct soc15_reg_entry mmhub_v1_7_err_status_regs[] = {
+static const struct soc15_reg_entry mmhub_v1_7_ea_err_status_regs[] = {
        { SOC15_REG_ENTRY(MMHUB, 0, regMMEA0_ERR_STATUS), 0, 0, 0 },
        { SOC15_REG_ENTRY(MMHUB, 0, regMMEA1_ERR_STATUS), 0, 0, 0 },
        { SOC15_REG_ENTRY(MMHUB, 0, regMMEA2_ERR_STATUS), 0, 0, 0 },
@@ -1303,12 +1303,15 @@ static void mmhub_v1_7_query_ras_error_status(struct 
amdgpu_device *adev)
        if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB))
                return;
 
-       for (i = 0; i < ARRAY_SIZE(mmhub_v1_7_err_status_regs); i++) {
+       for (i = 0; i < ARRAY_SIZE(mmhub_v1_7_ea_err_status_regs); i++) {
                reg_value =
-                       
RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v1_7_err_status_regs[i]));
-               if (reg_value)
+                       
RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v1_7_ea_err_status_regs[i]));
+               if (REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, 
SDP_RDRSP_STATUS) ||
+                   REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, 
SDP_WRRSP_STATUS) ||
+                   REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, 
SDP_RDRSP_DATAPARITY_ERROR)) {
                        dev_warn(adev->dev, "MMHUB EA err detected at instance: 
%d, status: 0x%x!\n",
                                        i, reg_value);
+               }
        }
 }
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 2/2] drm/amdgpu: only harvest gcea/mmea error status in aldebaran

Reply via email to