SDP RdRspStatus/WrRspStatus or first parity error on
RdRsp data can cause system fatal error in arcturus.
GPU will be freezed in such case.

Driver needs to harvest these error information before
reset the GPU. Check error type to avoid harvest normal
gcea/mmea information.

Signed-off-by: Hawking Zhang <hawking.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c            | 16 +++++++++++-----
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c          |  8 +++++++-
 .../amd/include/asic_reg/gc/gc_9_4_1_sh_mask.h   | 16 ++++++++++++++++
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
index 830080ff90d8..b4789dfc2bb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
@@ -994,7 +994,7 @@ static int gfx_v9_4_ras_error_inject(struct amdgpu_device 
*adev,
        return ret;
 }
 
-static const struct soc15_reg_entry gfx_v9_4_rdrsp_status_regs =
+static const struct soc15_reg_entry gfx_v9_4_ea_err_status_regs =
        { SOC15_REG_ENTRY(GC, 0, mmGCEA_ERR_STATUS), 0, 1, 32 };
 
 static void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev)
@@ -1007,15 +1007,21 @@ static void gfx_v9_4_query_ras_error_status(struct 
amdgpu_device *adev)
 
        mutex_lock(&adev->grbm_idx_mutex);
 
-       for (i = 0; i < gfx_v9_4_rdrsp_status_regs.se_num; i++) {
-               for (j = 0; j < gfx_v9_4_rdrsp_status_regs.instance;
+       for (i = 0; i < gfx_v9_4_ea_err_status_regs.se_num; i++) {
+               for (j = 0; j < gfx_v9_4_ea_err_status_regs.instance;
                     j++) {
                        gfx_v9_4_select_se_sh(adev, i, 0, j);
                        reg_value = RREG32(SOC15_REG_ENTRY_OFFSET(
-                               gfx_v9_4_rdrsp_status_regs));
-                       if (reg_value)
+                               gfx_v9_4_ea_err_status_regs));
+                       if (REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, 
SDP_RDRSP_STATUS) ||
+                           REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, 
SDP_WRRSP_STATUS) ||
+                           REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, 
SDP_RDRSP_DATAPARITY_ERROR)) {
+                               /* SDP read/write error/parity error in 
FUE_IS_FATAL mode
+                                * can cause system fatal error in arcturas. 
Harvest the error
+                                * status before GPU reset */
                                dev_warn(adev->dev, "GCEA err detected at 
instance: %d, status: 0x%x!\n",
                                                j, reg_value);
+                       }
                }
        }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
index 4e2c5f117cef..d28df47cea91 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -1644,9 +1644,15 @@ static void mmhub_v9_4_query_ras_error_status(struct 
amdgpu_device *adev)
        for (i = 0; i < ARRAY_SIZE(mmhub_v9_4_err_status_regs); i++) {
                reg_value =
                        
RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v9_4_err_status_regs[i]));
-               if (reg_value)
+               if (REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, 
SDP_RDRSP_STATUS) ||
+                   REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, 
SDP_WRRSP_STATUS) ||
+                   REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, 
SDP_RDRSP_DATAPARITY_ERROR)) {
+                       /* SDP read/write error/parity error in FUE_IS_FATAL 
mode
+                        * can cause system fatal error in arcturas. Harvest 
the error
+                        * status before GPU reset */
                        dev_warn(adev->dev, "MMHUB EA err detected at instance: 
%d, status: 0x%x!\n",
                                        i, reg_value);
+               }
        }
 }
 
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_sh_mask.h
index 4089cfa081f5..849450caca15 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_sh_mask.h
@@ -617,6 +617,22 @@
 #define GCEA_EDC_CNT3__MAM_A3MEM_SEC_COUNT_MASK                                
                               0x30000000L
 #define GCEA_EDC_CNT3__MAM_A3MEM_DED_COUNT_MASK                                
                               0xC0000000L
 
+//GCEA_ERR_STATUS
+#define GCEA_ERR_STATUS__SDP_RDRSP_STATUS__SHIFT                               
                               0x0
+#define GCEA_ERR_STATUS__SDP_WRRSP_STATUS__SHIFT                               
                               0x4
+#define GCEA_ERR_STATUS__SDP_RDRSP_DATASTATUS__SHIFT                           
                               0x8
+#define GCEA_ERR_STATUS__SDP_RDRSP_DATAPARITY_ERROR__SHIFT                     
                               0xa
+#define GCEA_ERR_STATUS__CLEAR_ERROR_STATUS__SHIFT                             
                               0xb
+#define GCEA_ERR_STATUS__BUSY_ON_ERROR__SHIFT                                  
                               0xc
+#define GCEA_ERR_STATUS__FUE_FLAG__SHIFT                                       
                               0xd
+#define GCEA_ERR_STATUS__SDP_RDRSP_STATUS_MASK                                 
                               0x0000000FL
+#define GCEA_ERR_STATUS__SDP_WRRSP_STATUS_MASK                                 
                               0x000000F0L
+#define GCEA_ERR_STATUS__SDP_RDRSP_DATASTATUS_MASK                             
                               0x00000300L
+#define GCEA_ERR_STATUS__SDP_RDRSP_DATAPARITY_ERROR_MASK                       
                               0x00000400L
+#define GCEA_ERR_STATUS__CLEAR_ERROR_STATUS_MASK                               
                               0x00000800L
+#define GCEA_ERR_STATUS__BUSY_ON_ERROR_MASK                                    
                               0x00001000L
+#define GCEA_ERR_STATUS__FUE_FLAG_MASK                                         
                               0x00002000L
+
 // addressBlock: gc_gfxudec
 //GRBM_GFX_INDEX
 #define GRBM_GFX_INDEX__INSTANCE_INDEX__SHIFT                                  
                               0x0
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to