AMDGPU_RAS_GPU_ERR_BOOT_STATUS field is no longer valid.
The polling sequence is also simplifed according to
the latest firmware change.

Signed-off-by: Hawking Zhang <hawking.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 99 +++++++++++--------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 +-
 2 files changed, 46 insertions(+), 57 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index eedf2b613ac2..2c338d39cd45 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4416,64 +4416,74 @@ int amdgpu_ras_error_statistic_de_count(struct 
ras_err_data *err_data,
 #define mmMP0_SMN_C2PMSG_92    0x1609C
 #define mmMP0_SMN_C2PMSG_126   0x160BE
 static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
-                                                u32 instance, u32 boot_error)
+                                                u32 instance)
 {
        u32 socket_id, aid_id, hbm_id;
-       u32 reg_data;
+       u32 fw_status;
+       u32 boot_error;
        u64 reg_addr;
 
-       socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
-       aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
-       hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
-
        /* The pattern for smn addressing in other SOC could be different from
         * the one for aqua_vanjaram. We should revisit the code if the pattern
         * is changed. In such case, replace the aqua_vanjaram implementation
         * with more common helper */
        reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
                   aqua_vanjaram_encode_ext_smn_addressing(instance);
+       fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+
+       reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+                  aqua_vanjaram_encode_ext_smn_addressing(instance);
+       boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
 
-       reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
-       dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
-               socket_id, aid_id, reg_data);
+       socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+       aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+       hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
 
        if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
-               dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
-                        socket_id, aid_id, hbm_id);
+               dev_info(adev->dev,
+                        "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory 
training failed\n",
+                        socket_id, aid_id, hbm_id, fw_status);
 
        if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
-               dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
-                        socket_id, aid_id);
+               dev_info(adev->dev,
+                        "socket: %d, aid: %d, fw_status: 0x%x, firmware load 
failed at boot time\n",
+                        socket_id, aid_id, fw_status);
 
        if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
-               dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
-                        socket_id, aid_id);
+               dev_info(adev->dev,
+                        "socket: %d, aid: %d, fw_status: 0x%x, wafl link 
training failed\n",
+                        socket_id, aid_id, fw_status);
 
        if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
-               dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
-                        socket_id, aid_id);
+               dev_info(adev->dev,
+                        "socket: %d, aid: %d, fw_status: 0x%x, xgmi link 
training failed\n",
+                        socket_id, aid_id, fw_status);
 
        if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
-               dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
-                        socket_id, aid_id);
+               dev_info(adev->dev,
+                        "socket: %d, aid: %d, fw_status: 0x%x, usr cp link 
training failed\n",
+                        socket_id, aid_id, fw_status);
 
        if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
-               dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
-                        socket_id, aid_id);
+               dev_info(adev->dev,
+                        "socket: %d, aid: %d, fw_status: 0x%x, usr dp link 
training failed\n",
+                        socket_id, aid_id, fw_status);
 
        if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
-               dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
-                        socket_id, aid_id, hbm_id);
+               dev_info(adev->dev,
+                        "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm 
memory test failed\n",
+                        socket_id, aid_id, hbm_id, fw_status);
 
        if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
-               dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
-                        socket_id, aid_id, hbm_id);
+               dev_info(adev->dev,
+                        "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm 
bist test failed\n",
+                        socket_id, aid_id, hbm_id, fw_status);
 }
 
-static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
-                                            u32 instance, u32 *boot_error)
+static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev,
+                                          u32 instance)
 {
-       u32 reg_addr;
+       u64 reg_addr;
        u32 reg_data;
        int retry_loop;
 
@@ -4482,41 +4492,22 @@ static int amdgpu_ras_wait_for_boot_complete(struct 
amdgpu_device *adev,
 
        for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; 
retry_loop++) {
                reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
-               if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == 
AMDGPU_RAS_BOOT_STEADY_STATUS) {
-                       *boot_error = AMDGPU_RAS_BOOT_SUCEESS;
-                       return 0;
-               }
-               msleep(1);
-       }
-
-       /* The pattern for smn addressing in other SOC could be different from
-        * the one for aqua_vanjaram. We should revisit the code if the pattern
-        * is changed. In such case, replace the aqua_vanjaram implementation
-        * with more common helper */
-       reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
-                  aqua_vanjaram_encode_ext_smn_addressing(instance);
-
-       for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; 
retry_loop++) {
-               reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
-               if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
-                       *boot_error = reg_data;
-                       return 0;
-               }
-               msleep(1);
+               if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == 
AMDGPU_RAS_BOOT_STEADY_STATUS)
+                       return false;
+               else
+                       msleep(1);
        }
 
-       *boot_error = reg_data;
-       return -ETIME;
+       return true;
 }
 
 void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 
num_instances)
 {
-       u32 boot_error = 0;
        u32 i;
 
        for (i = 0; i < num_instances; i++) {
-               if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error))
-                       amdgpu_ras_boot_time_error_reporting(adev, i, 
boot_error);
+               if (amdgpu_ras_boot_error_detected(adev, i))
+                       amdgpu_ras_boot_time_error_reporting(adev, i);
        }
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 7021c4a66fb5..d0a125743d3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -47,12 +47,10 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x)                        
AMDGPU_GET_REG_FIELD(x, 10, 8)
 #define AMDGPU_RAS_GPU_ERR_AID_ID(x)                   AMDGPU_GET_REG_FIELD(x, 
12, 11)
 #define AMDGPU_RAS_GPU_ERR_HBM_ID(x)                   AMDGPU_GET_REG_FIELD(x, 
14, 13)
-#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)              AMDGPU_GET_REG_FIELD(x, 
31, 31)
 
-#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT   1000
+#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT   100
 #define AMDGPU_RAS_BOOT_STEADY_STATUS          0xBA
 #define AMDGPU_RAS_BOOT_STATUS_MASK            0xFF
-#define AMDGPU_RAS_BOOT_SUCEESS                        0x80000000
 
 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS          (0x1 << 0)
 /* position of instance value in sub_block_index of
-- 
2.17.1

Reply via email to