[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang <hawking.zh...@amd.com>

Regards,
Hawking
-----Original Message-----
From: Liu, Xiang(Dean) <xiang....@amd.com>
Sent: Thursday, March 20, 2025 17:39
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <hawking.zh...@amd.com>; Wang, Yang(Kevin) 
<kevinyang.w...@amd.com>; Zhou1, Tao <tao.zh...@amd.com>; Chai, Thomas 
<yipeng.c...@amd.com>; Liu, Xiang(Dean) <xiang....@amd.com>
Subject: [PATCH v2] drm/amdgpu: Decode deferred error type in gfx aca bank 
parser

In the case of injecting uncorrected error with background workload, the 
deferred error among uncorrected errors need to be specified by checking the 
deferred and poison bits of status register.

v2: refine checking for deferred error
v2: log possiable DEs among CEs
v2: generate CPER records for DEs among UEs

Signed-off-by: Xiang Liu <xiang....@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 25 +++++++++++++++++++++++--  
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 16 +++++++++++-----  
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c |  8 ++++----
 3 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index ffd4c64e123c..dc47f5fd4ea1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -391,6 +391,7 @@ static void aca_banks_generate_cper(struct amdgpu_device 
*adev,  {
        struct aca_bank_node *node;
        struct aca_bank *bank;
+       int r;

        if (!adev->cper.enabled)
                return;
@@ -402,11 +403,27 @@ static void aca_banks_generate_cper(struct amdgpu_device 
*adev,

        /* UEs must be encoded into separate CPER entries */
        if (type == ACA_SMU_TYPE_UE) {
+               struct aca_banks de_banks;
+
+               aca_banks_init(&de_banks);
                list_for_each_entry(node, &banks->list, node) {
                        bank = &node->bank;
-                       if (amdgpu_cper_generate_ue_record(adev, bank))
-                               dev_warn(adev->dev, "fail to generate ue cper 
records\n");
+                       if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
+                               r = aca_banks_add_bank(&de_banks, bank);
+                               if (r)
+                                       dev_warn(adev->dev, "fail to add de 
banks, ret = %d\n", r);
+                       } else {
+                               if (amdgpu_cper_generate_ue_record(adev, bank))
+                                       dev_warn(adev->dev, "fail to generate 
ue cper records\n");
+                       }
+               }
+
+               if (!list_empty(&de_banks.list)) {
+                       if (amdgpu_cper_generate_ce_records(adev, &de_banks, 
de_banks.nr_banks))
+                               dev_warn(adev->dev, "fail to generate de cper 
records\n");
                }
+
+               aca_banks_release(&de_banks);
        } else {
                /*
                 * SMU_TYPE_CE banks are combined into 1 CPER entries, @@ 
-541,6 +558,10 @@ static int __aca_get_error_data(struct amdgpu_device *adev, 
struct aca_handle *h
        if (ret)
                return ret;

+       /* DEs may contain in CEs or UEs */
+       if (type != ACA_ERROR_TYPE_DEFERRED)
+               aca_log_aca_error(handle, ACA_ERROR_TYPE_DEFERRED, err_data);
+
        return aca_log_aca_error(handle, type, err_data);  }

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
index 6f62e5d80ed6..6b180f1b33fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
@@ -76,11 +76,17 @@ struct ras_query_context;
 #define mmSMNAID_XCD1_MCA_SMU          0x38430400      /* SMN AID XCD1 */
 #define mmSMNXCD_XCD0_MCA_SMU          0x40430400      /* SMN XCD XCD0 */

-#define ACA_BANK_ERR_CE_DE_DECODE(bank)                                  \
-       ((ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) ||   \
-         ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS])) ? \
-               ACA_ERROR_TYPE_DEFERRED :                                \
-               ACA_ERROR_TYPE_CE)
+#define ACA_BANK_ERR_IS_DEFFERED(bank)                                \
+       (ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
+        ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS]))
+
+#define ACA_BANK_ERR_CE_DE_DECODE(bank)                             \
+       (ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
+                                         ACA_ERROR_TYPE_CE)
+
+#define ACA_BANK_ERR_UE_DE_DECODE(bank)                             \
+       (ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
+                                         ACA_ERROR_TYPE_UE)

 enum aca_reg_idx {
        ACA_REG_IDX_CTL                 = 0,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index c0de682b7774..e62072779ae1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -876,7 +876,7 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle 
*handle,
                                      void *data)
 {
        struct aca_bank_info info;
-       u64 misc0;
+       u64 misc0, status;
        u32 instlo;
        int ret;

@@ -890,12 +890,12 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle 
*handle,
        info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1;

        misc0 = bank->regs[ACA_REG_IDX_MISC0];
+       status = bank->regs[ACA_REG_IDX_STATUS];

        switch (type) {
        case ACA_SMU_TYPE_UE:
-               bank->aca_err_type = ACA_ERROR_TYPE_UE;
-               ret = aca_error_cache_log_bank_error(handle, &info,
-                                                    ACA_ERROR_TYPE_UE, 1ULL);
+               bank->aca_err_type = ACA_BANK_ERR_UE_DE_DECODE(bank);
+               ret = aca_error_cache_log_bank_error(handle, &info,
+bank->aca_err_type, 1ULL);
                break;
        case ACA_SMU_TYPE_CE:
                bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
--
2.34.1

Reply via email to