In the case of injecting uncorrected error with background workload,
the deferred error among uncorrected errors need to be specified
by checking the deferred and poison bits of status register.

v2: refine checking for deferred error
v2: log possiable DEs among CEs
v2: generate CPER records for DEs among UEs

Signed-off-by: Xiang Liu <xiang....@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 25 +++++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 16 +++++++++++-----
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c |  8 ++++----
 3 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index ffd4c64e123c..dc47f5fd4ea1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -391,6 +391,7 @@ static void aca_banks_generate_cper(struct amdgpu_device 
*adev,
 {
        struct aca_bank_node *node;
        struct aca_bank *bank;
+       int r;
 
        if (!adev->cper.enabled)
                return;
@@ -402,11 +403,27 @@ static void aca_banks_generate_cper(struct amdgpu_device 
*adev,
 
        /* UEs must be encoded into separate CPER entries */
        if (type == ACA_SMU_TYPE_UE) {
+               struct aca_banks de_banks;
+
+               aca_banks_init(&de_banks);
                list_for_each_entry(node, &banks->list, node) {
                        bank = &node->bank;
-                       if (amdgpu_cper_generate_ue_record(adev, bank))
-                               dev_warn(adev->dev, "fail to generate ue cper 
records\n");
+                       if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
+                               r = aca_banks_add_bank(&de_banks, bank);
+                               if (r)
+                                       dev_warn(adev->dev, "fail to add de 
banks, ret = %d\n", r);
+                       } else {
+                               if (amdgpu_cper_generate_ue_record(adev, bank))
+                                       dev_warn(adev->dev, "fail to generate 
ue cper records\n");
+                       }
+               }
+
+               if (!list_empty(&de_banks.list)) {
+                       if (amdgpu_cper_generate_ce_records(adev, &de_banks, 
de_banks.nr_banks))
+                               dev_warn(adev->dev, "fail to generate de cper 
records\n");
                }
+
+               aca_banks_release(&de_banks);
        } else {
                /*
                 * SMU_TYPE_CE banks are combined into 1 CPER entries,
@@ -541,6 +558,10 @@ static int __aca_get_error_data(struct amdgpu_device 
*adev, struct aca_handle *h
        if (ret)
                return ret;
 
+       /* DEs may contain in CEs or UEs */
+       if (type != ACA_ERROR_TYPE_DEFERRED)
+               aca_log_aca_error(handle, ACA_ERROR_TYPE_DEFERRED, err_data);
+
        return aca_log_aca_error(handle, type, err_data);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
index 6f62e5d80ed6..6b180f1b33fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
@@ -76,11 +76,17 @@ struct ras_query_context;
 #define mmSMNAID_XCD1_MCA_SMU          0x38430400      /* SMN AID XCD1 */
 #define mmSMNXCD_XCD0_MCA_SMU          0x40430400      /* SMN XCD XCD0 */
 
-#define ACA_BANK_ERR_CE_DE_DECODE(bank)                                  \
-       ((ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) ||   \
-         ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS])) ? \
-               ACA_ERROR_TYPE_DEFERRED :                                \
-               ACA_ERROR_TYPE_CE)
+#define ACA_BANK_ERR_IS_DEFFERED(bank)                                \
+       (ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
+        ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS]))
+
+#define ACA_BANK_ERR_CE_DE_DECODE(bank)                             \
+       (ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
+                                         ACA_ERROR_TYPE_CE)
+
+#define ACA_BANK_ERR_UE_DE_DECODE(bank)                             \
+       (ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
+                                         ACA_ERROR_TYPE_UE)
 
 enum aca_reg_idx {
        ACA_REG_IDX_CTL                 = 0,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index c0de682b7774..e62072779ae1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -876,7 +876,7 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle 
*handle,
                                      void *data)
 {
        struct aca_bank_info info;
-       u64 misc0;
+       u64 misc0, status;
        u32 instlo;
        int ret;
 
@@ -890,12 +890,12 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle 
*handle,
        info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1;
 
        misc0 = bank->regs[ACA_REG_IDX_MISC0];
+       status = bank->regs[ACA_REG_IDX_STATUS];
 
        switch (type) {
        case ACA_SMU_TYPE_UE:
-               bank->aca_err_type = ACA_ERROR_TYPE_UE;
-               ret = aca_error_cache_log_bank_error(handle, &info,
-                                                    ACA_ERROR_TYPE_UE, 1ULL);
+               bank->aca_err_type = ACA_BANK_ERR_UE_DE_DECODE(bank);
+               ret = aca_error_cache_log_bank_error(handle, &info, 
bank->aca_err_type, 1ULL);
                break;
        case ACA_SMU_TYPE_CE:
                bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
-- 
2.34.1

Reply via email to