default:
+               dev_warn(adev->dev,
+                        "UMC address to Physical address translation is not 
supported\n");
+               return NOTIFY_DONE;

Before returning, maybe it's necessary to free err_data.err_addr?

Regards,
Guchun

-----Original Message-----
From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Hawking Zhang
Sent: Friday, October 14, 2022 2:19 PM
To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <tao.zh...@amd.com>; Yang, 
Stanley <stanley.y...@amd.com>
Cc: Russell, Kent <kent.russ...@amd.com>; Zhang, Hawking <hawking.zh...@amd.com>
Subject: [PATCH] drm/amdgpu: move convert_error_address out of umc_ras

RAS error address translation algorithm is common across dGPU and A + A 
platform as along as the SOC integrates the same generation of UMC IP.

UMC RAS is managed by x86 MCA on A + A platform, umc_ras in GPU driver is not 
initialized at all on A + A platform. In such case, any umc_ras callback 
implemented for dGPU config shouldn't be invoked from A + A specific callback.

The change moves convert_error_address out of dGPU umc_ras structure and makes 
it share between A + A and dGPU config.

Signed-off-by: Hawking Zhang <hawking.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 15 +++++++++++----  
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  3 ---
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   |  7 +++----
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.h   |  4 +++-
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 75f1402101f4..ff92ea99d513 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -36,6 +36,7 @@
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
+#include "umc_v6_7.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include <asm/mce.h>
@@ -2885,10 +2886,16 @@ static int amdgpu_bad_page_notifier(struct 
notifier_block *nb,
        /*
         * Translate UMC channel address to Physical address
         */
-       if (adev->umc.ras &&
-           adev->umc.ras->convert_ras_error_address)
-               adev->umc.ras->convert_ras_error_address(adev,
-                       &err_data, m->addr, ch_inst, umc_inst);
+       switch (adev->ip_versions[UMC_HWIP][0]) {
+       case IP_VERSION(6, 7, 0):
+               umc_v6_7_convert_error_address(adev,
+                               &err_data, m->addr, ch_inst, umc_inst);
+               break;
+       default:
+               dev_warn(adev->dev,
+                        "UMC address to Physical address translation is not 
supported\n");
+               return NOTIFY_DONE;
+       }
 
        if (amdgpu_bad_page_threshold != 0) {
                amdgpu_ras_add_bad_pages(adev, err_data.err_addr, diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index e46439274f3a..3629d8f292ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -51,9 +51,6 @@ struct amdgpu_umc_ras {
        struct amdgpu_ras_block_object ras_block;
        void (*err_cnt_init)(struct amdgpu_device *adev);
        bool (*query_ras_poison_mode)(struct amdgpu_device *adev);
-       void (*convert_ras_error_address)(struct amdgpu_device *adev,
-                               struct ras_err_data *err_data, uint64_t 
err_addr,
-                               uint32_t ch_inst, uint32_t umc_inst);
        void (*ecc_info_query_ras_error_count)(struct amdgpu_device *adev,
                                      void *ras_error_status);
        void (*ecc_info_query_ras_error_address)(struct amdgpu_device *adev, 
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index 5d5d031c9e7d..72fd963f178b 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -187,9 +187,9 @@ static void umc_v6_7_ecc_info_query_ras_error_count(struct 
amdgpu_device *adev,
        }
 }
 
-static void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
-                                       struct ras_err_data *err_data, uint64_t 
err_addr,
-                                       uint32_t ch_inst, uint32_t umc_inst)
+void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
+                                   struct ras_err_data *err_data, uint64_t 
err_addr,
+                                   uint32_t ch_inst, uint32_t umc_inst)
 {
        uint32_t channel_index;
        uint64_t soc_pa, retired_page, column; @@ -553,5 +553,4 @@ struct 
amdgpu_umc_ras umc_v6_7_ras = {
        .query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
        .ecc_info_query_ras_error_count = 
umc_v6_7_ecc_info_query_ras_error_count,
        .ecc_info_query_ras_error_address = 
umc_v6_7_ecc_info_query_ras_error_address,
-       .convert_ras_error_address = umc_v6_7_convert_error_address,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h 
b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
index fe41ed2f5945..105245d5b6e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
@@ -71,5 +71,7 @@ extern const uint32_t
        
umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM];
 extern const uint32_t
        
umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM];
-
+void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
+                                    struct ras_err_data *err_data, uint64_t 
err_addr,
+                                    uint32_t ch_inst, uint32_t 
+umc_inst);
 #endif
--
2.17.1

Reply via email to