[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang <hawking.zh...@amd.com>

When dynamic GECC platform is detected and default mem ecc is disabled, Let's 
add kernel message to remind users explicitly set amdgpu_ras_enable=1 before 
driver loading to enable GECC if needed.

Regards,
Hawking

-----Original Message-----
From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Candice Li
Sent: Wednesday, February 12, 2025 12:23
To: amd-gfx@lists.freedesktop.org
Cc: Li, Candice <candice...@amd.com>
Subject: [PATCH] drm/amdgpu: Optimize the enablement of GECC

Enable GECC only when the default memory ECC mode or the module parameter 
amdgpu_ras_enable is activated.

Signed-off-by: Candice Li <candice...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  1 +
 .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c  | 18 +++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c       | 31 ++++++++++---------
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c03a586eb5a26f..7f84cc66a19b34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1167,6 +1167,7 @@ struct amdgpu_device {
        struct ratelimit_state          throttling_logging_rs;
        uint32_t                        ras_hw_enabled;
        uint32_t                        ras_enabled;
+       bool                            ras_default_ecc_enabled;

        bool                            no_hw_access;
        struct pci_saved_state          *pci_state;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index f873dd3cae1606..eb015bdda8a749 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -549,9 +549,10 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct 
amdgpu_device *adev)
        u16 data_offset, size;
        union umc_info *umc_info;
        u8 frev, crev;
-       bool ecc_default_enabled = false;
+       bool mem_ecc_enabled = false;
        u8 umc_config;
        u32 umc_config1;
+       adev->ras_default_ecc_enabled = false;

        index = 
get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
                        umc_info);
@@ -563,20 +564,22 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct 
amdgpu_device *adev)
                        switch (crev) {
                        case 1:
                                umc_config = 
le32_to_cpu(umc_info->v31.umc_config);
-                               ecc_default_enabled =
+                               mem_ecc_enabled =
                                        (umc_config & 
UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false;
                                break;
                        case 2:
                                umc_config = 
le32_to_cpu(umc_info->v32.umc_config);
-                               ecc_default_enabled =
+                               mem_ecc_enabled =
                                        (umc_config & 
UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false;
                                break;
                        case 3:
                                umc_config = 
le32_to_cpu(umc_info->v33.umc_config);
                                umc_config1 = 
le32_to_cpu(umc_info->v33.umc_config1);
-                               ecc_default_enabled =
+                               mem_ecc_enabled =
                                        ((umc_config & 
UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ||
                                         (umc_config1 & 
UMC_CONFIG1__ENABLE_ECC_CAPABLE)) ? true : false;
+                               adev->ras_default_ecc_enabled =
+                                       (umc_config & 
UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false;
                                break;
                        default:
                                /* unsupported crev */
@@ -585,9 +588,12 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct 
amdgpu_device *adev)
                } else if (frev == 4) {
                        switch (crev) {
                        case 0:
+                               umc_config = 
le32_to_cpu(umc_info->v40.umc_config);
                                umc_config1 = 
le32_to_cpu(umc_info->v40.umc_config1);
-                               ecc_default_enabled =
+                               mem_ecc_enabled =
                                        (umc_config1 & 
UMC_CONFIG1__ENABLE_ECC_CAPABLE) ? true : false;
+                               adev->ras_default_ecc_enabled =
+                                       (umc_config & 
UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false;
                                break;
                        default:
                                /* unsupported crev */
@@ -599,7 +605,7 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct 
amdgpu_device *adev)
                }
        }

-       return ecc_default_enabled;
+       return mem_ecc_enabled;
 }

 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 196b8dbffc2e28..06f6bbdc7f5e9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1794,7 +1794,22 @@ int psp_ras_initialize(struct psp_context *psp)
                if (ret)
                        dev_warn(adev->dev, "PSP get boot config failed\n");

-               if (!amdgpu_ras_is_supported(psp->adev, AMDGPU_RAS_BLOCK__UMC)) 
{
+               if ((adev->ras_default_ecc_enabled || amdgpu_ras_enable == 1) &&
+                   amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) {
+                       if (boot_cfg == 1) {
+                               dev_info(adev->dev, "GECC is enabled\n");
+                       } else {
+                               /* enable GECC in next boot cycle if it is 
disabled
+                                * in boot config, or force enable GECC if 
failed to
+                                * get boot configuration
+                                */
+                               ret = psp_boot_config_set(adev, 
BOOT_CONFIG_GECC);
+                               if (ret)
+                                       dev_warn(adev->dev, "PSP set boot 
config failed\n");
+                               else
+                                       dev_warn(adev->dev, "GECC will be 
enabled in next boot cycle\n");
+                       }
+               } else {
                        if (!boot_cfg) {
                                dev_info(adev->dev, "GECC is disabled\n");
                        } else {
@@ -1809,20 +1824,6 @@ int psp_ras_initialize(struct psp_context *psp)
                                else
                                        dev_warn(adev->dev, "GECC will be 
disabled in next boot cycle if set amdgpu_ras_enable and/or amdgpu_ras_mask to 
0x0\n");
                        }
-               } else {
-                       if (boot_cfg == 1) {
-                               dev_info(adev->dev, "GECC is enabled\n");
-                       } else {
-                               /* enable GECC in next boot cycle if it is 
disabled
-                                * in boot config, or force enable GECC if 
failed to
-                                * get boot configuration
-                                */
-                               ret = psp_boot_config_set(adev, 
BOOT_CONFIG_GECC);
-                               if (ret)
-                                       dev_warn(adev->dev, "PSP set boot 
config failed\n");
-                               else
-                                       dev_warn(adev->dev, "GECC will be 
enabled in next boot cycle\n");
-                       }
                }
        }

--
2.25.1

Reply via email to