[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang <hawking.zh...@amd.com>

Regards,
Hawking
-----Original Message-----
From: Liu, Xiang(Dean) <xiang....@amd.com>
Sent: Friday, June 6, 2025 12:11
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <hawking.zh...@amd.com>; Liu, Xiang(Dean) <xiang....@amd.com>
Subject: [PATCH] drm/amdgpu: Add debug mask to disable CE logs

Add debug mask to disable kernel logs of RAS correctable errors, including both 
ACA and CE error counter kernel messages.

Signed-off-by: Xiang Liu <xiang....@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 5 +++++  
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ++++++  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +++
 4 files changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9f7284ab5ab6..d0982dbce31a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1309,6 +1309,7 @@ struct amdgpu_device {
        bool                            debug_exp_resets;
        bool                            debug_disable_gpu_ring_reset;
        bool                            debug_vm_userptr;
+       bool                            debug_disable_ce_logs;

        /* Protection for the following isolation structure */
        struct mutex                    enforce_isolation_mutex;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 3835f2592914..cbc40cad581b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -115,6 +115,11 @@ static void aca_smu_bank_dump(struct amdgpu_device *adev, 
int idx, int total, st
        u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID;
        int i;

+       if (adev->debug_disable_ce_logs &&
+           bank->smu_err_type == ACA_SMU_TYPE_CE &&
+           !ACA_BANK_ERR_IS_DEFFERED(bank))
+               return;
+
        RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture 
events logged\n");
        /* plus 1 for output format, e.g: ACA[08/08]: xxxx */
        for (i = 0; i < ARRAY_SIZE(aca_regs); i++) diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 7249f815feaa..fbd0e3a030de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -144,6 +144,7 @@ enum AMDGPU_DEBUG_MASK {
        AMDGPU_DEBUG_DISABLE_GPU_RING_RESET = BIT(6),
        AMDGPU_DEBUG_SMU_POOL = BIT(7),
        AMDGPU_DEBUG_VM_USERPTR = BIT(8),
+       AMDGPU_DEBUG_DISABLE_RAS_CE_LOG = BIT(9)
 };

 unsigned int amdgpu_vram_limit = UINT_MAX; @@ -2302,6 +2303,11 @@ static void 
amdgpu_init_debug_options(struct amdgpu_device *adev)
                pr_info("debug: VM mode debug for userptr is enabled\n");
                adev->debug_vm_userptr = true;
        }
+
+       if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_RAS_CE_LOG) {
+               pr_info("debug: disable kernel logs of correctalbe errors\n");
+               adev->debug_disable_ce_logs = true;
+       }
 }

 static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long 
flags) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6565dc7ff9cd..505850f415ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1107,6 +1107,9 @@ static void amdgpu_ras_error_print_error_data(struct 
amdgpu_device *adev,
                                              err_info->de_count, blk_name);
                        }
                } else {
+                       if (adev->debug_disable_ce_logs)
+                               return;
+
                        for_each_ras_error(err_node, err_data) {
                                err_info = &err_node->err_info;
                                mcm_info = &err_info->mcm_info;
--
2.34.1

Reply via email to