amdgpu: Change the timing of doing coredump

Trigger.Huang Thu, 15 Aug 2024 04:39:06 -0700

From: Trigger Huang <trigger.hu...@amd.com>

Do the coredump immediately after a job timeout to get a closer
representation of GPU's error status. For other code paths that
need to do the coredump, keep the original logic unchanged, except:
1,All the coredump operations will be under the control of parameter
amdgpu_gpu_coredump
2, Do the ip dump and register to dev codedump system together.


Signed-off-by: Trigger Huang <trigger.hu...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++-------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 10 ++++++++++
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 69d6a5b7ca34..a8eb76d82921 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5341,15 +5341,9 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
*adev,
                        }
                }
 
-               if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
-                       dev_info(tmp_adev->dev, "Dumping IP State\n");
-                       /* Trigger ip dump before we reset the asic */
-                       for (i = 0; i < tmp_adev->num_ip_blocks; i++)
-                               if 
(tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
-                                       tmp_adev->ip_blocks[i].version->funcs
-                                               ->dump_ip_state((void 
*)tmp_adev);
-                       dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
-               }
+               if (amdgpu_gpu_coredump &&
+                   (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)))
+                       amdgpu_device_gpu_coredump_single(tmp_adev, job);
 
                if (need_full_reset)
                        r = amdgpu_device_ip_suspend(adev);
@@ -5444,10 +5438,6 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
                                        goto out;
 
                                vram_lost = 
amdgpu_device_check_vram_lost(tmp_adev);
-
-                               if (!test_bit(AMDGPU_SKIP_COREDUMP, 
&reset_context->flags))
-                                       amdgpu_coredump(tmp_adev, vram_lost, 
reset_context);
-
                                if (vram_lost) {
                                        DRM_INFO("VRAM is lost due to GPU 
reset!\n");
                                        amdgpu_inc_vram_lost(tmp_adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index c6a1783fc9ef..63869820c334 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -48,6 +48,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
                return DRM_GPU_SCHED_STAT_ENODEV;
        }
 
+       /*
+        * Do the coredump immediately after a job timeout to get a closer
+        * representation of GPU's error status.
+        */
+       if (amdgpu_gpu_coredump)
+               amdgpu_device_gpu_coredump(adev, job);
 
        adev->job_hang = true;
 
@@ -101,6 +107,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
                reset_context.src = AMDGPU_RESET_SRC_JOB;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
 
+               /* To avoid a double coredump, as we have already done it */
+               if (amdgpu_gpu_coredump)
+                       set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
+
                r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
                if (r)
                        dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);
-- 
2.34.1

[PATCH 3/3] drm/amdgpu: Change the timing of doing coredump

Reply via email to