When debugging VMC page fault and ring hang issues, the detailed
process information is quite helpful, especially when the issue
can only be reproduced after a very long time running. With this
information, only run the specific sub-testcase may also will
reproduce the issue, which may save a lot of time for debugging.

With this patch, the process information is similar as following.
        When VMC page fault issue happened:
[  142.978417] amdgpu 0000:00:08.0: [gfxhub] VMC page fault (src_id:0 ring:171 
vmid:2 pasid:32769),
[  142.978542] amdgpu 0000:00:08.0: for process ocltst pid 1354 thread ocltst 
pid 1354, args:./ocltst -m oclperf.so -t OCLPerfDeviceEnqueueEvent,
[  142.978693] amdgpu 0000:00:08.0:   in page starting at address 
0x0000000000000000 from 27

        When ring hang issue happened:
[ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring comp_1.0.0 
timeout, signaled seq=91571, emitted seq=91572
[ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process 
information is as below:
[ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process SDMA pid 2098 
thread SDMA pid 2098, cmd line:SDMA --mode goldimage_compare --offscreen 
--n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4

Signed-off-by: Trigger Huang <trigger.hu...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  3 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  8 ++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 11 +++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 11 ++++++-----
 5 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 1c49b82..1a2d0c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -235,9 +235,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser 
*p, union drm_amdgpu_cs
                p->job->uf_addr = uf_offset;
        kfree(chunk_array);
 
-       /* Use this opportunity to fill in task info for the vm */
-       amdgpu_vm_set_task_info(vm);
-
        return 0;
 
 free_all_kdata:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e0af44f..c75ecb3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -43,6 +43,14 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
                  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
                  ring->fence_drv.sync_seq);
 
+       if (job->vm) {
+               struct amdgpu_task_info *ti = &(job->vm->task_info);
+
+               DRM_ERROR("The job's process information is as below:\n");
+               DRM_ERROR("Process %s, thread %s, cmd line:%s\n",
+                       ti->process_name, ti->task_name, ti->cmd_line);
+       }
+
        if (amdgpu_device_should_recover_gpu(ring->adev))
                amdgpu_device_gpu_recover(ring->adev, job);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index e73d152..24f3cbd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -30,6 +30,7 @@
 #include <linux/idr.h>
 #include <drm/drmP.h>
 #include <drm/amdgpu_drm.h>
+#include <linux/string_helpers.h>
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
@@ -3045,6 +3046,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
                        goto error_free_root;
 
                vm->pasid = pasid;
+
+               amdgpu_vm_set_task_info(vm);
        }
 
        vm->fault_hash = init_fault_hash();
@@ -3223,6 +3226,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
                spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
                idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
                spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
+
+               kfree(vm->task_info.cmd_line);
+               vm->task_info.cmd_line = NULL;
        }
 
        kfree(vm->fault_hash);
@@ -3391,6 +3397,11 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
                        vm->task_info.tgid = current->group_leader->pid;
                        get_task_comm(vm->task_info.process_name, 
current->group_leader);
                }
+
+               vm->task_info.cmd_line =
+                               kstrdup_quotable_cmdline(current, GFP_KERNEL);
+               if (!vm->task_info.cmd_line)
+                       DRM_DEBUG_DRIVER("Failed to get cmdline!\n");
        }
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index e8dcfd5..9fab787 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -175,6 +175,7 @@ struct amdgpu_vm_pte_funcs {
 struct amdgpu_task_info {
        char    process_name[TASK_COMM_LEN];
        char    task_name[TASK_COMM_LEN];
+       char    *cmd_line;
        pid_t   pid;
        pid_t   tgid;
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index bacdaef..c3e3558 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -325,11 +325,12 @@ static int gmc_v9_0_process_interrupt(struct 
amdgpu_device *adev,
                amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
 
                dev_err(adev->dev,
-                       "[%s] VMC page fault (src_id:%u ring:%u vmid:%u 
pasid:%u, for process %s pid %d thread %s pid %d)\n",
-                       entry->vmid_src ? "mmhub" : "gfxhub",
-                       entry->src_id, entry->ring_id, entry->vmid,
-                       entry->pasid, task_info.process_name, task_info.tgid,
-                       task_info.task_name, task_info.pid);
+                       "[%s] VMC page fault (src_id:%u ring:%u vmid:%u 
pasid:%u), ",
+                       entry->vmid_src ? "mmhub" : "gfxhub",  entry->src_id,
+                       entry->ring_id, entry->vmid, entry->pasid);
+               dev_err(adev->dev, "for process %s pid %d thread %s pid %d, 
args:%s,",
+                       task_info.process_name, task_info.tgid,
+                       task_info.task_name, task_info.pid, task_info.cmd_line);
                dev_err(adev->dev, "  in page starting at address 0x%016llx 
from %d\n",
                        addr, entry->client_id);
                if (!amdgpu_sriov_vf(adev))
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to