Walk page table for the faulting address and dump PDEs and PTEs at
all levels. Also flag discrepancies where a PDE points to a different
address than the next level PDB or PTB BO.

Signed-off-by: Felix Kuehling <felix.kuehl...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 74 ++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  7 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  5 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +-
 6 files changed, 89 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index bbbf069efb77..78440748c87f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1505,9 +1505,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
  * This is used to access VRAM that backs a buffer object via MMIO
  * access for debugging purposes.
  */
-static int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo,
-                                   unsigned long offset,
-                                   void *buf, int len, int write)
+int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned long 
offset,
+                            void *buf, int len, int write)
 {
        struct amdgpu_bo *abo = ttm_to_amdgpu_bo(bo);
        struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index bccb8c49e597..cffbafffa9d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -83,6 +83,8 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev);
 void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev,
                                        bool enable);
 
+int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned long 
offset,
+                            void *buf, int len, int write);
 int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
                       uint64_t dst_offset, uint32_t byte_count,
                       struct reservation_object *resv,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 1951f2abbdbc..1551c1252a6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -544,6 +544,73 @@ static void amdgpu_vm_pt_next_dfs(struct amdgpu_device 
*adev,
             amdgpu_vm_pt_continue_dfs((start), (entry));                       
\
             (entry) = (cursor).entry, amdgpu_vm_pt_next_dfs((adev), &(cursor)))
 
+/**
+ * amdgpu_vm_dump_pte - dump PTEs along a page table walk
+ *
+ * @adev: amdgpu device pointer
+ * @vm: VM address space
+ * @addr: virtual address
+ *
+ * Walks the page table of @vm at the given @addr and prints the PDEs
+ * and PTEs along the way on a single line.
+ */
+void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+                       uint64_t addr)
+{
+       static const char *level[4] = {"PDE2", "PDE1", "PDE0", "PTE"};
+       uint64_t last_addr = vm->root.base.bo->tbo.offset -
+               adev->gmc.vram_start;
+       struct amdgpu_vm_pt_cursor cursor;
+       char buf[128];
+       int i = 0;
+
+       amdgpu_vm_pt_start(adev, vm, addr >> PAGE_SHIFT, &cursor);
+
+       do {
+               unsigned int mask, shift, idx;
+               struct amdgpu_bo *bo;
+               uint64_t pte;
+
+               mask = amdgpu_vm_entries_mask(adev, cursor.level);
+               shift = amdgpu_vm_level_shift(adev, cursor.level);
+               idx = (cursor.pfn >> shift) & mask;
+
+               bo = cursor.entry->base.bo;
+               if (bo) {
+                       /* Flag discrepancy between previous level PDE
+                        * and the actual address of this PTB or PDB.
+                        */
+                       if (bo->tbo.offset - adev->gmc.vram_start != last_addr)
+                               i += snprintf(buf + i, sizeof(buf) - i, "!");
+
+                       amdgpu_ttm_access_memory(&bo->tbo, idx * sizeof(pte),
+                                                &pte, sizeof(pte), false);
+                       i += snprintf(buf + i, sizeof(buf) - i,
+                                     "%s[%d]: 0x%llx ",
+                                     level[cursor.level], idx, pte);
+                       last_addr = (pte & AMDGPU_PTE_VALID) ?
+                               (pte & 0xffffffff000) : 0;
+               } else {
+                       /* Flag discrepancy if previous level PDE had
+                        * a valid entry but there is no PTB or PDB BO.
+                        */
+                       if (last_addr)
+                               i += snprintf(buf + i, sizeof(buf) - i, "!");
+                       i += snprintf(buf + i, sizeof(buf) - i,
+                                     "%s[%d]: no bo ",
+                                     level[cursor.level], idx);
+                       last_addr = 0;
+               }
+
+               ++cursor.level;
+               cursor.parent = cursor.entry;
+               if (!cursor.entry->entries)
+                       break;
+               cursor.entry = &cursor.entry->entries[idx];
+       } while (cursor.entry);
+       dev_err(adev->dev, "%s", buf);
+}
+
 /**
  * amdgpu_vm_get_pd_bo - add the VM PD to a validation list
  *
@@ -3081,8 +3148,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
  * @pasid: PASID identifier for VM
  * @task_info: task_info to fill.
  */
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
-                        struct amdgpu_task_info *task_info)
+struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev,
+                                         unsigned int pasid,
+                                         struct amdgpu_task_info *task_info)
 {
        struct amdgpu_vm *vm;
        unsigned long flags;
@@ -3094,6 +3162,8 @@ void amdgpu_vm_get_task_info(struct amdgpu_device *adev, 
unsigned int pasid,
                *task_info = vm->task_info;
 
        spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
+
+       return vm;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 489a162ca620..6a8b833d180e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -348,6 +348,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
 int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, 
unsigned int pasid);
 void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct amdgpu_vm 
*vm);
 void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
+void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+                       uint64_t addr);
 void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
                         struct list_head *validated,
                         struct amdgpu_bo_list_entry *entry);
@@ -401,8 +403,9 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
                                  struct amdgpu_job *job);
 void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
 
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
-                            struct amdgpu_task_info *task_info);
+struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev,
+                                         unsigned int pasid,
+                                         struct amdgpu_task_info *task_info);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 8bf2ba310fd9..1a5582405a85 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1448,9 +1448,10 @@ static int gmc_v8_0_process_interrupt(struct 
amdgpu_device *adev,
 
        if (printk_ratelimit()) {
                struct amdgpu_task_info task_info;
+               struct amdgpu_vm *vm;
 
                memset(&task_info, 0, sizeof(struct amdgpu_task_info));
-               amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+               vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
 
                dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process 
%s pid %d thread %s pid %d\n",
                        entry->src_id, entry->src_data[0], 
task_info.process_name,
@@ -1461,6 +1462,8 @@ static int gmc_v8_0_process_interrupt(struct 
amdgpu_device *adev,
                        status);
                gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client,
                                         entry->pasid);
+               if (vm)
+                       amdgpu_vm_dump_pte(adev, vm, addr);
        }
 
        vmid = REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index bd5d36944481..f27e88af4016 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -331,9 +331,10 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
 
        if (printk_ratelimit()) {
                struct amdgpu_task_info task_info;
+               struct amdgpu_vm *vm;
 
                memset(&task_info, 0, sizeof(struct amdgpu_task_info));
-               amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+               vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
 
                dev_err(adev->dev,
                        "[%s] %s page fault (src_id:%u ring:%u vmid:%u "
@@ -349,6 +350,8 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
                        dev_err(adev->dev,
                                "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
                                status);
+               if (vm)
+                       amdgpu_vm_dump_pte(adev, vm, addr);
        }
 
        return 0;
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to