If vm_context_cntl set xnack on, then GPU vm fault has retry_fault bit
set, but the driver select xnack on or off path depending on per process
xnack setting which is also used to set qpd mem_config xnack on or off
if KFD_SUPPORT_XNACK_PER_PROCESS.

If process is xnack on, then GPU page fault show retry page fault
message, otherwise show no-retry page fault message, to avoid misleading
when debugging application page fault issue.

The process lookup from pasid is done inside retry fault handler
svm_range_restore_pages, add xnack_on parameter to pass process xnack
setting back to amdgpu_vm_handle_fault and then to gmc interrupt handler
to show vm fault message.

Signed-off-by: Philip Yang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 7 ++++---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   | 4 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h   | 2 +-
 6 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 64ddc87f7fb6..58f7ab193027 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2757,13 +2757,14 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
  *           GFX 9.4.3.
  * @addr: Address of the fault
  * @write_fault: true is write fault, false is read fault
+ * @xnack_on: return value, true if the process sets xnack on
  *
  * Try to gracefully handle a VM fault. Return true if the fault was handled 
and
  * shouldn't be reported any more.
  */
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
                            u32 vmid, u32 node_id, uint64_t addr,
-                           bool write_fault)
+                           bool write_fault, bool *xnack_on)
 {
        bool is_compute_context = false;
        struct amdgpu_bo *root;
@@ -2788,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
u32 pasid,
        addr /= AMDGPU_GPU_PAGE_SIZE;
 
        if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
-           node_id, addr, write_fault)) {
+           node_id, addr, write_fault, xnack_on)) {
                amdgpu_bo_unref(&root);
                return true;
        }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index bc71b44387b2..7f364f0b9a60 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -549,7 +549,7 @@ void amdgpu_vm_put_task_info(struct amdgpu_task_info 
*task_info);
 
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
                            u32 vmid, u32 node_id, uint64_t addr,
-                           bool write_fault);
+                           bool write_fault, bool *xnack_on);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index d933e19e0cf5..2f0752376236 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -132,7 +132,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device 
*adev,
                /* Try to handle the recoverable page faults by filling page
                 * tables
                 */
-               if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, 
write_fault))
+               if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, 
write_fault, NULL))
                        return 1;
        }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 671a6766df5b..3db0f2304b6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -558,6 +558,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
        uint32_t cam_index = 0;
        int ret, xcc_id = 0;
        uint32_t node_id;
+       bool xnack_on = false;
 
        node_id = entry->node_id;
 
@@ -595,7 +596,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
                        cam_index = entry->src_data[2] & 0x3ff;
 
                        ret = amdgpu_vm_handle_fault(adev, entry->pasid, 
entry->vmid, node_id,
-                                                    addr, write_fault);
+                                                    addr, write_fault, 
&xnack_on);
                        WDOORBELL32(adev->irq.retry_cam_doorbell_index, 
cam_index);
                        if (ret)
                                return 1;
@@ -618,7 +619,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
                         * tables
                         */
                        if (amdgpu_vm_handle_fault(adev, entry->pasid, 
entry->vmid, node_id,
-                                                  addr, write_fault))
+                                                  addr, write_fault, 
&xnack_on))
                                return 1;
                }
        }
@@ -628,7 +629,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
 
        dev_err(adev->dev,
                "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n", 
hub_name,
-               retry_fault ? "retry" : "no-retry",
+               (retry_fault && xnack_on) ? "retry" : "no-retry",
                entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
 
        task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 234ea0fbfa0c..9d44a52bc4b2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2887,7 +2887,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool 
write_fault)
 int
 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
                        uint32_t vmid, uint32_t node_id,
-                       uint64_t addr, bool write_fault)
+                       uint64_t addr, bool write_fault, bool *xnack_on)
 {
        unsigned long start, last, size;
        struct mm_struct *mm = NULL;
@@ -2923,6 +2923,8 @@ svm_range_restore_pages(struct amdgpu_device *adev, 
unsigned int pasid,
                goto out;
        }
 
+       if (xnack_on)
+               *xnack_on = p->xnack_enabled;
        if (!p->xnack_enabled) {
                pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
                r = -EFAULT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 22f22b06a2f4..402f6fbb6452 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -182,7 +182,7 @@ int svm_range_vram_node_new(struct kfd_node *node, struct 
svm_range *prange,
 void svm_range_vram_node_free(struct svm_range *prange);
 int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
                            uint32_t vmid, uint32_t node_id, uint64_t addr,
-                           bool write_fault);
+                           bool write_fault, bool *xnack_on);
 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
 void svm_range_add_list_work(struct svm_range_list *svms,
                             struct svm_range *prange, struct mm_struct *mm,
-- 
2.43.2

Reply via email to