During Mode 1 reset, the ASIC undergoes a reset cycle and becomes temporarily
inaccessible via PCIe. Any attempt to access framebuffer or MMIO registers 
during
this window can result in uncompleted PCIe transactions, leading to NMI panics 
or
system hangs.

To prevent this, Unmap all of the applications mappings of the framebuffer
and doorbell BARs before mode1 reset. Also prevent new mappings from coming in
during the reset process.

Signed-off-by: Yifan Zhang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h      |  1 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c    |  9 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c      |  6 ++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c         | 17 +++++++++++++++--
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c        | 10 ++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h           |  3 +++
 6 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index cdbab7f8cee8..2b4108f83f48 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -358,6 +358,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
                uint64_t size, u32 alloc_flag, int8_t xcp_id);
 void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
                uint64_t size, u32 alloc_flag, int8_t xcp_id);
+void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev);
 
 u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 768998c82b43..0de0b998eba7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -139,6 +139,15 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
        kfd_mem_limit.system_mem_used += size;
 }
 
+void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev)
+{
+       if (adev->kfd.dev && adev->kfd.dev->inode &&
+               adev->kfd.dev->inode->i_mapping)
+               unmap_mapping_range(adev->kfd.dev->inode->i_mapping,
+                       KFD_MMAP_TYPE_DOORBELL, 
kfd_doorbell_process_slice(adev->kfd.dev), 1);
+}
+
+
 /* Estimate page table size needed to represent a given memory size
  *
  * With 4KB pages, we need one 8 byte PTE for each 4KB of memory
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b2deb6a74eb2..c95e45b8eca4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5721,6 +5721,12 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
        /* disable BM */
        pci_clear_master(adev->pdev);
 
+       /* unmap all the mappings of doorbell and framebuffer to prevent user 
space from
+        * accessing them
+        */
+       unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
+       amdgpu_amdkfd_clear_kfd_mapping(adev);
+
        if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
                dev_info(adev->dev, "GPU smu mode1 reset\n");
                ret = amdgpu_dpm_mode1_reset(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 5f9fa2140f09..c0b789fa92dc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -44,6 +44,7 @@
 #include "amdgpu_hmm.h"
 #include "amdgpu_xgmi.h"
 #include "amdgpu_vm.h"
+#include "amdgpu_reset.h"
 
 static int
 amdgpu_gem_add_input_fence(struct drm_file *filp,
@@ -115,13 +116,21 @@ amdgpu_gem_update_timeline_node(struct drm_file *filp,
 static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
 {
        struct ttm_buffer_object *bo = vmf->vma->vm_private_data;
+       struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
        struct drm_device *ddev = bo->base.dev;
        vm_fault_t ret;
        int idx;
 
+       /* Prevent new mappings from coming in during reset */
+
+       if (!down_read_trylock(&adev->reset_domain->sem))
+               return VM_FAULT_RETRY;
+
        ret = ttm_bo_vm_reserve(bo, vmf);
-       if (ret)
+       if (ret) {
+               up_read(&adev->reset_domain->sem);
                return ret;
+       }
 
        if (drm_dev_enter(ddev, &idx)) {
                ret = amdgpu_bo_fault_reserve_notify(bo);
@@ -137,11 +146,15 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
        } else {
                ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot);
        }
-       if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
+       if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+               up_read(&adev->reset_domain->sem);
                return ret;
+       }
 
 unlock:
        dma_resv_unlock(bo->base.resv);
+       up_read(&adev->reset_domain->sem);
+
        return ret;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 88621cb7d409..3caf8da3cd71 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -127,11 +127,21 @@ void kfd_chardev_exit(void)
 static int kfd_open(struct inode *inode, struct file *filep)
 {
        struct kfd_process *process;
+       struct kfd_node *node;
        bool is_32bit_user_mode;
+       int i;
 
        if (iminor(inode) != 0)
                return -ENODEV;
 
+       /* Save inode in kfd_dev for unmap_mapping_range */
+       for (i = 0; kfd_topology_enum_kfd_devices(i, &node) == 0; i++) {
+               if (node && node->kfd && !node->kfd->inode) {
+                       node->kfd->inode = inode;
+                       break;
+               }
+       }
+
        is_32bit_user_mode = in_compat_syscall();
 
        if (is_32bit_user_mode) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 9849b54f54ba..bb6959e83b60 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -374,6 +374,9 @@ struct kfd_dev {
 
        struct workqueue_struct *ih_wq;
 
+       /* kfd inode */
+       struct inode *inode;
+
        /* Kernel doorbells for KFD device */
        struct amdgpu_bo *doorbells;
 
-- 
2.43.0

Reply via email to