Am 10.09.2018 um 11:34 schrieb Emily Deng:
It will ramdomly have the dead lock issue when test TDR:
1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock
2. amdgpu_bo_create locked the bo's resv lock
3. amdgpu_bo_create_shadow is waiting for the shadow_list_lock
4. amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv
lock.
v2:
Make a local copy of the list
Signed-off-by: Emily Deng <emily.d...@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 1 +
2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index acfc63e..2b9f597 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3006,6 +3006,9 @@ static int amdgpu_device_handle_vram_lost(struct
amdgpu_device *adev)
long r = 1;
int i = 0;
long tmo;
+ struct list_head local_shadow_list;
+
+ INIT_LIST_HEAD(&local_shadow_list);
if (amdgpu_sriov_runtime(adev))
tmo = msecs_to_jiffies(8000);
@@ -3013,8 +3016,15 @@ static int amdgpu_device_handle_vram_lost(struct
amdgpu_device *adev)
tmo = msecs_to_jiffies(100);
DRM_INFO("recover vram bo from shadow start\n");
+
mutex_lock(&adev->shadow_list_lock);
list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
+ amdgpu_bo_ref(bo);
+ list_add_tail(&bo->copy_shadow_list, &local_shadow_list);
+ }
Please don't add an extra copy_shadow_list field to amdgpu_bo.
Instead just use bo->shadow list for this. When you hold a reference to
the BO it should not be removed from the shadow list.
Additional to that you can just use list_splice_init() to move the whole
shadow list to your local list.
Christian.
+ mutex_unlock(&adev->shadow_list_lock);
+
+ list_for_each_entry_safe(bo, tmp, &local_shadow_list, copy_shadow_list)
{
next = NULL;
amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
if (fence) {
@@ -3033,8 +3043,8 @@ static int amdgpu_device_handle_vram_lost(struct
amdgpu_device *adev)
dma_fence_put(fence);
fence = next;
+ amdgpu_bo_unref(&bo);
}
- mutex_unlock(&adev->shadow_list_lock);
if (fence) {
r = dma_fence_wait_timeout(fence, false, tmo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 907fdf4..cfee16c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -103,6 +103,7 @@ struct amdgpu_bo {
struct list_head mn_list;
struct list_head shadow_list;
};
+ struct list_head copy_shadow_list;
struct kgd_mem *kfd_bo;
};
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx