VM might already be freed when amdgpu_vm_tlb_seq_cb() is called.
We see the calltrace below.

Fix it by keeping the last flush fence around and wait for it to signal

BUG kmalloc-4k (Not tainted): Poison overwritten

0xffff9c88630414e8-0xffff9c88630414e8 @offset=5352. First byte 0x6c
instead of 0x6b Allocated in amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
age=44 cpu=0 pid=2343
 __slab_alloc.isra.0+0x4f/0x90
 kmem_cache_alloc_trace+0x6b8/0x7a0
 amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
 drm_file_alloc+0x222/0x3e0 [drm]
 drm_open+0x11d/0x410 [drm]
 drm_stub_open+0xdc/0x230 [drm]
 chrdev_open+0xa5/0x1e0
 do_dentry_open+0x16c/0x3c0
 vfs_open+0x2d/0x30
 path_openat+0x70a/0xa90
 do_filp_open+0xb2/0x120
 do_sys_openat2+0x245/0x330
 do_sys_open+0x46/0x80
 __x64_sys_openat+0x20/0x30
 do_syscall_64+0x38/0xc0
 entry_SYSCALL_64_after_hwframe+0x44/0xae
Freed in amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu] age=22 cpu=1
pid=2485
 kfree+0x4a2/0x580
 amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu]
 drm_file_free+0x24e/0x3c0 [drm]
 drm_close_helper.isra.0+0x90/0xb0 [drm]
 drm_release+0x97/0x1a0 [drm]
 __fput+0xb6/0x280
 ____fput+0xe/0x10
 task_work_run+0x64/0xb0
 do_exit+0x406/0xcf0
 do_group_exit+0x50/0xc0
 __x64_sys_exit_group+0x18/0x20
 do_syscall_64+0x38/0xc0
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Suggested-by: Christian König <christian.koe...@amd.com>
Signed-off-by: xinhui pan <xinhui....@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 22 +++++++++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  1 +
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 645ce28277c2..e2486e95ca69 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -932,9 +932,12 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
 
        if (flush_tlb || params.table_freed) {
                tlb_cb->vm = vm;
-               if (!fence || !*fence ||
-                   dma_fence_add_callback(*fence, &tlb_cb->cb,
-                                          amdgpu_vm_tlb_seq_cb))
+               if (fence && *fence &&
+                   !dma_fence_add_callback(*fence, &tlb_cb->cb,
+                                          amdgpu_vm_tlb_seq_cb)) {
+                       dma_fence_put(vm->last_delayed_tlb_flush);
+                       vm->last_delayed_tlb_flush = dma_fence_get(*fence);
+               } else
                        amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
                tlb_cb = NULL;
        }
@@ -2258,6 +2261,19 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
        dma_fence_wait(vm->last_unlocked, false);
        dma_fence_put(vm->last_unlocked);
 
+       if (vm->last_delayed_tlb_flush) {
+               /* Wait until fence is signaled.
+                * But must double check to make sure fence cb is called.
+                * As dma_fence_default_wait checks DMA_FENCE_FLAG_SIGNALED_BIT 
without
+                * holding fence lock(the first test_bit).
+                * So call dma_fence_get_status which will hold the fence lock.
+                * Then we can make sure fence cb has been called.
+                */
+               (void)dma_fence_wait(vm->last_delayed_tlb_flush, false);
+               (void)dma_fence_get_status(vm->last_delayed_tlb_flush);
+               dma_fence_put(vm->last_delayed_tlb_flush);
+       }
+
        list_for_each_entry_safe(mapping, tmp, &vm->freed, list) {
                if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) {
                        amdgpu_vm_prt_fini(adev, vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 1a814fbffff8..c1a48f5c1019 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -286,6 +286,7 @@ struct amdgpu_vm {
 
        /* Last finished delayed update */
        atomic64_t              tlb_seq;
+       struct dma_fence        *last_delayed_tlb_flush;
 
        /* Last unlocked submission to the scheduler entities */
        struct dma_fence        *last_unlocked;
-- 
2.25.1

Reply via email to