Am 2022-06-10 um 00:04 schrieb Ramesh Errabolu:
Following error conditions are fixed:
   Unpin MMIO and DOORBELL BOs only after map count goes to zero
   Remove BO from validate list of a KFD process in a safe manner
   Print a warning message if unreserving GPUVMs encounters an error

Signed-off-by: Ramesh Errabolu <ramesh.errab...@amd.com>
---
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 42 +++++++++++--------
  1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index a1de900ba677..ee48e6591f99 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1013,14 +1013,22 @@ static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem 
*mem,
        mutex_unlock(&process_info->lock);
  }
-static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,
+/**
+ * remove_kgd_mem_from_validate_list() - Remove BO from process's validate 
list,
+ * in an idempotent manner, so that restore worker can't access it anymore
+ * @mem: BO's membership handle in validate list
+ * @process_info: KFD process handle to which BO belongs
+ *
+ * Return: void

I don't think you need to state a void return explicitly. [+David], since you were looking into KFD documentation and kernel-doc comments lately, do you have any feedback on the kernel-doc syntax?

Other than that, this patch is

Reviewed-by: Felix Kuehling <felix.kuehl...@amd.com>


+ */
+static void remove_kgd_mem_from_validate_list(struct kgd_mem *mem,
                struct amdkfd_process_info *process_info)
  {
        struct ttm_validate_buffer *bo_list_entry;
bo_list_entry = &mem->validate_list;
        mutex_lock(&process_info->lock);
-       list_del(&bo_list_entry->head);
+       list_del_init(&bo_list_entry->head);
        mutex_unlock(&process_info->lock);
  }
@@ -1796,7 +1804,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( allocate_init_user_pages_failed:
  err_pin_bo:
-       remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);
+       remove_kgd_mem_from_validate_list(*mem, avm->process_info);
        drm_vma_node_revoke(&gobj->vma_node, drm_priv);
  err_node_allow:
        /* Don't unreserve system mem limit twice */
@@ -1825,20 +1833,12 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
        unsigned long bo_size = mem->bo->tbo.base.size;
        struct kfd_mem_attachment *entry, *tmp;
        struct bo_vm_reservation_context ctx;
-       struct ttm_validate_buffer *bo_list_entry;
        unsigned int mapped_to_gpu_memory;
        int ret;
        bool is_imported = false;
mutex_lock(&mem->lock); - /* Unpin MMIO/DOORBELL BO's that were pinned during allocation */
-       if (mem->alloc_flags &
-           (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
-            KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
-               amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo);
-       }
-
        mapped_to_gpu_memory = mem->mapped_to_gpu_memory;
        is_imported = mem->is_imported;
        mutex_unlock(&mem->lock);
@@ -1853,10 +1853,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
        }
/* Make sure restore workers don't access the BO any more */
-       bo_list_entry = &mem->validate_list;
-       mutex_lock(&process_info->lock);
-       list_del(&bo_list_entry->head);
-       mutex_unlock(&process_info->lock);
+       remove_kgd_mem_from_validate_list(mem, process_info);
/* No more MMU notifiers */
        amdgpu_mn_unregister(mem->bo);
@@ -1878,7 +1875,18 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
        list_for_each_entry_safe(entry, tmp, &mem->attachments, list)
                kfd_mem_detach(entry);
+ /* Return success even in case of error */
        ret = unreserve_bo_and_vms(&ctx, false, false);
+       if (unlikely(ret)) {
+               WARN_ONCE(ret, "Error in unreserving BO and associated VMs");
+               ret = 0;
+       }
+
+       /* Unpin MMIO/DOORBELL BO's that were pinned during allocation */
+       if (mem->alloc_flags &
+           (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
+            KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))
+               amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo);
/* Free the sync object */
        amdgpu_sync_free(&mem->sync);
@@ -2814,7 +2822,7 @@ int amdgpu_amdkfd_add_gws_to_process(void *info, void 
*gws, struct kgd_mem **mem
  bo_reservation_failure:
        mutex_unlock(&(*mem)->process_info->lock);
        amdgpu_sync_free(&(*mem)->sync);
-       remove_kgd_mem_from_kfd_bo_list(*mem, process_info);
+       remove_kgd_mem_from_validate_list(*mem, process_info);
        amdgpu_bo_unref(&gws_bo);
        mutex_destroy(&(*mem)->lock);
        kfree(*mem);
@@ -2832,7 +2840,7 @@ int amdgpu_amdkfd_remove_gws_from_process(void *info, 
void *mem)
        /* Remove BO from process's validate list so restore worker won't touch
         * it anymore
         */
-       remove_kgd_mem_from_kfd_bo_list(kgd_mem, process_info);
+       remove_kgd_mem_from_validate_list(kgd_mem, process_info);
ret = amdgpu_bo_reserve(gws_bo, false);
        if (unlikely(ret)) {

Reply via email to