Am 2023-03-08 um 16:37 schrieb Chia-I Wu:
kgd_mem should be accessed with p->mutex locked, or it could have been
freed by kfd_ioctl_free_memory_of_gpu.

Thank you for the patch. It's not just about accessing kgd_mem with p->mutex held. It's also about holding the mutex continuously. I'd update the description to be more explicit about the invariant being broken here:

kgd_mem pointers returned by kfd_process_device_translate_handle are only guaranteed to be valid while p->mutex is held. As soon as the mutex is unlocked, another thread can free the BO.

I can update the description and submit the patch.

Reviewed-by: Felix Kuehling <felix.kuehl...@amd.com>

Regards,
  Felix



Signed-off-by: Chia-I Wu <olva...@gmail.com>
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 16 ++++++++++------
  1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 6d291aa6386bd..3c630114210d6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1293,14 +1293,14 @@ static int kfd_ioctl_map_memory_to_gpu(struct file 
*filep,
                args->n_success = i+1;
        }
- mutex_unlock(&p->mutex);
-
        err = amdgpu_amdkfd_gpuvm_sync_memory(dev->adev, (struct kgd_mem *) 
mem, true);
        if (err) {
                pr_debug("Sync memory failed, wait interrupted by user 
signal\n");
                goto sync_memory_failed;
        }
+ mutex_unlock(&p->mutex);
+
        /* Flush TLBs after waiting for the page table updates to complete */
        for (i = 0; i < args->n_devices; i++) {
                peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
@@ -1316,9 +1316,9 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
  bind_process_to_device_failed:
  get_mem_obj_from_handle_failed:
  map_memory_to_gpu_failed:
+sync_memory_failed:
        mutex_unlock(&p->mutex);
  copy_from_user_failed:
-sync_memory_failed:
        kfree(devices_arr);
return err;
@@ -1332,6 +1332,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
        void *mem;
        long err = 0;
        uint32_t *devices_arr = NULL, i;
+       bool flush_tlb;
if (!args->n_devices) {
                pr_debug("Device IDs array empty\n");
@@ -1384,16 +1385,19 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
                }
                args->n_success = i+1;
        }
-       mutex_unlock(&p->mutex);
- if (kfd_flush_tlb_after_unmap(pdd->dev)) {
+       flush_tlb = kfd_flush_tlb_after_unmap(pdd->dev);
+       if (flush_tlb) {
                err = amdgpu_amdkfd_gpuvm_sync_memory(pdd->dev->adev,
                                (struct kgd_mem *) mem, true);
                if (err) {
                        pr_debug("Sync memory failed, wait interrupted by user 
signal\n");
                        goto sync_memory_failed;
                }
+       }
+       mutex_unlock(&p->mutex);
+ if (flush_tlb) {
                /* Flush TLBs after waiting for the page table updates to 
complete */
                for (i = 0; i < args->n_devices; i++) {
                        peer_pdd = kfd_process_device_data_by_id(p, 
devices_arr[i]);
@@ -1409,9 +1413,9 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
  bind_process_to_device_failed:
  get_mem_obj_from_handle_failed:
  unmap_memory_from_gpu_failed:
+sync_memory_failed:
        mutex_unlock(&p->mutex);
  copy_from_user_failed:
-sync_memory_failed:
        kfree(devices_arr);
        return err;
  }

Reply via email to