amdkfd: Trigger segfault for early userptr unmmapping

Felix Kuehling Thu, 24 Apr 2025 06:56:32 -0700


On 2025-04-24 0:54, Shane Xiao wrote:
> If applications unmap the memory before destroying the userptr, it needs
> trigger a segfault to notify user space to correct the free sequence in
> VM debug mode.
>
> v2: Send GPU access fault to user space


See some comments inline.


>
> Signed-off-by: Shane Xiao <shane.x...@amd.com>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 14 +++++++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_events.c       | 20 +++++++++++++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  3 +++
>  3 files changed, 37 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index d2ec4130a316..876e9df34adf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -2496,6 +2496,7 @@ static int update_invalid_user_pages(struct 
> amdkfd_process_info *process_info,
>       struct ttm_operation_ctx ctx = { false, false };
>       uint32_t invalid;
>       int ret = 0;
> +     uint64_t userptr = 0;
>  
>       mutex_lock(&process_info->notifier_lock);
>  
> @@ -2559,6 +2560,19 @@ static int update_invalid_user_pages(struct 
> amdkfd_process_info *process_info,
>                       if (ret != -EFAULT)
>                               return ret;
>  
> +                     /* If applications unmap memory before destroying the 
> userptr
> +                      * from the KFD, trigger a segmentation fault in VM 
> debug mode.
> +                      */
> +                     if (amdgpu_ttm_adev(bo->tbo.bdev)->debug_vm_userptr) {
> +                             amdgpu_ttm_tt_get_userptr(&bo->tbo, userptr);

I think you meant &userptr. But this will get you the CPU address of the 
userptr, not the GPU address. For reporting a GPU fault it would make more 
sense to use the GPU address in mem->va.


> +                             pr_err("User space unmap memory before 
> destroying a userptr that refers to it\n");
> +                             pr_err("The unmap userptr address is 0x%llx\n", 
> userptr);
> +
> +                             // Send GPU VM fault to user space
> +                             
> kfd_signal_vm_fault_event_with_userptr(kfd_lookup_process_by_pid(process_info->pid),
> +                                             
> amdgpu_ttm_adev(bo->tbo.bdev)->kfd.dev, userptr);
> +                     }
> +
>                       ret = 0;
>               }
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index fecdb6794075..89943d2146a4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -1177,6 +1177,26 @@ void kfd_signal_hw_exception_event(u32 pasid)
>       kfd_unref_process(p);
>  }
>  
> +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p,
> +             struct kfd_dev *dev , uint64_t userptr)

dev seems to be unused. You can remove that parameter.

Regards,
  Felix


> +{
> +     struct kfd_process_device *pdd;
> +     struct kfd_hsa_memory_exception_data exception_data;
> +     int i;
> +
> +     memset(&exception_data, 0, sizeof(exception_data));
> +     exception_data.va = userptr;
> +     exception_data.failure.NotPresent = 1;
> +
> +     // Send VM seg fault to all kfd process device
> +     for (i = 0; i < p->n_pdds; i++) {
> +             pdd = p->pdds[i];
> +             exception_data.gpu_id = pdd->user_gpu_id;
> +             kfd_evict_process_device(pdd);
> +             kfd_signal_vm_fault_event(pdd, NULL, &exception_data);
> +     }
> +}
> +
>  void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
>                               struct kfd_vm_fault_info *info,
>                               struct kfd_hsa_memory_exception_data *data)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index f6aedf69c644..34f47dc1cbbd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1507,6 +1507,9 @@ int kfd_event_create(struct file *devkfd, struct 
> kfd_process *p,
>  int kfd_get_num_events(struct kfd_process *p);
>  int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
>  
> +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p,
> +                             struct kfd_dev *dev , uint64_t userptr);
> +
>  void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
>                               struct kfd_vm_fault_info *info,
>                               struct kfd_hsa_memory_exception_data *data);

Re: [PATCH 2/2] amd/amdkfd: Trigger segfault for early userptr unmmapping

Reply via email to