[Public] Ping
> -----Original Message----- > From: Xiao, Shane <shane.x...@amd.com> > Sent: Friday, April 25, 2025 11:35 AM > To: amd-gfx@lists.freedesktop.org; Kuehling, Felix <felix.kuehl...@amd.com>; > Koenig, Christian <christian.koe...@amd.com>; Yang, Philip > <philip.y...@amd.com> > Cc: Xiao, Shane <shane.x...@amd.com> > Subject: [PATCH 2/2] amd/amdkfd: Trigger segfault for early userptr > unmmapping > > If applications unmap the memory before destroying the userptr, it needs > trigger a segfault to notify user space to correct the free sequence in VM > debug > mode. > > v2: Send GPU access fault to user space > v3: Report gpu address to user space, remove unnecessary params > > Signed-off-by: Shane Xiao <shane.x...@amd.com> > --- > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 15 +++++++++++++++ > drivers/gpu/drm/amd/amdkfd/kfd_events.c | 19 +++++++++++++++++++ > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ > 3 files changed, 36 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > index d2ec4130a316..61a698056fb8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > @@ -2496,6 +2496,7 @@ static int update_invalid_user_pages(struct > amdkfd_process_info *process_info, > struct ttm_operation_ctx ctx = { false, false }; > uint32_t invalid; > int ret = 0; > + uint64_t userptr = 0; > > mutex_lock(&process_info->notifier_lock); > > @@ -2559,6 +2560,20 @@ static int update_invalid_user_pages(struct > amdkfd_process_info *process_info, > if (ret != -EFAULT) > return ret; > > + /* If applications unmap memory before destroying the > userptr > + * from the KFD, trigger a segmentation fault in VM > debug mode. > + */ > + if (amdgpu_ttm_adev(bo->tbo.bdev)- > >debug_vm_userptr) { > + amdgpu_ttm_tt_get_userptr(&bo->tbo, > &userptr); > + pr_err("User space unmap memory before > destroying a userptr that refers to it\n"); > + pr_err("The unmap userptr cpu address is > 0x%llx, gpu address is 0x%llx\n", > + userptr, mem- > >va); > + > + // Send GPU VM fault to user space > + > kfd_signal_vm_fault_event_with_userptr(kfd_lookup_process_by_pid(pr > ocess_info->pid), > + mem->va); > + } > + > ret = 0; > } > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > index fecdb6794075..e54e708ed82d 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > @@ -1177,6 +1177,25 @@ void kfd_signal_hw_exception_event(u32 pasid) > kfd_unref_process(p); > } > > +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, > +uint64_t gpu_va) { > + struct kfd_process_device *pdd; > + struct kfd_hsa_memory_exception_data exception_data; > + int i; > + > + memset(&exception_data, 0, sizeof(exception_data)); > + exception_data.va = gpu_va; > + exception_data.failure.NotPresent = 1; > + > + // Send VM seg fault to all kfd process device > + for (i = 0; i < p->n_pdds; i++) { > + pdd = p->pdds[i]; > + exception_data.gpu_id = pdd->user_gpu_id; > + kfd_evict_process_device(pdd); > + kfd_signal_vm_fault_event(pdd, NULL, &exception_data); > + } > +} > + > void kfd_signal_vm_fault_event(struct kfd_process_device *pdd, > struct kfd_vm_fault_info *info, > struct kfd_hsa_memory_exception_data *data) > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index f6aedf69c644..8703be8077b0 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -1507,6 +1507,8 @@ int kfd_event_create(struct file *devkfd, struct > kfd_process *p, int kfd_get_num_events(struct kfd_process *p); int > kfd_event_destroy(struct kfd_process *p, uint32_t event_id); > > +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, > +uint64_t gpu_va); > + > void kfd_signal_vm_fault_event(struct kfd_process_device *pdd, > struct kfd_vm_fault_info *info, > struct kfd_hsa_memory_exception_data > *data); > -- > 2.25.1