[Public]
Regards,
Prike
> -----Original Message-----
> From: amd-gfx <[email protected]> On Behalf Of Christian
> König
> Sent: Thursday, August 28, 2025 11:02 PM
> To: Khatri, Sunil <[email protected]>
> Cc: [email protected]; Deucher, Alexander
> <[email protected]>
> Subject: [PATCH 1/3] drm/amdgpu: fix userq VM validation v3
>
> That was actually complete nonsense and not validating the BOs at all. The
> code
> just cleared all VM areas were it couldn't grab the lock for a BO.
>
> Try to fix this. Only compile tested at the moment.
>
> v2: fix fence slot reservation as well as pointed out by Sunil.
> also validate PDs, PTs, per VM BOs and update PDEs
> v3: grab the status_lock while working with the done list.
>
> Signed-off-by: Christian König <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 136 ++++++++++------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 35 ++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +
> 3 files changed, 97 insertions(+), 76 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 424831997cb1..abc2f96bea76 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -545,108 +545,92 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr
> *uq_mgr)
> return ret;
> }
>
> -static int
> -amdgpu_userq_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)
> +static int amdgpu_userq_validate_vm(void *param, struct amdgpu_bo *bo)
> {
> struct ttm_operation_ctx ctx = { false, false };
> - int ret;
>
> amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
> -
> - ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
> - if (ret)
> - DRM_ERROR("Fail to validate\n");
> -
> - return ret;
> + return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
> }
>
> static int
> -amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
> +amdgpu_userq_validate_bos(struct amdgpu_device *adev, struct drm_exec *exec,
> + struct amdgpu_vm *vm)
> {
> - struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
> - struct amdgpu_vm *vm = &fpriv->vm;
> - struct amdgpu_device *adev = uq_mgr->adev;
> + struct ttm_operation_ctx ctx = { false, false };
> struct amdgpu_bo_va *bo_va;
> - struct ww_acquire_ctx *ticket;
> - struct drm_exec exec;
> struct amdgpu_bo *bo;
> - struct dma_resv *resv;
> - bool clear, unlock;
> - int ret = 0;
> -
> - drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
> - drm_exec_until_all_locked(&exec) {
> - ret = amdgpu_vm_lock_pd(vm, &exec, 2);
> - drm_exec_retry_on_contention(&exec);
> - if (unlikely(ret)) {
> - drm_file_err(uq_mgr->file, "Failed to lock PD\n");
> - goto unlock_all;
> - }
> -
> - /* Lock the done list */
> - list_for_each_entry(bo_va, &vm->done, base.vm_status) {
> - bo = bo_va->base.bo;
> - if (!bo)
> - continue;
> -
> - ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
> - drm_exec_retry_on_contention(&exec);
> - if (unlikely(ret))
> - goto unlock_all;
> - }
> - }
> + int ret;
>
> spin_lock(&vm->status_lock);
> - while (!list_empty(&vm->moved)) {
> - bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
> + while (!list_empty(&vm->invalidated)) {
> + bo_va = list_first_entry(&vm->invalidated,
> + struct amdgpu_bo_va,
> base.vm_status);
> spin_unlock(&vm->status_lock);
>
> - /* Per VM BOs never need to bo cleared in the page tables */
> + bo = bo_va->base.bo;
> + ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2);
> + if (unlikely(ret))
> + return ret;
> +
> + amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
> + ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
> + if (ret)
> + return ret;
> +
> + /* This moves the bo_va to the done list */
> ret = amdgpu_vm_bo_update(adev, bo_va, false);
> if (ret)
> - goto unlock_all;
> + return ret;
> +
> spin_lock(&vm->status_lock);
> }
> + spin_unlock(&vm->status_lock);
>
> - ticket = &exec.ticket;
> - while (!list_empty(&vm->invalidated)) {
> - bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
> - base.vm_status);
> - resv = bo_va->base.bo->tbo.base.resv;
> - spin_unlock(&vm->status_lock);
> + return 0;
> +}
>
> - bo = bo_va->base.bo;
> - ret = amdgpu_userq_validate_vm_bo(NULL, bo);
> - if (ret) {
> - drm_file_err(uq_mgr->file, "Failed to validate BO\n");
> - goto unlock_all;
> - }
> +static int
> +amdgpu_userq_update_vm(struct amdgpu_userq_mgr *uq_mgr) {
> + struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
> + struct amdgpu_device *adev = uq_mgr->adev;
> + struct amdgpu_vm *vm = &fpriv->vm;
> + struct drm_exec exec;
> + int ret;
>
> - /* Try to reserve the BO to avoid clearing its ptes */
> - if (!adev->debug_vm && dma_resv_trylock(resv)) {
> - clear = false;
> - unlock = true;
> - /* The caller is already holding the reservation lock */
> - } else if (dma_resv_locking_ctx(resv) == ticket) {
> - clear = false;
> - unlock = false;
> - /* Somebody else is using the BO right now */
> - } else {
> - clear = true;
> - unlock = false;
> - }
> + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
> + drm_exec_until_all_locked(&exec) {
> + ret = amdgpu_vm_lock_pd(vm, &exec, 1);
> + drm_exec_retry_on_contention(&exec);
> + if (unlikely(ret))
> + goto unlock_all;
>
> - ret = amdgpu_vm_bo_update(adev, bo_va, clear);
> + ret = amdgpu_vm_lock_done(vm, &exec, 1);
> + drm_exec_retry_on_contention(&exec);
> + if (unlikely(ret))
> + goto unlock_all;
>
> - if (unlock)
> - dma_resv_unlock(resv);
> - if (ret)
> + ret = amdgpu_vm_validate(adev, vm, NULL,
> + amdgpu_userq_validate_vm,
> + NULL);
> + if (unlikely(ret))
> goto unlock_all;
>
> - spin_lock(&vm->status_lock);
> + ret = amdgpu_userq_validate_bos(adev, &exec, vm);
> + drm_exec_retry_on_contention(&exec);
> + if (unlikely(ret))
> + goto unlock_all;
> }
> - spin_unlock(&vm->status_lock);
> +
> + ret = amdgpu_vm_handle_moved(adev, vm, NULL);
> + if (ret)
> + goto unlock_all;
> +
> + ret = amdgpu_vm_update_pdes(adev, vm, false);
> + if (ret)
> + goto unlock_all;
Do we need to create a sync point for syncing the VM PD/PT/PET update, and
ensure
all the updates done before attach the eviction fence?
> ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);
> if (ret)
> @@ -667,7 +651,7 @@ static void amdgpu_userq_restore_worker(struct
> work_struct *work)
>
> mutex_lock(&uq_mgr->userq_mutex);
>
> - ret = amdgpu_userq_validate_bos(uq_mgr);
> + ret = amdgpu_userq_update_vm(uq_mgr);
> if (ret) {
> drm_file_err(uq_mgr->file, "Failed to validate BOs to
> restore\n");
> goto unlock;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index bf42246a3db2..16451c9bbe1f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -484,6 +484,41 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct
> drm_exec *exec,
> 2 + num_fences);
> }
>
> +/**
> + * amdgpu_vm_lock_done - lock all BOs on the done list
> + * @exec: drm execution context
> + * @num_fences: number of extra fences to reserve
> + *
> + * Lock the BOs on the done list in the DRM execution context.
> + */
> +int amdgpu_vm_lock_done(struct amdgpu_vm *vm, struct drm_exec *exec,
> + unsigned int num_fences)
> +{
> + struct list_head *prev = &vm->done;
> + struct amdgpu_bo_va *bo_va;
> + struct amdgpu_bo *bo;
> + int ret;
> +
> + /* We can only trust prev->next while holding the lock */
> + spin_lock(&vm->status_lock);
> + while (!list_is_head(prev->next, &vm->done)) {
Can here use the list_for_each_entry_safe() for simplifying the done list walk
loop?
> + bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status);
> + spin_unlock(&vm->status_lock);
> +
> + bo = bo_va->base.bo;
> + if (bo) {
> + ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 1);
> + if (unlikely(ret))
> + return ret;
> + }
> + spin_lock(&vm->status_lock);
> + prev = prev->next;
> + }
> + spin_unlock(&vm->status_lock);
> +
> + return 0;
> +}
> +
> /**
> * amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU
> *
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index f9549f6b3d1f..0e3884dfdb6d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -492,6 +492,8 @@ int amdgpu_vm_make_compute(struct amdgpu_device
> *adev, struct amdgpu_vm *vm); void amdgpu_vm_fini(struct amdgpu_device *adev,
> struct amdgpu_vm *vm); int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct
> drm_exec *exec,
> unsigned int num_fences);
> +int amdgpu_vm_lock_done(struct amdgpu_vm *vm, struct drm_exec *exec,
> + unsigned int num_fences);
> bool amdgpu_vm_ready(struct amdgpu_vm *vm); uint64_t
> amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm); int
> amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> --
> 2.43.0