Am 05.09.23 um 17:36 schrieb Shashank Sharma:
It has been observed that task_info struct makes it difficult to
handle amdgpu_vm during a GPU reset, due to it's parameters like
task_name and process name.

This patch:
- removes task_info struct from amdgpu_vm and moves it into vm_mgr
   as an Xarray.
- creates a PID vs task_info mapping to index task_info from this
   Xarray (similar to how it is being done for PASID to vm indexing).

That's a good start, but the fundamental problem is that the task info needs to exceed the existence of the task which it describes.

In other words we can have a GPU reset for a task long terminated.

I would rather say we make the task_info a separate reference counted object and each job/hw fence gets a reference to it.

I would also completely remove it from the VM structure and code since this isn't realted in any way. Rather put it into fpriv instead.

Regards,
Christian.

- adds additional changes to support these changes.

Cc: Christian Koenig <christian.koe...@amd.com>
Cc: Alex Deucher <alexander.deuc...@amd.com>
Signed-off-by: Shashank Sharma <shashank.sha...@amd.com>
---
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   |  5 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  5 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       |  4 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        | 64 ++++++++++++++++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        | 12 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c     |  4 +-
  9 files changed, 77 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index d34c3ef8f3ed..8568ced570c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1539,7 +1539,7 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct 
amdgpu_device *adev,
        if (ret)
                return ret;
- amdgpu_vm_set_task_info(avm);
+       amdgpu_vm_set_task_info(adev, avm);
return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index fb78a8f47587..24b9a841db54 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -316,7 +316,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
        kvfree(chunk_array);
/* Use this opportunity to fill in task info for the vm */
-       amdgpu_vm_set_task_info(vm);
+       amdgpu_vm_set_task_info(p->adev, vm);
return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 56e89e76ff17..cd8aef1135e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1762,9 +1762,10 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file 
*m, void *unused)
        list_for_each_entry(file, &dev->filelist, lhead) {
                struct amdgpu_fpriv *fpriv = file->driver_priv;
                struct amdgpu_vm *vm = &fpriv->vm;
+               struct amdgpu_task_info ti = {0, };
- seq_printf(m, "pid:%d\tProcess:%s ----------\n",
-                               vm->task_info.pid, vm->task_info.process_name);
+               amdgpu_vm_get_task_info(adev, vm->pasid, &ti);
+               seq_printf(m, "pid:%d\tProcess:%s ----------\n", ti.pid, 
ti.process_name);
                r = amdgpu_bo_reserve(vm->root.bo, true);
                if (r)
                        break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6238701cde23..44fb16eba749 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5192,8 +5192,9 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
                                memset(&tmp_adev->reset_task_info, 0,
                                                
sizeof(tmp_adev->reset_task_info));
                                if (reset_context->job && 
reset_context->job->vm)
-                                       tmp_adev->reset_task_info =
-                                               
reset_context->job->vm->task_info;
+                                       amdgpu_vm_get_task_info(tmp_adev,
+                                                           
reset_context->job->vm->pasid,
+                                                               
&tmp_adev->reset_task_info);
                                amdgpu_reset_capture_coredumpm(tmp_adev);
  #endif
                                if (vram_lost) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 78476bc75b4e..3985b9b10b46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -58,7 +58,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
                goto exit;
        }
- amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
+       amdgpu_vm_get_task_info(ring->adev, job->vm->pasid, &ti);
        DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
                  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
                  ring->fence_drv.sync_seq);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 12414a713256..cb3bde5ce682 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1329,8 +1329,10 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
        amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
        amdgpu_vm_fini(adev, &fpriv->vm);
- if (pasid)
+       if (pasid) {
+               amdgpu_vm_free_task_info(adev, fpriv->vm.pid);
                amdgpu_pasid_free_delayed(pd->tbo.base.resv, pasid);
+       }
        amdgpu_bo_unref(&pd);
idr_for_each_entry(&fpriv->bo_list_handles, list, handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ec1ec08d4058..95bea7690bb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -161,7 +161,6 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
                vm->pasid = pasid;
        }
-
        return 0;
  }
@@ -2439,6 +2438,7 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
  #endif
xa_init_flags(&adev->vm_manager.pasids, XA_FLAGS_LOCK_IRQ);
+       xa_init_flags(&adev->vm_manager.tasks, XA_FLAGS_LOCK_IRQ);
  }
/**
@@ -2453,6 +2453,9 @@ void amdgpu_vm_manager_fini(struct amdgpu_device *adev)
        WARN_ON(!xa_empty(&adev->vm_manager.pasids));
        xa_destroy(&adev->vm_manager.pasids);
+ WARN_ON(!xa_empty(&adev->vm_manager.tasks));
+       xa_destroy(&adev->vm_manager.tasks);
+
        amdgpu_vmid_mgr_fini(adev);
  }
@@ -2498,6 +2501,20 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
        return 0;
  }
+/**
+ * amdgpu_vm_free_task_info - Finds and frees task_info ptr
+ *
+ * @adev: drm device pointer
+ * @pid: pid of the process in vm
+ */
+void amdgpu_vm_free_task_info(struct amdgpu_device *adev, u32 pid)
+{
+       struct amdgpu_task_info *task_info;
+
+       task_info = xa_load(&adev->vm_manager.tasks, pid);
+       kfree(task_info);
+}
+
  /**
   * amdgpu_vm_get_task_info - Extracts task info for a PASID.
   *
@@ -2508,14 +2525,18 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
  void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
                         struct amdgpu_task_info *task_info)
  {
+       struct amdgpu_task_info *ti;
        struct amdgpu_vm *vm;
        unsigned long flags;
xa_lock_irqsave(&adev->vm_manager.pasids, flags); vm = xa_load(&adev->vm_manager.pasids, pasid);
-       if (vm)
-               *task_info = vm->task_info;
+       if (vm) {
+               ti = xa_load(&adev->vm_manager.tasks, vm->pid);
+               if (ti)
+                       *task_info = *ti;
+       }
xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
  }
@@ -2523,21 +2544,44 @@ void amdgpu_vm_get_task_info(struct amdgpu_device 
*adev, u32 pasid,
  /**
   * amdgpu_vm_set_task_info - Sets VMs task info.
   *
+ * @adev: drm device pointer
   * @vm: vm for which to set the info
   */
-void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
+void amdgpu_vm_set_task_info(struct amdgpu_device *adev, struct amdgpu_vm *vm)
  {
-       if (vm->task_info.pid)
+       struct amdgpu_task_info *task_info, *old;
+
+       if (!vm->pasid)
                return;
- vm->task_info.pid = current->pid;
-       get_task_comm(vm->task_info.task_name, current);
+       if (vm->pid == current->pid)
+               return;
- if (current->group_leader->mm != current->mm)
+       task_info = kzalloc(sizeof(struct amdgpu_task_info), GFP_KERNEL);
+       if (!task_info) {
+               DRM_ERROR("OOM while setting task info\n");
                return;
+       }
+
+       task_info->pid = current->pid;
+       get_task_comm(task_info->task_name, current);
+
+       if (current->group_leader->mm == current->mm) {
+               task_info->tgid = current->group_leader->pid;
+               get_task_comm(task_info->process_name, current->group_leader);
+       }
+
+       /* Replace and free if there is an existing task_info entry */
+       old = xa_store_irq(&adev->vm_manager.tasks, task_info->pid, task_info, 
GFP_KERNEL);
+       if (xa_err(old) < 0) {
+               DRM_WARN("Failed to update task info, pid=0x%x pasid=0x%x\n",
+                         task_info->pid, vm->pasid);
+               kfree(task_info);
+               return;
+       }
- vm->task_info.tgid = current->group_leader->pid;
-       get_task_comm(vm->task_info.process_name, current->group_leader);
+       kfree(old);
+       vm->pid = task_info->pid;
  }
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index ffac7413c657..33f333d864b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -302,6 +302,7 @@ struct amdgpu_vm {
        struct dma_fence        *last_unlocked;
unsigned int pasid;
+       unsigned int            pid;
        bool                    reserved_vmid[AMDGPU_MAX_VMHUBS];
/* Flag to indicate if VM tables are updated by CPU or GPU (SDMA) */
@@ -325,9 +326,6 @@ struct amdgpu_vm {
        /* Valid while the PD is reserved or fenced */
        uint64_t                pd_phys_addr;
- /* Some basic info about the task */
-       struct amdgpu_task_info task_info;
-
        /* Store positions of group of BOs */
        struct ttm_lru_bulk_move lru_bulk_move;
        /* Flag to indicate if VM is used for compute */
@@ -374,6 +372,11 @@ struct amdgpu_vm_manager {
         * look up VM of a page fault
         */
        struct xarray                           pasids;
+
+       /* PID to task_info mapping, will be used in reset context to
+        * look up PID/TID of a reset
+        */
+       struct xarray                           tasks;
  };
struct amdgpu_bo_va_mapping;
@@ -465,7 +468,8 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 
pasid,
                            u32 vmid, u32 node_id, uint64_t addr,
                            bool write_fault);
-void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
+void amdgpu_vm_set_task_info(struct amdgpu_device *adev, struct amdgpu_vm *vm);
+void amdgpu_vm_free_task_info(struct amdgpu_device *adev, u32 pid);
void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
                                struct amdgpu_vm *vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 5431332bbdb8..3704556210d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -992,16 +992,18 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params 
*params,
                        uint64_t upd_end = min(entry_end, frag_end);
                        unsigned int nptes = (upd_end - frag_start) >> shift;
                        uint64_t upd_flags = flags | AMDGPU_PTE_FRAG(frag);
+                       struct amdgpu_task_info ti = {0, };
/* This can happen when we set higher level PDs to
                         * silent to stop fault floods.
                         */
                        nptes = max(nptes, 1u);
+                       amdgpu_vm_get_task_info(adev, vm->pasid, &ti);
trace_amdgpu_vm_update_ptes(params, frag_start, upd_end,
                                                    min(nptes, 32u), dst, incr,
                                                    upd_flags,
-                                                   vm->task_info.tgid,
+                                                   ti.tgid,
                                                    
vm->immediate.fence_context);
                        amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt),
                                                   cursor.level, pe_start, dst,

Reply via email to