[AMD Official Use Only - AMD Internal Distribution Only] Good catch . Thanks . I will sent out another review for that .
Regards Shaoyun.liu From: Yang, Philip <philip.y...@amd.com> Sent: Thursday, October 17, 2024 3:47 PM To: Liu, Shaoyun <shaoyun....@amd.com>; amd-gfx@lists.freedesktop.org Subject: Re: [PATCH] drm/amd/amdkfd: add/remove kfd queues on start/stop KFD scheduling On 2024-10-17 12:12, Shaoyun Liu wrote: From: shaoyunl <shaoyun....@amd.com><mailto:shaoyun....@amd.com> Add back kfd queues in start scheduling that originally been removed on stop scheduling. Signed-off-by: Shaoyun Liu <shaoyun....@amd.com><mailto:shaoyun....@amd.com> --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 40 +++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index b2b16a812e73..542363b4712e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -204,6 +204,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; + if (!dqm->sched_running || dqm->sched_halt) { up_read(&adev->reset_domain->sem); + return 0; } memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input)); queue_input.process_id = qpd->pqm->process->pasid; @@ -272,6 +274,8 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; + if (!dqm->sched_running || dqm->sched_halt) { up_read(&adev->reset_domain->sem); + return 0; } It is simpler to move sched_halt/running check outside reset sem lock, but not sure if it is safe. Regards, Philip memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); queue_input.doorbell_offset = q->properties.doorbell_off; @@ -292,7 +296,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, return r; } -static int remove_all_queues_mes(struct device_queue_manager *dqm) +static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm) { struct device_process_node *cur; struct device *dev = dqm->dev->adev->dev; @@ -319,6 +323,33 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm) return retval; } +static int add_all_kfd_queues_mes(struct device_queue_manager *dqm) +{ + struct device_process_node *cur; + struct device *dev = dqm->dev->adev->dev; + struct qcm_process_device *qpd; + struct queue *q; + int retval = 0; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if (!q->properties.is_active) + continue; + retval = add_queue_mes(dqm, q, qpd); + if (retval) { + dev_err(dev, "%s: Failed to add queue %d for dev %d", + __func__, + q->properties.queue_id, + dqm->dev->id); + return retval; + } + } + } + + return retval; +} + static int suspend_all_queues_mes(struct device_queue_manager *dqm) { struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; @@ -1742,7 +1773,7 @@ static int halt_cpsch(struct device_queue_manager *dqm) KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); else - ret = remove_all_queues_mes(dqm); + ret = remove_all_kfd_queues_mes(dqm); } dqm->sched_halt = true; dqm_unlock(dqm); @@ -1768,6 +1799,9 @@ static int unhalt_cpsch(struct device_queue_manager *dqm) ret = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD); + else + ret = add_all_kfd_queues_mes(dqm); + dqm_unlock(dqm); return ret; @@ -1867,7 +1901,7 @@ static int stop_cpsch(struct device_queue_manager *dqm) if (!dqm->dev->kfd->shared_resources.enable_mes) unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); else - remove_all_queues_mes(dqm); + remove_all_kfd_queues_mes(dqm); dqm->sched_running = false;