On GC 12.x dual-pipe MES, kernel compute queues (KCQs) may be registered on the KIQ pipe while suspend_all/resume_all is driven from the sched pipe. That can leave queue ownership inconsistent across suspend/resume.
Add an explicit KCQ migration flow in amdgpu_mes_suspend()/resume(): - unmap KCQs from their original pipe/queue - remap them to sched pipe temporary slots before suspend_all - restore original pipe/queue ownership after resume_all Track per-ring migration state (valid bit, temporary sched queue id, and original pipe/queue), and add rollback handling for partial failures. If map-to-sched fails for the current ring, restore it immediately before rolling back earlier migrated rings, so the current ring is not left unmapped. Signed-off-by: Jesse Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 315 +++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 7 +- drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 2 +- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +- 4 files changed, 321 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 0d4c77c1b4b5..fe179641e6aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -114,6 +114,9 @@ int amdgpu_mes_init(struct amdgpu_device *adev) adev->gfx.disable_kq ? 0 : adev->gfx.num_compute_rings); adev->mes.adev = adev; + adev->mes.kcq_migrated_to_sched_pipe = false; + memset(adev->mes.kcq_sched_migration_valid, 0, + sizeof(adev->mes.kcq_sched_migration_valid)); ida_init(&adev->mes.doorbell_ida); spin_lock_init(&adev->mes.queue_id_lock); @@ -281,7 +284,284 @@ void amdgpu_mes_fini(struct amdgpu_device *adev) mutex_destroy(&adev->mes.mutex_hidden); } -int amdgpu_mes_suspend(struct amdgpu_device *adev) +static int amdgpu_mes_map_legacy_queue_on_pipe(struct amdgpu_device *adev, + struct amdgpu_ring *ring, + u32 pipe_id, + u32 queue_id, + u32 xcc_id) +{ + struct mes_map_legacy_queue_input queue_input; + int r; + + memset(&queue_input, 0, sizeof(queue_input)); + + queue_input.xcc_id = xcc_id; + queue_input.queue_type = ring->funcs->type; + queue_input.doorbell_offset = ring->doorbell_index; + queue_input.pipe_id = pipe_id; + queue_input.queue_id = queue_id; + queue_input.mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj); + queue_input.wptr_addr = ring->wptr_gpu_addr; + + amdgpu_mes_lock(&adev->mes); + r = adev->mes.funcs->map_legacy_queue(&adev->mes, &queue_input); + amdgpu_mes_unlock(&adev->mes); + + return r; +} + +static int amdgpu_mes_unmap_legacy_queue_on_pipe(struct amdgpu_device *adev, + struct amdgpu_ring *ring, + u32 pipe_id, + u32 queue_id, + u32 xcc_id) +{ + struct mes_unmap_legacy_queue_input queue_input; + int r; + + memset(&queue_input, 0, sizeof(queue_input)); + + queue_input.xcc_id = xcc_id; + queue_input.action = RESET_QUEUES; + queue_input.queue_type = ring->funcs->type; + queue_input.doorbell_offset = ring->doorbell_index; + queue_input.pipe_id = pipe_id; + queue_input.queue_id = queue_id; + + amdgpu_mes_lock(&adev->mes); + r = adev->mes.funcs->unmap_legacy_queue(&adev->mes, &queue_input); + amdgpu_mes_unlock(&adev->mes); + + return r; +} + +static int amdgpu_mes_migrate_kcq_to_sched_pipe(struct amdgpu_device *adev) +{ + int num_xcc, xcc_id, i, j, r, ret = 0; + int rollback_xcc = -1, rollback_i = -1; + u32 max_sched_slots; + + num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; + max_sched_slots = min_t(u32, AMDGPU_MAX_COMPUTE_QUEUES, + adev->gfx.mec.num_queue_per_pipe ? + adev->gfx.mec.num_queue_per_pipe : AMDGPU_MAX_COMPUTE_QUEUES); + memset(adev->mes.kcq_sched_migration_valid, 0, + sizeof(adev->mes.kcq_sched_migration_valid)); + + for (xcc_id = 0; xcc_id < num_xcc; xcc_id++) { + bool sched_slot_used[AMDGPU_MAX_COMPUTE_QUEUES] = { false }; + + /* Seed pipe0 occupancy from kernel compute rings in this XCC. */ + + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + j = i + xcc_id * adev->gfx.num_compute_rings; + if (adev->gfx.compute_ring[j].pipe != AMDGPU_MES_SCHED_PIPE) + continue; + + if (adev->gfx.compute_ring[j].queue >= max_sched_slots) + return -EINVAL; + + sched_slot_used[adev->gfx.compute_ring[j].queue] = true; + } + + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + struct amdgpu_ring *ring; + u32 orig_pipe; + u32 orig_queue; + u32 sched_queue; + + j = i + xcc_id * adev->gfx.num_compute_rings; + ring = &adev->gfx.compute_ring[j]; + orig_pipe = ring->pipe; + orig_queue = ring->queue; + + if (orig_pipe == AMDGPU_MES_SCHED_PIPE) + continue; + + /* Prefer the original queue id; otherwise pick the first free sched slot. */ + sched_queue = ring->queue; + if (sched_queue >= max_sched_slots || sched_slot_used[sched_queue]) { + for (sched_queue = 0; sched_queue < max_sched_slots; sched_queue++) + if (!sched_slot_used[sched_queue]) + break; + + if (sched_queue >= max_sched_slots) { + ret = -ENOSPC; + rollback_xcc = xcc_id; + rollback_i = i ? i - 1 : -1; + goto rollback; + } + } + + adev->mes.kcq_migration_orig_pipe[j] = orig_pipe; + adev->mes.kcq_migration_orig_queue[j] = orig_queue; + + r = amdgpu_mes_unmap_legacy_queue_on_pipe(adev, ring, + orig_pipe, orig_queue, xcc_id); + if (r) { + ret = r; + rollback_xcc = xcc_id; + rollback_i = i ? i - 1 : -1; + goto rollback; + } + + r = amdgpu_mes_map_legacy_queue_on_pipe(adev, ring, + AMDGPU_MES_SCHED_PIPE, + sched_queue, + xcc_id); + if (r) { + /* Restore the current ring before rolling back earlier migrated rings. */ + int rr; + + rr = amdgpu_mes_map_legacy_queue_on_pipe(adev, ring, + orig_pipe, + orig_queue, + xcc_id); + if (rr) + dev_err(adev->dev, + "failed to restore current KCQ during migrate rollback xcc:%d queue:%u pipe:%u\n", + xcc_id, orig_queue, orig_pipe); + + ret = rr ? rr : r; + rollback_xcc = xcc_id; + rollback_i = i ? i - 1 : -1; + goto rollback; + } + + adev->mes.kcq_sched_migration_queue_id[j] = sched_queue; + adev->mes.kcq_sched_migration_valid[j] = true; + ring->pipe = AMDGPU_MES_SCHED_PIPE; + ring->queue = sched_queue; + sched_slot_used[sched_queue] = true; + } + } + + return 0; + +rollback: + /* Roll back only rings that were recorded as migrated. */ + for (xcc_id = rollback_xcc; xcc_id >= 0; xcc_id--) { + int start_i = (xcc_id == rollback_xcc) ? rollback_i : + (adev->gfx.num_compute_rings - 1); + + for (; start_i >= 0; start_i--) { + struct amdgpu_ring *ring; + u32 orig_pipe; + u32 orig_queue; + u32 sched_queue; + int rr; + + j = start_i + xcc_id * adev->gfx.num_compute_rings; + ring = &adev->gfx.compute_ring[j]; + + if (!adev->mes.kcq_sched_migration_valid[j]) + continue; + + sched_queue = adev->mes.kcq_sched_migration_queue_id[j]; + orig_pipe = adev->mes.kcq_migration_orig_pipe[j]; + orig_queue = adev->mes.kcq_migration_orig_queue[j]; + + rr = amdgpu_mes_unmap_legacy_queue_on_pipe(adev, ring, + AMDGPU_MES_SCHED_PIPE, + sched_queue, + xcc_id); + if (rr) { + if (!ret) + ret = rr; + /* + * Skip map to avoid double-mapping if unmap from + * sched pipe failed. Leave valid flag set so a + * subsequent restore attempt can retry. + */ + continue; + } + + rr = amdgpu_mes_map_legacy_queue_on_pipe(adev, ring, + orig_pipe, + orig_queue, + xcc_id); + if (rr) { + if (!ret) + ret = rr; + } else { + ring->pipe = orig_pipe; + ring->queue = orig_queue; + adev->mes.kcq_sched_migration_valid[j] = false; + } + } + } + + if (!ret) + memset(adev->mes.kcq_sched_migration_valid, 0, + sizeof(adev->mes.kcq_sched_migration_valid)); + + return ret; +} + +static int amdgpu_mes_restore_kcq_pipe_ownership(struct amdgpu_device *adev) +{ + int num_xcc, xcc_id, i, j, r, ret = 0; + + num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; + + for (xcc_id = 0; xcc_id < num_xcc; xcc_id++) { + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + struct amdgpu_ring *ring; + u32 orig_pipe; + u32 orig_queue; + u32 sched_queue; + + j = i + xcc_id * adev->gfx.num_compute_rings; + ring = &adev->gfx.compute_ring[j]; + + if (!adev->mes.kcq_sched_migration_valid[j]) + continue; + + sched_queue = adev->mes.kcq_sched_migration_queue_id[j]; + orig_pipe = adev->mes.kcq_migration_orig_pipe[j]; + orig_queue = adev->mes.kcq_migration_orig_queue[j]; + + if (ring->pipe != AMDGPU_MES_SCHED_PIPE || + ring->queue != sched_queue) { + if (!ret) + ret = -EINVAL; + continue; + } + + /* Move the queue from temporary pipe0 slot back to its original location. */ + r = amdgpu_mes_unmap_legacy_queue_on_pipe(adev, ring, + AMDGPU_MES_SCHED_PIPE, + sched_queue, + xcc_id); + if (r && !ret) + ret = r; + if (r) + continue; + + r = amdgpu_mes_map_legacy_queue_on_pipe(adev, ring, + orig_pipe, + orig_queue, + xcc_id); + if (r && !ret) + ret = r; + + if (!r) { + ring->pipe = orig_pipe; + ring->queue = orig_queue; + adev->mes.kcq_sched_migration_valid[j] = false; + } + } + } + + if (!ret) + memset(adev->mes.kcq_sched_migration_valid, 0, + sizeof(adev->mes.kcq_sched_migration_valid)); + + return ret; +} + +int amdgpu_mes_suspend(struct amdgpu_device *adev, + bool migrate_kcq_to_sched_pipe) { struct mes_suspend_gang_input input; int r; @@ -289,6 +569,19 @@ int amdgpu_mes_suspend(struct amdgpu_device *adev) if (!amdgpu_mes_suspend_resume_all_supported(adev)) return 0; + /* + * Explicitly migrate KCQ ownership to scheduler pipe before suspend_all. + * This keeps firmware restore path consistent without changing steady-state + * legacy queue map/unmap semantics. + */ + if (adev->mes.enable_legacy_queue_map && + migrate_kcq_to_sched_pipe) { + r = amdgpu_mes_migrate_kcq_to_sched_pipe(adev); + if (r) + return r; + adev->mes.kcq_migrated_to_sched_pipe = true; + } + memset(&input, 0x0, sizeof(struct mes_suspend_gang_input)); input.suspend_all_gangs = 1; @@ -299,9 +592,18 @@ int amdgpu_mes_suspend(struct amdgpu_device *adev) amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->suspend_gang(&adev->mes, &input); amdgpu_mes_unlock(&adev->mes); - if (r) + if (r) { dev_err(adev->dev, "failed to suspend all gangs"); + if (adev->mes.kcq_migrated_to_sched_pipe) { + if (amdgpu_mes_restore_kcq_pipe_ownership(adev)) + dev_err(adev->dev, + "failed to restore KCQ ownership after suspend failure\n"); + else + adev->mes.kcq_migrated_to_sched_pipe = false; + } + } + return r; } @@ -326,6 +628,15 @@ int amdgpu_mes_resume(struct amdgpu_device *adev) if (r) dev_err(adev->dev, "failed to resume all gangs"); + if (!r && adev->mes.kcq_migrated_to_sched_pipe) { + r = amdgpu_mes_restore_kcq_pipe_ownership(adev); + if (r) + dev_err(adev->dev, + "failed to restore KCQ ownership after resume\n"); + else + adev->mes.kcq_migrated_to_sched_pipe = false; + } + return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index f80e3aca9c78..fa8ef0ed63db 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -85,6 +85,11 @@ struct amdgpu_mes { uint32_t kiq_version; uint32_t fw_version[AMDGPU_MAX_MES_PIPES]; bool enable_legacy_queue_map; + bool kcq_migrated_to_sched_pipe; + u32 kcq_sched_migration_queue_id[AMDGPU_MAX_COMPUTE_RINGS * AMDGPU_MAX_GC_INSTANCES]; + u32 kcq_migration_orig_pipe[AMDGPU_MAX_COMPUTE_RINGS * AMDGPU_MAX_GC_INSTANCES]; + u32 kcq_migration_orig_queue[AMDGPU_MAX_COMPUTE_RINGS * AMDGPU_MAX_GC_INSTANCES]; + bool kcq_sched_migration_valid[AMDGPU_MAX_COMPUTE_RINGS * AMDGPU_MAX_GC_INSTANCES]; uint32_t total_max_queue; uint32_t max_doorbell_slices; @@ -429,7 +434,7 @@ int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe); int amdgpu_mes_init(struct amdgpu_device *adev); void amdgpu_mes_fini(struct amdgpu_device *adev); -int amdgpu_mes_suspend(struct amdgpu_device *adev); +int amdgpu_mes_suspend(struct amdgpu_device *adev, bool migrate_kcq_to_sched_pipe); int amdgpu_mes_resume(struct amdgpu_device *adev); int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c index 0e9089544769..177a78d9aa68 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c @@ -1899,7 +1899,7 @@ static int mes_v12_1_suspend(struct amdgpu_ip_block *ip_block) { int r; - r = amdgpu_mes_suspend(ip_block->adev); + r = amdgpu_mes_suspend(ip_block->adev, false); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 18bc5ba25f8f..e5167cbb77f8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -367,7 +367,7 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm) if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; - r = amdgpu_mes_suspend(adev); + r = amdgpu_mes_suspend(adev, false); up_read(&adev->reset_domain->sem); if (r) { -- 2.49.0
