On 2025-07-22 13:47, David Yat Sin wrote:
GPUs with multi-xcc have multiple MQDs per queue. This patch saves and
restores all the MQDs within the partition.
Signed-off-by: David Yat Sin <david.yat...@amd.com>
---
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +-
.../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 61 ++++++++++++++++---
.../amd/amdkfd/kfd_process_queue_manager.c | 20 ++++--
3 files changed, 67 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 2d91027e2a74..6c5c7c1bf5ed 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2725,7 +2725,7 @@ static void get_queue_checkpoint_info(struct
device_queue_manager *dqm,
dqm_lock(dqm);
mqd_mgr = dqm->mqd_mgrs[mqd_type];
- *mqd_size = mqd_mgr->mqd_size;
+ *mqd_size = mqd_mgr->mqd_size * NUM_XCC(mqd_mgr->dev->xcc_mask);
*ctl_stack_size = 0;
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE && mqd_mgr->get_checkpoint_info)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 97933d2a3803..f2dee320fada 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -373,7 +373,7 @@ static void get_checkpoint_info(struct mqd_manager *mm,
void *mqd, u32 *ctl_stac
{
struct v9_mqd *m = get_mqd(mqd);
- *ctl_stack_size = m->cp_hqd_cntl_stack_size;
+ *ctl_stack_size = m->cp_hqd_cntl_stack_size *
NUM_XCC(mm->dev->xcc_mask);
}
static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, void *ctl_stack_dst)
@@ -388,6 +388,24 @@ static void checkpoint_mqd(struct mqd_manager *mm, void
*mqd, void *mqd_dst, voi
memcpy(ctl_stack_dst, ctl_stack, m->cp_hqd_cntl_stack_size);
}
+static void checkpoint_mqd_v9_4_3(struct mqd_manager *mm,
+ void *mqd,
+ void *mqd_dst,
+ void
*ctl_stack_dst)
+{
+ struct v9_mqd *m;
+ int xcc;
+ uint64_t size = get_mqd(mqd)->cp_mqd_stride_size;
+
+ for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) {
+ m = get_mqd(mqd + size * xcc);
+
+ checkpoint_mqd(mm, m,
+ (uint8_t *)mqd_dst + sizeof(*m) * xcc,
+ (uint8_t *)ctl_stack_dst +
m->cp_hqd_cntl_stack_size * xcc);
+ }
+}
+
static void restore_mqd(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *qp,
@@ -764,13 +782,35 @@ static void restore_mqd_v9_4_3(struct mqd_manager *mm,
void **mqd,
const void *mqd_src,
const void *ctl_stack_src, u32 ctl_stack_size)
{
- restore_mqd(mm, mqd, mqd_mem_obj, gart_addr, qp, mqd_src,
ctl_stack_src, ctl_stack_size);
- if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) {
- struct v9_mqd *m;
+ struct kfd_mem_obj xcc_mqd_mem_obj;
+ u32 mqd_ctl_stack_size;
+ struct v9_mqd *m;
+ u32 num_xcc;
+ int xcc;
- m = (struct v9_mqd *) mqd_mem_obj->cpu_ptr;
- m->cp_hqd_pq_doorbell_control |= 1 <<
-
CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT;
Is this special handling for SRIOV no longer needed? Maybe it's enough
that it's handled during queue initialization. Then checkpoint and
restore just preserves the updated doorbell mode. Other than that, this
patch looks good to me.
Reviewed-by: Felix Kuehling <felix.kuehl...@amd.com>
+ uint64_t offset = mm->mqd_stride(mm, qp);
+
+ mm->dev->dqm->current_logical_xcc_start++;
+
+ num_xcc = NUM_XCC(mm->dev->xcc_mask);
+ mqd_ctl_stack_size = ctl_stack_size / num_xcc;
+
+ memset(&xcc_mqd_mem_obj, 0x0, sizeof(struct kfd_mem_obj));
+
+ /* Set the MQD pointer and gart address to XCC0 MQD */
+ *mqd = mqd_mem_obj->cpu_ptr;
+ if (gart_addr)
+ *gart_addr = mqd_mem_obj->gpu_addr;
+
+ for (xcc = 0; xcc < num_xcc; xcc++) {
+ get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset * xcc);
+ restore_mqd(mm, (void **)&m,
+ &xcc_mqd_mem_obj,
+ NULL,
+ qp,
+ (uint8_t *)mqd_src + xcc * sizeof(*m),
+ (uint8_t *)ctl_stack_src + xcc *
mqd_ctl_stack_size,
+ mqd_ctl_stack_size);
}
}
static int destroy_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,
@@ -906,7 +946,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE
type,
mqd->free_mqd = kfd_free_mqd_cp;
mqd->is_occupied = kfd_is_occupied_cp;
mqd->get_checkpoint_info = get_checkpoint_info;
- mqd->checkpoint_mqd = checkpoint_mqd;
mqd->mqd_size = sizeof(struct v9_mqd);
mqd->mqd_stride = mqd_stride_v9;
#if defined(CONFIG_DEBUG_FS)
@@ -918,16 +957,18 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE
type,
mqd->init_mqd = init_mqd_v9_4_3;
mqd->load_mqd = load_mqd_v9_4_3;
mqd->update_mqd = update_mqd_v9_4_3;
- mqd->restore_mqd = restore_mqd_v9_4_3;
mqd->destroy_mqd = destroy_mqd_v9_4_3;
mqd->get_wave_state = get_wave_state_v9_4_3;
+ mqd->checkpoint_mqd = checkpoint_mqd_v9_4_3;
+ mqd->restore_mqd = restore_mqd_v9_4_3;
} else {
mqd->init_mqd = init_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd;
- mqd->restore_mqd = restore_mqd;
mqd->destroy_mqd = kfd_destroy_mqd_cp;
mqd->get_wave_state = get_wave_state;
+ mqd->checkpoint_mqd = checkpoint_mqd;
+ mqd->restore_mqd = restore_mqd;
}
break;
case KFD_MQD_TYPE_HIQ:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index fe4c48930aad..bae200035ff4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -922,7 +922,10 @@ static int criu_checkpoint_queues_device(struct
kfd_process_device *pdd,
q_data = (struct kfd_criu_queue_priv_data *)q_private_data;
- /* data stored in this order: priv_data, mqd, ctl_stack */
+ /*
+ * data stored in this order:
+ * priv_data, mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0],
ctl_stack[xcc1]...
+ */
q_data->mqd_size = mqd_size;
q_data->ctl_stack_size = ctl_stack_size;
@@ -971,7 +974,7 @@ int kfd_criu_checkpoint_queues(struct kfd_process *p,
}
static void set_queue_properties_from_criu(struct queue_properties *qp,
- struct kfd_criu_queue_priv_data
*q_data)
+ struct kfd_criu_queue_priv_data
*q_data, uint32_t num_xcc)
{
qp->is_interop = false;
qp->queue_percent = q_data->q_percent;
@@ -985,7 +988,11 @@ static void set_queue_properties_from_criu(struct
queue_properties *qp,
qp->eop_ring_buffer_size = q_data->eop_ring_buffer_size;
qp->ctx_save_restore_area_address =
q_data->ctx_save_restore_area_address;
qp->ctx_save_restore_area_size = q_data->ctx_save_restore_area_size;
- qp->ctl_stack_size = q_data->ctl_stack_size;
+ if (q_data->type == KFD_QUEUE_TYPE_COMPUTE)
+ qp->ctl_stack_size = q_data->ctl_stack_size / num_xcc;
+ else
+ qp->ctl_stack_size = q_data->ctl_stack_size;
+
qp->type = q_data->type;
qp->format = q_data->format;
}
@@ -1045,12 +1052,15 @@ int kfd_criu_restore_queue(struct kfd_process *p,
goto exit;
}
- /* data stored in this order: mqd, ctl_stack */
+ /*
+ * data stored in this order:
+ * mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]...
+ */
mqd = q_extra_data;
ctl_stack = mqd + q_data->mqd_size;
memset(&qp, 0, sizeof(qp));
- set_queue_properties_from_criu(&qp, q_data);
+ set_queue_properties_from_criu(&qp, q_data,
NUM_XCC(pdd->dev->adev->gfx.xcc_mask));
print_queue_properties(&qp);