On Fri, Mar 20, 2026 at 4:09 PM Amber Lin <[email protected]> wrote: > > Add detect_and_reset_hung_queues to user mode compute queues on GC 12.1. > > Signed-off-by: Amber Lin <[email protected]>
Reviewed-by: Alex Deucher <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 35 +++++++++++++++++++++++++- > 1 file changed, 34 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c > b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c > index 7aea3a50e712..ac9e26b8bb52 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c > @@ -46,6 +46,8 @@ static int mes_v12_1_kiq_hw_fini(struct amdgpu_device > *adev, uint32_t xcc_id); > static int mes_v12_1_self_test(struct amdgpu_device *adev, int xcc_id); > > #define MES_EOP_SIZE 2048 > +#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset [4:7] hqd > info */ > +#define MES12_HUNG_HQD_INFO_OFFSET 4 > > #define regCP_HQD_IB_CONTROL_MES_12_1_DEFAULT 0x100000 > #define XCC_MID_MASK 0x41000000 > @@ -229,7 +231,7 @@ static int > mes_v12_1_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, > xcc_id, pipe, x_pkt->header.opcode); > > r = amdgpu_fence_wait_polling(ring, seq, timeout); > - if (r < 1 || !*status_ptr) { > + if (r < 1 || !lower_32_bits(*status_ptr)) { > if (misc_op_str) > dev_err(adev->dev, > "MES(%d, %d) failed to respond to msg=%s > (%s)\n", > @@ -858,6 +860,33 @@ static int mes_v12_1_reset_legacy_queue(struct > amdgpu_mes *mes, > } > #endif > > +static int mes_v12_1_detect_and_reset_hung_queues(struct amdgpu_mes *mes, > + struct > mes_detect_and_reset_queue_input *input) > +{ > + union MESAPI__RESET mes_reset_queue_pkt; > + > + memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt)); > + > + mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER; > + mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET; > + mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; > + > + mes_reset_queue_pkt.queue_type = > + convert_to_mes_queue_type(input->queue_type); > + mes_reset_queue_pkt.doorbell_offset_addr = > + mes->hung_queue_db_array_gpu_addr[0]; > + > + if (input->detect_only) > + mes_reset_queue_pkt.hang_detect_only = 1; > + else > + mes_reset_queue_pkt.hang_detect_then_reset = 1; > + > + return mes_v12_1_submit_pkt_and_poll_completion(mes, > + input->xcc_id, AMDGPU_MES_SCHED_PIPE, > + &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt), > + offsetof(union MESAPI__RESET, api_status)); > +} > + > static int mes_v12_inv_tlb_convert_hub_id(uint8_t id) > { > /* > @@ -915,6 +944,7 @@ static const struct amdgpu_mes_funcs mes_v12_1_funcs = { > .resume_gang = mes_v12_1_resume_gang, > .misc_op = mes_v12_1_misc_op, > .reset_hw_queue = mes_v12_1_reset_hw_queue, > + .detect_and_reset_hung_queues = > mes_v12_1_detect_and_reset_hung_queues, > .invalidate_tlbs_pasid = mes_v12_1_inv_tlbs_pasid, > }; > > @@ -1931,6 +1961,9 @@ static int mes_v12_1_early_init(struct amdgpu_ip_block > *ip_block) > struct amdgpu_device *adev = ip_block->adev; > int pipe, r; > > + adev->mes.hung_queue_db_array_size = MES12_HUNG_DB_OFFSET_ARRAY_SIZE; > + adev->mes.hung_queue_hqd_info_offset = MES12_HUNG_HQD_INFO_OFFSET; > + > for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { > r = amdgpu_mes_init_microcode(adev, pipe); > if (r) > -- > 2.43.0 >
