On Fri, Mar 20, 2026 at 4:09 PM Amber Lin <[email protected]> wrote:
>
> Add detect_and_reset_hung_queues to user mode compute queues on GC 12.1.
>
> Signed-off-by: Amber Lin <[email protected]>

Reviewed-by: Alex Deucher <[email protected]>

> ---
>  drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 35 +++++++++++++++++++++++++-
>  1 file changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c 
> b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> index 7aea3a50e712..ac9e26b8bb52 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> @@ -46,6 +46,8 @@ static int mes_v12_1_kiq_hw_fini(struct amdgpu_device 
> *adev, uint32_t xcc_id);
>  static int mes_v12_1_self_test(struct amdgpu_device *adev, int xcc_id);
>
>  #define MES_EOP_SIZE   2048
> +#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset [4:7] hqd 
> info */
> +#define MES12_HUNG_HQD_INFO_OFFSET      4
>
>  #define regCP_HQD_IB_CONTROL_MES_12_1_DEFAULT 0x100000
>  #define XCC_MID_MASK 0x41000000
> @@ -229,7 +231,7 @@ static int 
> mes_v12_1_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>                         xcc_id, pipe, x_pkt->header.opcode);
>
>         r = amdgpu_fence_wait_polling(ring, seq, timeout);
> -       if (r < 1 || !*status_ptr) {
> +       if (r < 1 || !lower_32_bits(*status_ptr)) {
>                 if (misc_op_str)
>                         dev_err(adev->dev,
>                                 "MES(%d, %d) failed to respond to msg=%s 
> (%s)\n",
> @@ -858,6 +860,33 @@ static int mes_v12_1_reset_legacy_queue(struct 
> amdgpu_mes *mes,
>  }
>  #endif
>
> +static int mes_v12_1_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> +                                                 struct 
> mes_detect_and_reset_queue_input *input)
> +{
> +       union MESAPI__RESET mes_reset_queue_pkt;
> +
> +       memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
> +
> +       mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER;
> +       mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET;
> +       mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> +
> +       mes_reset_queue_pkt.queue_type =
> +               convert_to_mes_queue_type(input->queue_type);
> +       mes_reset_queue_pkt.doorbell_offset_addr =
> +               mes->hung_queue_db_array_gpu_addr[0];
> +
> +       if (input->detect_only)
> +               mes_reset_queue_pkt.hang_detect_only = 1;
> +       else
> +               mes_reset_queue_pkt.hang_detect_then_reset = 1;
> +
> +       return mes_v12_1_submit_pkt_and_poll_completion(mes,
> +                       input->xcc_id, AMDGPU_MES_SCHED_PIPE,
> +                       &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt),
> +                       offsetof(union MESAPI__RESET, api_status));
> +}
> +
>  static int mes_v12_inv_tlb_convert_hub_id(uint8_t id)
>  {
>         /*
> @@ -915,6 +944,7 @@ static const struct amdgpu_mes_funcs mes_v12_1_funcs = {
>         .resume_gang = mes_v12_1_resume_gang,
>         .misc_op = mes_v12_1_misc_op,
>         .reset_hw_queue = mes_v12_1_reset_hw_queue,
> +       .detect_and_reset_hung_queues = 
> mes_v12_1_detect_and_reset_hung_queues,
>         .invalidate_tlbs_pasid = mes_v12_1_inv_tlbs_pasid,
>  };
>
> @@ -1931,6 +1961,9 @@ static int mes_v12_1_early_init(struct amdgpu_ip_block 
> *ip_block)
>         struct amdgpu_device *adev = ip_block->adev;
>         int pipe, r;
>
> +       adev->mes.hung_queue_db_array_size = MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
> +       adev->mes.hung_queue_hqd_info_offset = MES12_HUNG_HQD_INFO_OFFSET;
> +
>         for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
>                 r = amdgpu_mes_init_microcode(adev, pipe);
>                 if (r)
> --
> 2.43.0
>

Reply via email to