amdkfd: support the debugger during per-queue reset

Felix Kuehling Tue, 30 Jul 2024 15:16:52 -0700

On 2024-07-26 11:30, Jonathan Kim wrote:
> In order to allow ROCm GDB to handle reset queues, raise an
> EC_QUEUE_RESET exception so that the debugger can subscribe and
> query this exception.
> 
> Reset queues should still be considered suspendable with a status
> flag of KFD_DBG_QUEUE_RESET_MASK.
> However they should not be resumable since user space will no longer
> be able to access reset queues.
> 
> v2: move per-queue reset flag to this patch
> rebase based on patch 1 changes
> 
> Signed-off-by: Jonathan Kim <jonathan....@amd.com>
> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 31 ++++++++++++++++---
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
>  include/uapi/linux/kfd_ioctl.h                |  4 +++
>  3 files changed, 31 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index e335703eff84..cb7b5bbf5c40 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -164,6 +164,10 @@ static void kfd_hws_hang(struct device_queue_manager 
> *dqm)
>                       struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>  
>                       pdd->has_reset_queue = true;
> +                     q->properties.is_reset = true;
> +                     kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET),
> +                                      q->process, q->device, q->doorbell_id,
> +                                      false, NULL, 0);
>               }
>       }
>  
> @@ -986,7 +990,7 @@ static int suspend_single_queue(struct 
> device_queue_manager *dqm,
>  {
>       bool is_new;
>  
> -     if (q->properties.is_suspended)
> +     if (q->properties.is_suspended || q->properties.is_reset)
>               return 0;
>  
>       pr_debug("Suspending PASID %u queue [%i]\n",
> @@ -1007,6 +1011,9 @@ static int suspend_single_queue(struct 
> device_queue_manager *dqm,
>               if (dqm->dev->kfd->shared_resources.enable_mes) {
>                       int r = remove_queue_mes(dqm, q, &pdd->qpd);
>  
> +                     if (q->properties.is_reset)
> +                             return 0;
> +
>                       if (r)
>                               return r;
>               }
> @@ -1967,10 +1974,14 @@ static void set_queue_as_reset(struct 
> device_queue_manager *dqm, struct queue *q
>              q->properties.queue_id, q->process->pasid);
>  
>       pdd->has_reset_queue = true;
> +     q->properties.is_reset = true;
>       if (q->properties.is_active) {
>               q->properties.is_active = false;
>               decrement_queue_count(dqm, qpd, q);
>       }
> +
> +     kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q->device,
> +                      q->doorbell_id, false, NULL, 0);
>  }
>  
>  static int detect_queue_hang(struct device_queue_manager *dqm)
> @@ -3037,7 +3048,8 @@ int resume_queues(struct kfd_process *p,
>                                               queue_ids[q_idx] &=
>                                                       
> ~KFD_DBG_QUEUE_INVALID_MASK;
>                                       } else {
> -                                             queue_ids[q_idx] |=
> +                                             queue_ids[q_idx] |= 
> q->properties.is_reset ?
> +                                                     
> KFD_DBG_QUEUE_RESET_MASK :
>                                                       
> KFD_DBG_QUEUE_ERROR_MASK;
>                                               break;
>                                       }
> @@ -3072,7 +3084,7 @@ int resume_queues(struct kfd_process *p,
>                                                       queue_ids);
>  
>                                       /* mask queue as error on resume fail */
> -                                     if (q_idx != QUEUE_NOT_FOUND)
> +                                     if (q_idx != QUEUE_NOT_FOUND && 
> !q->properties.is_reset)
>                                               queue_ids[q_idx] |=
>                                                       
> KFD_DBG_QUEUE_ERROR_MASK;
>                               }
> @@ -3119,6 +3131,7 @@ int suspend_queues(struct kfd_process *p,
>               struct qcm_process_device *qpd = &pdd->qpd;
>               struct queue *q;
>               int r, per_device_suspended = 0;
> +             bool has_queue_reset_fail = false;
>  
>               mutex_lock(&p->event_mutex);
>               dqm_lock(dqm);
> @@ -3135,6 +3148,9 @@ int suspend_queues(struct kfd_process *p,
>  
>                               if (!err) {
>                                       queue_ids[q_idx] &= 
> ~KFD_DBG_QUEUE_INVALID_MASK;
> +                                     if (q->properties.is_reset)
> +                                             queue_ids[q_idx] |= 
> KFD_DBG_QUEUE_RESET_MASK;
> +
>                                       if (exception_clear_mask && is_mes)
>                                               q->properties.exception_status 
> &=
>                                                       ~exception_clear_mask;
> @@ -3176,13 +3192,18 @@ int suspend_queues(struct kfd_process *p,
>                               continue;
>  
>                       /* mask queue as error on suspend fail */
> -                     if (r)
> +                     if (r && !q->properties.is_reset) {
> +                             has_queue_reset_fail = true;
>                               queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK;
> -                     else if (exception_clear_mask)
> +                     } else if (exception_clear_mask) {
>                               q->properties.exception_status &=
>                                                       ~exception_clear_mask;
> +                     }
>               }
>  
> +             if (!has_queue_reset_fail)
> +                     total_suspended += per_device_suspended;
> +
>               dqm_unlock(dqm);
>               mutex_unlock(&p->event_mutex);
>               amdgpu_device_flush_hdp(dqm->dev->adev, NULL);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 892a85408c09..192e3102c152 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -504,6 +504,7 @@ struct queue_properties {
>       bool is_being_destroyed;
>       bool is_active;
>       bool is_gws;
> +     bool is_reset;
>       uint32_t pm4_target_xcc;
>       bool is_dbg_wa;
>       bool is_user_cu_masked;
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 285a36601dc9..4713f9a6796e 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -875,6 +875,7 @@ enum kfd_dbg_trap_exception_code {
>       EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
>       EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
>       EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
> +     EC_QUEUE_RESET = 29,
>       EC_QUEUE_PREEMPTION_ERROR = 30,

Do we really need to distinguish between preemption errors and resets? I mean, 
both are caused by preemption errors. The only difference is, that in the 
"reset" case we succeeded in resetting only the affected queues.

>       EC_QUEUE_NEW = 31,
>       /* per device */
> @@ -907,6 +908,7 @@ enum kfd_dbg_trap_exception_code {
>                                
> KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) |        \
>                                
> KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) |       \
>                                
> KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED)        |       \
> +                              KFD_EC_MASK(EC_QUEUE_RESET)    |       \
>                                KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) |       
> \
>                                KFD_EC_MASK(EC_QUEUE_NEW))
>  #define KFD_EC_MASK_DEVICE   (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) |          
> \
> @@ -997,8 +999,10 @@ struct kfd_queue_snapshot_entry {
>  };
>  
>  /* Queue status return for suspend/resume */
> +#define KFD_DBG_QUEUE_RESET_BIT              29
>  #define KFD_DBG_QUEUE_ERROR_BIT              30
>  #define KFD_DBG_QUEUE_INVALID_BIT    31
> +#define KFD_DBG_QUEUE_RESET_MASK     (1 << KFD_DBG_QUEUE_RESET_BIT)

Same as above. If the distinction is not necessary, we don't need to change the 
user mode API. Just report queue resets as generic preemption errors.

Regards,
  Felix

>  #define KFD_DBG_QUEUE_ERROR_MASK     (1 << KFD_DBG_QUEUE_ERROR_BIT)
>  #define KFD_DBG_QUEUE_INVALID_MASK   (1 << KFD_DBG_QUEUE_INVALID_BIT)
>
Re: [PATCH 2/2] drm/amdkfd: support the debugger during per-queue reset

Reply via email to