In order to allow ROCm GDB to handle reset queues, raise an
EC_QUEUE_RESET exception so that the debugger can subscribe and
query this exception.

Reset queues should still be considered suspendable with a status
flag of KFD_DBG_QUEUE_RESET_MASK.
However they should not be resumable since user space will no longer
be able to access reset queues.

Signed-off-by: Jonathan Kim <jonathan....@amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 30 +++++++++++++++----
 include/uapi/linux/kfd_ioctl.h                |  4 +++
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 037b75a64e66..8c41806df39e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -163,6 +163,10 @@ static void kfd_hws_hang(struct device_queue_manager *dqm)
                list_for_each_entry(q, &qpd->queues_list, list) {
                        q->properties.is_reset = true;
                        q->process->has_reset_queue = true;
+
+                       kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET),
+                                        q->process, q->device, q->doorbell_id,
+                                        false, NULL, 0);
                }
        }
 
@@ -987,7 +991,7 @@ static int suspend_single_queue(struct device_queue_manager 
*dqm,
 {
        bool is_new;
 
-       if (q->properties.is_suspended)
+       if (q->properties.is_suspended || q->properties.is_reset)
                return 0;
 
        pr_debug("Suspending PASID %u queue [%i]\n",
@@ -1008,6 +1012,9 @@ static int suspend_single_queue(struct 
device_queue_manager *dqm,
                if (dqm->dev->kfd->shared_resources.enable_mes) {
                        int r = remove_queue_mes(dqm, q, &pdd->qpd);
 
+                       if (q->properties.is_reset)
+                               return 0;
+
                        if (r)
                                return r;
                }
@@ -1971,6 +1978,9 @@ static void set_queue_as_reset(struct 
device_queue_manager *dqm, struct queue *q
                q->properties.is_active = false;
                decrement_queue_count(dqm, qpd, q);
        }
+
+       kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q->device,
+                        q->doorbell_id, false, NULL, 0);
 }
 
 static int detect_queue_hang(struct device_queue_manager *dqm)
@@ -3036,7 +3046,8 @@ int resume_queues(struct kfd_process *p,
                                                queue_ids[q_idx] &=
                                                        
~KFD_DBG_QUEUE_INVALID_MASK;
                                        } else {
-                                               queue_ids[q_idx] |=
+                                               queue_ids[q_idx] |= 
q->properties.is_reset ?
+                                                       
KFD_DBG_QUEUE_RESET_MASK :
                                                        
KFD_DBG_QUEUE_ERROR_MASK;
                                                break;
                                        }
@@ -3071,7 +3082,7 @@ int resume_queues(struct kfd_process *p,
                                                        queue_ids);
 
                                        /* mask queue as error on resume fail */
-                                       if (q_idx != QUEUE_NOT_FOUND)
+                                       if (q_idx != QUEUE_NOT_FOUND && 
!q->properties.is_reset)
                                                queue_ids[q_idx] |=
                                                        
KFD_DBG_QUEUE_ERROR_MASK;
                                }
@@ -3118,6 +3129,7 @@ int suspend_queues(struct kfd_process *p,
                struct qcm_process_device *qpd = &pdd->qpd;
                struct queue *q;
                int r, per_device_suspended = 0;
+               bool has_queue_reset_fail = false;
 
                mutex_lock(&p->event_mutex);
                dqm_lock(dqm);
@@ -3134,6 +3146,9 @@ int suspend_queues(struct kfd_process *p,
 
                                if (!err) {
                                        queue_ids[q_idx] &= 
~KFD_DBG_QUEUE_INVALID_MASK;
+                                       if (q->properties.is_reset)
+                                               queue_ids[q_idx] |= 
KFD_DBG_QUEUE_RESET_MASK;
+
                                        if (exception_clear_mask && is_mes)
                                                q->properties.exception_status 
&=
                                                        ~exception_clear_mask;
@@ -3175,13 +3190,18 @@ int suspend_queues(struct kfd_process *p,
                                continue;
 
                        /* mask queue as error on suspend fail */
-                       if (r)
+                       if (r && !q->properties.is_reset) {
+                               has_queue_reset_fail = true;
                                queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK;
-                       else if (exception_clear_mask)
+                       } else if (exception_clear_mask) {
                                q->properties.exception_status &=
                                                        ~exception_clear_mask;
+                       }
                }
 
+               if (!has_queue_reset_fail)
+                       total_suspended += per_device_suspended;
+
                dqm_unlock(dqm);
                mutex_unlock(&p->event_mutex);
                amdgpu_device_flush_hdp(dqm->dev->adev, NULL);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 285a36601dc9..4713f9a6796e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -875,6 +875,7 @@ enum kfd_dbg_trap_exception_code {
        EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
        EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
        EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
+       EC_QUEUE_RESET = 29,
        EC_QUEUE_PREEMPTION_ERROR = 30,
        EC_QUEUE_NEW = 31,
        /* per device */
@@ -907,6 +908,7 @@ enum kfd_dbg_trap_exception_code {
                                 
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) |        \
                                 
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) |       \
                                 
KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED)        |       \
+                                KFD_EC_MASK(EC_QUEUE_RESET)    |       \
                                 KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) |       
\
                                 KFD_EC_MASK(EC_QUEUE_NEW))
 #define KFD_EC_MASK_DEVICE     (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) |          
\
@@ -997,8 +999,10 @@ struct kfd_queue_snapshot_entry {
 };
 
 /* Queue status return for suspend/resume */
+#define KFD_DBG_QUEUE_RESET_BIT                29
 #define KFD_DBG_QUEUE_ERROR_BIT                30
 #define KFD_DBG_QUEUE_INVALID_BIT      31
+#define KFD_DBG_QUEUE_RESET_MASK       (1 << KFD_DBG_QUEUE_RESET_BIT)
 #define KFD_DBG_QUEUE_ERROR_MASK       (1 << KFD_DBG_QUEUE_ERROR_BIT)
 #define KFD_DBG_QUEUE_INVALID_MASK     (1 << KFD_DBG_QUEUE_INVALID_BIT)
 
-- 
2.34.1

Reply via email to