In order to allow ROCm GDB to handle reset queues, raise an
EC_QUEUE_RESET exception so that the debugger can subscribe and
query this exception.

Reset queues should still be considered suspendable with a status
flag of KFD_DBG_QUEUE_RESET_MASK.
However they should not be resumable since user space will no longer
be able to access reset queues.

v2: move per-queue reset flag to this patch
rebase based on patch 1 changes

Signed-off-by: Jonathan Kim <jonathan....@amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 31 ++++++++++++++++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
 include/uapi/linux/kfd_ioctl.h                |  4 +++
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index e335703eff84..cb7b5bbf5c40 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -164,6 +164,10 @@ static void kfd_hws_hang(struct device_queue_manager *dqm)
                        struct kfd_process_device *pdd = qpd_to_pdd(qpd);
 
                        pdd->has_reset_queue = true;
+                       q->properties.is_reset = true;
+                       kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET),
+                                        q->process, q->device, q->doorbell_id,
+                                        false, NULL, 0);
                }
        }
 
@@ -986,7 +990,7 @@ static int suspend_single_queue(struct device_queue_manager 
*dqm,
 {
        bool is_new;
 
-       if (q->properties.is_suspended)
+       if (q->properties.is_suspended || q->properties.is_reset)
                return 0;
 
        pr_debug("Suspending PASID %u queue [%i]\n",
@@ -1007,6 +1011,9 @@ static int suspend_single_queue(struct 
device_queue_manager *dqm,
                if (dqm->dev->kfd->shared_resources.enable_mes) {
                        int r = remove_queue_mes(dqm, q, &pdd->qpd);
 
+                       if (q->properties.is_reset)
+                               return 0;
+
                        if (r)
                                return r;
                }
@@ -1967,10 +1974,14 @@ static void set_queue_as_reset(struct 
device_queue_manager *dqm, struct queue *q
               q->properties.queue_id, q->process->pasid);
 
        pdd->has_reset_queue = true;
+       q->properties.is_reset = true;
        if (q->properties.is_active) {
                q->properties.is_active = false;
                decrement_queue_count(dqm, qpd, q);
        }
+
+       kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q->device,
+                        q->doorbell_id, false, NULL, 0);
 }
 
 static int detect_queue_hang(struct device_queue_manager *dqm)
@@ -3037,7 +3048,8 @@ int resume_queues(struct kfd_process *p,
                                                queue_ids[q_idx] &=
                                                        
~KFD_DBG_QUEUE_INVALID_MASK;
                                        } else {
-                                               queue_ids[q_idx] |=
+                                               queue_ids[q_idx] |= 
q->properties.is_reset ?
+                                                       
KFD_DBG_QUEUE_RESET_MASK :
                                                        
KFD_DBG_QUEUE_ERROR_MASK;
                                                break;
                                        }
@@ -3072,7 +3084,7 @@ int resume_queues(struct kfd_process *p,
                                                        queue_ids);
 
                                        /* mask queue as error on resume fail */
-                                       if (q_idx != QUEUE_NOT_FOUND)
+                                       if (q_idx != QUEUE_NOT_FOUND && 
!q->properties.is_reset)
                                                queue_ids[q_idx] |=
                                                        
KFD_DBG_QUEUE_ERROR_MASK;
                                }
@@ -3119,6 +3131,7 @@ int suspend_queues(struct kfd_process *p,
                struct qcm_process_device *qpd = &pdd->qpd;
                struct queue *q;
                int r, per_device_suspended = 0;
+               bool has_queue_reset_fail = false;
 
                mutex_lock(&p->event_mutex);
                dqm_lock(dqm);
@@ -3135,6 +3148,9 @@ int suspend_queues(struct kfd_process *p,
 
                                if (!err) {
                                        queue_ids[q_idx] &= 
~KFD_DBG_QUEUE_INVALID_MASK;
+                                       if (q->properties.is_reset)
+                                               queue_ids[q_idx] |= 
KFD_DBG_QUEUE_RESET_MASK;
+
                                        if (exception_clear_mask && is_mes)
                                                q->properties.exception_status 
&=
                                                        ~exception_clear_mask;
@@ -3176,13 +3192,18 @@ int suspend_queues(struct kfd_process *p,
                                continue;
 
                        /* mask queue as error on suspend fail */
-                       if (r)
+                       if (r && !q->properties.is_reset) {
+                               has_queue_reset_fail = true;
                                queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK;
-                       else if (exception_clear_mask)
+                       } else if (exception_clear_mask) {
                                q->properties.exception_status &=
                                                        ~exception_clear_mask;
+                       }
                }
 
+               if (!has_queue_reset_fail)
+                       total_suspended += per_device_suspended;
+
                dqm_unlock(dqm);
                mutex_unlock(&p->event_mutex);
                amdgpu_device_flush_hdp(dqm->dev->adev, NULL);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 892a85408c09..192e3102c152 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -504,6 +504,7 @@ struct queue_properties {
        bool is_being_destroyed;
        bool is_active;
        bool is_gws;
+       bool is_reset;
        uint32_t pm4_target_xcc;
        bool is_dbg_wa;
        bool is_user_cu_masked;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 285a36601dc9..4713f9a6796e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -875,6 +875,7 @@ enum kfd_dbg_trap_exception_code {
        EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
        EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
        EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
+       EC_QUEUE_RESET = 29,
        EC_QUEUE_PREEMPTION_ERROR = 30,
        EC_QUEUE_NEW = 31,
        /* per device */
@@ -907,6 +908,7 @@ enum kfd_dbg_trap_exception_code {
                                 
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) |        \
                                 
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) |       \
                                 
KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED)        |       \
+                                KFD_EC_MASK(EC_QUEUE_RESET)    |       \
                                 KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) |       
\
                                 KFD_EC_MASK(EC_QUEUE_NEW))
 #define KFD_EC_MASK_DEVICE     (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) |          
\
@@ -997,8 +999,10 @@ struct kfd_queue_snapshot_entry {
 };
 
 /* Queue status return for suspend/resume */
+#define KFD_DBG_QUEUE_RESET_BIT                29
 #define KFD_DBG_QUEUE_ERROR_BIT                30
 #define KFD_DBG_QUEUE_INVALID_BIT      31
+#define KFD_DBG_QUEUE_RESET_MASK       (1 << KFD_DBG_QUEUE_RESET_BIT)
 #define KFD_DBG_QUEUE_ERROR_MASK       (1 << KFD_DBG_QUEUE_ERROR_BIT)
 #define KFD_DBG_QUEUE_INVALID_MASK     (1 << KFD_DBG_QUEUE_INVALID_BIT)
 
-- 
2.34.1

Reply via email to