Am 09.05.24 um 22:41 schrieb Ori Messinger:
This patch adds 'ring hang' events to the driver.
This is done by adding a 'reset_ring_hang' bool variable to the
struct 'amdgpu_reset_context' in the amdgpu_reset.h file.
The purpose for this 'reset_ring_hang' variable is whenever a GPU
reset is initiated due to a ring hang, the reset_ring_hang should
be set to 'true'.

This 'amdgpu_reset_context' struct is now also passed
through across all relevant functions, and another event type
"KFD_SMI_EVENT_RING_HANG" is added to the kfd_smi_event enum.

To repeat myself on the newest version on the patch, this whole approach is a clear NAK.

Driver hangs including the cause of it are exposed through udev.

And in general exposing driver telemetry through the SMI interface in KFD is a pretty clear no-go.

Regards,
Christian.


Signed-off-by: Ori Messinger <ori.messin...@amd.com>
Change-Id: I6af3022eb1b4514201c9430d635ff87f167ad6f7
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  6 ++++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h  |  9 ++++++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c     |  3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h   |  2 ++
  drivers/gpu/drm/amd/amdkfd/kfd_device.c     |  7 ++++---
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  7 ++++++-
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  5 ++++-
  include/uapi/linux/kfd_ioctl.h              | 15 ++++++++-------
  9 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index e3738d417245..ac0ee4322555 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -133,6 +133,8 @@ static void amdgpu_amdkfd_reset_work(struct work_struct 
*work)
reset_context.method = AMD_RESET_METHOD_NONE;
        reset_context.reset_req_dev = adev;
+       reset_context.reset_ring_hang = true;
+       strscpy(reset_context.reset_cause, "scheduler_hang", 
sizeof(reset_context.reset_cause));
        clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
@@ -261,12 +263,12 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool 
run_pm)
        return r;
  }
-int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev)
+int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev, struct 
amdgpu_reset_context *reset_context)
  {
        int r = 0;
if (adev->kfd.dev)
-               r = kgd2kfd_pre_reset(adev->kfd.dev);
+               r = kgd2kfd_pre_reset(adev->kfd.dev, reset_context);
return r;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 1de021ebdd46..c9030d8b8308 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE {
  };
struct amdgpu_device;
+struct amdgpu_reset_context;
enum kfd_mem_attachment_type {
        KFD_MEM_ATT_SHARED,     /* Share kgd_mem->bo or another attachment's */
@@ -170,7 +171,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct 
amdgpu_device *adev);
bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); -int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
+int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
+                           struct amdgpu_reset_context *reset_context);
int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev); @@ -416,7 +418,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
  void kgd2kfd_device_exit(struct kfd_dev *kfd);
  void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
  int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
-int kgd2kfd_pre_reset(struct kfd_dev *kfd);
+int kgd2kfd_pre_reset(struct kfd_dev *kfd,
+                     struct amdgpu_reset_context *reset_context);
  int kgd2kfd_post_reset(struct kfd_dev *kfd);
  void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
  void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
@@ -459,7 +462,7 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool 
run_pm)
        return 0;
  }
-static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)
+static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd, struct 
amdgpu_reset_context *reset_context)
  {
        return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 77f6fd50002a..f9fa784f36f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5772,7 +5772,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
cancel_delayed_work_sync(&tmp_adev->delayed_init_work); - amdgpu_amdkfd_pre_reset(tmp_adev);
+               amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
/*
                 * Mark these ASICs to be reseted as untracked first
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e4742b65032d..abcae2e7ed0f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -77,6 +77,9 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
+               reset_context.reset_ring_hang = true;
+               strscpy(reset_context.reset_cause, "SDMA_hang", 
sizeof(reset_context.reset_cause));
+               DRM_ERROR("Reset cause: SDMA ring hang\n");
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 5a9cc043b858..757284ab36e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -43,6 +43,8 @@ struct amdgpu_reset_context {
        struct amdgpu_hive_info *hive;
        struct list_head *reset_device_list;
        unsigned long flags;
+       bool reset_ring_hang;
+       char reset_cause[64];
  };
struct amdgpu_reset_handler {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 6b15e55811b6..88171f24496b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -35,6 +35,7 @@
  #include "kfd_migrate.h"
  #include "amdgpu.h"
  #include "amdgpu_xcp.h"
+#include "amdgpu_reset.h"
#define MQD_SIZE_ALIGNED 768 @@ -931,7 +932,7 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
        kfree(kfd);
  }
-int kgd2kfd_pre_reset(struct kfd_dev *kfd)
+int kgd2kfd_pre_reset(struct kfd_dev *kfd, struct amdgpu_reset_context 
*reset_context)
  {
        struct kfd_node *node;
        int i;
@@ -941,7 +942,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
for (i = 0; i < kfd->num_nodes; i++) {
                node = kfd->nodes[i];
-               kfd_smi_event_update_gpu_reset(node, false);
+               kfd_smi_event_update_gpu_reset(node, false, reset_context);
                node->dqm->ops.pre_reset(node->dqm);
        }
@@ -981,7 +982,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
        for (i = 0; i < kfd->num_nodes; i++) {
                node = kfd->nodes[i];
                atomic_set(&node->sram_ecc_flag, 0);
-               kfd_smi_event_update_gpu_reset(node, true);
+               kfd_smi_event_update_gpu_reset(node, true, NULL);
        }
return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 06ac835190f9..3ffe4b61fe4f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -29,6 +29,7 @@
  #include "amdgpu_vm.h"
  #include "kfd_priv.h"
  #include "kfd_smi_events.h"
+#include "amdgpu_reset.h"
struct kfd_smi_client {
        struct list_head list;
@@ -215,7 +216,8 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node 
*dev,
        add_event_to_kfifo(pid, dev, event, fifo_in, len);
  }
-void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
+void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
+                                   struct amdgpu_reset_context *reset_context)
  {
        unsigned int event;
@@ -224,6 +226,9 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
        } else {
                event = KFD_SMI_EVENT_GPU_PRE_RESET;
                ++(dev->reset_seq_num);
+               if (reset_context && reset_context->reset_ring_hang)
+                       kfd_smi_event_add(0, dev, KFD_SMI_EVENT_RING_HANG, 
"%s\n",
+                                         reset_context->reset_cause);
        }
        kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num);
  }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index fa95c2dfd587..85010b8307f8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -24,11 +24,14 @@
  #ifndef KFD_SMI_EVENTS_H_INCLUDED
  #define KFD_SMI_EVENTS_H_INCLUDED
+struct amdgpu_reset_context;
+
  int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd);
  void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid);
  void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
                                             uint64_t throttle_bitmask);
-void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset);
+void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
+                                   struct amdgpu_reset_context *reset_context);
  void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
                                    unsigned long address, bool write_fault,
                                    ktime_t ts);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 285a36601dc9..7c94d2c7da13 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -519,13 +519,14 @@ enum kfd_smi_event {
        KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
        KFD_SMI_EVENT_GPU_PRE_RESET = 3,
        KFD_SMI_EVENT_GPU_POST_RESET = 4,
-       KFD_SMI_EVENT_MIGRATE_START = 5,
-       KFD_SMI_EVENT_MIGRATE_END = 6,
-       KFD_SMI_EVENT_PAGE_FAULT_START = 7,
-       KFD_SMI_EVENT_PAGE_FAULT_END = 8,
-       KFD_SMI_EVENT_QUEUE_EVICTION = 9,
-       KFD_SMI_EVENT_QUEUE_RESTORE = 10,
-       KFD_SMI_EVENT_UNMAP_FROM_GPU = 11,
+       KFD_SMI_EVENT_RING_HANG = 5,
+       KFD_SMI_EVENT_MIGRATE_START = 6,
+       KFD_SMI_EVENT_MIGRATE_END = 7,
+       KFD_SMI_EVENT_PAGE_FAULT_START = 8,
+       KFD_SMI_EVENT_PAGE_FAULT_END = 9,
+       KFD_SMI_EVENT_QUEUE_EVICTION = 10,
+       KFD_SMI_EVENT_QUEUE_RESTORE = 11,
+       KFD_SMI_EVENT_UNMAP_FROM_GPU = 12,
/*
         * max event number, as a flag bit to get events from all processes,

Reply via email to