from:"jesse.zh...@amd.com"

[PATCH] drm/amdgpu: fix queue reset issue by mmio

2024-09-04 Thread jesse.zh...@amd.com

Initialize the queue type before resetting the queue using mmio.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index f7d5d4f08a53..10b61ff63802 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -832,6 +832,7 @@ int amdgpu_mes_reset_hw_queue_mmio(struct amdgpu_device 
*adev, int queue_type,
struct mes_reset_queue_input queue_input;
int r;
 
+   queue_input.queue_type = queue_type;
queue_input.use_mmio = true;
queue_input.me_id = me_id;
queue_input.pipe_id = pipe_id;
-- 
2.25.1

[PATCH] drm/amdkfd: clean up code for interrupt v10

2024-09-05 Thread jesse.zh...@amd.com

Variable hub_inst is unused.
Related the commit "bde7ae79ca40":
"drm/amdkfd: Drop poison hanlding from gfx v10"

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 13 -
 1 file changed, 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index bb8cbfc39b90..2db824e87c8a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -308,21 +308,8 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
uint32_t vmid_type = 
SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
-   int hub_inst = 0;
struct kfd_hsa_memory_exception_data exception_data;
 
-   /* gfxhub */
-   if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) 
{
-   hub_inst = 
dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
-   node_id);
-   if (hub_inst < 0)
-   hub_inst = 0;
-   }
-
-   /* mmhub */
-   if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
-   hub_inst = node_id / 4;
-
info.vmid = vmid;
info.mc_id = client_id;
info.page_addr = ih_ring_entry[4] |
-- 
2.25.1

[PATCH] drm/amdkfd: Fix resource leak in riu rsetore queue

2024-09-08 Thread jesse.zh...@amd.com

To avoid memory leaks, release q_extra_data when exiting the restore queue.
v2: Correct the proto (Alex)

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 20ea745729ee..b439d4d0bd84 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -1046,6 +1046,7 @@ int kfd_criu_restore_queue(struct kfd_process *p,
pr_debug("Queue id %d was restored successfully\n", queue_id);
 
kfree(q_data);
+   kfree(q_extra_data);
 
return ret;
 }
-- 
2.25.1

[PATCH] drm/amdgpu: add the command AMDGPU_INFO_QUEUE_RESET to query queue reset

2024-10-18 Thread jesse.zh...@amd.com

Not all ASICs support the queue reset feature.
Therefore, userspace can query this feature
via AMDGPU_INFO_QUEUE_RESET before validating a queue reset.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 27 +
 include/uapi/drm/amdgpu_drm.h   |  2 ++
 2 files changed, 29 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index b53c35992152..87dee858fb4c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -577,6 +577,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
uint64_t ui64 = 0;
int i, found, ret;
int ui32_size = sizeof(ui32);
+   bool queue_reset;
 
if (!info->return_size || !info->return_pointer)
return -EINVAL;
@@ -1282,6 +1283,32 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
*data, struct drm_file *filp)
return copy_to_user(out, &gpuvm_fault,
min((size_t)size, sizeof(gpuvm_fault))) ? 
-EFAULT : 0;
}
+   case AMDGPU_INFO_QUEUE_RESET: {
+   fpriv = (struct amdgpu_fpriv *)filp->driver_priv;
+   type = amdgpu_ip_get_block_type(adev, info->query_hw_ip.type);
+   ip_block = amdgpu_device_ip_get_ip_block(adev, type);
+
+   if (!ip_block || !ip_block->status.valid)
+   return -EINVAL;
+
+   switch (info->query_hw_ip.type) {
+   case AMDGPU_HW_IP_GFX:
+   queue_reset = adev->gfx.gfx_ring[0].funcs->reset ? true 
: false;
+   break;
+   case AMDGPU_HW_IP_COMPUTE:
+   queue_reset = adev->gfx.compute_ring[0].funcs->reset ? 
true : false;
+   break;
+   case AMDGPU_HW_IP_DMA:
+   queue_reset = adev->sdma.instance[0].ring.funcs->reset 
? true : false;
+   break;
+   case AMDGPU_HW_IP_UVD_ENC:
+   case AMDGPU_HW_IP_VCN_DEC:
+   default:
+   queue_reset = false;
+   }
+
+   return copy_to_user(out, &queue_reset, min(size, 4u)) ? -EFAULT 
: 0;
+   }
default:
DRM_DEBUG_KMS("Invalid request %d\n", info->query);
return -EINVAL;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index d9bff1c3b326..3b17d82fd1ee 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -1052,6 +1052,8 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow {
 #define AMDGPU_INFO_MAX_IBS0x22
 /* query last page fault info */
 #define AMDGPU_INFO_GPUVM_FAULT0x23
+/* query queue reset */
+#define AMDGPU_INFO_QUEUE_RESET0x24
 
 #define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0
 #define AMDGPU_INFO_MMR_SE_INDEX_MASK  0xff
-- 
2.25.1

[PATCH 1/2] drm/amdgpu: add amdgpu_gfx_sched_mask and amdgpu_compute_sched_mask debugfs

2024-10-17 Thread jesse.zh...@amd.com

compute/gfx may have multiple rings on some hardware.
In some cases, userspace wants to run jobs on a specific ring for validation 
purposes.
This debugfs entry helps to disable or enable submitting jobs to a specific 
ring.
This entry is populated only if there are at least two or more cores in the 
gfx/compute ip.

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |   2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 142 
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   2 +
 3 files changed, 146 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 37d8657f0776..6e3f657cab9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -2096,6 +2096,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
amdgpu_debugfs_umsch_fwlog_init(adev, &adev->umsch_mm);
 
amdgpu_debugfs_jpeg_sched_mask_init(adev);
+   amdgpu_debugfs_gfx_sched_mask_init(adev);
+   amdgpu_debugfs_compute_sched_mask_init(adev);
 
amdgpu_ras_debugfs_create_all(adev);
amdgpu_rap_debugfs_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index b6acbe923b6b..29997c9f68b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1868,3 +1868,145 @@ void amdgpu_gfx_enforce_isolation_ring_end_use(struct 
amdgpu_ring *ring)
}
mutex_unlock(&adev->enforce_isolation_mutex);
 }
+
+/*
+ * debugfs for to enable/disable gfx job submission to specific core.
+ */
+#if defined(CONFIG_DEBUG_FS)
+static int amdgpu_debugfs_gfx_sched_mask_set(void *data, u64 val)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)data;
+   u32 i;
+   u64 mask = 0;
+   struct amdgpu_ring *ring;
+
+   if (!adev)
+   return -ENODEV;
+
+   mask = (1 << adev->gfx.num_gfx_rings) - 1;
+   if ((val & mask) == 0)
+   return -EINVAL;
+
+   for (i = 0; i < adev->gfx.num_gfx_rings; ++i) {
+   ring = &adev->gfx.gfx_ring[i];
+   if (val & (1 << i))
+   ring->sched.ready = true;
+   else
+   ring->sched.ready = false;
+   }
+   /* publish sched.ready flag update effective immediately across smp */
+   smp_rmb();
+   return 0;
+}
+
+static int amdgpu_debugfs_gfx_sched_mask_get(void *data, u64 *val)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)data;
+   u32 i;
+   u64 mask = 0;
+   struct amdgpu_ring *ring;
+
+   if (!adev)
+   return -ENODEV;
+   for (i = 0; i < adev->gfx.num_gfx_rings; ++i) {
+   ring = &adev->gfx.gfx_ring[i];
+   if (ring->sched.ready)
+   mask |= 1 << i;
+   }
+
+   *val = mask;
+   return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gfx_sched_mask_fops,
+amdgpu_debugfs_gfx_sched_mask_get,
+amdgpu_debugfs_gfx_sched_mask_set, "%llx\n");
+
+#endif
+
+void amdgpu_debugfs_gfx_sched_mask_init(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+   struct drm_minor *minor = adev_to_drm(adev)->primary;
+   struct dentry *root = minor->debugfs_root;
+   char name[32];
+
+   if (!(adev->gfx.num_gfx_rings > 1))
+   return;
+   sprintf(name, "amdgpu_gfx_sched_mask");
+   debugfs_create_file(name, 0600, root, adev,
+   &amdgpu_debugfs_gfx_sched_mask_fops);
+#endif
+}
+
+/*
+ * debugfs for to enable/disable compute job submission to specific core.
+ */
+#if defined(CONFIG_DEBUG_FS)
+static int amdgpu_debugfs_compute_sched_mask_set(void *data, u64 val)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)data;
+   u32 i;
+   u64 mask = 0;
+   struct amdgpu_ring *ring;
+
+   if (!adev)
+   return -ENODEV;
+
+   mask = (1 << adev->gfx.num_compute_rings) - 1;
+   if ((val & mask) == 0)
+   return -EINVAL;
+
+   for (i = 0; i < adev->gfx.num_compute_rings; ++i) {
+   ring = &adev->gfx.compute_ring[i];
+   if (val & (1 << i))
+   ring->sched.ready = true;
+   else
+   ring->sched.ready = false;
+   }
+
+   /* publish sched.ready flag update effective immediately across smp */
+   smp_rmb();
+   return 0;
+}
+
+static int amdgpu_debugfs_compute_sched_mask_get(void *data, u64 *val)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)data;
+   u32 i;
+   u64 mask = 0;
+   struct amdgpu_ring *ring;
+
+   if (!adev)
+   return -ENODEV;
+   for (i = 0; i < adev->gfx.num_compute_rings; ++i) {
+   ring = &adev->gfx.compute_ring[i];
+   if (ring-

[PATCH 2/2] drm/amdgpu: add amdgpu_sdma_sched_mask debugfs

2024-10-17 Thread jesse.zh...@amd.com

Userspace wants to run jobs on a specific sdma ring for verification purposes.
This debugfs entry helps to disable or enable submitting jobs to a specific 
ring.
This entry is populated only if there are at least two or more cores in the 
sdma ip.

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c| 71 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h|  2 +-
 3 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 6e3f657cab9c..c446bfccea59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -2098,6 +2098,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
amdgpu_debugfs_jpeg_sched_mask_init(adev);
amdgpu_debugfs_gfx_sched_mask_init(adev);
amdgpu_debugfs_compute_sched_mask_init(adev);
+   amdgpu_debugfs_sdma_sched_mask_init(adev);
 
amdgpu_ras_debugfs_create_all(adev);
amdgpu_rap_debugfs_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 183a976ba29d..cacb12a4fa9e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -343,3 +343,74 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev)
 
return 0;
 }
+
+/*
+ * debugfs for to enable/disable sdma job submission to specific core.
+ */
+#if defined(CONFIG_DEBUG_FS)
+static int amdgpu_debugfs_sdma_sched_mask_set(void *data, u64 val)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)data;
+   u32 i;
+   u64 mask = 0;
+   struct amdgpu_ring *ring;
+
+   if (!adev)
+   return -ENODEV;
+
+   mask = (1 << adev->sdma.num_instances) - 1;
+   if ((val & mask) == 0)
+   return -EINVAL;
+
+   for (i = 0; i < adev->sdma.num_instances; ++i) {
+   ring = &adev->sdma.instance[i].ring;
+   if (val & (1 << i))
+   ring->sched.ready = true;
+   else
+   ring->sched.ready = false;
+   }
+   /* publish sched.ready flag update effective immediately across smp */
+   smp_rmb();
+   return 0;
+}
+
+static int amdgpu_debugfs_sdma_sched_mask_get(void *data, u64 *val)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)data;
+   u32 i;
+   u64 mask = 0;
+   struct amdgpu_ring *ring;
+
+   if (!adev)
+   return -ENODEV;
+   for (i = 0; i < adev->sdma.num_instances; ++i) {
+   ring = &adev->sdma.instance[i].ring;
+   if (ring->sched.ready)
+   mask |= 1 << i;
+   }
+
+   *val = mask;
+   return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_sdma_sched_mask_fops,
+amdgpu_debugfs_sdma_sched_mask_get,
+amdgpu_debugfs_sdma_sched_mask_set, "%llx\n");
+
+#endif
+
+void amdgpu_debugfs_sdma_sched_mask_init(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+   struct drm_minor *minor = adev_to_drm(adev)->primary;
+   struct dentry *root = minor->debugfs_root;
+   char name[32];
+
+   if (!(adev->sdma.num_instances > 1))
+   return;
+   sprintf(name, "amdgpu_sdma_sched_mask");
+   debugfs_create_file(name, 0600, root, adev,
+   &amdgpu_debugfs_sdma_sched_mask_fops);
+#endif
+}
+
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 087ce0f6fa07..a37fcd9bb981 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -175,5 +175,5 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev, 
u32 instance,
 void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev,
 bool duplicate);
 int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
-
+void amdgpu_debugfs_sdma_sched_mask_init(struct amdgpu_device *adev);
 #endif
-- 
2.25.1

[PATCH 4/5 V2] drm/amdgpu: Add sysfs interface for vpe reset mask

2024-10-22 Thread jesse.zh...@amd.com

Add the sysfs interface for vpe:
vpe_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 53 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h |  2 +
 2 files changed, 55 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
index 6d96e1f21e20..eae94fdb126e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
@@ -377,6 +377,10 @@ static int vpe_sw_init(struct amdgpu_ip_block *ip_block)
ret = vpe_init_microcode(vpe);
if (ret)
goto out;
+
+   ret = amdgpu_vpe_sysfs_reset_mask_init(adev);
+   if (ret)
+   goto out;
 out:
return ret;
 }
@@ -389,6 +393,7 @@ static int vpe_sw_fini(struct amdgpu_ip_block *ip_block)
release_firmware(vpe->fw);
vpe->fw = NULL;
 
+   amdgpu_vpe_sysfs_reset_mask_fini(adev);
vpe_ring_fini(vpe);
 
amdgpu_bo_free_kernel(&adev->vpe.cmdbuf_obj,
@@ -865,6 +870,54 @@ static void vpe_ring_end_use(struct amdgpu_ring *ring)
schedule_delayed_work(&adev->vpe.idle_work, VPE_IDLE_TIMEOUT);
 }
 
+static ssize_t amdgpu_get_vpe_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   ssize_t size = 0;
+   struct amdgpu_ring *ring = &adev->vpe.ring;
+
+   if (!adev || !ring)
+   return -ENODEV;
+
+   if (amdgpu_device_should_recover_gpu(adev))
+   size += sysfs_emit_at(buf, size, "full ");
+
+   if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
+   && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
+   size += sysfs_emit_at(buf, size, "soft ");
+
+   if (amdgpu_gpu_recovery && ring->funcs->reset)
+   size += sysfs_emit_at(buf, size, "queue ");
+
+   size += sysfs_emit_at(buf, size, "\n");
+   return size;
+}
+
+static DEVICE_ATTR(vpe_reset_mask, 0444,
+  amdgpu_get_vpe_reset_mask, NULL);
+
+int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->vpe.num_instances) {
+   r = device_create_file(adev->dev, &dev_attr_vpe_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->vpe.num_instances)
+   device_remove_file(adev->dev, &dev_attr_vpe_reset_mask);
+}
+
 static const struct amdgpu_ring_funcs vpe_ring_funcs = {
.type = AMDGPU_RING_TYPE_VPE,
.align_mask = 0xf,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h
index 231d86d0953e..015ba7aaf0ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h
@@ -86,6 +86,8 @@ int amdgpu_vpe_init_microcode(struct amdgpu_vpe *vpe);
 int amdgpu_vpe_ring_init(struct amdgpu_vpe *vpe);
 int amdgpu_vpe_ring_fini(struct amdgpu_vpe *vpe);
 int amdgpu_vpe_configure_dpm(struct amdgpu_vpe *vpe);
+void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev);
+int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device *adev);
 
 #define vpe_ring_init(vpe) ((vpe)->funcs->ring_init ? 
(vpe)->funcs->ring_init((vpe)) : 0)
 #define vpe_ring_start(vpe) ((vpe)->funcs->ring_start ? 
(vpe)->funcs->ring_start((vpe)) : 0)
-- 
2.25.1

[PATCH 1/5 V2] drm/amdgpu: Add sysfs interface for gc reset mask

2024-10-22 Thread jesse.zh...@amd.com

Add two sysfs interfaces for gfx and compute:
gfx_reset_mask
compute_reset_mask

These interfaces are read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 122 
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   2 +
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  |   6 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  |   5 +
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c  |   5 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |   5 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c |   5 +
 7 files changed, 150 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index e96984c53e72..10d55755ee88 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1588,6 +1588,94 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct 
device *dev,
return count;
 }
 
+static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   ssize_t size = 0;
+   struct amdgpu_ring *ring = &adev->gfx.gfx_ring[0];
+
+   if (!adev || !ring)
+   return -ENODEV;
+
+   if (amdgpu_device_should_recover_gpu(adev))
+   size += sysfs_emit_at(buf, size, "full ");
+
+   if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
+   && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
+   size += sysfs_emit_at(buf, size, "soft ");
+
+   if (amdgpu_gpu_recovery && ring->funcs->reset) {
+switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+case IP_VERSION(9, 2, 2): //reven2
+case IP_VERSION(9, 3, 0): //renior
+case IP_VERSION(9, 4, 0): //vega20
+case IP_VERSION(10, 1, 0): //navi10
+case IP_VERSION(10, 1, 1): //navi12
+case IP_VERSION(10, 1, 2): //navi13
+/* Skip flag setting because some cases
+ * are not supported by current firmware.
+ */
+break;
+
+default:
+   size += sysfs_emit_at(buf, size, "queue ");
+break;
+   }
+}
+
+   size += sysfs_emit_at(buf, size, "\n");
+   return size;
+}
+
+static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   ssize_t size = 0;
+   struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
+
+   if (!adev || !ring)
+   return -ENODEV;
+
+   if (amdgpu_device_should_recover_gpu(adev))
+   size += sysfs_emit_at(buf, size, "full ");
+
+   if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
+   && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
+   size += sysfs_emit_at(buf, size, "soft ");
+
+   if (amdgpu_gpu_recovery && ring->funcs->reset) {
+switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+case IP_VERSION(9, 2, 2): //reven2
+case IP_VERSION(9, 3, 0): //renior
+case IP_VERSION(9, 4, 0): //vega20
+case IP_VERSION(10, 1, 0): //navi10
+case IP_VERSION(10, 1, 1): //navi12
+case IP_VERSION(10, 1, 2): //navi13
+/* Skip flag setting because some test cases
+ * are not supported by current firmware.
+ */
+break;
+
+default:
+   size += sysfs_emit_at(buf, size, "queue ");
+break;
+   }
+}
+
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
+   adev->gfx.mec_fw_version >= 0x009b)
+   size += sysfs_emit_at(buf, size, "pipe ");
+
+   size += sysfs_emit_at(buf, size, "\n");
+   return size;
+}
+
 static DEVICE_ATTR(run_cleaner_shader, 0200,
   NULL, amdgpu_gfx_set_run_cleaner_shader);
 
@@ -1602,6 +1690,12 @@ static DEVICE_ATTR(current_compute_partition, 0644,
 static DEVICE_ATTR(available_compute_partition, 0444,
   amdgpu_gfx_get_available_compute_partition, NULL);
 
+

[PATCH 2/5 V2] drm/amdgpu: Add sysfs interface for sdma reset mask

2024-10-22 Thread jesse.zh...@amd.com

Add the sysfs interface for sdma:
sdma_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 48 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  2 +
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c   |  5 +++
 10 files changed, 90 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 183a976ba29d..f20b7285f5fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -343,3 +343,51 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev)
 
return 0;
 }
+
+static ssize_t amdgpu_get_sdma_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   ssize_t size = 0;
+   struct amdgpu_ring *ring = &adev->sdma.instance[0].ring;
+
+   if (!adev || !ring)
+   return -ENODEV;
+
+   if (amdgpu_device_should_recover_gpu(adev))
+   size += sysfs_emit_at(buf, size, "full ");
+
+   if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
+   && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
+   size += sysfs_emit_at(buf, size, "soft ");
+
+   if (amdgpu_gpu_recovery && ring->funcs->reset)
+   size += sysfs_emit_at(buf, size, "queue ");
+
+   size += sysfs_emit_at(buf, size, "\n");
+   return size;
+}
+
+static DEVICE_ATTR(sdma_reset_mask, 0444,
+  amdgpu_get_sdma_reset_mask, NULL);
+
+int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->sdma.num_instances) {
+   r = device_create_file(adev->dev, &dev_attr_sdma_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->sdma.num_instances)
+   device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 087ce0f6fa07..3058548d0733 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -175,5 +175,7 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev, 
u32 instance,
 void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev,
 bool duplicate);
 int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index 10fd772cb80f..bd04310cb2b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -863,6 +863,10 @@ static int sdma_v2_4_sw_init(struct amdgpu_ip_block 
*ip_block)
return r;
}
 
+   r = amdgpu_sdma_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return r;
 }
 
@@ -874,6 +878,7 @@ static int sdma_v2_4_sw_fini(struct amdgpu_ip_block 
*ip_block)
for (i = 0; i < adev->sdma.num_instances; i++)
amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 
+   amdgpu_sdma_sysfs_reset_mask_fini(adev);
sdma_v2_4_free_microcode(adev);
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 69fba087e09c..1fcf7e977143 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -1149,6 +1149,10 @@ static int sdma_v3_0_sw_init(struct amdgpu_ip_block 
*ip_block)
return r;
}
 
+   r = amdgpu_sdma_sysfs_reset_mask_init(adev)
+   if (r)
+   return r;
+
return r;
 }
 
@@ -1160,6 +1164,7 @@ static int sdma_v3_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
for (i = 0; i < adev->sdma.num_instances; i++)
amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 
+   amdgpu_sdma_sysfs_reset_mask_fini(adev);
sdma_v3_0_free_m

[PATCH 3/5 V2] drm/amdgpu: Add sysfs interface for vcn reset mask

2024-10-22 Thread jesse.zh...@amd.com

Add the sysfs interface for vcn:
vcn_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 48 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c   |  6 
 drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c   |  4 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  4 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c |  6 
 9 files changed, 85 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 43f44cc201cb..7ae52c7026d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1277,3 +1277,51 @@ int amdgpu_vcn_psp_update_sram(struct amdgpu_device 
*adev, int inst_idx,
 
return psp_execute_ip_fw_load(&adev->psp, &ucode);
 }
+
+static ssize_t amdgpu_get_vcn_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   ssize_t size = 0;
+   struct amdgpu_ring *ring = &adev->vcn.inst[0].ring_enc[0];
+
+   if (!adev || !ring)
+   return -ENODEV;
+
+   if (amdgpu_device_should_recover_gpu(adev))
+   size += sysfs_emit_at(buf, size, "full ");
+
+   if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
+   && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
+   size += sysfs_emit_at(buf, size, "soft ");
+
+   if (amdgpu_gpu_recovery && ring->funcs->reset)
+   size += sysfs_emit_at(buf, size, "queue ");
+
+   size += sysfs_emit_at(buf, size, "\n");
+   return size;
+}
+
+static DEVICE_ATTR(vcn_reset_mask, 0444,
+  amdgpu_get_vcn_reset_mask, NULL);
+
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->vcn.num_vcn_inst) {
+   r = device_create_file(adev->dev, &dev_attr_vcn_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->vcn.num_vcn_inst)
+   device_remove_file(adev->dev, &dev_attr_vcn_reset_mask);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index 2a1f3dbb14d3..d52c3f752c06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -518,5 +518,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
 
 int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
   enum AMDGPU_UCODE_ID ucode_id);
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
index 129c759772c2..5d03d4c0a273 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
@@ -207,6 +207,11 @@ static int vcn_v1_0_sw_init(struct amdgpu_ip_block 
*ip_block)
} else {
adev->vcn.ip_dump = ptr;
}
+
+   r = amdgpu_vcn_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return r;
 }
 
@@ -228,6 +233,7 @@ static int vcn_v1_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
 
jpeg_v1_0_sw_fini(ip_block);
 
+   amdgpu_vcn_sysfs_reset_mask_fini(adev);
r = amdgpu_vcn_sw_fini(adev);
 
kfree(adev->vcn.ip_dump);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
index 19bbd49f760e..6a7a55bc96bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
@@ -231,6 +231,9 @@ static int vcn_v2_0_sw_init(struct amdgpu_ip_block 
*ip_block)
adev->vcn.ip_dump = ptr;
}
 
+   r = amdgpu_vcn_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
return 0;
 }
 
@@ -258,6 +261,7 @@ static int vcn_v2_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
+   amdgpu_vcn_sysfs_reset_mask_fini(adev);
r = amdgpu_vcn_sw_fini(adev);
 
kfree(adev->vcn.ip_dump);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
index 13632c22d2f9..7c5838060bae 1006

[PATCH 5/5 V2] drm/amdgpu: Add sysfs interface for jpeg reset mask

2024-10-22 Thread jesse.zh...@amd.com

Add the sysfs interface for jpeg:
jpeg_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 48 
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h |  2 +
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c |  4 ++
 6 files changed, 69 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 95e2796919fc..eda71e9b1199 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -415,3 +415,51 @@ void amdgpu_debugfs_jpeg_sched_mask_init(struct 
amdgpu_device *adev)
&amdgpu_debugfs_jpeg_sched_mask_fops);
 #endif
 }
+
+static ssize_t amdgpu_get_jpeg_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   ssize_t size = 0;
+   struct amdgpu_ring *ring = adev->jpeg.inst[0].ring_dec;
+
+   if (!adev || !ring)
+   return -ENODEV;
+
+   if (amdgpu_device_should_recover_gpu(adev))
+   size += sysfs_emit_at(buf, size, "full ");
+
+   if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
+   && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
+   size += sysfs_emit_at(buf, size, "soft ");
+
+   if (amdgpu_gpu_recovery && ring->funcs->reset)
+   size += sysfs_emit_at(buf, size, "queue ");
+
+   size += sysfs_emit_at(buf, size, "\n");
+   return size;
+}
+
+static DEVICE_ATTR(jpeg_reset_mask, 0444,
+  amdgpu_get_jpeg_reset_mask, NULL);
+
+int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->jpeg.num_jpeg_inst) {
+   r = device_create_file(adev->dev, &dev_attr_jpeg_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->jpeg.num_jpeg_inst)
+   device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
index 819dc7a0af99..f5e1c98a4764 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
@@ -150,5 +150,7 @@ int amdgpu_jpeg_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_jpeg_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
   enum AMDGPU_UCODE_ID ucode_id);
 void amdgpu_debugfs_jpeg_sched_mask_init(struct amdgpu_device *adev);
+int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif /*__AMDGPU_JPEG_H__*/
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
index 20e1fe89c463..c0b86a742a94 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
@@ -124,6 +124,10 @@ static int jpeg_v4_0_sw_init(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
+   r = amdgpu_jpeg_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -143,6 +147,7 @@ static int jpeg_v4_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
+   amdgpu_jpeg_sysfs_reset_mask_fini(adev);
r = amdgpu_jpeg_sw_fini(adev);
 
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index 2a53537db135..2a25f7fffbd4 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -159,6 +159,10 @@ static int jpeg_v4_0_3_sw_init(struct amdgpu_ip_block 
*ip_block)
}
}
 
+   r = amdgpu_jpeg_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -178,6 +182,7 @@ static int jpeg_v4_0_3_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
+   amdgpu_jpeg_sysfs_reset_mask_fini(adev);
r = amdgpu_jpeg_sw_fini(adev);
 
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c
index ef2d4237925b..0e06c477f6b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c
+++ b/drivers/gpu/d

[PATCH 4/5] drm/amdgpu: Add sysfs interface for vpe reset mask

2024-10-22 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Add the sysfs interface for vpe:
vpe_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 53 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h |  2 +
 2 files changed, 55 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
index 6d96e1f21e20..7df8ed6be2fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
@@ -377,6 +377,10 @@ static int vpe_sw_init(struct amdgpu_ip_block *ip_block)
ret = vpe_init_microcode(vpe);
if (ret)
goto out;
+
+   ret = amdgpu_vpe_sysfs_reset_mask_init(adev);
+   if (ret)
+   goto out;
 out:
return ret;
 }
@@ -389,6 +393,7 @@ static int vpe_sw_fini(struct amdgpu_ip_block *ip_block)
release_firmware(vpe->fw);
vpe->fw = NULL;
 
+   amdgpu_vpe_sysfs_reset_mask_fini(adev);
vpe_ring_fini(vpe);
 
amdgpu_bo_free_kernel(&adev->vpe.cmdbuf_obj,
@@ -865,6 +870,54 @@ static void vpe_ring_end_use(struct amdgpu_ring *ring)
schedule_delayed_work(&adev->vpe.idle_work, VPE_IDLE_TIMEOUT);
 }
 
+static ssize_t amdgpu_get_vpe_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   ssize_t size = 0;
+   struct amdgpu_ring *ring = &adev->vpe.ring;
+
+   if (!adev || !ring)
+   return -ENODEV;
+
+   if (amdgpu_device_should_recover_gpu(adev))
+   size |= AMDGPU_RESET_TYPE_FULL;
+
+   if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
+   && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
+   size |= AMDGPU_RESET_TYPE_SOFT_RESET;
+
+   if (amdgpu_gpu_recovery && ring->funcs->reset)
+   size |= AMDGPU_RESET_TYPE_PER_QUEUE;
+
+   size = sysfs_emit_at(buf, 0, "%lu\n", size);
+   return size;
+}
+
+static DEVICE_ATTR(vpe_reset_mask, 0444,
+  amdgpu_get_vpe_reset_mask, NULL);
+
+int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->vpe.num_instances) {
+   r = device_create_file(adev->dev, &dev_attr_vpe_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->vpe.num_instances)
+   device_remove_file(adev->dev, &dev_attr_vpe_reset_mask);
+}
+
 static const struct amdgpu_ring_funcs vpe_ring_funcs = {
.type = AMDGPU_RING_TYPE_VPE,
.align_mask = 0xf,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h
index 231d86d0953e..015ba7aaf0ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h
@@ -86,6 +86,8 @@ int amdgpu_vpe_init_microcode(struct amdgpu_vpe *vpe);
 int amdgpu_vpe_ring_init(struct amdgpu_vpe *vpe);
 int amdgpu_vpe_ring_fini(struct amdgpu_vpe *vpe);
 int amdgpu_vpe_configure_dpm(struct amdgpu_vpe *vpe);
+void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev);
+int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device *adev);
 
 #define vpe_ring_init(vpe) ((vpe)->funcs->ring_init ? 
(vpe)->funcs->ring_init((vpe)) : 0)
 #define vpe_ring_start(vpe) ((vpe)->funcs->ring_start ? 
(vpe)->funcs->ring_start((vpe)) : 0)
-- 
2.25.1

[PATCH 2/5] drm/amdgpu: Add sysfs interface for sdma reset mask

2024-10-22 Thread jesse.zh...@amd.com

Add the sysfs interface for sdma:
sdma_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 49 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  2 +
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   |  3 ++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  3 ++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   |  3 ++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c   |  2 +
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c   |  3 ++
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c   |  3 ++
 10 files changed, 78 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 183a976ba29d..90156669ac66 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -343,3 +343,52 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev)
 
return 0;
 }
+
+static ssize_t amdgpu_get_sdma_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   ssize_t size = 0;
+   struct amdgpu_ring *ring = &adev->sdma.instance[0].ring;
+
+   if (!adev || !ring)
+   return -ENODEV;
+
+   if (amdgpu_device_should_recover_gpu(adev))
+   size |= AMDGPU_RESET_TYPE_FULL;
+
+   if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
+   && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
+   size |= AMDGPU_RESET_TYPE_SOFT_RESET;
+
+   if (amdgpu_gpu_recovery && ring->funcs->reset)
+   size |= AMDGPU_RESET_TYPE_PER_QUEUE;
+
+   size = sysfs_emit_at(buf, 0, "%lu\n", size);
+   return size;
+}
+
+static DEVICE_ATTR(sdma_reset_mask, 0444,
+  amdgpu_get_sdma_reset_mask, NULL);
+
+int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->sdma.num_instances) {
+   r = device_create_file(adev->dev, &dev_attr_sdma_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->sdma.num_instances)
+   device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
+}
+
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 087ce0f6fa07..3058548d0733 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -175,5 +175,7 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev, 
u32 instance,
 void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev,
 bool duplicate);
 int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index 10fd772cb80f..bd04310cb2b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -863,6 +863,10 @@ static int sdma_v2_4_sw_init(struct amdgpu_ip_block 
*ip_block)
return r;
}
 
+   r = amdgpu_sdma_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return r;
 }
 
@@ -874,6 +878,7 @@ static int sdma_v2_4_sw_fini(struct amdgpu_ip_block 
*ip_block)
for (i = 0; i < adev->sdma.num_instances; i++)
amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 
+   amdgpu_sdma_sysfs_reset_mask_fini(adev);
sdma_v2_4_free_microcode(adev);
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 69fba087e09c..9d9dc29132e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -1149,6 +1149,10 @@ static int sdma_v3_0_sw_init(struct amdgpu_ip_block 
*ip_block)
return r;
}
 
+   r = amdgpu_sdma_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return r;
 }
 
@@ -1160,6 +1164,7 @@ static int sdma_v3_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
for (i = 0; i < adev->sdma.num_instances; i++)
amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 
+   amdgpu_sdma_sysfs_reset_mask_fini(adev);
sdma_v3_0_free_microcode(adev);
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/driver

[PATCH 1/5] drm/amdgpu: Add sysfs interface for gc reset mask

2024-10-22 Thread jesse.zh...@amd.com

Add two sysfs interfaces for gfx and compute:
gfx_reset_mask
compute_reset_mask

These interfaces are read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |   6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 122 
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   2 +
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  |   6 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  |   5 +
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c  |   5 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |   5 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c |   5 +
 8 files changed, 156 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 48c9b9b06905..0dd475c30267 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -300,6 +300,12 @@ extern int amdgpu_wbrf;
 #define AMDGPU_RESET_VCE   (1 << 13)
 #define AMDGPU_RESET_VCE1  (1 << 14)
 
+/* reset mask */
+#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, 
mode1/mode2/BACO/etc. */
+#define AMDGPU_RESET_TYPE_SOFT_RESET (1 << 1) /* IP level soft reset */
+#define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2) /* per queue */
+#define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe */
+
 /* max cursor sizes (in pixels) */
 #define CIK_CURSOR_WIDTH 128
 #define CIK_CURSOR_HEIGHT 128
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index e96984c53e72..b4706355ece8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1588,6 +1588,94 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct 
device *dev,
return count;
 }
 
+static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   ssize_t size = 0;
+   struct amdgpu_ring *ring = &adev->gfx.gfx_ring[0];
+
+   if (!adev || !ring)
+   return -ENODEV;
+
+   if (amdgpu_device_should_recover_gpu(adev))
+   size |= AMDGPU_RESET_TYPE_FULL;
+
+   if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
+   && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
+   size |= AMDGPU_RESET_TYPE_SOFT_RESET;
+
+   if (amdgpu_gpu_recovery && ring->funcs->reset) {
+switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+case IP_VERSION(9, 2, 2): //reven2
+case IP_VERSION(9, 3, 0): //renior
+case IP_VERSION(9, 4, 0): //vega20
+case IP_VERSION(10, 1, 0): //navi10
+case IP_VERSION(10, 1, 1): //navi12
+case IP_VERSION(10, 1, 2): //navi13
+/* Skip flag setting because some cases
+ * are not supported by current firmware.
+ */
+break;
+
+default:
+size |= AMDGPU_RESET_TYPE_PER_QUEUE;
+break;
+   }
+}
+
+   size = sysfs_emit_at(buf, 0, "%lu\n", size);
+   return size;
+}
+
+static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   ssize_t size = 0;
+   struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
+
+   if (!adev || !ring)
+   return -ENODEV;
+
+   if (amdgpu_device_should_recover_gpu(adev))
+   size |= AMDGPU_RESET_TYPE_FULL;
+
+   if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
+   && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
+   size |= AMDGPU_RESET_TYPE_SOFT_RESET;
+
+   if (amdgpu_gpu_recovery && ring->funcs->reset) {
+switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+case IP_VERSION(9, 2, 2): //reven2
+case IP_VERSION(9, 3, 0): //renior
+case IP_VERSION(9, 4, 0): //vega20
+case IP_VERSION(10, 1, 0): //navi10
+case IP_VERSION(10, 1, 1): //navi12
+case IP_VERSION(10, 1, 2): //navi13
+/* Skip flag setting because some test cases
+ * are not supported by current firmware.
+ */
+break;
+
+default:
+

[PATCH 3/5] drm/amdgpu: Add sysfs interface for vcn reset mask

2024-10-22 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Add the sysfs interface for vcn:
vcn_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 49 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  2 +
 drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c   |  6 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c   |  4 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  4 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c |  6 +++
 9 files changed, 86 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 43f44cc201cb..409ad6185cf3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1277,3 +1277,52 @@ int amdgpu_vcn_psp_update_sram(struct amdgpu_device 
*adev, int inst_idx,
 
return psp_execute_ip_fw_load(&adev->psp, &ucode);
 }
+
+static ssize_t amdgpu_get_vcn_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   ssize_t size = 0;
+   struct amdgpu_ring *ring = &adev->vcn.inst[0].ring_enc[0];
+
+   if (!adev || !ring)
+   return -ENODEV;
+
+   if (amdgpu_device_should_recover_gpu(adev))
+   size |= AMDGPU_RESET_TYPE_FULL;
+
+   if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
+   && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
+   size |= AMDGPU_RESET_TYPE_SOFT_RESET;
+
+   if (amdgpu_gpu_recovery && ring->funcs->reset)
+   size |= AMDGPU_RESET_TYPE_PER_QUEUE;
+
+   size = sysfs_emit_at(buf, 0, "%lu\n", size);
+   return size;
+}
+
+static DEVICE_ATTR(vcn_reset_mask, 0444,
+  amdgpu_get_vcn_reset_mask, NULL);
+
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->vcn.num_vcn_inst) {
+   r = device_create_file(adev->dev, &dev_attr_vcn_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->vcn.num_vcn_inst)
+   device_remove_file(adev->dev, &dev_attr_vcn_reset_mask);
+}
+
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index 2a1f3dbb14d3..d52c3f752c06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -518,5 +518,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
 
 int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
   enum AMDGPU_UCODE_ID ucode_id);
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
index 129c759772c2..5d03d4c0a273 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
@@ -207,6 +207,11 @@ static int vcn_v1_0_sw_init(struct amdgpu_ip_block 
*ip_block)
} else {
adev->vcn.ip_dump = ptr;
}
+
+   r = amdgpu_vcn_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return r;
 }
 
@@ -228,6 +233,7 @@ static int vcn_v1_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
 
jpeg_v1_0_sw_fini(ip_block);
 
+   amdgpu_vcn_sysfs_reset_mask_fini(adev);
r = amdgpu_vcn_sw_fini(adev);
 
kfree(adev->vcn.ip_dump);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
index 19bbd49f760e..6a7a55bc96bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
@@ -231,6 +231,9 @@ static int vcn_v2_0_sw_init(struct amdgpu_ip_block 
*ip_block)
adev->vcn.ip_dump = ptr;
}
 
+   r = amdgpu_vcn_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
return 0;
 }
 
@@ -258,6 +261,7 @@ static int vcn_v2_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
+   amdgpu_vcn_sysfs_reset_mask_fini(adev);
r = amdgpu_vcn_sw_fini(adev);
 
kfree(adev->vcn.ip_dump);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c 
b/drivers/gpu/drm/amd/am

[PATCH 5/5] drm/amdgpu: Add sysfs interface for jpeg reset mask

2024-10-22 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Add the sysfs interface for jpeg:
jpeg_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 48 
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h |  2 +
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c |  4 ++
 6 files changed, 69 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 95e2796919fc..34742232af47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -415,3 +415,51 @@ void amdgpu_debugfs_jpeg_sched_mask_init(struct 
amdgpu_device *adev)
&amdgpu_debugfs_jpeg_sched_mask_fops);
 #endif
 }
+
+static ssize_t amdgpu_get_jpeg_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   ssize_t size = 0;
+   struct amdgpu_ring *ring = adev->jpeg.inst[0].ring_dec;
+
+   if (!adev || !ring)
+   return -ENODEV;
+
+   if (amdgpu_device_should_recover_gpu(adev))
+   size |= AMDGPU_RESET_TYPE_FULL;
+
+   if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
+   && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
+   size |= AMDGPU_RESET_TYPE_SOFT_RESET;
+
+   if (amdgpu_gpu_recovery && ring->funcs->reset)
+   size |= AMDGPU_RESET_TYPE_PER_QUEUE;
+
+   size = sysfs_emit_at(buf, 0, "%lu\n", size);
+   return size;
+}
+
+static DEVICE_ATTR(jpeg_reset_mask, 0444,
+  amdgpu_get_jpeg_reset_mask, NULL);
+
+int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->jpeg.num_jpeg_inst) {
+   r = device_create_file(adev->dev, &dev_attr_jpeg_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->jpeg.num_jpeg_inst)
+   device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
index 819dc7a0af99..f5e1c98a4764 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
@@ -150,5 +150,7 @@ int amdgpu_jpeg_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_jpeg_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
   enum AMDGPU_UCODE_ID ucode_id);
 void amdgpu_debugfs_jpeg_sched_mask_init(struct amdgpu_device *adev);
+int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif /*__AMDGPU_JPEG_H__*/
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
index 20e1fe89c463..c0b86a742a94 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
@@ -124,6 +124,10 @@ static int jpeg_v4_0_sw_init(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
+   r = amdgpu_jpeg_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -143,6 +147,7 @@ static int jpeg_v4_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
+   amdgpu_jpeg_sysfs_reset_mask_fini(adev);
r = amdgpu_jpeg_sw_fini(adev);
 
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index 2a53537db135..2a25f7fffbd4 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -159,6 +159,10 @@ static int jpeg_v4_0_3_sw_init(struct amdgpu_ip_block 
*ip_block)
}
}
 
+   r = amdgpu_jpeg_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -178,6 +182,7 @@ static int jpeg_v4_0_3_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
+   amdgpu_jpeg_sysfs_reset_mask_fini(adev);
r = amdgpu_jpeg_sw_fini(adev);
 
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c
index ef2d4237925b..0e06c477f6b4 100644
--- a/drivers/gpu/drm/amd/amdgp

[PATCH V3 1/5] drm/amdgpu: Add sysfs interface for gc reset mask

2024-10-24 Thread jesse.zh...@amd.com

Add two sysfs interfaces for gfx and compute:
gfx_reset_mask
compute_reset_mask

These interfaces are read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
and print the strings in the order they are applied (Christian)

check amdgpu_gpu_recovery  before creating sysfs file itself,
and initialize supported_reset_types in IP version files (Lijo)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  8 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 66 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h|  4 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c |  6 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 14 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c| 12 
 7 files changed, 147 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 48c9b9b06905..aea1031d7b84 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -300,6 +300,12 @@ extern int amdgpu_wbrf;
 #define AMDGPU_RESET_VCE   (1 << 13)
 #define AMDGPU_RESET_VCE1  (1 << 14)
 
+/* reset mask */
+#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, 
mode1/mode2/BACO/etc. */
+#define AMDGPU_RESET_TYPE_SOFT_RESET (1 << 1) /* IP level soft reset */
+#define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2) /* per queue */
+#define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe */
+
 /* max cursor sizes (in pixels) */
 #define CIK_CURSOR_WIDTH 128
 #define CIK_CURSOR_HEIGHT 128
@@ -1466,6 +1472,8 @@ struct dma_fence *amdgpu_device_get_gang(struct 
amdgpu_device *adev);
 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
struct dma_fence *gang);
 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
+ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
+ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
 
 /* atpx handler */
 #if defined(CONFIG_VGA_SWITCHEROO)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ef715b2bbcdb..cd1e3f018893 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6684,3 +6684,40 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device 
*adev,
}
return ret;
 }
+
+ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
+{
+   ssize_t size = 0;
+
+   if (!ring)
+   return size;
+
+   if (amdgpu_device_should_recover_gpu(ring->adev))
+   size |= AMDGPU_RESET_TYPE_FULL;
+
+   if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
+   !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
+   size |= AMDGPU_RESET_TYPE_SOFT_RESET;
+
+   return size;
+}
+
+ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
+{
+   ssize_t size = 0;
+
+   if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
+   size += sysfs_emit_at(buf, size, "soft ");
+
+   if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
+   size += sysfs_emit_at(buf, size, "queue ");
+
+   if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
+   size += sysfs_emit_at(buf, size, "pipe ");
+
+   if (supported_reset & AMDGPU_RESET_TYPE_FULL)
+   size += sysfs_emit_at(buf, size, "full ");
+
+   size += sysfs_emit_at(buf, size, "\n");
+   return size;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index e96984c53e72..6de1f3bf6863 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1588,6 +1588,32 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct 
device *dev,
return count;
 }
 
+static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->gfx.gfx_supported_reset);
+}
+
+static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+

[PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask

2024-10-29 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Add two sysfs interfaces for gfx and compute:
gfx_reset_mask
compute_reset_mask

These interfaces are read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
and print the strings in the order they are applied (Christian)

check amdgpu_gpu_recovery  before creating sysfs file itself,
and initialize supported_reset_types in IP version files (Lijo)
v4: Fixing uninitialized variables (Tim)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  8 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 66 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h|  4 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c |  9 +++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 23 
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 10 
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 10 
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c| 17 ++
 9 files changed, 184 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 48c9b9b06905..aea1031d7b84 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -300,6 +300,12 @@ extern int amdgpu_wbrf;
 #define AMDGPU_RESET_VCE   (1 << 13)
 #define AMDGPU_RESET_VCE1  (1 << 14)
 
+/* reset mask */
+#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, 
mode1/mode2/BACO/etc. */
+#define AMDGPU_RESET_TYPE_SOFT_RESET (1 << 1) /* IP level soft reset */
+#define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2) /* per queue */
+#define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe */
+
 /* max cursor sizes (in pixels) */
 #define CIK_CURSOR_WIDTH 128
 #define CIK_CURSOR_HEIGHT 128
@@ -1466,6 +1472,8 @@ struct dma_fence *amdgpu_device_get_gang(struct 
amdgpu_device *adev);
 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
struct dma_fence *gang);
 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
+ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
+ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
 
 /* atpx handler */
 #if defined(CONFIG_VGA_SWITCHEROO)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ef715b2bbcdb..cd1e3f018893 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6684,3 +6684,40 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device 
*adev,
}
return ret;
 }
+
+ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
+{
+   ssize_t size = 0;
+
+   if (!ring)
+   return size;
+
+   if (amdgpu_device_should_recover_gpu(ring->adev))
+   size |= AMDGPU_RESET_TYPE_FULL;
+
+   if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
+   !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
+   size |= AMDGPU_RESET_TYPE_SOFT_RESET;
+
+   return size;
+}
+
+ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
+{
+   ssize_t size = 0;
+
+   if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
+   size += sysfs_emit_at(buf, size, "soft ");
+
+   if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
+   size += sysfs_emit_at(buf, size, "queue ");
+
+   if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
+   size += sysfs_emit_at(buf, size, "pipe ");
+
+   if (supported_reset & AMDGPU_RESET_TYPE_FULL)
+   size += sysfs_emit_at(buf, size, "full ");
+
+   size += sysfs_emit_at(buf, size, "\n");
+   return size;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index e96984c53e72..6de1f3bf6863 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1588,6 +1588,32 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct 
device *dev,
return count;
 }
 
+static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->gfx.gfx_supported_reset);
+}

[PATCH 2/5 V4] drm/amdgpu: Add sysfs interface for sdma reset mask

2024-10-29 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Add the sysfs interface for sdma:
sdma_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
   and print the strings in the order they are applied (Christian)

   check amdgpu_gpu_recovery  before creating sysfs file itself,
   and initialize supported_reset_types in IP version files (Lijo)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 41 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 ++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  9 ++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c   | 23 +
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c   | 17 ++
 6 files changed, 111 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 183a976ba29d..7edcd989afce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -343,3 +343,44 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev)
 
return 0;
 }
+
+static ssize_t amdgpu_get_sdma_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->sdma.supported_reset);
+}
+
+static DEVICE_ATTR(sdma_reset_mask, 0444,
+  amdgpu_get_sdma_reset_mask, NULL);
+
+int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (!amdgpu_gpu_recovery)
+   return r;
+
+   if (adev->sdma.num_instances) {
+   r = device_create_file(adev->dev, &dev_attr_sdma_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (!amdgpu_gpu_recovery)
+   return;
+
+   if (adev->sdma.num_instances)
+   device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 087ce0f6fa07..7ce613de7ee0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -116,6 +116,7 @@ struct amdgpu_sdma {
struct ras_common_if*ras_if;
struct amdgpu_sdma_ras  *ras;
uint32_t*ip_dump;
+   uint32_tsupported_reset;
 };
 
 /*
@@ -175,5 +176,7 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev, 
u32 instance,
 void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev,
 bool duplicate);
 int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 9c7cea0890c9..a38553f38fdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1430,6 +1430,10 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block 
*ip_block)
}
}
 
+   /* TODO: Add queue reset mask when FW fully supports it */
+   adev->sdma.supported_reset =
+   amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
+
if (amdgpu_sdma_ras_sw_init(adev)) {
dev_err(adev->dev, "fail to initialize sdma ras block\n");
return -EINVAL;
@@ -1442,6 +1446,10 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block 
*ip_block)
else
DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n");
 
+   r = amdgpu_sdma_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return r;
 }
 
@@ -1456,6 +1464,7 @@ static int sdma_v4_4_2_sw_fini(struct amdgpu_ip_block 
*ip_block)
amdgpu_ring_fini(&adev->sdma.instance[i].page);
}
 
+   amdgpu_sdma_sysfs_reset_mask_fini(adev);
if (amdgpu_ip_version(adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 2) ||
amdgpu_ip_version(adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 5))
amdgpu_sdma_destroy_inst_ctx(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 6a675daf5620..40d0e8e9132c 100644
---

[PATCH V4 3/5] drm/amdgpu: Add sysfs interface for vcn reset mask

2024-10-28 Thread jesse.zh...@amd.com

Add the sysfs interface for vcn:
vcn_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)

V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
and print the strings in the order they are applied (Christian)

check amdgpu_gpu_recovery  before creating sysfs file itself,
and initialize supported_reset_types in IP version files (Lijo)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 35 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  4 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   |  9 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  8 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c |  9 +++
 5 files changed, 65 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 43f44cc201cb..9bbae298189a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1277,3 +1277,38 @@ int amdgpu_vcn_psp_update_sram(struct amdgpu_device 
*adev, int inst_idx,
 
return psp_execute_ip_fw_load(&adev->psp, &ucode);
 }
+
+static ssize_t amdgpu_get_vcn_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->vcn.supported_reset);
+}
+
+static DEVICE_ATTR(vcn_reset_mask, 0444,
+  amdgpu_get_vcn_reset_mask, NULL);
+
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->vcn.num_vcn_inst) {
+   r = device_create_file(adev->dev, &dev_attr_vcn_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->vcn.num_vcn_inst)
+   device_remove_file(adev->dev, &dev_attr_vcn_reset_mask);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index 2a1f3dbb14d3..904336ff0b39 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -333,6 +333,8 @@ struct amdgpu_vcn {
 
/* IP reg dump */
uint32_t*ip_dump;
+
+   uint32_tsupported_reset;
 };
 
 struct amdgpu_fw_shared_rb_ptrs_struct {
@@ -518,5 +520,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
 
 int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
   enum AMDGPU_UCODE_ID ucode_id);
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index e7b7a8150ea7..4c8046f5b209 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -225,6 +225,10 @@ static int vcn_v4_0_sw_init(struct amdgpu_ip_block 
*ip_block)
vcn_v4_0_fw_shared_init(adev, i);
}
 
+   /* TODO: Add queue reset mask when FW fully supports it */
+   adev->sdma.supported_reset =
+   
amdgpu_get_soft_full_reset_mask(&adev->vcn.inst[0].ring_enc[0]);
+
if (amdgpu_sriov_vf(adev)) {
r = amdgpu_virt_alloc_mm_table(adev);
if (r)
@@ -247,6 +251,10 @@ static int vcn_v4_0_sw_init(struct amdgpu_ip_block 
*ip_block)
adev->vcn.ip_dump = ptr;
}
 
+   r = amdgpu_vcn_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -284,6 +292,7 @@ static int vcn_v4_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
+   amdgpu_vcn_sysfs_reset_mask_fini(adev);
r = amdgpu_vcn_sw_fini(adev);
 
kfree(adev->vcn.ip_dump);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index 6dcae398b2dc..3031ae57a37a 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -180,6 +180,10 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block 
*ip_block)
amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]);
}
 
+   /* TODO: Add queue reset mask when FW fully supports it */
+   adev->sdma.supported_reset =
+   
amdgpu_get_soft_full_reset_mask(&adev->vcn.inst[0].ring_enc[0]);
+
if (am

[PATCH V4 5/5] drm/amdgpu: Add sysfs interface for jpeg reset mask

2024-10-28 Thread jesse.zh...@amd.com

Add the sysfs interface for jpeg:
jpeg_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
and print the strings in the order they are applied (Christian)

check amdgpu_gpu_recovery  before creating sysfs file itself,
and initialize supported_reset_types in IP version files (Lijo)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 35 
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h |  3 ++
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c   |  7 +
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c |  8 ++
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c |  8 ++
 drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c |  7 +
 6 files changed, 68 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 95e2796919fc..f971ffdffce9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -415,3 +415,38 @@ void amdgpu_debugfs_jpeg_sched_mask_init(struct 
amdgpu_device *adev)
&amdgpu_debugfs_jpeg_sched_mask_fops);
 #endif
 }
+
+static ssize_t amdgpu_get_jpeg_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->jpeg.supported_reset);
+}
+
+static DEVICE_ATTR(jpeg_reset_mask, 0444,
+  amdgpu_get_jpeg_reset_mask, NULL);
+
+int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->jpeg.num_jpeg_inst) {
+   r = device_create_file(adev->dev, &dev_attr_jpeg_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->jpeg.num_jpeg_inst)
+   device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
index 819dc7a0af99..3eb4a4653fce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
@@ -128,6 +128,7 @@ struct amdgpu_jpeg {
uint16_t inst_mask;
uint8_t num_inst_per_aid;
boolindirect_sram;
+   uint32_t supported_reset;
 };
 
 int amdgpu_jpeg_sw_init(struct amdgpu_device *adev);
@@ -150,5 +151,7 @@ int amdgpu_jpeg_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_jpeg_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
   enum AMDGPU_UCODE_ID ucode_id);
 void amdgpu_debugfs_jpeg_sched_mask_init(struct amdgpu_device *adev);
+int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif /*__AMDGPU_JPEG_H__*/
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
index 20e1fe89c463..c675d6619f9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
@@ -121,6 +121,12 @@ static int jpeg_v4_0_sw_init(struct amdgpu_ip_block 
*ip_block)
adev->jpeg.inst->external.jpeg_pitch[0] = SOC15_REG_OFFSET(JPEG, 0, 
regUVD_JPEG_PITCH);
 
r = amdgpu_jpeg_ras_sw_init(adev);
+   if (r)
+   return r;
+   /* TODO: Add queue reset mask when FW fully supports it */
+   adev->jpeg.supported_reset =
+   
amdgpu_get_soft_full_reset_mask(&adev->jpeg.inst[0].ring_dec[0]);
+   r = amdgpu_jpeg_sysfs_reset_mask_init(adev);
if (r)
return r;
 
@@ -143,6 +149,7 @@ static int jpeg_v4_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
+   amdgpu_jpeg_sysfs_reset_mask_fini(adev);
r = amdgpu_jpeg_sw_fini(adev);
 
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index 2a53537db135..a785c970a908 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -159,6 +159,13 @@ static int jpeg_v4_0_3_sw_init(struct amdgpu_ip_block 
*ip_block)
}
}
 
+   /* TODO: Add queue reset mask when FW fully supports it */
+   adev->jpeg.supported_reset =
+   
amdgpu_get_soft_full_reset_mask(&adev->jpeg.inst[0].ring_dec[0]);
+   r = amdgpu_jpeg_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return 0;

[PATCH V4 4/5] drm/amdgpu: Add sysfs interface for vpe reset mask

2024-10-28 Thread jesse.zh...@amd.com

Add the sysfs interface for vpe:
vpe_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
and print the strings in the order they are applied (Christian)

check amdgpu_gpu_recovery  before creating sysfs file itself,
and initialize supported_reset_types in IP version files (Lijo)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 43 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h |  3 ++
 2 files changed, 46 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
index 6d96e1f21e20..85d1013bba9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
@@ -377,6 +377,13 @@ static int vpe_sw_init(struct amdgpu_ip_block *ip_block)
ret = vpe_init_microcode(vpe);
if (ret)
goto out;
+
+   /* TODO: Add queue reset mask when FW fully supports it */
+   adev->vpe.supported_reset =
+amdgpu_get_soft_full_reset_mask(&adev->vpe.ring);
+   ret = amdgpu_vpe_sysfs_reset_mask_init(adev);
+   if (ret)
+   goto out;
 out:
return ret;
 }
@@ -389,6 +396,7 @@ static int vpe_sw_fini(struct amdgpu_ip_block *ip_block)
release_firmware(vpe->fw);
vpe->fw = NULL;
 
+   amdgpu_vpe_sysfs_reset_mask_fini(adev);
vpe_ring_fini(vpe);
 
amdgpu_bo_free_kernel(&adev->vpe.cmdbuf_obj,
@@ -865,6 +873,41 @@ static void vpe_ring_end_use(struct amdgpu_ring *ring)
schedule_delayed_work(&adev->vpe.idle_work, VPE_IDLE_TIMEOUT);
 }
 
+static ssize_t amdgpu_get_vpe_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->vpe.supported_reset);
+}
+
+static DEVICE_ATTR(vpe_reset_mask, 0444,
+  amdgpu_get_vpe_reset_mask, NULL);
+
+int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->vpe.num_instances) {
+   r = device_create_file(adev->dev, &dev_attr_vpe_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->vpe.num_instances)
+   device_remove_file(adev->dev, &dev_attr_vpe_reset_mask);
+}
+
 static const struct amdgpu_ring_funcs vpe_ring_funcs = {
.type = AMDGPU_RING_TYPE_VPE,
.align_mask = 0xf,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h
index 231d86d0953e..695da740a97e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h
@@ -79,6 +79,7 @@ struct amdgpu_vpe {
 
uint32_tnum_instances;
boolcollaborate_mode;
+   uint32_tsupported_reset;
 };
 
 int amdgpu_vpe_psp_update_sram(struct amdgpu_device *adev);
@@ -86,6 +87,8 @@ int amdgpu_vpe_init_microcode(struct amdgpu_vpe *vpe);
 int amdgpu_vpe_ring_init(struct amdgpu_vpe *vpe);
 int amdgpu_vpe_ring_fini(struct amdgpu_vpe *vpe);
 int amdgpu_vpe_configure_dpm(struct amdgpu_vpe *vpe);
+void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev);
+int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device *adev);
 
 #define vpe_ring_init(vpe) ((vpe)->funcs->ring_init ? 
(vpe)->funcs->ring_init((vpe)) : 0)
 #define vpe_ring_start(vpe) ((vpe)->funcs->ring_start ? 
(vpe)->funcs->ring_start((vpe)) : 0)
-- 
2.25.1

[PATCH V4 2/5] drm/amdgpu: Add sysfs interface for sdma reset mask

2024-10-28 Thread jesse.zh...@amd.com

Add the sysfs interface for sdma:
sdma_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
   and print the strings in the order they are applied (Christian)

   check amdgpu_gpu_recovery  before creating sysfs file itself,
   and initialize supported_reset_types in IP version files (Lijo)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 41 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 ++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  9 ++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c   | 23 +
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c   | 17 ++
 6 files changed, 111 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 183a976ba29d..7edcd989afce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -343,3 +343,44 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev)
 
return 0;
 }
+
+static ssize_t amdgpu_get_sdma_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->sdma.supported_reset);
+}
+
+static DEVICE_ATTR(sdma_reset_mask, 0444,
+  amdgpu_get_sdma_reset_mask, NULL);
+
+int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (!amdgpu_gpu_recovery)
+   return r;
+
+   if (adev->sdma.num_instances) {
+   r = device_create_file(adev->dev, &dev_attr_sdma_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (!amdgpu_gpu_recovery)
+   return;
+
+   if (adev->sdma.num_instances)
+   device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 087ce0f6fa07..7ce613de7ee0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -116,6 +116,7 @@ struct amdgpu_sdma {
struct ras_common_if*ras_if;
struct amdgpu_sdma_ras  *ras;
uint32_t*ip_dump;
+   uint32_tsupported_reset;
 };
 
 /*
@@ -175,5 +176,7 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev, 
u32 instance,
 void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev,
 bool duplicate);
 int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 9c7cea0890c9..d73932728b93 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1430,6 +1430,10 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block 
*ip_block)
}
}
 
+   /* TODO: Add queue reset mask when FW fully supports it */
+   adev->sdma.supported_reset |=
+   amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
+
if (amdgpu_sdma_ras_sw_init(adev)) {
dev_err(adev->dev, "fail to initialize sdma ras block\n");
return -EINVAL;
@@ -1442,6 +1446,10 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block 
*ip_block)
else
DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n");
 
+   r = amdgpu_sdma_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return r;
 }
 
@@ -1456,6 +1464,7 @@ static int sdma_v4_4_2_sw_fini(struct amdgpu_ip_block 
*ip_block)
amdgpu_ring_fini(&adev->sdma.instance[i].page);
}
 
+   amdgpu_sdma_sysfs_reset_mask_fini(adev);
if (amdgpu_ip_version(adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 2) ||
amdgpu_ip_version(adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 5))
amdgpu_sdma_destroy_inst_ctx(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 6a675daf5620..4b71f93fc1c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1452,6 +1452,19 @@

[PATCH V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask

2024-10-28 Thread jesse.zh...@amd.com

Add two sysfs interfaces for gfx and compute:
gfx_reset_mask
compute_reset_mask

These interfaces are read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
and print the strings in the order they are applied (Christian)

check amdgpu_gpu_recovery  before creating sysfs file itself,
and initialize supported_reset_types in IP version files (Lijo)
v4: Fixing uninitialized variables (Tim)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  8 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 66 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h|  4 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c |  9 +++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 23 
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 10 
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 10 
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c| 17 ++
 9 files changed, 184 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 48c9b9b06905..aea1031d7b84 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -300,6 +300,12 @@ extern int amdgpu_wbrf;
 #define AMDGPU_RESET_VCE   (1 << 13)
 #define AMDGPU_RESET_VCE1  (1 << 14)
 
+/* reset mask */
+#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, 
mode1/mode2/BACO/etc. */
+#define AMDGPU_RESET_TYPE_SOFT_RESET (1 << 1) /* IP level soft reset */
+#define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2) /* per queue */
+#define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe */
+
 /* max cursor sizes (in pixels) */
 #define CIK_CURSOR_WIDTH 128
 #define CIK_CURSOR_HEIGHT 128
@@ -1466,6 +1472,8 @@ struct dma_fence *amdgpu_device_get_gang(struct 
amdgpu_device *adev);
 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
struct dma_fence *gang);
 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
+ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
+ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
 
 /* atpx handler */
 #if defined(CONFIG_VGA_SWITCHEROO)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ef715b2bbcdb..cd1e3f018893 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6684,3 +6684,40 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device 
*adev,
}
return ret;
 }
+
+ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
+{
+   ssize_t size = 0;
+
+   if (!ring)
+   return size;
+
+   if (amdgpu_device_should_recover_gpu(ring->adev))
+   size |= AMDGPU_RESET_TYPE_FULL;
+
+   if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
+   !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
+   size |= AMDGPU_RESET_TYPE_SOFT_RESET;
+
+   return size;
+}
+
+ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
+{
+   ssize_t size = 0;
+
+   if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
+   size += sysfs_emit_at(buf, size, "soft ");
+
+   if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
+   size += sysfs_emit_at(buf, size, "queue ");
+
+   if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
+   size += sysfs_emit_at(buf, size, "pipe ");
+
+   if (supported_reset & AMDGPU_RESET_TYPE_FULL)
+   size += sysfs_emit_at(buf, size, "full ");
+
+   size += sysfs_emit_at(buf, size, "\n");
+   return size;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index e96984c53e72..6de1f3bf6863 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1588,6 +1588,32 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct 
device *dev,
return count;
 }
 
+static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->gfx.gfx_supported_reset);
+}
+
+static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+

[PATCH] drm/amdgpu: fix double free vcn ip_dump

2024-11-10 Thread jesse.zh...@amd.com

[   90.441868] [ cut here ]
[   90.441873] kernel BUG at mm/slub.c:553!
[   90.441885] Oops: invalid opcode:  [#1] PREEMPT SMP NOPTI
[   90.441892] CPU: 0 PID: 1523 Comm: amd_pci_unplug Tainted: GE
  6.10.0+ #47
[   90.441900] Hardware name: AMD Splinter/Splinter-PHX2, BIOS TS41102C_925 
01/05/2024
[   90.441907] RIP: 0010:__slab_free+0x1ce/0x320
[   90.441916] Code: f7 c3 00 02 00 00 0f 84 6c ff ff ff fb 0f 1f 44 00 00 e9 
61 ff ff ff 41 f7 46 08 87 04 00 00 0f 85 d6 fe ff ff e9 ca fe ff ff <0f> 0b 49 
3b 5c 24 28 75 bd 48 8b 44 24 28 49 89 4c 24 28 ba 01 00
[   90.441927] RSP: 0018:b9c801cefcb0 EFLAGS: 00010246
[   90.441934] RAX: 8cdb481dcf00 RBX: 00200012 RCX: 8cdb481dce00
[   90.441940] RDX: 8cdb481dce00 RSI: e3f904207700 RDI: 8cdb40042a00
[   90.441945] RBP: b9c801cefd50 R08: 0001 R09: c149f632
[   90.441950] R10: 8cdb481dce00 R11: 8ce26e621e18 R12: e3f904207700
[   90.441956] R13: 8cdb481dce00 R14: 8cdb40042a00 R15: 8cdb481dce00
[   90.441962] FS:  7f0a4f3fec40() GS:8ce26e60() 
knlGS:
[   90.441968] CS:  0010 DS:  ES:  CR0: 80050033
[   90.441974] CR2: 55bf74ba8930 CR3: 000108f48000 CR4: 00750ef0
[   90.441979] PKRU: 5554
[   90.441983] Call Trace:
[   90.441987]  
[   90.441991]  ? show_regs+0x6b/0x80
[   90.441999]  ? __die_body+0x24/0x70
[   90.442005]  ? die+0x42/0x70
[   90.442011]  ? do_trap+0xda/0xf0
[   90.442018]  ? do_error_trap+0x76/0xa0
[   90.442023]  ? __slab_free+0x1ce/0x320
[   90.442030]  ? exc_invalid_op+0x57/0x80
[   90.442036]  ? __slab_free+0x1ce/0x320
[   90.442042]  ? asm_exc_invalid_op+0x1f/0x30
[   90.442053]  ? vcn_v4_0_sw_fini+0xc2/0x110 [amdgpu]
[   90.442308]  ? __slab_free+0x1ce/0x320
[   90.442316]  ? release_firmware.part.0+0x2e/0x50
[   90.442323]  ? srso_alias_return_thunk+0x5/0xfbef5
[   90.442332]  ? srso_alias_return_thunk+0x5/0xfbef5
[   90.442338]  ? vcn_v4_0_sw_fini+0xc2/0x110 [amdgpu]
[   90.442496]  kfree+0x23e/0x2f0
[   90.442502]  vcn_v4_0_sw_fini+0xc2/0x110 [amdgpu]
[   90.442653]  amdgpu_device_fini_sw+0x133/0x700 [amdgpu]
[   90.442835]  amdgpu_driver_release_kms+0x1a/0x30 [amdgpu]
[   90.442981]  drm_dev_release+0x2d/0x50 [drm]
[   90.443003]  drm_minor_release+0x3d/0x60 [drm]
[   90.443026]  drm_release+0x90/0xd0 [drm]
[   90.443052]  __fput+0xfa/0x2f0
[   90.443059]  __fput_sync+0x1e/0x30
[   90.443064]  __x64_sys_close+0x42/0x90
[   90.443071]  x64_sys_call+0x18f6/0x20d0
[   90.443076]  do_syscall_64+0x6f/0x110
[   90.443083]  ? do_syscall_64+0x7b/0x110
[   90.443089]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   90.443096] RIP: 0033:0x7f0a51514f67
[   90.443102] Code: ff e8 0d 16 02 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 
f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 
f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 73 ba f7 ff
[   90.443113] RSP: 002b:7ffec29b16e8 EFLAGS: 0246 ORIG_RAX: 
0003
[   90.443121] RAX: ffda RBX:  RCX: 7f0a51514f67
[   90.443127] RDX: 0001 RSI: 7f0a51776485 RDI: 0003
[   90.443132] RBP: 7ffec29b1770 R08: 55f942381170 R09: 
[   90.443138] R10:  R11: 0246 R12: 
[   90.443143] R13: 55f90a2a41c0 R14:  R15: 0001
[   90.443152]  

Set ip_dump to null after releasing vcn ip_dump.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c   | 5 -
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c   | 5 -
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   | 5 -
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 5 -
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c | 5 -
 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c | 5 -
 6 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
index a327c3bf84f2..b23b55539b43 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
@@ -263,7 +263,10 @@ static int vcn_v2_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
 
r = amdgpu_vcn_sw_fini(adev, inst);
 
-   kfree(adev->vcn.ip_dump);
+   if (adev->vcn.ip_dump) {
+   kfree(adev->vcn.ip_dump);
+   adev->vcn.ip_dump = NULL;
+   }
 
return r;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
index b78c6da0a3cd..df3855e7b5c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
@@ -331,7 +331,10 @@ static int vcn_v3_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
 
r = amdgpu_vcn_sw_fini(adev, inst);
 
-   kfree(adev->vcn.ip_dump);
+   if (adev->vcn.ip_dump) {
+   kfree(adev->vcn.ip_dump);
+   adev->vcn.ip_dump = NULL;
+   }
return r;
 }
 
diff --git a/

[PATCH V3 2/5] drm/amdgpu: Add sysfs interface for sdma reset mask

2024-10-25 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Add the sysfs interface for sdma:
sdma_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
   and print the strings in the order they are applied (Christian)

   check amdgpu_gpu_recovery  before creating sysfs file itself,
   and initialize supported_reset_types in IP version files (Lijo)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 41 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 ++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c   | 23 +
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c   | 17 ++
 5 files changed, 102 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 183a976ba29d..7edcd989afce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -343,3 +343,44 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev)
 
return 0;
 }
+
+static ssize_t amdgpu_get_sdma_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->sdma.supported_reset);
+}
+
+static DEVICE_ATTR(sdma_reset_mask, 0444,
+  amdgpu_get_sdma_reset_mask, NULL);
+
+int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (!amdgpu_gpu_recovery)
+   return r;
+
+   if (adev->sdma.num_instances) {
+   r = device_create_file(adev->dev, &dev_attr_sdma_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (!amdgpu_gpu_recovery)
+   return;
+
+   if (adev->sdma.num_instances)
+   device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 087ce0f6fa07..7ce613de7ee0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -116,6 +116,7 @@ struct amdgpu_sdma {
struct ras_common_if*ras_if;
struct amdgpu_sdma_ras  *ras;
uint32_t*ip_dump;
+   uint32_tsupported_reset;
 };
 
 /*
@@ -175,5 +176,7 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev, 
u32 instance,
 void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev,
 bool duplicate);
 int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 6a675daf5620..728643efe203 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1452,6 +1452,19 @@ static int sdma_v5_0_sw_init(struct amdgpu_ip_block 
*ip_block)
return r;
}
 
+   switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
+   case IP_VERSION(5, 0, 0):
+   case IP_VERSION(5, 0, 2):
+   case IP_VERSION(5, 0, 5):
+   if (adev->sdma.instance[i].fw_version >= 35)
+   adev->sdma.supported_reset = 
AMDGPU_RESET_TYPE_PER_QUEUE;
+   break;
+   default:
+   break;
+   }
+   adev->sdma.supported_reset |=
+   amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
+
/* Allocate memory for SDMA IP Dump buffer */
ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t), 
GFP_KERNEL);
if (ptr)
@@ -1459,6 +1472,10 @@ static int sdma_v5_0_sw_init(struct amdgpu_ip_block 
*ip_block)
else
DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n");
 
+   r = amdgpu_sdma_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return r;
 }
 
@@ -1470,6 +1487,7 @@ static int sdma_v5_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
for (i = 0; i < adev->sdma.num_instances; i++)
amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 
+   amdgpu_sdma_sysfs_reset_mask_fini(adev);
amdgpu_sdma_destroy_inst_ctx(adev,

[PATCH V3 3/5] drm/amdgpu: Add sysfs interface for vcn reset mask

2024-10-24 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Add the sysfs interface for vcn:
vcn_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)

V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
and print the strings in the order they are applied (Christian)

check amdgpu_gpu_recovery  before creating sysfs file itself,
and initialize supported_reset_types in IP version files (Lijo)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 35 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  4 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   |  9 +++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  8 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c |  9 +++
 5 files changed, 65 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 43f44cc201cb..9bbae298189a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1277,3 +1277,38 @@ int amdgpu_vcn_psp_update_sram(struct amdgpu_device 
*adev, int inst_idx,
 
return psp_execute_ip_fw_load(&adev->psp, &ucode);
 }
+
+static ssize_t amdgpu_get_vcn_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->vcn.supported_reset);
+}
+
+static DEVICE_ATTR(vcn_reset_mask, 0444,
+  amdgpu_get_vcn_reset_mask, NULL);
+
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->vcn.num_vcn_inst) {
+   r = device_create_file(adev->dev, &dev_attr_vcn_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->vcn.num_vcn_inst)
+   device_remove_file(adev->dev, &dev_attr_vcn_reset_mask);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index 2a1f3dbb14d3..904336ff0b39 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -333,6 +333,8 @@ struct amdgpu_vcn {
 
/* IP reg dump */
uint32_t*ip_dump;
+
+   uint32_tsupported_reset;
 };
 
 struct amdgpu_fw_shared_rb_ptrs_struct {
@@ -518,5 +520,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
 
 int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
   enum AMDGPU_UCODE_ID ucode_id);
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index e7b7a8150ea7..9c84cfe9ea43 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -225,6 +225,10 @@ static int vcn_v4_0_sw_init(struct amdgpu_ip_block 
*ip_block)
vcn_v4_0_fw_shared_init(adev, i);
}
 
+   /* TODO: Check the version that supports queue reset */
+   adev->sdma.supported_reset |=
+   
amdgpu_get_soft_full_reset_mask(&adev->vcn.inst[0].ring_enc[0]);
+
if (amdgpu_sriov_vf(adev)) {
r = amdgpu_virt_alloc_mm_table(adev);
if (r)
@@ -247,6 +251,10 @@ static int vcn_v4_0_sw_init(struct amdgpu_ip_block 
*ip_block)
adev->vcn.ip_dump = ptr;
}
 
+   r = amdgpu_vcn_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -284,6 +292,7 @@ static int vcn_v4_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
+   amdgpu_vcn_sysfs_reset_mask_fini(adev);
r = amdgpu_vcn_sw_fini(adev);
 
kfree(adev->vcn.ip_dump);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index 6dcae398b2dc..1887a15b7d69 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -180,6 +180,10 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block 
*ip_block)
amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]);
}
 
+   /* TODO: Check the version that supports queue reset ? */
+   adev->sdma.supported_reset |=
+

[PATCH V3 4/5] drm/amdgpu: Add sysfs interface for vpe reset mask

2024-10-24 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Add the sysfs interface for vpe:
vpe_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
and print the strings in the order they are applied (Christian)

check amdgpu_gpu_recovery  before creating sysfs file itself,
and initialize supported_reset_types in IP version files (Lijo)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 43 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h |  3 ++
 2 files changed, 46 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
index 6d96e1f21e20..44213634e236 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
@@ -377,6 +377,13 @@ static int vpe_sw_init(struct amdgpu_ip_block *ip_block)
ret = vpe_init_microcode(vpe);
if (ret)
goto out;
+
+   /* TODO: Check the version that supports queue reset */
+   adev->vpe.supported_reset |=
+amdgpu_get_soft_full_reset_mask(&adev->vpe.ring);
+   ret = amdgpu_vpe_sysfs_reset_mask_init(adev);
+   if (ret)
+   goto out;
 out:
return ret;
 }
@@ -389,6 +396,7 @@ static int vpe_sw_fini(struct amdgpu_ip_block *ip_block)
release_firmware(vpe->fw);
vpe->fw = NULL;
 
+   amdgpu_vpe_sysfs_reset_mask_fini(adev);
vpe_ring_fini(vpe);
 
amdgpu_bo_free_kernel(&adev->vpe.cmdbuf_obj,
@@ -865,6 +873,41 @@ static void vpe_ring_end_use(struct amdgpu_ring *ring)
schedule_delayed_work(&adev->vpe.idle_work, VPE_IDLE_TIMEOUT);
 }
 
+static ssize_t amdgpu_get_vpe_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->vpe.supported_reset);
+}
+
+static DEVICE_ATTR(vpe_reset_mask, 0444,
+  amdgpu_get_vpe_reset_mask, NULL);
+
+int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->vpe.num_instances) {
+   r = device_create_file(adev->dev, &dev_attr_vpe_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->vpe.num_instances)
+   device_remove_file(adev->dev, &dev_attr_vpe_reset_mask);
+}
+
 static const struct amdgpu_ring_funcs vpe_ring_funcs = {
.type = AMDGPU_RING_TYPE_VPE,
.align_mask = 0xf,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h
index 231d86d0953e..695da740a97e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h
@@ -79,6 +79,7 @@ struct amdgpu_vpe {
 
uint32_tnum_instances;
boolcollaborate_mode;
+   uint32_tsupported_reset;
 };
 
 int amdgpu_vpe_psp_update_sram(struct amdgpu_device *adev);
@@ -86,6 +87,8 @@ int amdgpu_vpe_init_microcode(struct amdgpu_vpe *vpe);
 int amdgpu_vpe_ring_init(struct amdgpu_vpe *vpe);
 int amdgpu_vpe_ring_fini(struct amdgpu_vpe *vpe);
 int amdgpu_vpe_configure_dpm(struct amdgpu_vpe *vpe);
+void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev);
+int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device *adev);
 
 #define vpe_ring_init(vpe) ((vpe)->funcs->ring_init ? 
(vpe)->funcs->ring_init((vpe)) : 0)
 #define vpe_ring_start(vpe) ((vpe)->funcs->ring_start ? 
(vpe)->funcs->ring_start((vpe)) : 0)
-- 
2.25.1

[PATCH] drm/admgpu: fix vcn reset sysfs warning

2024-11-12 Thread jesse.zh...@amd.com

sysfs: cannot create duplicate filename 
'/devices/pci:00/:00:01.1/:01:00.0/:02:00.0/:03:00.0/vcn_reset_mask'
[  562.443738] CPU: 13 PID: 4888 Comm: modprobe Tainted: GE  
6.10.0+ #51
[  562.443740] Hardware name: AMD Splinter/Splinter-RPL, BIOS VS2683299N.FD 
05/10/2023
[  562.443741] Call Trace:
[  562.443743]  
[  562.443746]  dump_stack_lvl+0x70/0x90
[  562.443751]  dump_stack+0x14/0x20
[  562.443753]  sysfs_warn_dup+0x60/0x80
[  562.443757]  sysfs_add_file_mode_ns+0x126/0x130
[  562.443760]  sysfs_create_file_ns+0x68/0xa0
[  562.443762]  device_create_file+0x46/0x90
[  562.443766]  amdgpu_vcn_sysfs_reset_mask_init+0x1c/0x30 [amdgpu]
[  562.443991]  vcn_v4_0_3_sw_init+0x270/0x3e0 [amdgpu]
[  562.444120]  amdgpu_device_init+0x1a0e/0x35a0 [amdgpu]
[  562.444227]  ? srso_alias_return_thunk+0x5/0xfbef5
[  562.444230]  ? pci_read_config_word+0x2d/0x50
[  562.444235]  amdgpu_driver_load_kms+0x1e/0xc0 [amdgpu]
[  562.444340]  amdgpu_pci_probe+0x1c3/0x660 [amdgpu]
[  562.51]  local_pci_probe+0x4c/0xb0

For multiple vcn instances, to avoid creating reset sysfs multiple times,
add the instance paramter in reset mask init.

V2: create one sysfs file per instance.  E.g.,
vcn_reset_mask, vcn1_reset_mask, vcn2_reset_mask, etc. (Alex)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 92 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  6 +-
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   | 14 ++--
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  8 +--
 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c |  8 +--
 5 files changed, 104 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 25f490ad3a85..e3eab01ea38d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1289,32 +1289,114 @@ static ssize_t amdgpu_get_vcn_reset_mask(struct device 
*dev,
if (!adev)
return -ENODEV;
 
-   return amdgpu_show_reset_mask(buf, adev->vcn.supported_reset);
+   return amdgpu_show_reset_mask(buf, adev->vcn.inst[0].supported_reset);
+}
+
+static ssize_t amdgpu_get_vcn1_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->vcn.inst[1].supported_reset);
+}
+
+static ssize_t amdgpu_get_vcn2_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->vcn.inst[2].supported_reset);
+}
+
+static ssize_t amdgpu_get_vcn3_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->vcn.inst[3].supported_reset);
 }
 
 static DEVICE_ATTR(vcn_reset_mask, 0444,
   amdgpu_get_vcn_reset_mask, NULL);
 
-int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev)
+static DEVICE_ATTR(vcn1_reset_mask, 0444,
+  amdgpu_get_vcn1_reset_mask, NULL);
+
+static DEVICE_ATTR(vcn2_reset_mask, 0444,
+  amdgpu_get_vcn2_reset_mask, NULL);
+
+static DEVICE_ATTR(vcn3_reset_mask, 0444,
+  amdgpu_get_vcn3_reset_mask, NULL);
+
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev, int inst)
 {
int r = 0;
 
-   if (adev->vcn.num_vcn_inst) {
+   switch (inst) {
+   case 0:
r = device_create_file(adev->dev, &dev_attr_vcn_reset_mask);
if (r)
return r;
+   break;
+   case 1:
+   r = device_create_file(adev->dev, &dev_attr_vcn1_reset_mask);
+   if (r)
+   return r;
+   break;
+   case 2:
+   r = device_create_file(adev->dev, &dev_attr_vcn2_reset_mask);
+   if (r)
+   return r;
+   break;
+   case 3:
+   r = device_create_file(adev->dev, &dev_attr_vcn3_reset_mask);
+   if (r)
+   return r;
+   break;
+   default:
+   break;
}
 
return r;
 }
 
-void amdgpu_vcn_sysfs_reset_mask_fini(struct

[PATCH V2] drm/admgpu: fix vcn reset sysfs warning

2024-11-12 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

sysfs: cannot create duplicate filename 
'/devices/pci:00/:00:01.1/:01:00.0/:02:00.0/:03:00.0/vcn_reset_mask'
[  562.443738] CPU: 13 PID: 4888 Comm: modprobe Tainted: GE  
6.10.0+ #51
[  562.443740] Hardware name: AMD Splinter/Splinter-RPL, BIOS VS2683299N.FD 
05/10/2023
[  562.443741] Call Trace:
[  562.443743]  
[  562.443746]  dump_stack_lvl+0x70/0x90
[  562.443751]  dump_stack+0x14/0x20
[  562.443753]  sysfs_warn_dup+0x60/0x80
[  562.443757]  sysfs_add_file_mode_ns+0x126/0x130
[  562.443760]  sysfs_create_file_ns+0x68/0xa0
[  562.443762]  device_create_file+0x46/0x90
[  562.443766]  amdgpu_vcn_sysfs_reset_mask_init+0x1c/0x30 [amdgpu]
[  562.443991]  vcn_v4_0_3_sw_init+0x270/0x3e0 [amdgpu]
[  562.444120]  amdgpu_device_init+0x1a0e/0x35a0 [amdgpu]
[  562.444227]  ? srso_alias_return_thunk+0x5/0xfbef5
[  562.444230]  ? pci_read_config_word+0x2d/0x50
[  562.444235]  amdgpu_driver_load_kms+0x1e/0xc0 [amdgpu]
[  562.444340]  amdgpu_pci_probe+0x1c3/0x660 [amdgpu]
[  562.51]  local_pci_probe+0x4c/0xb0

For multiple vcn instances, to avoid creating reset sysfs multiple times,
add the instance paramter in reset mask init.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c |  8 
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  4 ++--
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   | 10 --
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c |  4 ++--
 5 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 25f490ad3a85..1d4eda649845 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1295,11 +1295,11 @@ static ssize_t amdgpu_get_vcn_reset_mask(struct device 
*dev,
 static DEVICE_ATTR(vcn_reset_mask, 0444,
   amdgpu_get_vcn_reset_mask, NULL);
 
-int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev)
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev, int inst)
 {
int r = 0;
 
-   if (adev->vcn.num_vcn_inst) {
+   if (inst == 0) {
r = device_create_file(adev->dev, &dev_attr_vcn_reset_mask);
if (r)
return r;
@@ -1308,12 +1308,12 @@ int amdgpu_vcn_sysfs_reset_mask_init(struct 
amdgpu_device *adev)
return r;
 }
 
-void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev, int inst)
 {
int idx;
 
if (drm_dev_enter(adev_to_drm(adev), &idx)) {
-   if (adev->vcn.num_vcn_inst)
+   if (inst == 0)
device_remove_file(adev->dev, &dev_attr_vcn_reset_mask);
drm_dev_exit(idx);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index 7ff4ae2a0432..9b10044c61a3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -519,7 +519,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
   enum AMDGPU_UCODE_ID ucode_id);
 int amdgpu_vcn_save_vcpu_bo(struct amdgpu_device *adev, int inst);
-int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev);
-void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev);
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev, int inst);
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev, int inst);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index 59f83409d323..109b27904984 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -250,11 +250,9 @@ static int vcn_v4_0_sw_init(struct amdgpu_ip_block 
*ip_block)
ip_block->ip_dump = ptr;
}
 
-   if (inst == 0) {
-   r = amdgpu_vcn_sysfs_reset_mask_init(adev);
-   if (r)
-   return r;
-   }
+   r = amdgpu_vcn_sysfs_reset_mask_init(adev, inst);
+   if (r)
+   return r;
 
return 0;
 }
@@ -292,7 +290,7 @@ static int vcn_v4_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
-   amdgpu_vcn_sysfs_reset_mask_fini(adev);
+   amdgpu_vcn_sysfs_reset_mask_fini(adev, inst);
r = amdgpu_vcn_sw_fini(adev, inst);
 
kfree(ip_block->ip_dump);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index e9b869f373c9..ef3dfd44a022 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -217,7 +217,7 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block 
*ip_block)

[PATCH 2/2] drm/amdgpu: fix vcn sw init failed

2024-11-12 Thread jesse.zh...@amd.com

[ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of IP block 
 failed -22
[ 2875.880494] amdgpu :01:00.0: amdgpu: amdgpu_device_ip_init failed
[ 2875.887689] amdgpu :01:00.0: amdgpu: Fatal error during GPU init
[ 2875.894791] amdgpu :01:00.0: amdgpu: amdgpu: finishing device.

Add irqs with different IRQ source pointer for vcn0 and vcn1.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index ef3dfd44a022..82b90f1e6f33 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry 
vcn_reg_list_4_0_3[] = {
 
 #define NORMALIZE_VCN_REG_OFFSET(offset) \
(offset & 0x1)
+static int amdgpu_ih_clientid_vcns[] = {
+   SOC15_IH_CLIENTID_VCN,
+   SOC15_IH_CLIENTID_VCN1
+};
 
 static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
 static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device *adev, int 
inst);
@@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
-   /* VCN DEC TRAP */
-   r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
-   VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
+   /* VCN UNIFIED TRAP */
+   r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
+   VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, 
&adev->vcn.inst[inst].irq);
if (r)
return r;
 
@@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block 
*ip_block)
 
ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
-   r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
+   r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[inst].irq, 0,
 AMDGPU_RING_PRIO_DEFAULT,
 &adev->vcn.inst[inst].sched_score);
if (r)
@@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs 
vcn_v4_0_3_irq_funcs = {
  */
 static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int inst)
 {
-   adev->vcn.inst->irq.num_types++;
+   if (adev->vcn.harvest_config & (1 << inst))
+   return;
+
+   adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings + 1;
 
-   adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
+   adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
 }
 
 static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block *ip_block, struct 
drm_printer *p)
-- 
2.25.1

[PATCH 1/2] drm/admgpu: fix vcn sw init failed

2024-11-12 Thread jesse.zh...@amd.com

For multiple vcn instances, to avoid creating reset sysfs multiple times,
 add the instance paramter in reset mask init.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c |  8 
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  4 ++--
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   | 10 --
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c |  4 ++--
 5 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 25f490ad3a85..1d4eda649845 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1295,11 +1295,11 @@ static ssize_t amdgpu_get_vcn_reset_mask(struct device 
*dev,
 static DEVICE_ATTR(vcn_reset_mask, 0444,
   amdgpu_get_vcn_reset_mask, NULL);
 
-int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev)
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev, int inst)
 {
int r = 0;
 
-   if (adev->vcn.num_vcn_inst) {
+   if (inst == 0) {
r = device_create_file(adev->dev, &dev_attr_vcn_reset_mask);
if (r)
return r;
@@ -1308,12 +1308,12 @@ int amdgpu_vcn_sysfs_reset_mask_init(struct 
amdgpu_device *adev)
return r;
 }
 
-void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev, int inst)
 {
int idx;
 
if (drm_dev_enter(adev_to_drm(adev), &idx)) {
-   if (adev->vcn.num_vcn_inst)
+   if (inst == 0)
device_remove_file(adev->dev, &dev_attr_vcn_reset_mask);
drm_dev_exit(idx);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index 7ff4ae2a0432..9b10044c61a3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -519,7 +519,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
   enum AMDGPU_UCODE_ID ucode_id);
 int amdgpu_vcn_save_vcpu_bo(struct amdgpu_device *adev, int inst);
-int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev);
-void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev);
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev, int inst);
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev, int inst);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index 59f83409d323..109b27904984 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -250,11 +250,9 @@ static int vcn_v4_0_sw_init(struct amdgpu_ip_block 
*ip_block)
ip_block->ip_dump = ptr;
}
 
-   if (inst == 0) {
-   r = amdgpu_vcn_sysfs_reset_mask_init(adev);
-   if (r)
-   return r;
-   }
+   r = amdgpu_vcn_sysfs_reset_mask_init(adev, inst);
+   if (r)
+   return r;
 
return 0;
 }
@@ -292,7 +290,7 @@ static int vcn_v4_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
-   amdgpu_vcn_sysfs_reset_mask_fini(adev);
+   amdgpu_vcn_sysfs_reset_mask_fini(adev, inst);
r = amdgpu_vcn_sw_fini(adev, inst);
 
kfree(ip_block->ip_dump);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index e9b869f373c9..ef3dfd44a022 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -217,7 +217,7 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block 
*ip_block)
ip_block->ip_dump = ptr;
}
 
-   r = amdgpu_vcn_sysfs_reset_mask_init(adev);
+   r = amdgpu_vcn_sysfs_reset_mask_init(adev, inst);
if (r)
return r;
 
@@ -254,7 +254,7 @@ static int vcn_v4_0_3_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
-   amdgpu_vcn_sysfs_reset_mask_fini(adev);
+   amdgpu_vcn_sysfs_reset_mask_fini(adev, inst);
r = amdgpu_vcn_sw_fini(adev, inst);
 
kfree(ip_block->ip_dump);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
index 96ec01cffea3..8f9c19c68d88 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
@@ -186,7 +186,7 @@ static int vcn_v5_0_0_sw_init(struct amdgpu_ip_block 
*ip_block)
ip_block->ip_dump = ptr;
}
 
-   r = amdgpu_vcn_sysfs_reset_mask_init(adev);
+   r = amdgpu_vcn_sysfs_reset_mask_init(adev, inst);
if (r)
return r;
 
@@ -223,7 +223,7 @@ static int vcn_v5_0_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return

[PATCH V3 5/5] drm/amdgpu: Add sysfs interface for jpeg reset mask

2024-10-24 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Add the sysfs interface for jpeg:
jpeg_reset_mask

The interface is read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.

V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
and print the strings in the order they are applied (Christian)

check amdgpu_gpu_recovery  before creating sysfs file itself,
and initialize supported_reset_types in IP version files (Lijo)

Signed-off-by: Jesse Zhang 
Suggested-by:Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 35 
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h |  3 ++
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c   |  7 +
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c |  8 ++
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c |  8 ++
 drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c |  7 +
 6 files changed, 68 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 95e2796919fc..f971ffdffce9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -415,3 +415,38 @@ void amdgpu_debugfs_jpeg_sched_mask_init(struct 
amdgpu_device *adev)
&amdgpu_debugfs_jpeg_sched_mask_fops);
 #endif
 }
+
+static ssize_t amdgpu_get_jpeg_reset_mask(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (!adev)
+   return -ENODEV;
+
+   return amdgpu_show_reset_mask(buf, adev->jpeg.supported_reset);
+}
+
+static DEVICE_ATTR(jpeg_reset_mask, 0444,
+  amdgpu_get_jpeg_reset_mask, NULL);
+
+int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   if (adev->jpeg.num_jpeg_inst) {
+   r = device_create_file(adev->dev, &dev_attr_jpeg_reset_mask);
+   if (r)
+   return r;
+   }
+
+   return r;
+}
+
+void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+   if (adev->jpeg.num_jpeg_inst)
+   device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
index 819dc7a0af99..3eb4a4653fce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
@@ -128,6 +128,7 @@ struct amdgpu_jpeg {
uint16_t inst_mask;
uint8_t num_inst_per_aid;
boolindirect_sram;
+   uint32_t supported_reset;
 };
 
 int amdgpu_jpeg_sw_init(struct amdgpu_device *adev);
@@ -150,5 +151,7 @@ int amdgpu_jpeg_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_jpeg_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
   enum AMDGPU_UCODE_ID ucode_id);
 void amdgpu_debugfs_jpeg_sched_mask_init(struct amdgpu_device *adev);
+int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev);
 
 #endif /*__AMDGPU_JPEG_H__*/
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
index 20e1fe89c463..d1ee342d91e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
@@ -121,6 +121,12 @@ static int jpeg_v4_0_sw_init(struct amdgpu_ip_block 
*ip_block)
adev->jpeg.inst->external.jpeg_pitch[0] = SOC15_REG_OFFSET(JPEG, 0, 
regUVD_JPEG_PITCH);
 
r = amdgpu_jpeg_ras_sw_init(adev);
+   if (r)
+   return r;
+   /* TODO: Check the version that supports queue reset */
+   adev->jpeg.supported_reset |=
+   
amdgpu_get_soft_full_reset_mask(&adev->jpeg.inst[0].ring_dec[0]);
+   r = amdgpu_jpeg_sysfs_reset_mask_init(adev);
if (r)
return r;
 
@@ -143,6 +149,7 @@ static int jpeg_v4_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
if (r)
return r;
 
+   amdgpu_jpeg_sysfs_reset_mask_fini(adev);
r = amdgpu_jpeg_sw_fini(adev);
 
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index 2a53537db135..8c673fe71e5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -159,6 +159,13 @@ static int jpeg_v4_0_3_sw_init(struct amdgpu_ip_block 
*ip_block)
}
}
 
+   /* TODO: Check the version that supports queue reset */
+   adev->jpeg.supported_reset |=
+   
amdgpu_get_soft_full_reset_mask(&adev->jpeg.inst[0].ring_dec[0]);
+

[PATCH] drm/amdgpu: fix warning when removing sysfs

2024-11-07 Thread jesse.zh...@amd.com

Fix similar warning when running IGT:

[  155.585721] kernfs: can not remove 'enforce_isolation', no directory
[  155.592201] WARNING: CPU: 3 PID: 6960 at fs/kernfs/dir.c:1683 
kernfs_remove_by_name_ns+0xb9/0xc0
[  155.601145] Modules linked in: xt_MASQUERADE xt_comment nft_compat veth 
bridge stp llc overlay nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib 
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat 
nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink 
qrtr intel_rapl_msr amd_atl intel_rapl_common amd64_edac edac_mce_amd amdgpu 
kvm_amd kvm ipmi_ssif amdxcp rapl drm_exec gpu_sched drm_buddy i2c_algo_bit 
drm_suballoc_helper drm_ttm_helper ttm pcspkr drm_display_helper acpi_cpufreq 
drm_kms_helper video wmi k10temp i2c_piix4 acpi_ipmi ipmi_si drm zram ip_tables 
loop squashfs dm_multipath crct10dif_pclmul crc32_pclmul crc32c_intel 
ghash_clmulni_intel sha512_ssse3 sha256_ssse3 sha1_ssse3 sp5100_tco ixgbe 
rfkill ccp dca sunrpc be2iscsi bnx2i cnic uio cxgb4i cxgb4 tls cxgb3i cxgb3 
mdio libcxgbi libcxgb qla4xxx iscsi_boot_sysfs iscsi_tcp libiscsi_tcp libiscsi 
scsi_transport_iscsi ipmi_devintf ipmi_msghandler fuse
[  155.685224] systemd-journald[1354]: Compressed data object 957 -> 524 using 
ZSTD
[  155.685687] CPU: 3 PID: 6960 Comm: amd_pci_unplug Not tainted 
6.10.0-1148853.1.zuul.164395107d6642bdb451071313e9378d #1
[  155.704149] Hardware name: TYAN B8021G88V2HR-2T/S8021GM2NR-2T, BIOS 
V1.03.B10 04/01/2019
[  155.712383] RIP: 0010:kernfs_remove_by_name_ns+0xb9/0xc0
[  155.717805] Code: a0 00 48 89 ef e8 37 96 c7 ff 5b b8 fe ff ff ff 5d 41 5c 
41 5d e9 f7 96 a0 00 0f 0b eb ab 48 c7 c7 48 ba 7e 8f e8 f7 66 bf ff <0f> 0b eb 
dc 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90
[  155.736766] RSP: 0018:b1685d7a3e20 EFLAGS: 00010296
[  155.742108] RAX: 0038 RBX: 929e94c8 RCX: 
[  155.749363] RDX: 928e1efaf200 RSI: 928e1efa18c0 RDI: 928e1efa18c0
[  155.756612] RBP: 0008 R08:  R09: 0003
[  155.763855] R10: b1685d7a3cd8 R11: 8fb3e1c8 R12: c1ef5341
[  155.771104] R13: 929e94cc5530 R14:  R15: 
[  155.778357] FS:  7fd9dd8d9c40() GS:928e1ef8() 
knlGS:
[  155.786594] CS:  0010 DS:  ES:  CR0: 80050033
[  155.792450] CR2: 561245ceee38 CR3: 000113018000 CR4: 003506f0
[  155.799702] Call Trace:
[  155.802254]  
[  155.804460]  ? __warn+0x80/0x120
[  155.807798]  ? kernfs_remove_by_name_ns+0xb9/0xc0
[  155.812617]  ? report_bug+0x164/0x190
[  155.816393]  ? handle_bug+0x3c/0x80
[  155.819994]  ? exc_invalid_op+0x17/0x70
[  155.823939]  ? asm_exc_invalid_op+0x1a/0x20
[  155.828235]  ? kernfs_remove_by_name_ns+0xb9/0xc0
[  155.833058]  amdgpu_gfx_sysfs_fini+0x59/0xd0 [amdgpu]
[  155.838637]  gfx_v9_0_sw_fini+0x123/0x1c0 [amdgpu]
[  155.843887]  amdgpu_device_fini_sw+0xbc/0x3e0 [amdgpu]
[  155.849432]  amdgpu_driver_release_kms+0x16/0x30 [amdgpu]
[  155.855235]  drm_dev_put.part.0+0x3c/0x60 [drm]
[  155.859914]  drm_release+0x8b/0xc0 [drm]
[  155.863978]  __fput+0xf1/0x2c0
[  155.867141]  __x64_sys_close+0x3c/0x80
[  155.870998]  do_syscall_64+0x64/0x170

Check if the sysfs directory entry exists before deleting the sysfs file.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c| 3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c| 3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/df_v3_6.c| 2 ++
 7 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index f7bf5e43f16e..a9f40b28e030 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1773,6 +1773,9 @@ int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)
 
 void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev)
 {
+   if (!adev || !adev->dev->kobj.sd)
+   return
+
amdgpu_gfx_sysfs_xcp_fini(adev);
amdgpu_gfx_sysfs_isolation_shader_fini(adev);
amdgpu_gfx_sysfs_reset_mask_fini(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 642b8c848141..257f4b712f00 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -447,6 +447,9 @@ int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device 
*adev)
 
 void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev)
 {
+   if (!adev || !adev->dev->kobj.sd)
+   return;
+
if (adev->jpeg.num_jpeg_inst)
device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask);
 }
diff --git a/drivers/gp

[PATCH V2] drm/amdgpu: fix warning when removing sysfs

2024-11-08 Thread jesse.zh...@amd.com

Fix the similar warning:

[  155.585721] kernfs: can not remove 'enforce_isolation', no directory
[  155.592201] WARNING: CPU: 3 PID: 6960 at fs/kernfs/dir.c:1683 
kernfs_remove_by_name_ns+0xb9/0xc0
[  155.601145] Modules linked in: xt_MASQUERADE xt_comment nft_compat veth 
bridge stp llc overlay nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib 
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat 
nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink 
qrtr intel_rapl_msr amd_atl intel_rapl_common amd64_edac edac_mce_amd amdgpu 
kvm_amd kvm ipmi_ssif amdxcp rapl drm_exec gpu_sched drm_buddy i2c_algo_bit 
drm_suballoc_helper drm_ttm_helper ttm pcspkr drm_display_helper acpi_cpufreq 
drm_kms_helper video wmi k10temp i2c_piix4 acpi_ipmi ipmi_si drm zram ip_tables 
loop squashfs dm_multipath crct10dif_pclmul crc32_pclmul crc32c_intel 
ghash_clmulni_intel sha512_ssse3 sha256_ssse3 sha1_ssse3 sp5100_tco ixgbe 
rfkill ccp dca sunrpc be2iscsi bnx2i cnic uio cxgb4i cxgb4 tls cxgb3i cxgb3 
mdio libcxgbi libcxgb qla4xxx iscsi_boot_sysfs iscsi_tcp libiscsi_tcp libiscsi 
scsi_transport_iscsi ipmi_devintf ipmi_msghandler fuse
[  155.685224] systemd-journald[1354]: Compressed data object 957 -> 524 using 
ZSTD
[  155.685687] CPU: 3 PID: 6960 Comm: amd_pci_unplug Not tainted 
6.10.0-1148853.1.zuul.164395107d6642bdb451071313e9378d #1
[  155.704149] Hardware name: TYAN B8021G88V2HR-2T/S8021GM2NR-2T, BIOS 
V1.03.B10 04/01/2019
[  155.712383] RIP: 0010:kernfs_remove_by_name_ns+0xb9/0xc0
[  155.717805] Code: a0 00 48 89 ef e8 37 96 c7 ff 5b b8 fe ff ff ff 5d 41 5c 
41 5d e9 f7 96 a0 00 0f 0b eb ab 48 c7 c7 48 ba 7e 8f e8 f7 66 bf ff <0f> 0b eb 
dc 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90
[  155.736766] RSP: 0018:b1685d7a3e20 EFLAGS: 00010296
[  155.742108] RAX: 0038 RBX: 929e94c8 RCX: 
[  155.749363] RDX: 928e1efaf200 RSI: 928e1efa18c0 RDI: 928e1efa18c0
[  155.756612] RBP: 0008 R08:  R09: 0003
[  155.763855] R10: b1685d7a3cd8 R11: 8fb3e1c8 R12: c1ef5341
[  155.771104] R13: 929e94cc5530 R14:  R15: 
[  155.778357] FS:  7fd9dd8d9c40() GS:928e1ef8() 
knlGS:
[  155.786594] CS:  0010 DS:  ES:  CR0: 80050033
[  155.792450] CR2: 561245ceee38 CR3: 000113018000 CR4: 003506f0
[  155.799702] Call Trace:
[  155.802254]  
[  155.804460]  ? __warn+0x80/0x120
[  155.807798]  ? kernfs_remove_by_name_ns+0xb9/0xc0
[  155.812617]  ? report_bug+0x164/0x190
[  155.816393]  ? handle_bug+0x3c/0x80
[  155.819994]  ? exc_invalid_op+0x17/0x70
[  155.823939]  ? asm_exc_invalid_op+0x1a/0x20
[  155.828235]  ? kernfs_remove_by_name_ns+0xb9/0xc0
[  155.833058]  amdgpu_gfx_sysfs_fini+0x59/0xd0 [amdgpu]
[  155.838637]  gfx_v9_0_sw_fini+0x123/0x1c0 [amdgpu]
[  155.843887]  amdgpu_device_fini_sw+0xbc/0x3e0 [amdgpu]
[  155.849432]  amdgpu_driver_release_kms+0x16/0x30 [amdgpu]
[  155.855235]  drm_dev_put.part.0+0x3c/0x60 [drm]
[  155.859914]  drm_release+0x8b/0xc0 [drm]
[  155.863978]  __fput+0xf1/0x2c0
[  155.867141]  __x64_sys_close+0x3c/0x80
[  155.870998]  do_syscall_64+0x64/0x170

Check if the sysfs directory entry exists before deleting the sysfs file.

Signed-off-by: Jesse Zhang 
Suggested-by: Lijo Lazar 
Reviewed-by: Tim Huang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 12 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c| 10 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c |  8 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c| 10 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c |  9 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c |  9 +++--
 drivers/gpu/drm/amd/amdgpu/df_v3_6.c|  8 ++--
 7 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index f7bf5e43f16e..f552d7dfae96 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -25,6 +25,7 @@
 
 #include 
 #include 
+#include 
 
 #include "amdgpu.h"
 #include "amdgpu_gfx.h"
@@ -1773,9 +1774,14 @@ int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)
 
 void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev)
 {
-   amdgpu_gfx_sysfs_xcp_fini(adev);
-   amdgpu_gfx_sysfs_isolation_shader_fini(adev);
-   amdgpu_gfx_sysfs_reset_mask_fini(adev);
+   int idx;
+
+   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
+   amdgpu_gfx_sysfs_xcp_fini(adev);
+   amdgpu_gfx_sysfs_isolation_shader_fini(adev);
+   amdgpu_gfx_sysfs_reset_mask_fini(adev);
+   drm_dev_exit(idx);
+   }
 }
 
 int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 642b8c8481

[PATCH 1/3] Revert "drm/amdgpu: fix a mistake when removing mem_info_preempt_used sysfs"

2024-11-18 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This reverts commit 10aec8943bcc5123288ded8c97e78312bcf17fb1.
the dev->unplugged flag will also be set to true ,
Only uninstall the driver by amdgpu_exit, not actually unplug the device.
that will cause a new issue.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
index 9a0346ed6ea4..33a714ddfbbc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
@@ -138,7 +138,7 @@ void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev)
if (ret)
return;
 
-   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
+   if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
drm_dev_exit(idx);
}
-- 
2.25.1

[PATCH 3/3 V2] drm/amdgpu: Fix sysfs warning when hotplugging

2024-11-18 Thread jesse.zh...@amd.com

Fix the similar warning when hotplugging:

[  155.585721] kernfs: can not remove 'enforce_isolation', no directory
[  155.592201] WARNING: CPU: 3 PID: 6960 at fs/kernfs/dir.c:1683 
kernfs_remove_by_name_ns+0xb9/0xc0
[  155.601145] Modules linked in: xt_MASQUERADE xt_comment nft_compat veth 
bridge stp llc overlay nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib 
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat 
nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink 
qrtr intel_rapl_msr amd_atl intel_rapl_common amd64_edac edac_mce_amd amdgpu 
kvm_amd kvm ipmi_ssif amdxcp rapl drm_exec gpu_sched drm_buddy i2c_algo_bit 
drm_suballoc_helper drm_ttm_helper ttm pcspkr drm_display_helper acpi_cpufreq 
drm_kms_helper video wmi k10temp i2c_piix4 acpi_ipmi ipmi_si drm zram ip_tables 
loop squashfs dm_multipath crct10dif_pclmul crc32_pclmul crc32c_intel 
ghash_clmulni_intel sha512_ssse3 sha256_ssse3 sha1_ssse3 sp5100_tco ixgbe 
rfkill ccp dca sunrpc be2iscsi bnx2i cnic uio cxgb4i cxgb4 tls cxgb3i cxgb3 
mdio libcxgbi libcxgb qla4xxx iscsi_boot_sysfs iscsi_tcp libiscsi_tcp libiscsi 
scsi_transport_iscsi ipmi_devintf ipmi_msghandler fuse
[  155.685224] systemd-journald[1354]: Compressed data object 957 -> 524 using 
ZSTD
[  155.685687] CPU: 3 PID: 6960 Comm: amd_pci_unplug Not tainted 
6.10.0-1148853.1.zuul.164395107d6642bdb451071313e9378d #1
[  155.704149] Hardware name: TYAN B8021G88V2HR-2T/S8021GM2NR-2T, BIOS 
V1.03.B10 04/01/2019
[  155.712383] RIP: 0010:kernfs_remove_by_name_ns+0xb9/0xc0
[  155.717805] Code: a0 00 48 89 ef e8 37 96 c7 ff 5b b8 fe ff ff ff 5d 41 5c 
41 5d e9 f7 96 a0 00 0f 0b eb ab 48 c7 c7 48 ba 7e 8f e8 f7 66 bf ff <0f> 0b eb 
dc 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90
[  155.736766] RSP: 0018:b1685d7a3e20 EFLAGS: 00010296
[  155.742108] RAX: 0038 RBX: 929e94c8 RCX: 
[  155.749363] RDX: 928e1efaf200 RSI: 928e1efa18c0 RDI: 928e1efa18c0
[  155.756612] RBP: 0008 R08:  R09: 0003
[  155.763855] R10: b1685d7a3cd8 R11: 8fb3e1c8 R12: c1ef5341
[  155.771104] R13: 929e94cc5530 R14:  R15: 
[  155.778357] FS:  7fd9dd8d9c40() GS:928e1ef8() 
knlGS:
[  155.786594] CS:  0010 DS:  ES:  CR0: 80050033
[  155.792450] CR2: 561245ceee38 CR3: 000113018000 CR4: 003506f0
[  155.799702] Call Trace:
[  155.802254]  
[  155.804460]  ? __warn+0x80/0x120
[  155.807798]  ? kernfs_remove_by_name_ns+0xb9/0xc0
[  155.812617]  ? report_bug+0x164/0x190
[  155.816393]  ? handle_bug+0x3c/0x80
[  155.819994]  ? exc_invalid_op+0x17/0x70
[  155.823939]  ? asm_exc_invalid_op+0x1a/0x20
[  155.828235]  ? kernfs_remove_by_name_ns+0xb9/0xc0
[  155.833058]  amdgpu_gfx_sysfs_fini+0x59/0xd0 [amdgpu]
[  155.838637]  gfx_v9_0_sw_fini+0x123/0x1c0 [amdgpu]
[  155.843887]  amdgpu_device_fini_sw+0xbc/0x3e0 [amdgpu]
[  155.849432]  amdgpu_driver_release_kms+0x16/0x30 [amdgpu]
[  155.855235]  drm_dev_put.part.0+0x3c/0x60 [drm]
[  155.859914]  drm_release+0x8b/0xc0 [drm]
[  155.863978]  __fput+0xf1/0x2c0
[  155.867141]  __x64_sys_close+0x3c/0x80
[  155.870998]  do_syscall_64+0x64/0x170

V2: Add details in comments (Tim)

Signed-off-by: Jesse Zhang 
Reported-by: Andy Dong 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 8 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c| 6 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c| 6 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 6 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 6 --
 drivers/gpu/drm/amd/amdgpu/df_v3_6.c| 4 ++--
 7 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 3c89c74d67e0..e54f42e3797e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1778,9 +1778,11 @@ int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)
 
 void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev)
 {
-   amdgpu_gfx_sysfs_xcp_fini(adev);
-   amdgpu_gfx_sysfs_isolation_shader_fini(adev);
-   amdgpu_gfx_sysfs_reset_mask_fini(adev);
+   if (adev->dev->kobj.sd) {
+   amdgpu_gfx_sysfs_xcp_fini(adev);
+   amdgpu_gfx_sysfs_isolation_shader_fini(adev);
+   amdgpu_gfx_sysfs_reset_mask_fini(adev);
+   }
 }
 
 int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 43ea76ebbad8..9a1a317d4fd9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -447,6 +447,8 @@ int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device 
*adev)
 
 void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdg

[PATCH] drm/amdgpu: Fix sysfs warning when hotplugging

2024-11-14 Thread jesse.zh...@amd.com

Replace the check drm_dev_enter with sysfs directory entry.
Because the dev->unplugged flag will also be set to true,
Only uninstall the driver by amdgpu_exit, not actually unplug the device.

Signed-off-by: Jesse Zhang 
Reported-by: Andy Dong 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c| 5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c | 6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c| 5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 5 +
 drivers/gpu/drm/amd/amdgpu/df_v3_6.c| 6 +-
 7 files changed, 8 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index c65feb97167d..cda0efd4d73c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1779,13 +1779,10 @@ int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)
 
 void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev)
 {
-   int idx;
-
-   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
+   if (adev->dev->kobj.sd) {
amdgpu_gfx_sysfs_xcp_fini(adev);
amdgpu_gfx_sysfs_isolation_shader_fini(adev);
amdgpu_gfx_sysfs_reset_mask_fini(adev);
-   drm_dev_exit(idx);
}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index bf4dbceb18e1..7444b556e78a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -448,11 +448,8 @@ int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device 
*adev)
 
 void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev)
 {
-   int idx;
-
-   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
+   if (adev->dev->kobj.sd) {
if (adev->jpeg.num_jpeg_inst)
device_remove_file(adev->dev, 
&dev_attr_jpeg_reset_mask);
-   drm_dev_exit(idx);
}
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
index 9a0346ed6ea4..ead1ca43e14e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
@@ -130,7 +130,7 @@ int amdgpu_preempt_mgr_init(struct amdgpu_device *adev)
 void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev)
 {
struct ttm_resource_manager *man = &adev->mman.preempt_mgr;
-   int idx, ret;
+   int ret;
 
ttm_resource_manager_set_used(man, false);
 
@@ -138,10 +138,8 @@ void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev)
if (ret)
return;
 
-   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
+   if (adev->dev->kobj.sd)
device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
-   drm_dev_exit(idx);
-   }
 
ttm_resource_manager_cleanup(man);
ttm_set_driver_manager(&adev->mman.bdev, AMDGPU_PL_PREEMPT, NULL);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 24e9daacaabb..11c64f087efd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -449,14 +449,11 @@ int amdgpu_sdma_sysfs_reset_mask_init(struct 
amdgpu_device *adev)
 
 void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev)
 {
-   int idx;
-
if (!amdgpu_gpu_recovery)
return;
 
-   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
+   if (adev->dev->kobj.sd) {
if (adev->sdma.num_instances)
device_remove_file(adev->dev, 
&dev_attr_sdma_reset_mask);
-   drm_dev_exit(idx);
}
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 25f490ad3a85..ed9c795e7b35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1310,11 +1310,8 @@ int amdgpu_vcn_sysfs_reset_mask_init(struct 
amdgpu_device *adev)
 
 void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev)
 {
-   int idx;
-
-   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
+   if (adev->dev->kobj.sd) {
if (adev->vcn.num_vcn_inst)
device_remove_file(adev->dev, &dev_attr_vcn_reset_mask);
-   drm_dev_exit(idx);
}
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
index b5f5a1a81c29..dc96e81235df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
@@ -904,12 +904,9 @@ int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device 
*adev)
 
 void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev)
 {
-   int idx;
-
-   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
+   if (adev->dev->kobj.sd) {

[PATCH 3/3] drm/amdgpu: Fix sysfs warning when hotplugging

2024-11-17 Thread jesse.zh...@amd.com

Replace the check drm_dev_enter with sysfs directory entry.
Because the dev->unplugged flag will also be set to true,
Only uninstall the driver by amdgpu_exit, not actually unplug the device.

Signed-off-by: Jesse Zhang 
Reported-by: Andy Dong 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 8 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c| 6 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c| 6 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 6 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 6 --
 drivers/gpu/drm/amd/amdgpu/df_v3_6.c| 4 ++--
 7 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 3c89c74d67e0..e54f42e3797e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1778,9 +1778,11 @@ int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)
 
 void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev)
 {
-   amdgpu_gfx_sysfs_xcp_fini(adev);
-   amdgpu_gfx_sysfs_isolation_shader_fini(adev);
-   amdgpu_gfx_sysfs_reset_mask_fini(adev);
+   if (adev->dev->kobj.sd) {
+   amdgpu_gfx_sysfs_xcp_fini(adev);
+   amdgpu_gfx_sysfs_isolation_shader_fini(adev);
+   amdgpu_gfx_sysfs_reset_mask_fini(adev);
+   }
 }
 
 int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 43ea76ebbad8..9a1a317d4fd9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -447,6 +447,8 @@ int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device 
*adev)
 
 void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev)
 {
-   if (adev->jpeg.num_jpeg_inst)
-   device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask);
+   if (adev->dev->kobj.sd) {
+   if (adev->jpeg.num_jpeg_inst)
+   device_remove_file(adev->dev, 
&dev_attr_jpeg_reset_mask);
+   }
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
index e8adfd0a570a..34b5e22b44e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
@@ -137,7 +137,8 @@ void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev)
if (ret)
return;
 
-   device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
+   if (adev->dev->kobj.sd)
+   device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
 
ttm_resource_manager_cleanup(man);
ttm_set_driver_manager(&adev->mman.bdev, AMDGPU_PL_PREEMPT, NULL);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 8c89b69edc20..113f0d242618 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -451,6 +451,8 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device 
*adev)
if (!amdgpu_gpu_recovery)
return;
 
-   if (adev->sdma.num_instances)
-   device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
+   if (adev->dev->kobj.sd) {
+   if (adev->sdma.num_instances)
+   device_remove_file(adev->dev, 
&dev_attr_sdma_reset_mask);
+   }
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 60e19052a1e2..ed9c795e7b35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1310,6 +1310,8 @@ int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device 
*adev)
 
 void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev)
 {
-   if (adev->vcn.num_vcn_inst)
-   device_remove_file(adev->dev, &dev_attr_vcn_reset_mask);
+   if (adev->dev->kobj.sd) {
+   if (adev->vcn.num_vcn_inst)
+   device_remove_file(adev->dev, &dev_attr_vcn_reset_mask);
+   }
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
index 02bda187f982..dc96e81235df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
@@ -904,8 +904,10 @@ int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device 
*adev)
 
 void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev)
 {
-   if (adev->vpe.num_instances)
-   device_remove_file(adev->dev, &dev_attr_vpe_reset_mask);
+   if (adev->dev->kobj.sd) {
+   if (adev->vpe.num_instances)
+   device_remove_file(adev->dev, &dev_attr_vpe_reset_mask);
+   }
 }
 
 static const struct amdgpu_ring_funcs vpe_ring_funcs = {
diff --git a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c 
b/driv

[PATCH 2/3] Revert "drm/amdgpu: fix warning when removing sysfs"

2024-11-17 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This reverts commit 330d97e9b14e0c85cc8b63e0092e4abcb9ce99c8.
the dev->unplugged flag will also be set to true ,
Only uninstall the driver by amdgpu_exit,not actually unplug the device.
that will cause a new issue.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 12 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c| 10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c |  8 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c| 10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c |  9 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c |  9 ++---
 drivers/gpu/drm/amd/amdgpu/df_v3_6.c|  8 ++--
 7 files changed, 15 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index c65feb97167d..3c89c74d67e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -25,7 +25,6 @@
 
 #include 
 #include 
-#include 
 
 #include "amdgpu.h"
 #include "amdgpu_gfx.h"
@@ -1779,14 +1778,9 @@ int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)
 
 void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev)
 {
-   int idx;
-
-   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
-   amdgpu_gfx_sysfs_xcp_fini(adev);
-   amdgpu_gfx_sysfs_isolation_shader_fini(adev);
-   amdgpu_gfx_sysfs_reset_mask_fini(adev);
-   drm_dev_exit(idx);
-   }
+   amdgpu_gfx_sysfs_xcp_fini(adev);
+   amdgpu_gfx_sysfs_isolation_shader_fini(adev);
+   amdgpu_gfx_sysfs_reset_mask_fini(adev);
 }
 
 int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index bf4dbceb18e1..43ea76ebbad8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -24,7 +24,6 @@
  *
  */
 
-#include 
 #include "amdgpu.h"
 #include "amdgpu_jpeg.h"
 #include "amdgpu_pm.h"
@@ -448,11 +447,6 @@ int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device 
*adev)
 
 void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev)
 {
-   int idx;
-
-   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
-   if (adev->jpeg.num_jpeg_inst)
-   device_remove_file(adev->dev, 
&dev_attr_jpeg_reset_mask);
-   drm_dev_exit(idx);
-   }
+   if (adev->jpeg.num_jpeg_inst)
+   device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
index 33a714ddfbbc..e8adfd0a570a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
@@ -23,7 +23,6 @@
  * Authors: Christian König, Felix Kuehling
  */
 
-#include 
 #include "amdgpu.h"
 
 /**
@@ -130,7 +129,7 @@ int amdgpu_preempt_mgr_init(struct amdgpu_device *adev)
 void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev)
 {
struct ttm_resource_manager *man = &adev->mman.preempt_mgr;
-   int idx, ret;
+   int ret;
 
ttm_resource_manager_set_used(man, false);
 
@@ -138,10 +137,7 @@ void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev)
if (ret)
return;
 
-   if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
-   device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
-   drm_dev_exit(idx);
-   }
+   device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
 
ttm_resource_manager_cleanup(man);
ttm_set_driver_manager(&adev->mman.bdev, AMDGPU_PL_PREEMPT, NULL);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 24e9daacaabb..8c89b69edc20 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -21,7 +21,6 @@
  *
  */
 
-#include 
 #include 
 #include "amdgpu.h"
 #include "amdgpu_sdma.h"
@@ -449,14 +448,9 @@ int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device 
*adev)
 
 void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev)
 {
-   int idx;
-
if (!amdgpu_gpu_recovery)
return;
 
-   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
-   if (adev->sdma.num_instances)
-   device_remove_file(adev->dev, 
&dev_attr_sdma_reset_mask);
-   drm_dev_exit(idx);
-   }
+   if (adev->sdma.num_instances)
+   device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 25f490ad3a85..60e19052a1e2 100

[PATCH V4] drm/amdkfd: pause autosuspend when creating pdd

2024-12-05 Thread jesse.zh...@amd.com

When using MES creating a pdd will require talking to the GPU to
setup the relevant context. The code here forgot to wake up the GPU
in case it was in suspend, this causes KVM to EFAULT for passthrough
GPU for example. This issue can be masked if the GPU was woken up by
other things (e.g. opening the KMS node) first and have not yet gone to sleep.
Fixes: cc009e613de6 ("drm/amdkfd: Add KFD support for soc21 v3")

v4: do the allocation of proc_ctx_bo in a lazy fashion
when the first queue is created in a process (Felix)

Signed-off-by: Jesse Zhang 
Reviewed-by: Yunxiang Li 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 15 
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 23 ++-
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c79fe9069e22..16b5daaa272f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -207,6 +207,21 @@ static int add_queue_mes(struct device_queue_manager *dqm, 
struct queue *q,
if (!down_read_trylock(&adev->reset_domain->sem))
return -EIO;
 
+   if (!pdd->proc_ctx_cpu_ptr) {
+   r = amdgpu_amdkfd_alloc_gtt_mem(adev,
+   AMDGPU_MES_PROC_CTX_SIZE,
+   &pdd->proc_ctx_bo,
+   &pdd->proc_ctx_gpu_addr,
+   &pdd->proc_ctx_cpu_ptr,
+   false);
+   if (r) {
+   dev_err(adev->dev,
+   "failed to allocate process context bo\n");
+   return r;
+   }
+   memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
+   }
+
memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));
queue_input.process_id = qpd->pqm->process->pasid;
queue_input.page_table_base_addr =  qpd->page_table_base;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 785d68b76f69..0976b5b0e8e8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1076,7 +1076,8 @@ static void kfd_process_destroy_pdds(struct kfd_process 
*p)
 
kfd_free_process_doorbells(pdd->dev->kfd, pdd);
 
-   if (pdd->dev->kfd->shared_resources.enable_mes)
+   if (pdd->dev->kfd->shared_resources.enable_mes &&
+   pdd->proc_ctx_cpu_ptr)
amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev,
   &pdd->proc_ctx_bo);
/*
@@ -1608,7 +1609,6 @@ struct kfd_process_device 
*kfd_create_process_device_data(struct kfd_node *dev,
struct kfd_process *p)
 {
struct kfd_process_device *pdd = NULL;
-   int retval = 0;
 
if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
return NULL;
@@ -1632,21 +1632,6 @@ struct kfd_process_device 
*kfd_create_process_device_data(struct kfd_node *dev,
pdd->user_gpu_id = dev->id;
atomic64_set(&pdd->evict_duration_counter, 0);
 
-   if (dev->kfd->shared_resources.enable_mes) {
-   retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev,
-   AMDGPU_MES_PROC_CTX_SIZE,
-   &pdd->proc_ctx_bo,
-   &pdd->proc_ctx_gpu_addr,
-   &pdd->proc_ctx_cpu_ptr,
-   false);
-   if (retval) {
-   dev_err(dev->adev->dev,
-   "failed to allocate process context bo\n");
-   goto err_free_pdd;
-   }
-   memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
-   }
-
p->pdds[p->n_pdds++] = pdd;
if (kfd_dbg_is_per_vmid_supported(pdd->dev))
pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap(
@@ -1658,10 +1643,6 @@ struct kfd_process_device 
*kfd_create_process_device_data(struct kfd_node *dev,
idr_init(&pdd->alloc_idr);
 
return pdd;
-
-err_free_pdd:
-   kfree(pdd);
-   return NULL;
 }
 
 /**
-- 
2.25.1

[PATCH 2/2] drm/amdgpu/gfx12: implement kgq reset via mmio

2025-01-05 Thread jesse.zh...@amd.com

replace MES kgq reset with MMIO.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index 69941442f00b..ba2ab9296eb4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -5254,9 +5254,9 @@ static int gfx_v12_0_reset_kgq(struct amdgpu_ring *ring, 
unsigned int vmid)
if (amdgpu_sriov_vf(adev))
return -EINVAL;
 
-   r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, false);
+   r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true);
if (r) {
-   dev_err(adev->dev, "reset via MES failed %d\n", r);
+   dev_err(adev->dev, "reset via MMIO failed %d\n", r);
return r;
}
 
-- 
2.25.1

[PATCH 1/2] drm/amdgpu: enable gfx12 queue reset flag

2025-01-05 Thread jesse.zh...@amd.com

Enable the kcg and kcq queue reset flag

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index 3aa34c4d..69941442f00b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -1477,11 +1477,19 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block 
*ip_block)
}
}
 
-   /* TODO: Add queue reset mask when FW fully supports it */
adev->gfx.gfx_supported_reset =
amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
adev->gfx.compute_supported_reset =
amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
+   switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+   case IP_VERSION(12, 0, 0):
+   case IP_VERSION(12, 0, 1):
+   if ((adev->gfx.me_fw_version >= 2660) &&
+   (adev->gfx.mec_fw_version >= 2920)) {
+   adev->gfx.compute_supported_reset |= 
AMDGPU_RESET_TYPE_PER_QUEUE;
+   adev->gfx.gfx_supported_reset |= 
AMDGPU_RESET_TYPE_PER_QUEUE;
+   }
+   }
 
if (!adev->enable_mes_kiq) {
r = amdgpu_gfx_kiq_init(adev, GFX12_MEC_HPD_SIZE, 0);
-- 
2.25.1

[PATCH 1/4] drm/amdgpu/kfd: Add shared SDMA reset functionality with callback support

2025-02-09 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch introduces shared SDMA reset functionality between AMDGPU and KFD.
The implementation includes the following key changes:

1. Added `amdgpu_sdma_reset_queue`:
   - Resets a specific SDMA queue by instance ID.
   - Invokes registered pre-reset and post-reset callbacks to allow KFD and 
AMDGPU
 to save/restore their state during the reset process.

2. Added `amdgpu_set_on_reset_callbacks`:
   - Allows KFD and AMDGPU to register callback functions for pre-reset and
 post-reset operations.
   - Callbacks are stored in a global linked list and invoked in the correct 
order
 during SDMA reset.

This patch ensures that both AMDGPU and KFD can handle SDMA reset events
gracefully, with proper state saving and restoration. It also provides a 
flexible
callback mechanism for future extensions.

v2: fix CamelCase and put the SDMA helper into amdgpu_sdma.c (Alex)
v3: rename the `amdgpu_register_on_reset_callbacks` function to
  `amdgpu_sdma_register_on_reset_callbacks`
move global reset_callback_list to struct amdgpu_sdma (Alex)

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 72 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 11 
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  2 +-
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 174badca27e7..19c8be7d72e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -460,3 +460,75 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct 
amdgpu_device *adev)
device_remove_file(adev->dev, 
&dev_attr_sdma_reset_mask);
}
 }
+
+/**
+ * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
+ * @funcs: Pointer to the callback structure containing pre_reset and 
post_reset functions
+ *
+ * This function allows KFD and AMDGPU to register their own callbacks for 
handling
+ * pre-reset and post-reset operations. The callbacks are added to a global 
list.
+ */
+void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, 
struct sdma_on_reset_funcs *funcs)
+{
+   if (!funcs)
+   return;
+
+   /* Initialize the list node in the callback structure */
+   INIT_LIST_HEAD(&funcs->list);
+
+   /* Add the callback structure to the global list */
+   list_add_tail(&funcs->list, &adev->sdma.reset_callback_list);
+}
+
+/**
+ * amdgpu_sdma_reset_instance - Reset a specific SDMA instance
+ * @adev: Pointer to the AMDGPU device
+ * @instance_id: ID of the SDMA engine instance to reset
+ *
+ * This function performs the following steps:
+ * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save 
their state.
+ * 2. Resets the specified SDMA engine instance.
+ * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to 
restore their state.
+ *
+ * Returns: 0 on success, or a negative error code on failure.
+ */
+int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t 
instance_id)
+{
+   struct sdma_on_reset_funcs *funcs;
+   int ret;
+
+   /* Invoke all registered pre_reset callbacks */
+   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
+   if (funcs->pre_reset) {
+   ret = funcs->pre_reset(adev, instance_id);
+   if (ret) {
+   dev_err(adev->dev,
+   "beforeReset callback failed for instance %u: 
%d\n",
+   instance_id, ret);
+   return ret;
+   }
+   }
+   }
+
+   /* Perform the SDMA reset for the specified instance */
+   ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
+   if (ret) {
+   dev_err(adev->dev, "Failed to reset SDMA instance %u\n", 
instance_id);
+   return ret;
+   }
+
+   /* Invoke all registered post_reset callbacks */
+   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
+   if (funcs->post_reset) {
+   ret = funcs->post_reset(adev, instance_id);
+   if (ret) {
+   dev_err(adev->dev,
+   "afterReset callback failed for instance %u: 
%d\n",
+   instance_id, ret);
+   return ret;
+   }
+   }
+   }
+
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 5f60736051d1..fbb8b04ef2cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdm

[PATCH 4/4] drm/amdgpu: Improve SDMA reset logic with guilty queue tracking

2025-02-09 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This commit introduces several improvements to the SDMA reset logic:

1. Added `cached_rptr` to the `amdgpu_ring` structure to store the read pointer
   before a reset, ensuring proper state restoration after reset.

2. Introduced `gfx_guilty` and `page_guilty` flags in the `amdgpu_sdma` 
structure
   to track which queue (GFX or PAGE) caused a timeout or error.

3. Replaced the `caller` parameter with a `guilty` boolean in the reset and 
resume
   functions to simplify the logic and handle resets based on the guilty state.

4. Added a helper function `sdma_v4_4_2_is_queue_selected` to check the
   `SDMA*_*_CONTEXT_STATUS.SELECTED` register and determine if a queue is 
guilty.

v2:
   1.replace the caller with a guilty bool.
   If the queue is the guilty one, set the rptr and wptr  to the saved wptr 
value,
   else, set the rptr and wptr to the saved rptr value. (Alex)
   2. cache the rptr before the reset. (Alex)

v3: add a new ring callback, is_guilty(), which will get called to check if
the ring in amdgpu_job_timedout() is actually the guilty ring. If it's not,
we can return goto exit(Alex)

v4: cache the rptr for page ring

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  | 10 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 +
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 96 
 6 files changed, 106 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 100f04475943..ce3e7a9d6688 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -102,6 +102,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
return DRM_GPU_SCHED_STAT_ENODEV;
}
 
+   /* Check if the ring is actually guilty of causing the timeout.
+* If not, skip error handling and fence completion.
+*/
+   if (amdgpu_gpu_recovery && ring->funcs->is_guilty) {
+   if (!ring->funcs->is_guilty(ring)) {
+   dev_err(adev->dev, "ring %s timeout, but not guilty\n",
+   s_job->sched->name);
+   goto exit;
+   }
+   }
/*
 * Do the coredump immediately after a job timeout to get a very
 * close dump/snapshot/representation of GPU's current error status
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index a6e28fe3f8d6..20cd21df38ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -342,6 +342,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct 
amdgpu_ring *ring,
ring->buf_mask = (ring->ring_size / 4) - 1;
ring->ptr_mask = ring->funcs->support_64bit_ptrs ?
0x : ring->buf_mask;
+   /*  Initialize cached_rptr to 0 */
+   ring->cached_rptr = 0;
 
/* Allocate ring buffer */
if (ring->is_mes_queue) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 04af26536f97..182aa535d395 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -237,6 +237,7 @@ struct amdgpu_ring_funcs {
void (*patch_de)(struct amdgpu_ring *ring, unsigned offset);
int (*reset)(struct amdgpu_ring *ring, unsigned int vmid);
void (*emit_cleaner_shader)(struct amdgpu_ring *ring);
+   bool (*is_guilty)(struct amdgpu_ring *ring);
 };
 
 struct amdgpu_ring {
@@ -306,6 +307,8 @@ struct amdgpu_ring {
 
boolis_sw_ring;
unsigned intentry_index;
+   /* store the cached rptr to restore after reset */
+   uint64_t cached_rptr;
 
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 8864a9d7455b..02d3685d10fc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -474,6 +474,10 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
if (!funcs)
return;
 
+   /* Ensure the reset_callback_list is initialized */
+   if (!adev->sdma.reset_callback_list.next) {
+   INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
+   }
/* Initialize the list node in the callback structure */
INIT_LIST_HEAD(&funcs->list);
 
@@ -513,7 +517,7 @@ int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, 
uint32_t instance_id,
*/
if (!amdgpu_ring_sched_ready(gfx_ring))

[PATCH 3/4] drm/amdgpu: Add common lock and reset caller parameter for SDMA reset synchronization

2025-02-09 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This commit introduces a caller parameter to the amdgpu_sdma_reset_instance 
function to differentiate
between reset requests originating from the KGD and KFD.
This change ensures proper synchronization between KGD and KFD during SDMA 
resets.

If the caller is KFD, the function now acquires and releases the scheduler lock 
(ring->sched.job_list_lock)
to protect the SDMA queue during the reset.

These changes prevent race conditions and ensure safe SDMA reset operations
when initiated by KFD, improving system stability and reliability.

V2: replace the ring_lock with the existed the scheduler
locks for the queues (ring->sched) on the sdma engine.(Alex)

v3: call drm_sched_wqueue_stop() rather than job_list_lock.
If a GPU ring reset was already initiated for one ring at 
amdgpu_job_timedout,
skip resetting that ring and call drm_sched_wqueue_stop()
for the other rings (Alex)

   replace  the common lock (sdma_reset_lock) with DQM lock to
   to resolve reset races between the two driver sections during KFD 
eviction.(Jon)

   Rename the caller to Reset_src and
   Change AMDGPU_RESET_SRC_SDMA_KGD/KFD to AMDGPU_RESET_SRC_SDMA_HWS/RING (Jon)
v4: restart the wqueue if the reset was successful,
or fall back to a full adapter reset. (Alex)

   move definition of reset source to enumeration AMDGPU_RESET_SRCS, and
   check reset src in amdgpu_sdma_reset_instance (Jon)

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Suggested-by: Jonathan Kim 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c  | 54 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h  |  6 +--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c  |  8 ++--
 4 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 4d9b9701139b..5b86e12ff9fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -43,6 +43,8 @@ enum AMDGPU_RESET_SRCS {
AMDGPU_RESET_SRC_MES,
AMDGPU_RESET_SRC_HWS,
AMDGPU_RESET_SRC_USER,
+   AMDGPU_RESET_SRC_SDMA_RING,
+   AMDGPU_RESET_SRC_SDMA_HWS,
 };
 
 struct amdgpu_reset_context {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 19c8be7d72e2..8864a9d7455b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -25,6 +25,7 @@
 #include "amdgpu.h"
 #include "amdgpu_sdma.h"
 #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
 
 #define AMDGPU_CSA_SDMA_SIZE 64
 /* SDMA CSA reside in the 3rd page of CSA */
@@ -484,6 +485,7 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
  * amdgpu_sdma_reset_instance - Reset a specific SDMA instance
  * @adev: Pointer to the AMDGPU device
  * @instance_id: ID of the SDMA engine instance to reset
+ * @src: The source of reset function (KGD or KFD)
  *
  * This function performs the following steps:
  * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save 
their state.
@@ -492,20 +494,42 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
  *
  * Returns: 0 on success, or a negative error code on failure.
  */
-int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t 
instance_id)
+int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t 
instance_id, int src)
 {
struct sdma_on_reset_funcs *funcs;
-   int ret;
+   int ret = 0;
+   struct amdgpu_sdma_instance *sdma_instance = 
&adev->sdma.instance[instance_id];;
+   struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
+   struct amdgpu_ring *page_ring = &sdma_instance->page;
+   bool gfx_sched_stopped = false, page_sched_stopped = false;
+
+   /* Check if the reset source is valid for SDMA ring reset */
+   if (src != AMDGPU_RESET_SRC_SDMA_RING && src != AMDGPU_RESET_SRC_HWS)
+   return -EINVAL;
+
+   /* Stop the scheduler's work queue for the GFX and page rings if they 
are running.
+   * This ensures that no new tasks are submitted to the queues while
+   * the reset is in progress.
+   */
+   if (!amdgpu_ring_sched_ready(gfx_ring)) {
+   drm_sched_wqueue_stop(&gfx_ring->sched);
+   gfx_sched_stopped = true;;
+   }
+
+   if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) {
+   drm_sched_wqueue_stop(&page_ring->sched);
+   page_sched_stopped = true;
+   }
 
/* Invoke all registered pre_reset callbacks */
list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
if (funcs->pre_reset) {
-

[PATCH 2/4] drm/amdgpu/sdma: Refactor SDMA reset functionality and add callback support

2025-02-09 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch refactors the SDMA reset functionality in the `sdma_v4_4_2` driver
to improve modularity and support shared usage between AMDGPU and KFD. The
changes include:

1. **Refactored SDMA Reset Logic**:
   - Split the `sdma_v4_4_2_reset_queue` function into two separate functions:
 - `sdma_v4_4_2_stop_queue`: Stops the SDMA queue before reset.
 - `sdma_v4_4_2_restore_queue`: Restores the SDMA queue after reset.
   - These functions are now used as callbacks for the shared reset mechanism.

2. **Added Callback Support**:
   - Introduced a new structure `sdma_v4_4_2_reset_funcs` to hold the stop and
 restore callbacks.
   - Added `sdma_v4_4_2_set_reset_funcs` to register these callbacks with the
 shared reset mechanism using `amdgpu_set_on_reset_callbacks`.

3. **Fixed Reset Queue Function**:
   - Modified `sdma_v4_4_2_reset_queue` to use the shared 
`amdgpu_sdma_reset_queue`
 function, ensuring consistency across the driver.

This patch ensures that SDMA reset functionality is more modular, reusable, and
aligned with the shared reset mechanism between AMDGPU and KFD.

Suggested-by: Jiadong Zhu 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 32 +---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 64c163dd708f..3e60456b0db0 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -105,6 +105,7 @@ static void sdma_v4_4_2_set_buffer_funcs(struct 
amdgpu_device *adev);
 static void sdma_v4_4_2_set_vm_pte_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev);
+static void sdma_v4_4_2_set_reset_funcs(struct amdgpu_device *adev);
 
 static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev,
u32 instance, u32 offset)
@@ -1330,6 +1331,7 @@ static int sdma_v4_4_2_early_init(struct amdgpu_ip_block 
*ip_block)
sdma_v4_4_2_set_vm_pte_funcs(adev);
sdma_v4_4_2_set_irq_funcs(adev);
sdma_v4_4_2_set_ras_funcs(adev);
+   sdma_v4_4_2_set_reset_funcs(adev);
 
return 0;
 }
@@ -1605,8 +1607,14 @@ static int sdma_v4_4_2_soft_reset(struct amdgpu_ip_block 
*ip_block)
 static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int i, r;
+   u32 id = GET_INST(SDMA0, ring->me);
+   return amdgpu_sdma_reset_instance(adev, id);
+}
+
+static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t 
instance_id)
+{
u32 inst_mask;
+   struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
@@ -1617,10 +1625,16 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
if (adev->sdma.has_page_queue)
sdma_v4_4_2_inst_page_stop(adev, inst_mask);
 
-   r = amdgpu_dpm_reset_sdma(adev, 1 << GET_INST(SDMA0, ring->me));
-   if (r)
-   return r;
+   return 0;
+}
 
+static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev, uint32_t 
instance_id)
+{
+   int i;
+   u32 inst_mask;
+   struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
+
+   inst_mask = 1 << ring->me;
udelay(50);
 
for (i = 0; i < adev->usec_timeout; i++) {
@@ -1638,6 +1652,16 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
return sdma_v4_4_2_inst_start(adev, inst_mask, true);
 }
 
+static struct sdma_on_reset_funcs sdma_v4_4_2_reset_funcs = {
+   .pre_reset = sdma_v4_4_2_stop_queue,
+   .post_reset = sdma_v4_4_2_restore_queue,
+};
+
+static void sdma_v4_4_2_set_reset_funcs(struct amdgpu_device *adev)
+{
+   amdgpu_sdma_register_on_reset_callbacks(adev, &sdma_v4_4_2_reset_funcs);
+}
+
 static int sdma_v4_4_2_set_trap_irq_state(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
unsigned type,
-- 
2.25.1

[PATCH v5 4/4] drm/amdgpu: Improve SDMA reset logic with guilty queue tracking

2025-02-10 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This commit introduces several improvements to the SDMA reset logic:

1. Added `cached_rptr` to the `amdgpu_ring` structure to store the read pointer
   before a reset, ensuring proper state restoration after reset.

2. Introduced `gfx_guilty` and `page_guilty` flags in the `amdgpu_sdma` 
structure
   to track which queue (GFX or PAGE) caused a timeout or error.

3. Replaced the `caller` parameter with a `guilty` boolean in the reset and 
resume
   functions to simplify the logic and handle resets based on the guilty state.

4. Added a helper function `sdma_v4_4_2_is_queue_selected` to check the
   `SDMA*_*_CONTEXT_STATUS.SELECTED` register and determine if a queue is 
guilty.

v2:
   1.replace the caller with a guilty bool.
   If the queue is the guilty one, set the rptr and wptr  to the saved wptr 
value,
   else, set the rptr and wptr to the saved rptr value. (Alex)
   2. cache the rptr before the reset. (Alex)

v3: add a new ring callback, is_guilty(), which will get called to check if
the ring in amdgpu_job_timedout() is actually the guilty ring. If it's not,
we can return goto exit(Alex)

v4: cache the rptr for page ring

v5: update the register addresses to correctly use the page ring registers
  (regSDMA_PAGE_RB_RPTR) in page resume.

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  | 10 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 +
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 96 
 6 files changed, 106 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 100f04475943..ce3e7a9d6688 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -102,6 +102,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
return DRM_GPU_SCHED_STAT_ENODEV;
}
 
+   /* Check if the ring is actually guilty of causing the timeout.
+* If not, skip error handling and fence completion.
+*/
+   if (amdgpu_gpu_recovery && ring->funcs->is_guilty) {
+   if (!ring->funcs->is_guilty(ring)) {
+   dev_err(adev->dev, "ring %s timeout, but not guilty\n",
+   s_job->sched->name);
+   goto exit;
+   }
+   }
/*
 * Do the coredump immediately after a job timeout to get a very
 * close dump/snapshot/representation of GPU's current error status
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index a6e28fe3f8d6..20cd21df38ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -342,6 +342,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct 
amdgpu_ring *ring,
ring->buf_mask = (ring->ring_size / 4) - 1;
ring->ptr_mask = ring->funcs->support_64bit_ptrs ?
0x : ring->buf_mask;
+   /*  Initialize cached_rptr to 0 */
+   ring->cached_rptr = 0;
 
/* Allocate ring buffer */
if (ring->is_mes_queue) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 04af26536f97..182aa535d395 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -237,6 +237,7 @@ struct amdgpu_ring_funcs {
void (*patch_de)(struct amdgpu_ring *ring, unsigned offset);
int (*reset)(struct amdgpu_ring *ring, unsigned int vmid);
void (*emit_cleaner_shader)(struct amdgpu_ring *ring);
+   bool (*is_guilty)(struct amdgpu_ring *ring);
 };
 
 struct amdgpu_ring {
@@ -306,6 +307,8 @@ struct amdgpu_ring {
 
boolis_sw_ring;
unsigned intentry_index;
+   /* store the cached rptr to restore after reset */
+   uint64_t cached_rptr;
 
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 8864a9d7455b..02d3685d10fc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -474,6 +474,10 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
if (!funcs)
return;
 
+   /* Ensure the reset_callback_list is initialized */
+   if (!adev->sdma.reset_callback_list.next) {
+   INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
+   }
/* Initialize the list node in the callback structure */
INIT_LIST_HEAD(&funcs->list);
 
@@ -513,7 +517,7 @@ int

[PATCH 2/4 v6] drm/amdgpu/sdma: Refactor SDMA reset functionality and add callback support

2025-02-10 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch refactors the SDMA reset functionality in the `sdma_v4_4_2` driver
to improve modularity and support shared usage between AMDGPU and KFD. The
changes include:

1. **Refactored SDMA Reset Logic**:
   - Split the `sdma_v4_4_2_reset_queue` function into two separate functions:
 - `sdma_v4_4_2_stop_queue`: Stops the SDMA queue before reset.
 - `sdma_v4_4_2_restore_queue`: Restores the SDMA queue after reset.
   - These functions are now used as callbacks for the shared reset mechanism.

2. **Added Callback Support**:
   - Introduced a new structure `sdma_v4_4_2_reset_funcs` to hold the stop and
 restore callbacks.
   - Added `sdma_v4_4_2_set_reset_funcs` to register these callbacks with the
 shared reset mechanism using `amdgpu_set_on_reset_callbacks`.

3. **Fixed Reset Queue Function**:
   - Modified `sdma_v4_4_2_reset_queue` to use the shared 
`amdgpu_sdma_reset_queue`
 function, ensuring consistency across the driver.

This patch ensures that SDMA reset functionality is more modular, reusable, and
aligned with the shared reset mechanism between AMDGPU and KFD.

Suggested-by: Jiadong Zhu 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 32 +---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 64c163dd708f..3e60456b0db0 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -105,6 +105,7 @@ static void sdma_v4_4_2_set_buffer_funcs(struct 
amdgpu_device *adev);
 static void sdma_v4_4_2_set_vm_pte_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev);
+static void sdma_v4_4_2_set_reset_funcs(struct amdgpu_device *adev);
 
 static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev,
u32 instance, u32 offset)
@@ -1330,6 +1331,7 @@ static int sdma_v4_4_2_early_init(struct amdgpu_ip_block 
*ip_block)
sdma_v4_4_2_set_vm_pte_funcs(adev);
sdma_v4_4_2_set_irq_funcs(adev);
sdma_v4_4_2_set_ras_funcs(adev);
+   sdma_v4_4_2_set_reset_funcs(adev);
 
return 0;
 }
@@ -1605,8 +1607,14 @@ static int sdma_v4_4_2_soft_reset(struct amdgpu_ip_block 
*ip_block)
 static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int i, r;
+   u32 id = GET_INST(SDMA0, ring->me);
+   return amdgpu_sdma_reset_instance(adev, id);
+}
+
+static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t 
instance_id)
+{
u32 inst_mask;
+   struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
@@ -1617,10 +1625,16 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
if (adev->sdma.has_page_queue)
sdma_v4_4_2_inst_page_stop(adev, inst_mask);
 
-   r = amdgpu_dpm_reset_sdma(adev, 1 << GET_INST(SDMA0, ring->me));
-   if (r)
-   return r;
+   return 0;
+}
 
+static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev, uint32_t 
instance_id)
+{
+   int i;
+   u32 inst_mask;
+   struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
+
+   inst_mask = 1 << ring->me;
udelay(50);
 
for (i = 0; i < adev->usec_timeout; i++) {
@@ -1638,6 +1652,16 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
return sdma_v4_4_2_inst_start(adev, inst_mask, true);
 }
 
+static struct sdma_on_reset_funcs sdma_v4_4_2_reset_funcs = {
+   .pre_reset = sdma_v4_4_2_stop_queue,
+   .post_reset = sdma_v4_4_2_restore_queue,
+};
+
+static void sdma_v4_4_2_set_reset_funcs(struct amdgpu_device *adev)
+{
+   amdgpu_sdma_register_on_reset_callbacks(adev, &sdma_v4_4_2_reset_funcs);
+}
+
 static int sdma_v4_4_2_set_trap_irq_state(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
unsigned type,
-- 
2.25.1

[PATCH 1/4 v6] drm/amdgpu/kfd: Add shared SDMA reset functionality with callback support

2025-02-10 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch introduces shared SDMA reset functionality between AMDGPU and KFD.
The implementation includes the following key changes:

1. Added `amdgpu_sdma_reset_queue`:
   - Resets a specific SDMA queue by instance ID.
   - Invokes registered pre-reset and post-reset callbacks to allow KFD and 
AMDGPU
 to save/restore their state during the reset process.

2. Added `amdgpu_set_on_reset_callbacks`:
   - Allows KFD and AMDGPU to register callback functions for pre-reset and
 post-reset operations.
   - Callbacks are stored in a global linked list and invoked in the correct 
order
 during SDMA reset.

This patch ensures that both AMDGPU and KFD can handle SDMA reset events
gracefully, with proper state saving and restoration. It also provides a 
flexible
callback mechanism for future extensions.

v2: fix CamelCase and put the SDMA helper into amdgpu_sdma.c (Alex)
v3: rename the `amdgpu_register_on_reset_callbacks` function to
  `amdgpu_sdma_register_on_reset_callbacks`
move global reset_callback_list to struct amdgpu_sdma (Alex)

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 72 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 11 
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  2 +-
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 174badca27e7..19c8be7d72e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -460,3 +460,75 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct 
amdgpu_device *adev)
device_remove_file(adev->dev, 
&dev_attr_sdma_reset_mask);
}
 }
+
+/**
+ * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
+ * @funcs: Pointer to the callback structure containing pre_reset and 
post_reset functions
+ *
+ * This function allows KFD and AMDGPU to register their own callbacks for 
handling
+ * pre-reset and post-reset operations. The callbacks are added to a global 
list.
+ */
+void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, 
struct sdma_on_reset_funcs *funcs)
+{
+   if (!funcs)
+   return;
+
+   /* Initialize the list node in the callback structure */
+   INIT_LIST_HEAD(&funcs->list);
+
+   /* Add the callback structure to the global list */
+   list_add_tail(&funcs->list, &adev->sdma.reset_callback_list);
+}
+
+/**
+ * amdgpu_sdma_reset_instance - Reset a specific SDMA instance
+ * @adev: Pointer to the AMDGPU device
+ * @instance_id: ID of the SDMA engine instance to reset
+ *
+ * This function performs the following steps:
+ * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save 
their state.
+ * 2. Resets the specified SDMA engine instance.
+ * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to 
restore their state.
+ *
+ * Returns: 0 on success, or a negative error code on failure.
+ */
+int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t 
instance_id)
+{
+   struct sdma_on_reset_funcs *funcs;
+   int ret;
+
+   /* Invoke all registered pre_reset callbacks */
+   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
+   if (funcs->pre_reset) {
+   ret = funcs->pre_reset(adev, instance_id);
+   if (ret) {
+   dev_err(adev->dev,
+   "beforeReset callback failed for instance %u: 
%d\n",
+   instance_id, ret);
+   return ret;
+   }
+   }
+   }
+
+   /* Perform the SDMA reset for the specified instance */
+   ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
+   if (ret) {
+   dev_err(adev->dev, "Failed to reset SDMA instance %u\n", 
instance_id);
+   return ret;
+   }
+
+   /* Invoke all registered post_reset callbacks */
+   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
+   if (funcs->post_reset) {
+   ret = funcs->post_reset(adev, instance_id);
+   if (ret) {
+   dev_err(adev->dev,
+   "afterReset callback failed for instance %u: 
%d\n",
+   instance_id, ret);
+   return ret;
+   }
+   }
+   }
+
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 5f60736051d1..fbb8b04ef2cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdm

[PATCH 4/4 V6] drm/amdgpu: Improve SDMA reset logic with guilty queue tracking

2025-02-10 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This commit introduces several improvements to the SDMA reset logic:

1. Added `cached_rptr` to the `amdgpu_ring` structure to store the read pointer
   before a reset, ensuring proper state restoration after reset.

2. Introduced `gfx_guilty` and `page_guilty` flags in the `amdgpu_sdma` 
structure
   to track which queue (GFX or PAGE) caused a timeout or error.

3. Replaced the `caller` parameter with a `guilty` boolean in the reset and 
resume
   functions to simplify the logic and handle resets based on the guilty state.

4. Added a helper function `sdma_v4_4_2_is_queue_selected` to check the
   `SDMA*_*_CONTEXT_STATUS.SELECTED` register and determine if a queue is 
guilty.

v2:
   1.replace the caller with a guilty bool.
   If the queue is the guilty one, set the rptr and wptr  to the saved wptr 
value,
   else, set the rptr and wptr to the saved rptr value. (Alex)
   2. cache the rptr before the reset. (Alex)

v3: add a new ring callback, is_guilty(), which will get called to check if
the ring in amdgpu_job_timedout() is actually the guilty ring. If it's not,
we can return goto exit(Alex)

v4: cache the rptr for page ring

v5: update the register addresses to correctly use the page ring registers
  (regSDMA_PAGE_RB_RPTR) in page resume.

v6: Keeping intermediate variables like u64 rwptr simplifies resotre 
rptr/wptr.(Lijo)

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  | 10 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 +
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 96 
 6 files changed, 106 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 100f04475943..ce3e7a9d6688 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -102,6 +102,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
return DRM_GPU_SCHED_STAT_ENODEV;
}
 
+   /* Check if the ring is actually guilty of causing the timeout.
+* If not, skip error handling and fence completion.
+*/
+   if (amdgpu_gpu_recovery && ring->funcs->is_guilty) {
+   if (!ring->funcs->is_guilty(ring)) {
+   dev_err(adev->dev, "ring %s timeout, but not guilty\n",
+   s_job->sched->name);
+   goto exit;
+   }
+   }
/*
 * Do the coredump immediately after a job timeout to get a very
 * close dump/snapshot/representation of GPU's current error status
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index a6e28fe3f8d6..20cd21df38ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -342,6 +342,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct 
amdgpu_ring *ring,
ring->buf_mask = (ring->ring_size / 4) - 1;
ring->ptr_mask = ring->funcs->support_64bit_ptrs ?
0x : ring->buf_mask;
+   /*  Initialize cached_rptr to 0 */
+   ring->cached_rptr = 0;
 
/* Allocate ring buffer */
if (ring->is_mes_queue) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 04af26536f97..182aa535d395 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -237,6 +237,7 @@ struct amdgpu_ring_funcs {
void (*patch_de)(struct amdgpu_ring *ring, unsigned offset);
int (*reset)(struct amdgpu_ring *ring, unsigned int vmid);
void (*emit_cleaner_shader)(struct amdgpu_ring *ring);
+   bool (*is_guilty)(struct amdgpu_ring *ring);
 };
 
 struct amdgpu_ring {
@@ -306,6 +307,8 @@ struct amdgpu_ring {
 
boolis_sw_ring;
unsigned intentry_index;
+   /* store the cached rptr to restore after reset */
+   uint64_t cached_rptr;
 
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 32eebf9d4408..6ba785798a4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -474,6 +474,10 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
if (!funcs)
return;
 
+   /* Ensure the reset_callback_list is initialized */
+   if (!adev->sdma.reset_callback_list.next) {
+   INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
+   }
/* Initialize the list node in the callback structure

[PATCH 3/4 v6] drm/amdgpu: Add common lock and reset caller parameter for SDMA reset synchronization

2025-02-10 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This commit introduces a caller parameter to the amdgpu_sdma_reset_instance 
function to differentiate
between reset requests originating from the KGD and KFD.
This change ensures proper synchronization between KGD and KFD during SDMA 
resets.

If the caller is KFD, the function now acquires and releases the scheduler lock 
(ring->sched.job_list_lock)
to protect the SDMA queue during the reset.

These changes prevent race conditions and ensure safe SDMA reset operations
when initiated by KFD, improving system stability and reliability.

V2: replace the ring_lock with the existed the scheduler
locks for the queues (ring->sched) on the sdma engine.(Alex)

v3: call drm_sched_wqueue_stop() rather than job_list_lock.
If a GPU ring reset was already initiated for one ring at 
amdgpu_job_timedout,
skip resetting that ring and call drm_sched_wqueue_stop()
for the other rings (Alex)

   replace  the common lock (sdma_reset_lock) with DQM lock to
   to resolve reset races between the two driver sections during KFD 
eviction.(Jon)

   Rename the caller to Reset_src and
   Change AMDGPU_RESET_SRC_SDMA_KGD/KFD to AMDGPU_RESET_SRC_SDMA_HWS/RING (Jon)
v4: restart the wqueue if the reset was successful,
or fall back to a full adapter reset. (Alex)

   move definition of reset source to enumeration AMDGPU_RESET_SRCS, and
   check reset src in amdgpu_sdma_reset_instance (Jon)

v5: Call amdgpu_amdkfd_suspend/resume at the start/end of reset function 
respectively under !SRC_HWS
conditions only (Jon)

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Suggested-by: Jonathan Kim 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c  | 65 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h  |  6 +--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c  |  8 +--
 4 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 4d9b9701139b..5b86e12ff9fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -43,6 +43,8 @@ enum AMDGPU_RESET_SRCS {
AMDGPU_RESET_SRC_MES,
AMDGPU_RESET_SRC_HWS,
AMDGPU_RESET_SRC_USER,
+   AMDGPU_RESET_SRC_SDMA_RING,
+   AMDGPU_RESET_SRC_SDMA_HWS,
 };
 
 struct amdgpu_reset_context {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 19c8be7d72e2..32eebf9d4408 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -25,6 +25,7 @@
 #include "amdgpu.h"
 #include "amdgpu_sdma.h"
 #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
 
 #define AMDGPU_CSA_SDMA_SIZE 64
 /* SDMA CSA reside in the 3rd page of CSA */
@@ -484,6 +485,7 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
  * amdgpu_sdma_reset_instance - Reset a specific SDMA instance
  * @adev: Pointer to the AMDGPU device
  * @instance_id: ID of the SDMA engine instance to reset
+ * @src: The source of reset function (KGD or KFD)
  *
  * This function performs the following steps:
  * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save 
their state.
@@ -492,20 +494,49 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
  *
  * Returns: 0 on success, or a negative error code on failure.
  */
-int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t 
instance_id)
+int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t 
instance_id, int src)
 {
struct sdma_on_reset_funcs *funcs;
-   int ret;
+   int ret = 0;
+   struct amdgpu_sdma_instance *sdma_instance = 
&adev->sdma.instance[instance_id];;
+   struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
+   struct amdgpu_ring *page_ring = &sdma_instance->page;
+   bool gfx_sched_stopped = false, page_sched_stopped = false;
+
+   /* Check if the reset source is valid for SDMA ring reset */
+   if (src != AMDGPU_RESET_SRC_SDMA_RING && src != AMDGPU_RESET_SRC_HWS)
+   return -EINVAL;
+
+   /* Suspend KFD if the reset source is not SDMA_HWS.
+* prevent the destruction of in-flight healthy user queue packets and
+ * avoid race conditions between KFD and KGD during the reset process.
+ */
+   if (src != AMDGPU_RESET_SRC_SDMA_HWS)
+   amdgpu_amdkfd_suspend(adev, false);
+
+   /* Stop the scheduler's work queue for the GFX and page rings if they 
are running.
+   * This ensures that no new tasks are submitted to the queues while
+   * the reset is in progress.
+   */
+   if (!amdgpu_ring_sched_ready(gfx_ring)) {
+   drm_sched_wqueue_stop(&gfx_ring->sched);
+   gfx_sch

[PATCH] drm/amdgpu: Add support for page queue scheduling

2025-02-11 Thread jesse.zh...@amd.com

This patch updates the sdma engine to support scheduling for
the page queue. The main changes include:

- Introduce a new variable `page` to handle the page queue if it exists.
- Update the scheduling logic to conditionally set the `sched.ready` flag for
  both the sdma gfx queue and the page queue based on the provided mask.
- Ensure that the scheduling flags are updated correctly for both queues when
  the mask is applied.

The patch ensures that the SDMA engine can handle scheduling for both the sdma 
gfx
queue and the page queue,

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 6ba785798a4a..e82ded95540c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -358,7 +358,7 @@ static int amdgpu_debugfs_sdma_sched_mask_set(void *data, 
u64 val)
struct amdgpu_device *adev = (struct amdgpu_device *)data;
u32 i;
u64 mask = 0;
-   struct amdgpu_ring *ring;
+   struct amdgpu_ring *ring, *page;
 
if (!adev)
return -ENODEV;
@@ -369,10 +369,18 @@ static int amdgpu_debugfs_sdma_sched_mask_set(void *data, 
u64 val)
 
for (i = 0; i < adev->sdma.num_instances; ++i) {
ring = &adev->sdma.instance[i].ring;
-   if (val & BIT_ULL(i))
+   if (adev->sdma.has_page_queue)
+   page = &adev->sdma.instance[i].page;
+   if (val & BIT_ULL(i)) {
ring->sched.ready = true;
-   else
+   if (adev->sdma.has_page_queue)
+   page->sched.ready = true;
+   } else {
ring->sched.ready = false;
+   if (adev->sdma.has_page_queue)
+   page->sched.ready = false;
+   }
+
}
/* publish sched.ready flag update effective immediately across smp */
smp_rmb();
-- 
2.25.1

[PATCH 1/5] drm/amdgpu/sdma7: Implement resume function for each instance

2024-12-09 Thread jesse.zh...@amd.com

Extracts the resume sequence for per sdma instance from sdma_v7_0_gfx_resume.
This function can be used in start or restart scenarios of specific instances.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 259 ++---
 1 file changed, 141 insertions(+), 118 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 10ddf2c9e1fd..8cc8eaff0680 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -491,162 +491,185 @@ static void sdma_v7_0_enable(struct amdgpu_device 
*adev, bool enable)
 }
 
 /**
- * sdma_v7_0_gfx_resume - setup and start the async dma engines
+ * sdma_v7_0_gfx_resume_instance - start/restart a certain sdma engine
  *
  * @adev: amdgpu_device pointer
+ * @i: instance
+ * @restore: used to restore wptr when restart
  *
- * Set up the gfx DMA ring buffers and enable them.
- * Returns 0 for success, error for failure.
+ * Set up the gfx DMA ring buffers and enable them. On restart, we will 
restore wptr and rptr.
+ * Return 0 for success.
  */
-static int sdma_v7_0_gfx_resume(struct amdgpu_device *adev)
+static int sdma_v7_0_gfx_resume_instance(struct amdgpu_device *adev, int i, 
bool restore)
 {
struct amdgpu_ring *ring;
u32 rb_cntl, ib_cntl;
u32 rb_bufsz;
u32 doorbell;
u32 doorbell_offset;
-   u32 tmp;
+   u32 temp;
u64 wptr_gpu_addr;
-   int i, r;
-
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   ring = &adev->sdma.instance[i].ring;
+   int r;
 
-   //if (!amdgpu_sriov_vf(adev))
-   //  WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_SEM_WAIT_FAIL_TIMER_CNTL), 0);
+   ring = &adev->sdma.instance[i].ring;
 
-   /* Set ring buffer size in dwords */
-   rb_bufsz = order_base_2(ring->ring_size / 4);
-   rb_cntl = RREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_CNTL));
-   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL, RB_SIZE, 
rb_bufsz);
+   /* Set ring buffer size in dwords */
+   rb_bufsz = order_base_2(ring->ring_size / 4);
+   rb_cntl = RREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_CNTL));
+   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL, RB_SIZE, 
rb_bufsz);
 #ifdef __BIG_ENDIAN
-   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL, 
RB_SWAP_ENABLE, 1);
-   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL,
-   RPTR_WRITEBACK_SWAP_ENABLE, 1);
+   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL, RB_SWAP_ENABLE, 
1);
+   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL,
+   RPTR_WRITEBACK_SWAP_ENABLE, 1);
 #endif
-   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL, RB_PRIV, 
1);
-   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_CNTL), rb_cntl);
-
-   /* Initialize the ring buffer's read and write pointers */
+   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL, RB_PRIV, 1);
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_CNTL), rb_cntl);
+
+   /* Initialize the ring buffer's read and write pointers */
+   if (restore) {
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_RPTR), lower_32_bits(ring->wptr << 2));
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_RPTR_HI), upper_32_bits(ring->wptr << 2));
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_WPTR), lower_32_bits(ring->wptr << 2));
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_WPTR_HI), upper_32_bits(ring->wptr << 2));
+   } else {
WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_RPTR), 0);
WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_RPTR_HI), 0);
WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_WPTR), 0);
WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_WPTR_HI), 0);
+   }
+   /* setup the wptr shadow polling */
+   wptr_gpu_addr = ring->wptr_gpu_addr;
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_WPTR_POLL_ADDR_LO),
+  lower_32_bits(wptr_gpu_addr));
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_WPTR_POLL_ADDR_HI),
+  upper_32_bits(wptr_gpu_addr));
+
+   /* set the wb address whether it's enabled or not */
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_RPTR_ADDR_HI),
+  upper_32_bits(ring->rptr_gpu_addr) & 0

[PATCH 2/5] drm/amdgpu/sdma7: implement queue reset callback for sdma7

2024-12-09 Thread jesse.zh...@amd.com

Implement sdma queue reset callback by mes_reset_queue_mmio.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 8cc8eaff0680..627e0173b64d 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -830,6 +830,31 @@ static bool sdma_v7_0_check_soft_reset(struct 
amdgpu_ip_block *ip_block)
return false;
 }
 
+static int sdma_v7_0_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
+{
+   struct amdgpu_device *adev = ring->adev;
+   int i, r;
+
+   if (amdgpu_sriov_vf(adev))
+   return -EINVAL;
+
+   for (i = 0; i < adev->sdma.num_instances; i++) {
+   if (ring == &adev->sdma.instance[i].ring)
+   break;
+   }
+
+   if (i == adev->sdma.num_instances) {
+   DRM_ERROR("sdma instance not found\n");
+   return -EINVAL;
+   }
+
+   r = amdgpu_mes_reset_legacy_queue(adev, ring, vmid, true);
+   if (r)
+   return r;
+
+   return sdma_v7_0_gfx_resume_instance(adev, i, true);
+}
+
 /**
  * sdma_v7_0_start - setup and start the async dma engines
  *
@@ -1668,6 +1693,7 @@ static const struct amdgpu_ring_funcs 
sdma_v7_0_ring_funcs = {
.emit_reg_write_reg_wait = sdma_v7_0_ring_emit_reg_write_reg_wait,
.init_cond_exec = sdma_v7_0_ring_init_cond_exec,
.preempt_ib = sdma_v7_0_ring_preempt_ib,
+   .reset = sdma_v7_0_reset_queue,
 };
 
 static void sdma_v7_0_set_ring_funcs(struct amdgpu_device *adev)
-- 
2.25.1

[PATCH 4/5] drm/amdgpu/mes12: Implement reset gfx/compute queue function by mmio

2024-12-09 Thread jesse.zh...@amd.com

Reset gfx/compute queue through mmio based on me_id and queue_id.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.h |  2 +
 drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 88 +-
 2 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.h 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.h
index bcc9c72ccbde..f7184b2dc4e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.h
@@ -26,4 +26,6 @@
 
 extern const struct amdgpu_ip_block_version gfx_v12_0_ip_block;
 
+int gfx_v12_0_request_gfx_index_mutex(struct amdgpu_device *adev,
+ bool req);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index 0f6635ee84a5..d24a0e7fff15 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include "amdgpu.h"
+#include "gfx_v12_0.h"
 #include "soc15_common.h"
 #include "soc21.h"
 #include "gc/gc_12_0_0_offset.h"
@@ -350,6 +351,38 @@ static int mes_v12_0_remove_hw_queue(struct amdgpu_mes 
*mes,
offsetof(union MESAPI__REMOVE_QUEUE, api_status));
 }
 
+int gfx_v12_0_request_gfx_index_mutex(struct amdgpu_device *adev,
+ bool req)
+{
+   u32 i, tmp, val;
+
+   for (i = 0; i < adev->usec_timeout; i++) {
+   /* Request with MeId=2, PipeId=0 */
+   tmp = REG_SET_FIELD(0, CP_GFX_INDEX_MUTEX, REQUEST, req);
+   tmp = REG_SET_FIELD(tmp, CP_GFX_INDEX_MUTEX, CLIENTID, 4);
+   WREG32_SOC15(GC, 0, regCP_GFX_INDEX_MUTEX, tmp);
+
+   val = RREG32_SOC15(GC, 0, regCP_GFX_INDEX_MUTEX);
+   if (req) {
+   if (val == tmp)
+   break;
+   } else {
+   tmp = REG_SET_FIELD(tmp, CP_GFX_INDEX_MUTEX,
+   REQUEST, 1);
+
+   /* unlocked or locked by firmware */
+   if (val != tmp)
+   break;
+   }
+   udelay(1);
+   }
+
+   if (i >= adev->usec_timeout)
+   return -EINVAL;
+
+   return 0;
+}
+
 static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t 
queue_type,
  uint32_t me_id, uint32_t pipe_id,
  uint32_t queue_id, uint32_t vmid)
@@ -360,7 +393,60 @@ static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes 
*mes, uint32_t queue_typ
 
amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
 
-   if (queue_type == AMDGPU_RING_TYPE_SDMA) {
+   if (queue_type == AMDGPU_RING_TYPE_GFX) {
+   dev_info(adev->dev, "reset gfx queue (%d:%d:%d: vmid:%d)\n",
+me_id, pipe_id, queue_id, vmid);
+
+   mutex_lock(&adev->gfx.reset_sem_mutex);
+   gfx_v12_0_request_gfx_index_mutex(adev, true);
+   /* all se allow writes */
+   WREG32_SOC15(GC, 0, regGRBM_GFX_INDEX,
+(uint32_t)(0x1 << 
GRBM_GFX_INDEX__SE_BROADCAST_WRITES__SHIFT));
+   value = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 << 
vmid);
+   if (pipe_id == 0)
+   value = REG_SET_FIELD(value, CP_VMID_RESET, 
PIPE0_QUEUES, 1 << queue_id);
+   else
+   value = REG_SET_FIELD(value, CP_VMID_RESET, 
PIPE1_QUEUES, 1 << queue_id);
+   WREG32_SOC15(GC, 0, regCP_VMID_RESET, value);
+   gfx_v12_0_request_gfx_index_mutex(adev, false);
+   mutex_unlock(&adev->gfx.reset_sem_mutex);
+
+   mutex_lock(&adev->srbm_mutex);
+   soc21_grbm_select(adev, me_id, pipe_id, queue_id, 0);
+   /* wait till dequeue take effects */
+   for (i = 0; i < adev->usec_timeout; i++) {
+   if (!(RREG32_SOC15(GC, 0, regCP_GFX_HQD_ACTIVE) & 1))
+   break;
+   udelay(1);
+   }
+   if (i >= adev->usec_timeout) {
+   dev_err(adev->dev, "failed to wait on gfx hqd 
deactivate\n");
+   r = -ETIMEDOUT;
+   }
+
+   soc21_grbm_select(adev, 0, 0, 0, 0);
+   mutex_unlock(&adev->srbm_mutex);
+   } else if (queue_type == AMDGPU_RING_TYPE_COMPUTE) {
+   dev_info(adev->dev, "reset compute queue (%d:%d:%d)\n",
+me_id, pipe_id, queue_id);
+   mutex_lock(&adev->srbm_mutex);
+   soc21_grbm_select(adev, me_id, pipe_id, queue_id, 0);
+   WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0x2);
+   WREG32_SOC15(GC, 0, regSPI_COMPUTE_QUEUE_RESET, 0x1);
+
+   /* wait till dequeue take effects */
+

[PATCH 3/5] drm/amdgpu/mes12: Implement reset sdmav7 queue function by mmio

2024-12-09 Thread jesse.zh...@amd.com

Reset sdma queue through mmio based on me_id and queue_id.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 46 ++
 1 file changed, 46 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index c63b3053eb7d..0f6635ee84a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -350,6 +350,47 @@ static int mes_v12_0_remove_hw_queue(struct amdgpu_mes 
*mes,
offsetof(union MESAPI__REMOVE_QUEUE, api_status));
 }
 
+static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t 
queue_type,
+ uint32_t me_id, uint32_t pipe_id,
+ uint32_t queue_id, uint32_t vmid)
+{
+   struct amdgpu_device *adev = mes->adev;
+   uint32_t value, reg;
+   int i, r = 0;
+
+   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
+
+   if (queue_type == AMDGPU_RING_TYPE_SDMA) {
+   dev_info(adev->dev, "reset sdma queue (%d:%d:%d)\n",
+me_id, pipe_id, queue_id);
+   switch (me_id) {
+   case 1:
+   reg = SOC15_REG_OFFSET(GC, 0, regSDMA1_QUEUE_RESET_REQ);
+   break;
+   case 0:
+   default:
+   reg = SOC15_REG_OFFSET(GC, 0, regSDMA0_QUEUE_RESET_REQ);
+   break;
+   }
+
+   value = 1 << queue_id;
+   WREG32(reg, value);
+   /* wait for queue reset done */
+   for (i = 0; i < adev->usec_timeout; i++) {
+   if (!(RREG32(reg) & value))
+   break;
+   udelay(1);
+   }
+   if (i >= adev->usec_timeout) {
+   dev_err(adev->dev, "failed to wait on sdma queue reset 
done\n");
+   r = -ETIMEDOUT;
+   }
+   }
+
+   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+   return r;
+}
+
 static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes,
struct mes_reset_queue_input *input)
 {
@@ -730,6 +771,11 @@ static int mes_v12_0_reset_legacy_queue(struct amdgpu_mes 
*mes,
union MESAPI__RESET mes_reset_queue_pkt;
int pipe;
 
+   if (input->use_mmio)
+   return mes_v12_0_reset_queue_mmio(mes, input->queue_type,
+ input->me_id, input->pipe_id,
+ input->queue_id, input->vmid);
+
memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
 
mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER;
-- 
2.25.1

[PATCH 5/5] drm/amdgpu/sdma7: Add queue reset sysfs for sdmav7

2024-12-09 Thread jesse.zh...@amd.com

sdmv7 queue reset already supports by mmio, add its sys file.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 627e0173b64d..8e69b84e0165 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -1368,6 +1368,9 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block 
*ip_block)
return r;
}
 
+   adev->sdma.supported_reset =
+   amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
+   adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
/* Allocate memory for SDMA IP Dump buffer */
ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t), 
GFP_KERNEL);
if (ptr)
@@ -1378,7 +1381,9 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block 
*ip_block)
 #ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ
adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs;
 #endif
-
+   r = amdgpu_sdma_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
 
return r;
 }
@@ -1391,6 +1396,7 @@ static int sdma_v7_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
for (i = 0; i < adev->sdma.num_instances; i++)
amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 
+   amdgpu_sdma_sysfs_reset_mask_fini(adev);
amdgpu_sdma_destroy_inst_ctx(adev, true);
 
if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT)
-- 
2.25.1

[PATCH 6/7 v2] drm/amdgpu/gfx12: clean up kcq reset code

2024-12-10 Thread jesse.zh...@amd.com

Replace kcq queue reset with existing function amdgpu_mes_reset_legacy_queue.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 18 +-
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index d0697b0869e3..035fc392526d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -5277,24 +5277,16 @@ static int gfx_v12_0_reset_kgq(struct amdgpu_ring 
*ring, unsigned int vmid)
 static int gfx_v12_0_reset_kcq(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int r, i;
+   int r;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
 
-   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
-   mutex_lock(&adev->srbm_mutex);
-   soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
-   WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0x2);
-   WREG32_SOC15(GC, 0, regSPI_COMPUTE_QUEUE_RESET, 0x1);
-   for (i = 0; i < adev->usec_timeout; i++) {
-   if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
-   break;
-   udelay(1);
+   r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true);
+   if (r) {
+   dev_err(adev->dev, "reset via MMIO failed %d\n", r);
+   return r;
}
-   soc24_grbm_select(adev, 0, 0, 0, 0);
-   mutex_unlock(&adev->srbm_mutex);
-   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
 
r = amdgpu_bo_reserve(ring->mqd_obj, false);
if (unlikely(r != 0)) {
-- 
2.25.1

[PATCH 1/7 v2] drm/amdgpu/sdma7: Implement resume function for each instance

2024-12-09 Thread jesse.zh...@amd.com

Extracts the resume sequence for per sdma instance from sdma_v7_0_gfx_resume.
This function can be used in start or restart scenarios of specific instances.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 259 ++---
 1 file changed, 141 insertions(+), 118 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 10ddf2c9e1fd..8cc8eaff0680 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -491,162 +491,185 @@ static void sdma_v7_0_enable(struct amdgpu_device 
*adev, bool enable)
 }
 
 /**
- * sdma_v7_0_gfx_resume - setup and start the async dma engines
+ * sdma_v7_0_gfx_resume_instance - start/restart a certain sdma engine
  *
  * @adev: amdgpu_device pointer
+ * @i: instance
+ * @restore: used to restore wptr when restart
  *
- * Set up the gfx DMA ring buffers and enable them.
- * Returns 0 for success, error for failure.
+ * Set up the gfx DMA ring buffers and enable them. On restart, we will 
restore wptr and rptr.
+ * Return 0 for success.
  */
-static int sdma_v7_0_gfx_resume(struct amdgpu_device *adev)
+static int sdma_v7_0_gfx_resume_instance(struct amdgpu_device *adev, int i, 
bool restore)
 {
struct amdgpu_ring *ring;
u32 rb_cntl, ib_cntl;
u32 rb_bufsz;
u32 doorbell;
u32 doorbell_offset;
-   u32 tmp;
+   u32 temp;
u64 wptr_gpu_addr;
-   int i, r;
-
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   ring = &adev->sdma.instance[i].ring;
+   int r;
 
-   //if (!amdgpu_sriov_vf(adev))
-   //  WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_SEM_WAIT_FAIL_TIMER_CNTL), 0);
+   ring = &adev->sdma.instance[i].ring;
 
-   /* Set ring buffer size in dwords */
-   rb_bufsz = order_base_2(ring->ring_size / 4);
-   rb_cntl = RREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_CNTL));
-   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL, RB_SIZE, 
rb_bufsz);
+   /* Set ring buffer size in dwords */
+   rb_bufsz = order_base_2(ring->ring_size / 4);
+   rb_cntl = RREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_CNTL));
+   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL, RB_SIZE, 
rb_bufsz);
 #ifdef __BIG_ENDIAN
-   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL, 
RB_SWAP_ENABLE, 1);
-   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL,
-   RPTR_WRITEBACK_SWAP_ENABLE, 1);
+   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL, RB_SWAP_ENABLE, 
1);
+   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL,
+   RPTR_WRITEBACK_SWAP_ENABLE, 1);
 #endif
-   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL, RB_PRIV, 
1);
-   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_CNTL), rb_cntl);
-
-   /* Initialize the ring buffer's read and write pointers */
+   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_QUEUE0_RB_CNTL, RB_PRIV, 1);
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_CNTL), rb_cntl);
+
+   /* Initialize the ring buffer's read and write pointers */
+   if (restore) {
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_RPTR), lower_32_bits(ring->wptr << 2));
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_RPTR_HI), upper_32_bits(ring->wptr << 2));
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_WPTR), lower_32_bits(ring->wptr << 2));
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_WPTR_HI), upper_32_bits(ring->wptr << 2));
+   } else {
WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_RPTR), 0);
WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_RPTR_HI), 0);
WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_WPTR), 0);
WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_WPTR_HI), 0);
+   }
+   /* setup the wptr shadow polling */
+   wptr_gpu_addr = ring->wptr_gpu_addr;
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_WPTR_POLL_ADDR_LO),
+  lower_32_bits(wptr_gpu_addr));
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_WPTR_POLL_ADDR_HI),
+  upper_32_bits(wptr_gpu_addr));
+
+   /* set the wb address whether it's enabled or not */
+   WREG32_SOC15_IP(GC, sdma_v7_0_get_reg_offset(adev, i, 
regSDMA0_QUEUE0_RB_RPTR_ADDR_HI),
+  upper_32_bits(ring->rptr_gpu_addr) & 0

[PATCH 4/7 v2] drm/amdgpu/mes12: Implement reset gfx/compute queue function by mmio

2024-12-09 Thread jesse.zh...@amd.com

Reset gfx/compute queue through mmio based on me_id and queue_id.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.h |  2 +
 drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 88 +-
 2 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.h 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.h
index bcc9c72ccbde..f7184b2dc4e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.h
@@ -26,4 +26,6 @@
 
 extern const struct amdgpu_ip_block_version gfx_v12_0_ip_block;
 
+int gfx_v12_0_request_gfx_index_mutex(struct amdgpu_device *adev,
+ bool req);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index 0f6635ee84a5..d24a0e7fff15 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include "amdgpu.h"
+#include "gfx_v12_0.h"
 #include "soc15_common.h"
 #include "soc21.h"
 #include "gc/gc_12_0_0_offset.h"
@@ -350,6 +351,38 @@ static int mes_v12_0_remove_hw_queue(struct amdgpu_mes 
*mes,
offsetof(union MESAPI__REMOVE_QUEUE, api_status));
 }
 
+int gfx_v12_0_request_gfx_index_mutex(struct amdgpu_device *adev,
+ bool req)
+{
+   u32 i, tmp, val;
+
+   for (i = 0; i < adev->usec_timeout; i++) {
+   /* Request with MeId=2, PipeId=0 */
+   tmp = REG_SET_FIELD(0, CP_GFX_INDEX_MUTEX, REQUEST, req);
+   tmp = REG_SET_FIELD(tmp, CP_GFX_INDEX_MUTEX, CLIENTID, 4);
+   WREG32_SOC15(GC, 0, regCP_GFX_INDEX_MUTEX, tmp);
+
+   val = RREG32_SOC15(GC, 0, regCP_GFX_INDEX_MUTEX);
+   if (req) {
+   if (val == tmp)
+   break;
+   } else {
+   tmp = REG_SET_FIELD(tmp, CP_GFX_INDEX_MUTEX,
+   REQUEST, 1);
+
+   /* unlocked or locked by firmware */
+   if (val != tmp)
+   break;
+   }
+   udelay(1);
+   }
+
+   if (i >= adev->usec_timeout)
+   return -EINVAL;
+
+   return 0;
+}
+
 static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t 
queue_type,
  uint32_t me_id, uint32_t pipe_id,
  uint32_t queue_id, uint32_t vmid)
@@ -360,7 +393,60 @@ static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes 
*mes, uint32_t queue_typ
 
amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
 
-   if (queue_type == AMDGPU_RING_TYPE_SDMA) {
+   if (queue_type == AMDGPU_RING_TYPE_GFX) {
+   dev_info(adev->dev, "reset gfx queue (%d:%d:%d: vmid:%d)\n",
+me_id, pipe_id, queue_id, vmid);
+
+   mutex_lock(&adev->gfx.reset_sem_mutex);
+   gfx_v12_0_request_gfx_index_mutex(adev, true);
+   /* all se allow writes */
+   WREG32_SOC15(GC, 0, regGRBM_GFX_INDEX,
+(uint32_t)(0x1 << 
GRBM_GFX_INDEX__SE_BROADCAST_WRITES__SHIFT));
+   value = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 << 
vmid);
+   if (pipe_id == 0)
+   value = REG_SET_FIELD(value, CP_VMID_RESET, 
PIPE0_QUEUES, 1 << queue_id);
+   else
+   value = REG_SET_FIELD(value, CP_VMID_RESET, 
PIPE1_QUEUES, 1 << queue_id);
+   WREG32_SOC15(GC, 0, regCP_VMID_RESET, value);
+   gfx_v12_0_request_gfx_index_mutex(adev, false);
+   mutex_unlock(&adev->gfx.reset_sem_mutex);
+
+   mutex_lock(&adev->srbm_mutex);
+   soc21_grbm_select(adev, me_id, pipe_id, queue_id, 0);
+   /* wait till dequeue take effects */
+   for (i = 0; i < adev->usec_timeout; i++) {
+   if (!(RREG32_SOC15(GC, 0, regCP_GFX_HQD_ACTIVE) & 1))
+   break;
+   udelay(1);
+   }
+   if (i >= adev->usec_timeout) {
+   dev_err(adev->dev, "failed to wait on gfx hqd 
deactivate\n");
+   r = -ETIMEDOUT;
+   }
+
+   soc21_grbm_select(adev, 0, 0, 0, 0);
+   mutex_unlock(&adev->srbm_mutex);
+   } else if (queue_type == AMDGPU_RING_TYPE_COMPUTE) {
+   dev_info(adev->dev, "reset compute queue (%d:%d:%d)\n",
+me_id, pipe_id, queue_id);
+   mutex_lock(&adev->srbm_mutex);
+   soc21_grbm_select(adev, me_id, pipe_id, queue_id, 0);
+   WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0x2);
+   WREG32_SOC15(GC, 0, regSPI_COMPUTE_QUEUE_RESET, 0x1);
+
+   /* wait till dequeue take effects */
+

[PATCH 3/7 v2] drm/amdgpu/mes12: Implement reset sdmav7 queue function by mmio

2024-12-09 Thread jesse.zh...@amd.com

Reset sdma queue through mmio based on me_id and queue_id.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 46 ++
 1 file changed, 46 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index c63b3053eb7d..0f6635ee84a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -350,6 +350,47 @@ static int mes_v12_0_remove_hw_queue(struct amdgpu_mes 
*mes,
offsetof(union MESAPI__REMOVE_QUEUE, api_status));
 }
 
+static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t 
queue_type,
+ uint32_t me_id, uint32_t pipe_id,
+ uint32_t queue_id, uint32_t vmid)
+{
+   struct amdgpu_device *adev = mes->adev;
+   uint32_t value, reg;
+   int i, r = 0;
+
+   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
+
+   if (queue_type == AMDGPU_RING_TYPE_SDMA) {
+   dev_info(adev->dev, "reset sdma queue (%d:%d:%d)\n",
+me_id, pipe_id, queue_id);
+   switch (me_id) {
+   case 1:
+   reg = SOC15_REG_OFFSET(GC, 0, regSDMA1_QUEUE_RESET_REQ);
+   break;
+   case 0:
+   default:
+   reg = SOC15_REG_OFFSET(GC, 0, regSDMA0_QUEUE_RESET_REQ);
+   break;
+   }
+
+   value = 1 << queue_id;
+   WREG32(reg, value);
+   /* wait for queue reset done */
+   for (i = 0; i < adev->usec_timeout; i++) {
+   if (!(RREG32(reg) & value))
+   break;
+   udelay(1);
+   }
+   if (i >= adev->usec_timeout) {
+   dev_err(adev->dev, "failed to wait on sdma queue reset 
done\n");
+   r = -ETIMEDOUT;
+   }
+   }
+
+   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+   return r;
+}
+
 static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes,
struct mes_reset_queue_input *input)
 {
@@ -730,6 +771,11 @@ static int mes_v12_0_reset_legacy_queue(struct amdgpu_mes 
*mes,
union MESAPI__RESET mes_reset_queue_pkt;
int pipe;
 
+   if (input->use_mmio)
+   return mes_v12_0_reset_queue_mmio(mes, input->queue_type,
+ input->me_id, input->pipe_id,
+ input->queue_id, input->vmid);
+
memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
 
mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER;
-- 
2.25.1

[PATCH 7/7 v2] drm/amdgpu/gfx11: clean up kcq reset code

2024-12-09 Thread jesse.zh...@amd.com

Replace kcq queue reset with existing function amdgpu_mes_reset_legacy_queue.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 22 +++---
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 67cd42031571..b741dcb0a5a3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -6675,30 +6675,14 @@ static int gfx_v11_0_reset_kgq(struct amdgpu_ring 
*ring, unsigned int vmid)
 static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int i, r = 0;
+   int r = 0;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
 
-   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
-   mutex_lock(&adev->srbm_mutex);
-   soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
-   WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0x2);
-   WREG32_SOC15(GC, 0, regSPI_COMPUTE_QUEUE_RESET, 0x1);
-
-   /* make sure dequeue is complete*/
-   for (i = 0; i < adev->usec_timeout; i++) {
-   if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
-   break;
-   udelay(1);
-   }
-   if (i >= adev->usec_timeout)
-   r = -ETIMEDOUT;
-   soc21_grbm_select(adev, 0, 0, 0, 0);
-   mutex_unlock(&adev->srbm_mutex);
-   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+   r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true);
if (r) {
-   dev_err(adev->dev, "fail to wait on hqd deactivate\n");
+   dev_err(adev->dev, "reset via MMIO failed %d\n", r);
return r;
}
 
-- 
2.25.1

[PATCH 2/7 v2] drm/amdgpu/sdma7: implement queue reset callback for sdma7

2024-12-09 Thread jesse.zh...@amd.com

Implement sdma queue reset callback by mes_reset_queue_mmio.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 8cc8eaff0680..627e0173b64d 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -830,6 +830,31 @@ static bool sdma_v7_0_check_soft_reset(struct 
amdgpu_ip_block *ip_block)
return false;
 }
 
+static int sdma_v7_0_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
+{
+   struct amdgpu_device *adev = ring->adev;
+   int i, r;
+
+   if (amdgpu_sriov_vf(adev))
+   return -EINVAL;
+
+   for (i = 0; i < adev->sdma.num_instances; i++) {
+   if (ring == &adev->sdma.instance[i].ring)
+   break;
+   }
+
+   if (i == adev->sdma.num_instances) {
+   DRM_ERROR("sdma instance not found\n");
+   return -EINVAL;
+   }
+
+   r = amdgpu_mes_reset_legacy_queue(adev, ring, vmid, true);
+   if (r)
+   return r;
+
+   return sdma_v7_0_gfx_resume_instance(adev, i, true);
+}
+
 /**
  * sdma_v7_0_start - setup and start the async dma engines
  *
@@ -1668,6 +1693,7 @@ static const struct amdgpu_ring_funcs 
sdma_v7_0_ring_funcs = {
.emit_reg_write_reg_wait = sdma_v7_0_ring_emit_reg_write_reg_wait,
.init_cond_exec = sdma_v7_0_ring_init_cond_exec,
.preempt_ib = sdma_v7_0_ring_preempt_ib,
+   .reset = sdma_v7_0_reset_queue,
 };
 
 static void sdma_v7_0_set_ring_funcs(struct amdgpu_device *adev)
-- 
2.25.1

[PATCH 5/7 v2] drm/amdgpu/sdma7: Add queue reset sysfs for sdmav7

2024-12-09 Thread jesse.zh...@amd.com

sdmv7 queue reset already supports by mmio, add its sys file.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 627e0173b64d..8e69b84e0165 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -1368,6 +1368,9 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block 
*ip_block)
return r;
}
 
+   adev->sdma.supported_reset =
+   amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
+   adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
/* Allocate memory for SDMA IP Dump buffer */
ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t), 
GFP_KERNEL);
if (ptr)
@@ -1378,7 +1381,9 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block 
*ip_block)
 #ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ
adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs;
 #endif
-
+   r = amdgpu_sdma_sysfs_reset_mask_init(adev);
+   if (r)
+   return r;
 
return r;
 }
@@ -1391,6 +1396,7 @@ static int sdma_v7_0_sw_fini(struct amdgpu_ip_block 
*ip_block)
for (i = 0; i < adev->sdma.num_instances; i++)
amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 
+   amdgpu_sdma_sysfs_reset_mask_fini(adev);
amdgpu_sdma_destroy_inst_ctx(adev, true);
 
if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT)
-- 
2.25.1

[PATCH 2/3] drm/amdgpu/pm: add PPSMC_MSG_ResetSDMA2 definition

2024-12-16 Thread jesse.zh...@amd.com

add the PPSMC_MSG_ResetSDMA2 definition for smu 13.0.6

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h | 1 +
 drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 ++-
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h
index 147bfb12fd75..7b65a27fb302 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h
@@ -93,6 +93,7 @@
 #define PPSMC_MSG_SelectPLPDMode0x40
 #define PPSMC_MSG_RmaDueToBadPageThreshold  0x43
 #define PPSMC_MSG_SelectPstatePolicy0x44
+#define PPSMC_MSG_ResetSDMA20x45
 #define PPSMC_MSG_ResetSDMA 0x4D
 #define PPSMC_Message_Count 0x4E
 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
index e4cd6a0d13da..b0dab9797c70 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
@@ -276,7 +276,8 @@
__SMU_DUMMY_MAP(SelectPstatePolicy), \
__SMU_DUMMY_MAP(MALLPowerController), \
__SMU_DUMMY_MAP(MALLPowerState), \
-   __SMU_DUMMY_MAP(ResetSDMA),
+   __SMU_DUMMY_MAP(ResetSDMA), \
+   __SMU_DUMMY_MAP(ResetSDMA2),
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)  SMU_MSG_##type
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 5b86df0c8536..9222e7a777a6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -194,6 +194,7 @@ static const struct cmn2asic_msg_mapping 
smu_v13_0_6_message_map[SMU_MSG_MAX_COU
MSG_MAP(RmaDueToBadPageThreshold,
PPSMC_MSG_RmaDueToBadPageThreshold,0),
MSG_MAP(SelectPstatePolicy,  
PPSMC_MSG_SelectPstatePolicy,  0),
MSG_MAP(ResetSDMA,   PPSMC_MSG_ResetSDMA,   
0),
+   MSG_MAP(ResetSDMA2,  PPSMC_MSG_ResetSDMA2,  
0),
 };
 
 // clang-format on
-- 
2.25.1

[PATCH 1/3] drm/amdgpu/sdma4.4.2: add apu support in sdma queue reset

2024-12-16 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Remove apu check in sdma queue reset.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 5e1cb1c2c0f8..e39f1f495ea8 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1600,7 +1600,7 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
int i, r;
u32 inst_mask;
 
-   if ((adev->flags & AMD_IS_APU) || amdgpu_sriov_vf(adev))
+   if (amdgpu_sriov_vf(adev))
return -EINVAL;
 
/* stop queue */
-- 
2.25.1

[PATCH 3/3] drm/amdgpu/pm: Implement SDMA queue reset for different asic

2024-12-16 Thread jesse.zh...@amd.com

Implement sdma queue reset by SMU_MSG_ResetSDMA2

Suggested-by: Tim Huang 
Signed-off-by: Jesse Zhang 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 30 ++-
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 9222e7a777a6..446959145058 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2721,17 +2721,31 @@ static int smu_v13_0_6_send_rma_reason(struct 
smu_context *smu)
 
 static int smu_v13_0_6_reset_sdma(struct smu_context *smu, uint32_t inst_mask)
 {
-   struct amdgpu_device *adev = smu->adev;
+   uint32_t smu_program;
int ret = 0;
 
-   /* the message is only valid on SMU 13.0.6 with pmfw 85.121.00 and 
above */
-   if ((adev->flags & AMD_IS_APU) ||
-   amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 6) ||
-   smu->smc_fw_version < 0x00557900)
-   return 0;
+   smu_program = (smu->smc_fw_version >> 24) & 0xff;
+   switch (amdgpu_ip_version(smu->adev, MP1_HWIP, 0)) {
+   case IP_VERSION(13, 0, 6):
+   if (((smu_program == 7) && (smu->smc_fw_version > 0x07550763)) 
||
+   ((smu_program == 0) && (smu->smc_fw_version > 
0x00557700)))
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+   SMU_MSG_ResetSDMA, inst_mask, NULL);
+   else if ((smu_program == 4) &&
+   (smu->smc_fw_version > 0x4556e6c))
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+ SMU_MSG_ResetSDMA2, inst_mask, NULL);
+   break;
+   case IP_VERSION(13, 0, 14):
+   if ((smu_program == 5) &&
+   (smu->smc_fw_version > 0x05550f00))
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+ SMU_MSG_ResetSDMA2, inst_mask, NULL);
+   break;
+   default:
+   break;
+   }
 
-   ret = smu_cmn_send_smc_msg_with_param(smu,
- SMU_MSG_ResetSDMA, inst_mask, 
NULL);
if (ret)
dev_err(smu->adev->dev,
"failed to send ResetSDMA event with mask 0x%x\n",
-- 
2.25.1

[PATCH] drm/amdkfd: fixed page fault when enable MES shader debugger

2024-12-18 Thread jesse.zh...@amd.com

Initialize the process context address before setting the shader debugger.

[  260.781212] amdgpu :03:00.0: amdgpu: [gfxhub] page fault (src_id:0 
ring:32 vmid:0 pasid:0)
[  260.781236] amdgpu :03:00.0: amdgpu:   in page starting at address 
0x from client 10
[  260.781255] amdgpu :03:00.0: amdgpu: 
GCVM_L2_PROTECTION_FAULT_STATUS:0x00040A40
[  260.781270] amdgpu :03:00.0: amdgpu:  Faulty UTCL2 client ID: CPC 
(0x5)
[  260.781284] amdgpu :03:00.0: amdgpu:  MORE_FAULTS: 0x0
[  260.781296] amdgpu :03:00.0: amdgpu:  WALKER_ERROR: 0x0
[  260.781308] amdgpu :03:00.0: amdgpu:  PERMISSION_FAULTS: 0x4
[  260.781320] amdgpu :03:00.0: amdgpu:  MAPPING_ERROR: 0x0
[  260.781332] amdgpu :03:00.0: amdgpu:  RW: 0x1
[  260.782017] amdgpu :03:00.0: amdgpu: [gfxhub] page fault (src_id:0 
ring:32 vmid:0 pasid:0)
[  260.782039] amdgpu :03:00.0: amdgpu:   in page starting at address 
0x from client 10
[  260.782058] amdgpu :03:00.0: amdgpu: 
GCVM_L2_PROTECTION_FAULT_STATUS:0x00040A41
[  260.782073] amdgpu :03:00.0: amdgpu:  Faulty UTCL2 client ID: CPC 
(0x5)
[  260.782087] amdgpu :03:00.0: amdgpu:  MORE_FAULTS: 0x1
[  260.782098] amdgpu :03:00.0: amdgpu:  WALKER_ERROR: 0x0
[  260.782110] amdgpu :03:00.0: amdgpu:  PERMISSION_FAULTS: 0x4
[  260.782122] amdgpu :03:00.0: amdgpu:  MAPPING_ERROR: 0x0
[  260.782137] amdgpu :03:00.0: amdgpu:  RW: 0x1
[  260.782155] amdgpu :03:00.0: amdgpu: [gfxhub] page fault (src_id:0 
ring:32 vmid:0 pasid:0)
[  260.782166] amdgpu :03:00.0: amdgpu:   in page starting at address 
0x from client 10

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 312dfa84f29f..a8abc3091801 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -350,10 +350,27 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device 
*pdd, bool sq_trap_en)
 {
uint32_t spi_dbg_cntl = pdd->spi_dbg_override | 
pdd->spi_dbg_launch_mode;
uint32_t flags = pdd->process->dbg_flags;
+   struct amdgpu_device *adev = pdd->dev->adev;
+   int r;
 
if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
return 0;
 
+   if (!pdd->proc_ctx_cpu_ptr) {
+   r = amdgpu_amdkfd_alloc_gtt_mem(adev,
+   AMDGPU_MES_PROC_CTX_SIZE,
+   &pdd->proc_ctx_bo,
+   &pdd->proc_ctx_gpu_addr,
+   &pdd->proc_ctx_cpu_ptr,
+   false);
+   if (r) {
+   dev_err(adev->dev,
+   "failed to allocate process context bo\n");
+   return r;
+   }
+   memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
+   }
+
return amdgpu_mes_set_shader_debugger(pdd->dev->adev, 
pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
pdd->watch_points, flags, 
sq_trap_en);
 }
-- 
2.25.1

[PATCH 1/3] drm/amdgpu/sdma4.4.2: add apu support in sdma queue reset

2024-12-13 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Remove apu check in sdma queue reset.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 5e1cb1c2c0f8..e39f1f495ea8 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1600,7 +1600,7 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
int i, r;
u32 inst_mask;
 
-   if ((adev->flags & AMD_IS_APU) || amdgpu_sriov_vf(adev))
+   if (amdgpu_sriov_vf(adev))
return -EINVAL;
 
/* stop queue */
-- 
2.25.1

[PATCH 2/3] drm/amd/pm: update 13_0_6 ppsmc header

2024-12-13 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

add the definition PPSMC_MSG_ResetSDMA2.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h | 1 +
 drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 ++-
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h
index 147bfb12fd75..7b65a27fb302 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h
@@ -93,6 +93,7 @@
 #define PPSMC_MSG_SelectPLPDMode0x40
 #define PPSMC_MSG_RmaDueToBadPageThreshold  0x43
 #define PPSMC_MSG_SelectPstatePolicy0x44
+#define PPSMC_MSG_ResetSDMA20x45
 #define PPSMC_MSG_ResetSDMA 0x4D
 #define PPSMC_Message_Count 0x4E
 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
index e4cd6a0d13da..b0dab9797c70 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
@@ -276,7 +276,8 @@
__SMU_DUMMY_MAP(SelectPstatePolicy), \
__SMU_DUMMY_MAP(MALLPowerController), \
__SMU_DUMMY_MAP(MALLPowerState), \
-   __SMU_DUMMY_MAP(ResetSDMA),
+   __SMU_DUMMY_MAP(ResetSDMA), \
+   __SMU_DUMMY_MAP(ResetSDMA2),
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)  SMU_MSG_##type
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 5b86df0c8536..9222e7a777a6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -194,6 +194,7 @@ static const struct cmn2asic_msg_mapping 
smu_v13_0_6_message_map[SMU_MSG_MAX_COU
MSG_MAP(RmaDueToBadPageThreshold,
PPSMC_MSG_RmaDueToBadPageThreshold,0),
MSG_MAP(SelectPstatePolicy,  
PPSMC_MSG_SelectPstatePolicy,  0),
MSG_MAP(ResetSDMA,   PPSMC_MSG_ResetSDMA,   
0),
+   MSG_MAP(ResetSDMA2,  PPSMC_MSG_ResetSDMA2,  
0),
 };
 
 // clang-format on
-- 
2.25.1

[PATCH 3/3] drm/amdgpu/pm: Implement SDMA queue for different asic

2024-12-13 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Implement sdma queue reset by SMU_MSG_ResetSDMA2.

Signed-off-by: Jesse Zhang 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 28 ++-
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 9222e7a777a6..e57d83099f4c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2724,14 +2724,28 @@ static int smu_v13_0_6_reset_sdma(struct smu_context 
*smu, uint32_t inst_mask)
struct amdgpu_device *adev = smu->adev;
int ret = 0;
 
-   /* the message is only valid on SMU 13.0.6 with pmfw 85.121.00 and 
above */
-   if ((adev->flags & AMD_IS_APU) ||
-   amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 6) ||
-   smu->smc_fw_version < 0x00557900)
-   return 0;
+   switch (amdgpu_ip_version(smu->adev, MP1_HWIP, 0)) {
+   case IP_VERSION(13, 0, 6):
+   if (((smu->smc_fw_version > 0x07550763) &&
+   (smu->smc_fw_version < 0x08000)) ||
+   ((smu->smc_fw_version > 0x00557700) &&
+   (smu->smc_fw_version < 0x0100)))
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+   SMU_MSG_ResetSDMA, inst_mask, NULL);
+   else if ((adev->flags & AMD_IS_APU) &&
+   (smu->smc_fw_version > 0x4556e6c))
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+ SMU_MSG_ResetSDMA2, inst_mask, NULL);
+   break;
+   case IP_VERSION(13, 0, 14):
+   if (smu->smc_fw_version > 0x05550f00)
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+ SMU_MSG_ResetSDMA2, inst_mask, NULL);
+   break;
+   default:
+   break;
+   }
 
-   ret = smu_cmn_send_smc_msg_with_param(smu,
- SMU_MSG_ResetSDMA, inst_mask, 
NULL);
if (ret)
dev_err(smu->adev->dev,
"failed to send ResetSDMA event with mask 0x%x\n",
-- 
2.25.1

[PATCH 2/2] drm/amdgpu/gfx10: implement gfx queue reset via MMIO

2025-01-08 Thread jesse.zh...@amd.com

implement gfx10 kgq reset via mmio.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 98 ++
 1 file changed, 70 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 89409cb7d195..aac250c121d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -9464,6 +9464,8 @@ static int gfx_v10_0_queue_reset(struct amdgpu_ring 
*ring, uint32_t vmid, bool u
struct amdgpu_ring *kiq_ring = &kiq->ring;
uint32_t queue_type = ring->funcs->type;
unsigned long flags;
+   uint32_t tmp;
+   uint64_t addr;
int i, r = 0;
 
if (use_mmio) {
@@ -9498,6 +9500,40 @@ static int gfx_v10_0_queue_reset(struct amdgpu_ring 
*ring, uint32_t vmid, bool u
nv_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(&adev->srbm_mutex);
amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+   } else if (queue_type == AMDGPU_RING_TYPE_GFX) {
+   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
+   mutex_lock(&adev->srbm_mutex);
+   nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 
0);
+
+   WREG32_SOC15(GC, 0, mmGRBM_GFX_INDEX,
+(uint32_t)(0x1 << 
GRBM_GFX_INDEX__SE_BROADCAST_WRITES__SHIFT));
+   tmp = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 
<< vmid);
+   if (ring->pipe == 0)
+   tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, 
PIPE0_QUEUES, 1 << ring->queue);
+   else
+   tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, 
PIPE1_QUEUES, 1 << ring->queue);
+   WREG32_SOC15(GC, 0, mmCP_VMID_RESET, tmp);
+
+
+   nv_grbm_select(adev, 0, 0, 0, 0);
+   mutex_unlock(&adev->srbm_mutex);
+   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+
+   mutex_lock(&adev->srbm_mutex);
+   nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 
0);
+   /* wait till dequeue take effects */
+   for (i = 0; i < adev->usec_timeout; i++) {
+   if (!(RREG32_SOC15(GC, 0, mmCP_GFX_HQD_ACTIVE) 
& 1))
+   break;
+   udelay(1);
+   }
+   if (i >= adev->usec_timeout) {
+   dev_err(adev->dev, "failed to wait on gfx hqd 
deactivate\n");
+   r = -ETIMEDOUT;
+   }
+
+   nv_grbm_select(adev, 0, 0, 0, 0);
+   mutex_unlock(&adev->srbm_mutex);
}
} else {
if (queue_type == AMDGPU_RING_TYPE_COMPUTE) {
@@ -9537,6 +9573,28 @@ static int gfx_v10_0_queue_reset(struct amdgpu_ring 
*ring, uint32_t vmid, bool u
dev_err(adev->dev, "fail to wait on hqd 
deactivate\n");
}
 
+   } else if (queue_type == AMDGPU_RING_TYPE_GFX) {
+   if (amdgpu_ring_alloc(kiq_ring, 5 + 7 + 7)) {
+   spin_unlock_irqrestore(&kiq->ring_lock, flags);
+   return -ENOMEM;
+   }
+
+   addr = amdgpu_bo_gpu_offset(ring->mqd_obj) +
+   offsetof(struct v10_gfx_mqd, cp_gfx_hqd_active);
+   tmp = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 
<< vmid);
+   if (ring->pipe == 0)
+   tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, 
PIPE0_QUEUES, 1 << ring->queue);
+   else
+   tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, 
PIPE1_QUEUES, 1 << ring->queue);
+
+   gfx_v10_0_ring_emit_wreg(kiq_ring,
+SOC15_REG_OFFSET(GC, 0, 
mmCP_VMID_RESET), tmp);
+   gfx_v10_0_wait_reg_mem(kiq_ring, 0, 1, 0,
+  lower_32_bits(addr), 
upper_32_bits(addr),
+  0, 1, 0x20);
+   gfx_v10_0_ring_emit_reg_wait(kiq_ring,
+SOC15_REG_OFFSET(GC, 0, 
mmCP_VMID_RESET), 0, 0x);
+   amdgpu_ring_commit(kiq_ring);
}
}
 
@@ -9549,8 +9607,6 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, 
unsigned int vmid)
struct amdgpu_kiq *kiq = &adev->gfx.kiq[0];
struct amdgpu_ring *kiq_ring = &kiq->ring;
unsigned long flags;
-   u32 tmp;
-   u64 addr;
int r;
 
if (amdgpu_sriov_vf(adev))
@@ -9560,

[PATCH 1/2] drm/amdgpu/gfx10: implement queue reset via MMIO

2025-01-08 Thread jesse.zh...@amd.com

implement gfx10 kcq reset via mmio.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 121 ++---
 1 file changed, 88 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 88393c2c08e4..89409cb7d195 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -9457,6 +9457,92 @@ static void gfx_v10_ring_insert_nop(struct amdgpu_ring 
*ring, uint32_t num_nop)
amdgpu_ring_insert_nop(ring, num_nop - 1);
 }
 
+static int gfx_v10_0_queue_reset(struct amdgpu_ring *ring, uint32_t vmid, bool 
use_mmio)
+{
+   struct amdgpu_device *adev = ring->adev;
+   struct amdgpu_kiq *kiq = &adev->gfx.kiq[0];
+   struct amdgpu_ring *kiq_ring = &kiq->ring;
+   uint32_t queue_type = ring->funcs->type;
+   unsigned long flags;
+   int i, r = 0;
+
+   if (use_mmio) {
+   if (queue_type == AMDGPU_RING_TYPE_COMPUTE) {
+   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
+   mutex_lock(&adev->srbm_mutex);
+   nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 
0);
+
+   WREG32_SOC15(GC, 0, mmCP_HQD_DEQUEUE_REQUEST, 0x2);
+   WREG32_SOC15(GC, 0, mmSPI_COMPUTE_QUEUE_RESET, 0x1);
+
+   nv_grbm_select(adev, 0, 0, 0, 0);
+   mutex_unlock(&adev->srbm_mutex);
+   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+
+   /* Make sure dequeue is complete */
+   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
+   mutex_lock(&adev->srbm_mutex);
+   nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 
0);
+
+   for (i = 0; i < adev->usec_timeout; i++) {
+   if (!(RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE) & 1))
+   break;
+   udelay(1);
+   }
+
+   if (i >= adev->usec_timeout) {
+   r = -ETIMEDOUT;
+   dev_err(adev->dev, "fail to wait on hqd 
deactivate\n");
+   }
+
+   nv_grbm_select(adev, 0, 0, 0, 0);
+   mutex_unlock(&adev->srbm_mutex);
+   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+   }
+   } else {
+   if (queue_type == AMDGPU_RING_TYPE_COMPUTE) {
+   spin_lock_irqsave(&kiq->ring_lock, flags);
+
+   if (amdgpu_ring_alloc(kiq_ring, 
kiq->pmf->unmap_queues_size)) {
+   spin_unlock_irqrestore(&kiq->ring_lock, flags);
+   return -ENOMEM;
+   }
+
+   kiq->pmf->kiq_unmap_queues(kiq_ring, ring, 
RESET_QUEUES, 0, 0);
+
+   amdgpu_ring_commit(kiq_ring);
+   spin_unlock_irqrestore(&kiq->ring_lock, flags);
+
+   r = amdgpu_ring_test_ring(kiq_ring);
+   if (r)
+   return r;
+
+   /* Make sure dequeue is complete */
+   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
+   mutex_lock(&adev->srbm_mutex);
+   nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 
0);
+
+   for (i = 0; i < adev->usec_timeout; i++) {
+   if (!(RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE) & 1))
+   break;
+   udelay(1);
+   }
+   if (i >= adev->usec_timeout)
+   r = -ETIMEDOUT;
+   nv_grbm_select(adev, 0, 0, 0, 0);
+   mutex_unlock(&adev->srbm_mutex);
+   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+   if (i >= adev->usec_timeout) {
+   r = -ETIMEDOUT;
+   dev_err(adev->dev, "fail to wait on hqd 
deactivate\n");
+   }
+
+   }
+   }
+
+   return r;
+}
+
 static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
@@ -9531,7 +9617,7 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring,
struct amdgpu_kiq *kiq = &adev->gfx.kiq[0];
struct amdgpu_ring *kiq_ring = &kiq->ring;
unsigned long flags;
-   int i, r;
+   int r;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
@@ -9539,41 +9625,10 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring,
if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
return -EINVAL;
 
-   spin_lock_irqsave(&kiq->ring_lock, flags);
-

[PATCH 1/2 V2] drm/amdgpu/gfx10: implement iqueue reset via MMIO

2025-01-09 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Using mmio to do queue reset.

v2: Alignment this function with gfx9/gfx9.4.3.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 5ba263fe5512..6157e1126030 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -3790,12 +3790,46 @@ static void gfx10_kiq_invalidate_tlbs(struct 
amdgpu_ring *kiq_ring,
gfx_v10_0_ring_invalidate_tlbs(kiq_ring, pasid, flush_type, all_hub, 1);
 }
 
+static void gfx_v10_0_kiq_reset_hw_queue(struct amdgpu_ring *kiq_ring, 
uint32_t queue_type,
+   uint32_t me_id, uint32_t pipe_id, 
uint32_t queue_id,
+   uint32_t xcc_id, uint32_t vmid)
+{
+   struct amdgpu_device *adev = kiq_ring->adev;
+   unsigned i;
+
+   /* enter save mode */
+   amdgpu_gfx_rlc_enter_safe_mode(adev, xcc_id);
+   mutex_lock(&adev->srbm_mutex);
+   nv_grbm_select(adev, me_id, pipe_id, queue_id, 0);
+
+   if (queue_type == AMDGPU_RING_TYPE_COMPUTE) {
+   WREG32_SOC15(GC, 0, mmCP_HQD_DEQUEUE_REQUEST, 0x2);
+   WREG32_SOC15(GC, 0, mmSPI_COMPUTE_QUEUE_RESET, 0x1);
+   /* wait till dequeue take effects */
+   for (i = 0; i < adev->usec_timeout; i++) {
+   if (!(RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE) & 1))
+   break;
+   udelay(1);
+   }
+   if (i >= adev->usec_timeout)
+   dev_err(adev->dev, "fail to wait on hqd deactive\n");
+   } else {
+   dev_err(adev->dev, "reset queue_type(%d) not supported\n", 
queue_type);
+   }
+
+   nv_grbm_select(adev, 0, 0, 0, 0);
+   mutex_unlock(&adev->srbm_mutex);
+   /* exit safe mode */
+   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+}
+
 static const struct kiq_pm4_funcs gfx_v10_0_kiq_pm4_funcs = {
.kiq_set_resources = gfx10_kiq_set_resources,
.kiq_map_queues = gfx10_kiq_map_queues,
.kiq_unmap_queues = gfx10_kiq_unmap_queues,
.kiq_query_status = gfx10_kiq_query_status,
.kiq_invalidate_tlbs = gfx10_kiq_invalidate_tlbs,
+   .kiq_reset_hw_queue = gfx_v10_0_kiq_reset_hw_queue,
.set_resources_size = 8,
.map_queues_size = 7,
.unmap_queues_size = 6,
-- 
2.25.1

[PATCH 2/2 V2] drm/amdgpu/gfx10: implement gfx queue reset via MMIO

2025-01-09 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Using mmio to do queue reset

v2: Alignment the function with gfx9/gfx9.4.3.

Signed-off-by: Jesse Zhang adev;
unsigned i;
+   uint32_t tmp;
 
/* enter save mode */
amdgpu_gfx_rlc_enter_safe_mode(adev, xcc_id);
@@ -3813,7 +3814,25 @@ static void gfx_v10_0_kiq_reset_hw_queue(struct 
amdgpu_ring *kiq_ring, uint32_t
}
if (i >= adev->usec_timeout)
dev_err(adev->dev, "fail to wait on hqd deactive\n");
-   } else {
+   } else if (queue_type == AMDGPU_RING_TYPE_GFX) {
+   WREG32_SOC15(GC, 0, mmGRBM_GFX_INDEX,
+(uint32_t)(0x1 << 
GRBM_GFX_INDEX__SE_BROADCAST_WRITES__SHIFT));
+   tmp = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 << vmid);
+   if (pipe_id == 0)
+   tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE0_QUEUES, 1 
<< queue_id);
+   else
+   tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE1_QUEUES, 1 
<< queue_id);
+   WREG32_SOC15(GC, 0, mmCP_VMID_RESET, tmp);
+
+   /* wait till dequeue take effects */
+   for (i = 0; i < adev->usec_timeout; i++) {
+   if (!(RREG32_SOC15(GC, 0, mmCP_GFX_HQD_ACTIVE) & 1))
+   break;
+   udelay(1);
+   }
+   if (i >= adev->usec_timeout)
+   dev_err(adev->dev, "failed to wait on gfx hqd 
deactivate\n");
+   }else {
dev_err(adev->dev, "reset queue_type(%d) not supported\n", 
queue_type);
}
 
-- 
2.25.1

[PATCH 1/3] revert "drm/amdgpu/pm: add definition PPSMC_MSG_ResetSDMA2"

2025-01-14 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

pmfw now unifies PPSMC_MSG_ResetSDMA definitions for different devices.
PPSMC_MSG_ResetSDMA2 is not needed.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h | 1 -
 drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 +--
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h
index 7b65a27fb302..147bfb12fd75 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h
@@ -93,7 +93,6 @@
 #define PPSMC_MSG_SelectPLPDMode0x40
 #define PPSMC_MSG_RmaDueToBadPageThreshold  0x43
 #define PPSMC_MSG_SelectPstatePolicy0x44
-#define PPSMC_MSG_ResetSDMA20x45
 #define PPSMC_MSG_ResetSDMA 0x4D
 #define PPSMC_Message_Count 0x4E
 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
index b0dab9797c70..e4cd6a0d13da 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
@@ -276,8 +276,7 @@
__SMU_DUMMY_MAP(SelectPstatePolicy), \
__SMU_DUMMY_MAP(MALLPowerController), \
__SMU_DUMMY_MAP(MALLPowerState), \
-   __SMU_DUMMY_MAP(ResetSDMA), \
-   __SMU_DUMMY_MAP(ResetSDMA2),
+   __SMU_DUMMY_MAP(ResetSDMA),
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)  SMU_MSG_##type
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 8ab30b2f7119..4ec339d17499 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -209,7 +209,6 @@ static const struct cmn2asic_msg_mapping 
smu_v13_0_6_message_map[SMU_MSG_MAX_COU
MSG_MAP(RmaDueToBadPageThreshold,
PPSMC_MSG_RmaDueToBadPageThreshold,0),
MSG_MAP(SelectPstatePolicy,  
PPSMC_MSG_SelectPstatePolicy,  0),
MSG_MAP(ResetSDMA,   PPSMC_MSG_ResetSDMA,   
0),
-   MSG_MAP(ResetSDMA2,  PPSMC_MSG_ResetSDMA2,  
0),
 };
 
 // clang-format on
-- 
2.25.1

[PATCH 3/3] drm/amd/pm: Refactor SMU 13.0.6 SDMA reset firmware version checks

2025-01-14 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch refactors the firmware version checks in `smu_v13_0_6_reset_sdma`
to support multiple SMU programs with different firmware version thresholds.

Signed-off-by: Jesse Zhang 
---
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 650aa9d0548a..5a5742571d29 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2745,11 +2745,15 @@ static int smu_v13_0_6_reset_sdma(struct smu_context 
*smu, uint32_t inst_mask)
 {
struct amdgpu_device *adev = smu->adev;
int ret = 0;
-
-   /* the message is only valid on SMU 13.0.6 with pmfw 85.121.00 and 
above */
-   if ((adev->flags & AMD_IS_APU) ||
-   amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 6) ||
-   smu->smc_fw_version < 0x00557900)
+   uint32_t smu_program;
+
+   smu_program = (smu->smc_fw_version >> 24) & 0xff;
+   /* the message is only valid on SMU 13.0.6 with these pmfw and above */
+   if (amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 6) ||
+   ((smu_program == 0) && (smu->smc_fw_version < 0x00557900)) ||
+   ((smu_program == 4) && (smu->smc_fw_version < 0x4556e00)) ||
+   ((smu_program == 5) && (smu->smc_fw_version < 0x5551200)) ||
+   ((smu_program == 7) && (smu->smc_fw_version < 0x7550700)))
return 0;
 
ret = smu_cmn_send_smc_msg_with_param(smu,
-- 
2.25.1

[PATCH 2/3] revert "drm/amdgpu/pm: Implement SDMA queue reset for different asic"

2025-01-14 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

pmfw unified PPSMC_MSG_ResetSDMA definitions for different devices.
PPSMC_MSG_ResetSDMA2 is not needed.

Signed-off-by: Jesse Zhang 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 30 +--
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 4ec339d17499..650aa9d0548a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2743,31 +2743,17 @@ static int smu_v13_0_6_send_rma_reason(struct 
smu_context *smu)
 
 static int smu_v13_0_6_reset_sdma(struct smu_context *smu, uint32_t inst_mask)
 {
-   uint32_t smu_program;
+   struct amdgpu_device *adev = smu->adev;
int ret = 0;
 
-   smu_program = (smu->smc_fw_version >> 24) & 0xff;
-   switch (amdgpu_ip_version(smu->adev, MP1_HWIP, 0)) {
-   case IP_VERSION(13, 0, 6):
-   if (((smu_program == 7) && (smu->smc_fw_version > 0x07550700)) 
||
-   ((smu_program == 0) && (smu->smc_fw_version > 
0x00557700)))
-   ret = smu_cmn_send_smc_msg_with_param(smu,
-   SMU_MSG_ResetSDMA, inst_mask, NULL);
-   else if ((smu_program == 4) &&
-   (smu->smc_fw_version > 0x4556e6c))
-   ret = smu_cmn_send_smc_msg_with_param(smu,
- SMU_MSG_ResetSDMA2, inst_mask, NULL);
-   break;
-   case IP_VERSION(13, 0, 14):
-   if ((smu_program == 5) &&
-   (smu->smc_fw_version > 0x05550f00))
-   ret = smu_cmn_send_smc_msg_with_param(smu,
- SMU_MSG_ResetSDMA2, inst_mask, NULL);
-   break;
-   default:
-   break;
-   }
+   /* the message is only valid on SMU 13.0.6 with pmfw 85.121.00 and 
above */
+   if ((adev->flags & AMD_IS_APU) ||
+   amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 6) ||
+   smu->smc_fw_version < 0x00557900)
+   return 0;
 
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+ SMU_MSG_ResetSDMA, inst_mask, 
NULL);
if (ret)
dev_err(smu->adev->dev,
"failed to send ResetSDMA event with mask 0x%x\n",
-- 
2.25.1

[PATCH 3/3 V2] drm/amd/pm: Refactor SMU 13.0.6 SDMA reset firmware version checks

2025-01-15 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch refactors the firmware version checks in `smu_v13_0_6_reset_sdma`
to support multiple SMU programs with different firmware version thresholds.

V2: return -EOPNOTSUPP for unspported pmfw

Suggested-by: Lazar Lijo 
Signed-off-by: Jesse Zhang 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 23 ++-
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 650aa9d0548a..f68282238303 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2745,12 +2745,23 @@ static int smu_v13_0_6_reset_sdma(struct smu_context 
*smu, uint32_t inst_mask)
 {
struct amdgpu_device *adev = smu->adev;
int ret = 0;
-
-   /* the message is only valid on SMU 13.0.6 with pmfw 85.121.00 and 
above */
-   if ((adev->flags & AMD_IS_APU) ||
-   amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 6) ||
-   smu->smc_fw_version < 0x00557900)
-   return 0;
+   uint32_t smu_program;
+
+   smu_program = (smu->smc_fw_version >> 24) & 0xff;
+   /* the message is only valid on SMU 13.0.6 with these pmfw and above */
+   if (amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 6) ||
+   ((smu_program == 0) && (smu->smc_fw_version < 0x00557900)) ||
+   ((smu_program == 4) && (smu->smc_fw_version < 0x4557000)) ||
+   ((smu_program == 5) && (smu->smc_fw_version < 0x5551200)) ||
+   ((smu_program == 7) && (smu->smc_fw_version < 0x7550700))) {
+   dev_err(smu->adev->dev,
+   "ResetSDMA not supported: SMU program %u requires PMFW >= 
0x%x\n"
+   "Current PMFW version: 0x%x\n",smu_program,
+   smu_program == 0 ? 0x00557900 : smu_program == 4 ? 0x4557000 :
+   smu_program == 5 ? 0x5551200 : smu_program == 7 ? 0x7550700 : 0,
+   smu->smc_fw_version);
+   return -EOPNOTSUPP;
+   }
 
ret = smu_cmn_send_smc_msg_with_param(smu,
  SMU_MSG_ResetSDMA, inst_mask, 
NULL);
-- 
2.25.1

[PATCH] drm/amdgpu: Use -ENODATA for GPU job timeout queue recovery

2025-01-14 Thread jesse.zh...@amd.com

When a GPU job times out, the driver attempts to recover by restarting
the scheduler. Previously, the scheduler was restarted with an error
code of 0, which does not distinguish between a full GPU reset and a
queue reset. This patch changes the error code to -ENODATA for queue
resets, while -ECANCELED or -ETIME is used for full GPU resets.

This change improves error handling by:
1. Clearly differentiating between queue resets and full GPU resets.
2. Providing more specific error codes for better debugging and recovery.
3. Aligning with kernel best practices for error reporting.

The related commit "b2ef808786d93df3658" (drm/sched: add optional errno
to drm_sched_start())
introduced support for passing an error code to
drm_sched_start(), enabling this improvement.

Signed-off-by: Vitaly Prosyak 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 100f04475943..b18b316872a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -148,7 +148,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
atomic_inc(&ring->adev->gpu_reset_counter);
amdgpu_fence_driver_force_completion(ring);
if (amdgpu_ring_sched_ready(ring))
-   drm_sched_start(&ring->sched, 0);
+   drm_sched_start(&ring->sched, -ENODATA);
goto exit;
}
dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name);
-- 
2.25.1

[PATCH V3] drm/amd/pm: Refactor SMU 13.0.6 SDMA reset firmware version checks

2025-01-16 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch refactors the firmware version checks in `smu_v13_0_6_reset_sdma`
to support multiple SMU programs with different firmware version thresholds.

V2: return -EOPNOTSUPP for unspported pmfw
V3: except IP_VERSION(13, 0, 12) which is not supported.

Suggested-by: Lazar Lijo 
Signed-off-by: Jesse Zhang 
Reviewed-by: Alex Deucher 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 23 ++-
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 35e2f0662fb5..591466e90f8b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2746,12 +2746,23 @@ static int smu_v13_0_6_reset_sdma(struct smu_context 
*smu, uint32_t inst_mask)
 {
struct amdgpu_device *adev = smu->adev;
int ret = 0;
-
-   /* the message is only valid on SMU 13.0.6 with pmfw 85.121.00 and 
above */
-   if ((adev->flags & AMD_IS_APU) ||
-   amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 6) ||
-   smu->smc_fw_version < 0x00557900)
-   return 0;
+   uint32_t smu_program;
+
+   smu_program = (smu->smc_fw_version >> 24) & 0xff;
+   /* the message is only valid on SMU 13.0.6/13.0.14 with these pmfw and 
above */
+   if (amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 12) ||
+   ((smu_program == 0) && (smu->smc_fw_version < 0x00557900)) ||
+   ((smu_program == 4) && (smu->smc_fw_version < 0x4557000)) ||
+   ((smu_program == 5) && (smu->smc_fw_version < 0x5551200)) ||
+   ((smu_program == 7) && (smu->smc_fw_version < 0x7550700))) {
+   dev_err(smu->adev->dev,
+   "ResetSDMA not supported: SMU program %u requires PMFW >= 
0x%x\n"
+   "Current PMFW version: 0x%x\n", smu_program,
+   smu_program == 0 ? 0x00557900 : smu_program == 4 ? 0x4557000 :
+   smu_program == 5 ? 0x5551200 : smu_program == 7 ? 0x7550700 : 0,
+   smu->smc_fw_version);
+   return -EOPNOTSUPP;
+   }
 
ret = smu_cmn_send_smc_msg_with_param(smu,
  SMU_MSG_ResetSDMA, inst_mask, 
NULL);
-- 
2.25.1

[PATCH] drm/amd/pm: Refactor SMU 13.0.6 SDMA reset firmware version checks

2025-01-16 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch refactors the firmware version checks in `smu_v13_0_6_reset_sdma`
to support multiple SMU programs with different firmware version thresholds.

V2: return -EOPNOTSUPP for unspported pmfw
V3: except IP_VERSION(13, 0, 12) which is not supported.

Suggested-by: Lazar Lijo 
Signed-off-by: Jesse Zhang 
Reviewed-by: Alex Deucher 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 23 ++-
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 35e2f0662fb5..31d9aafe0e83 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2746,12 +2746,23 @@ static int smu_v13_0_6_reset_sdma(struct smu_context 
*smu, uint32_t inst_mask)
 {
struct amdgpu_device *adev = smu->adev;
int ret = 0;
-
-   /* the message is only valid on SMU 13.0.6 with pmfw 85.121.00 and 
above */
-   if ((adev->flags & AMD_IS_APU) ||
-   amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 6) ||
-   smu->smc_fw_version < 0x00557900)
-   return 0;
+   uint32_t smu_program;
+
+   smu_program = (smu->smc_fw_version >> 24) & 0xff;
+   /* the message is only valid on SMU 13.0.6/13.0.14 with these pmfw and 
above */
+   if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 12) ||
+   ((smu_program == 0) && (smu->smc_fw_version < 0x00557900)) ||
+   ((smu_program == 4) && (smu->smc_fw_version < 0x4557000)) ||
+   ((smu_program == 5) && (smu->smc_fw_version < 0x5551200)) ||
+   ((smu_program == 7) && (smu->smc_fw_version < 0x7550700))) {
+   dev_err(smu->adev->dev,
+   "ResetSDMA not supported: SMU program %u requires PMFW >= 
0x%x\n"
+   "Current PMFW version: 0x%x\n", smu_program,
+   smu_program == 0 ? 0x00557900 : smu_program == 4 ? 0x4557000 :
+   smu_program == 5 ? 0x5551200 : smu_program == 7 ? 0x7550700 : 0,
+   smu->smc_fw_version);
+   return -EOPNOTSUPP;
+   }
 
ret = smu_cmn_send_smc_msg_with_param(smu,
  SMU_MSG_ResetSDMA, inst_mask, 
NULL);
-- 
2.25.1

[PATCH] drm/amdgpu: Fix SDMA engine reset logic

2025-03-16 Thread jesse.zh...@amd.com

The scheduler should restart only if the reset operation
succeeds  This ensures that new tasks are only submitted
to the queues after a successful reset.

Fixes: 9b5d66721b66308a5 ("drm/amdgpu: Introduce conditional user queue 
suspension for SDMA resets")

Suggested-by: Alex Deucher 
Signed-off-by: Jesse.Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 3a4cef896018..1334c209201f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -609,7 +609,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
uint32_t instance_id, b
 * if they were stopped by this function. This allows new tasks
 * to be submitted to the queues after the reset is complete.
 */
-   if (ret) {
+   if (!ret) {
if (gfx_sched_stopped && amdgpu_ring_sched_ready(gfx_ring)) {
drm_sched_wqueue_start(&gfx_ring->sched);
}
-- 
2.25.1

[PATCH V9 1/3] drm/amd/amdgpu: Increase max rings to enable SDMA page ring

2025-03-19 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

Increase the maximum number of rings supported by the AMDGPU driver from 133 to 
149.
This change is necessary to enable support for the SDMA page ring.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index b4fd1e17205e..bb2b66385223 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -37,7 +37,7 @@ struct amdgpu_job;
 struct amdgpu_vm;
 
 /* max number of rings */
-#define AMDGPU_MAX_RINGS   133
+#define AMDGPU_MAX_RINGS   149
 #define AMDGPU_MAX_HWIP_RINGS  64
 #define AMDGPU_MAX_GFX_RINGS   2
 #define AMDGPU_MAX_SW_GFX_RINGS 2
-- 
2.25.1

[PATCH v9 2/3] drm/amdgpu: Optimize VM invalidation engine allocation and synchronize GPU TLB flush

2025-03-19 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

- Modify the VM invalidation engine allocation logic to handle SDMA page rings.
  SDMA page rings now share the VM invalidation engine with SDMA gfx rings 
instead of
  allocating a separate engine. This change ensures efficient resource 
management and
  avoids the issue of insufficient VM invalidation engines.

- Add synchronization for GPU TLB flush operations in gmc_v9_0.c.
  Use spin_lock and spin_unlock to ensure thread safety and prevent race 
conditions
  during TLB flush operations. This improves the stability and reliability of 
the driver,
  especially in multi-threaded environments.

 v2: replace the sdma ring check with a function `amdgpu_sdma_is_page_queue`
 to check if a ring is an SDMA page queue.(Lijo)

 v3: Add GC version check, only enabled on GC9.4.3/9.4.4/9.5.0
 v4: Fix code style and add more detailed description (Christian)
 v5: Remove dependency on vm_inv_eng loop order, explicitly lookup shared 
inv_eng(Christian/Lijo)
 v6: Added search shared ring function amdgpu_sdma_get_shared_ring (Lijo)

Suggested-by: Lijo Lazar 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  | 20 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 33 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 +++
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 4eefa17fa39b..464625282872 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -573,6 +573,7 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device 
*adev)
unsigned vm_inv_engs[AMDGPU_MAX_VMHUBS] = {0};
unsigned i;
unsigned vmhub, inv_eng;
+   struct amdgpu_ring *shared_ring;
 
/* init the vm inv eng for all vmhubs */
for_each_set_bit(i, adev->vmhubs_mask, AMDGPU_MAX_VMHUBS) {
@@ -595,6 +596,10 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device 
*adev)
ring == &adev->cper.ring_buf)
continue;
 
+   /* Skip if the ring is a shared ring */
+   if (amdgpu_sdma_is_shared_inv_eng(adev, ring))
+   continue;
+
inv_eng = ffs(vm_inv_engs[vmhub]);
if (!inv_eng) {
dev_err(adev->dev, "no VM inv eng for ring %s\n",
@@ -607,6 +612,21 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device 
*adev)
 
dev_info(adev->dev, "ring %s uses VM inv eng %u on hub %u\n",
 ring->name, ring->vm_inv_eng, ring->vm_hub);
+   /* SDMA has a special packet which allows it to use the same
+* invalidation engine for all the rings in one instance.
+* Therefore, we do not allocate a separate VM invalidation 
engine
+* for SDMA page rings. Instead, they share the VM invalidation
+* engine with the SDMA gfx ring. This change ensures efficient
+* resource management and avoids the issue of insufficient VM
+* invalidation engines.
+*/
+   shared_ring = amdgpu_sdma_get_shared_ring(adev, ring);
+   if (shared_ring) {
+   shared_ring->vm_inv_eng = ring->vm_inv_eng;
+   dev_info(adev->dev, "ring %s shares VM invalidation 
engine %u with ring %s on hub %u\n",
+   ring->name, ring->vm_inv_eng, 
shared_ring->name, ring->vm_hub);
+   continue;
+   }
}
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 39669f8788a7..45dae38f802b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -504,6 +504,37 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct 
amdgpu_device *adev)
}
 }
 
+struct amdgpu_ring *amdgpu_sdma_get_shared_ring(struct amdgpu_device *adev, 
struct amdgpu_ring *ring)
+{
+   if (adev->sdma.has_page_queue && ring == 
&adev->sdma.instance[ring->me].ring)
+   return &adev->sdma.instance[ring->me].page;
+   else
+   return NULL;
+}
+
+/**
+* amdgpu_sdma_is_shared_inv_eng - Check if a ring is an SDMA ring that shares 
a VM invalidation engine
+* @adev: Pointer to the AMDGPU device structure
+* @ring: Pointer to the ring structure to check
+*
+* This function checks if the given ring is an SDMA ring that shares a VM 
invalidation engine.
+* It returns true if the ring is such an SDMA ring, false otherwise.
+*/
+bool amdgpu_sdma_is_shared_inv_eng(struct amdgpu_device *adev, struct 
amdgpu_ring *ring)
+{
+   int i = ring->me;
+
+   if (!adev->sdma.has_page_queue || i >= a

[PATCH v9 3/3] drm/amdgpu/sdma_v4_4_2: update VM flush implementation for SDMA

2025-03-19 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This commit updates the VM flush implementation for the SDMA engine.

- Added a new function `sdma_v4_4_2_get_invalidate_req` to construct the 
VM_INVALIDATE_ENG0_REQ
  register value for the specified VMID and flush type. This function ensures 
that all relevant
  page table cache levels (L1 PTEs, L2 PTEs, and L2 PDEs) are invalidated.

- Modified the `sdma_v4_4_2_ring_emit_vm_flush` function to use the new 
`sdma_v4_4_2_get_invalidate_req`
  function. The updated function emits the necessary register writes and waits 
to perform a VM flush
  for the specified VMID. It updates the PTB address registers and issues a VM 
invalidation request
  using the specified VM invalidation engine.

- Included the necessary header file `gc/gc_9_0_sh_mask.h` to provide access to 
the required register
  definitions.

v2: vm flush by the vm inalidation packet (Lijo)
v3: code stle and define thh macro for the vm invalidation packet (Christian)
v4: Format definition sdma vm invalidate packet (Lijo)

Suggested-by: Lijo Lazar 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c  | 77 +++
 .../gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h | 70 +
 2 files changed, 133 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index fd34dc138081..0a9238e70e7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -31,6 +31,7 @@
 #include "amdgpu_ucode.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_reset.h"
+#include "gc/gc_9_0_sh_mask.h"
 
 #include "sdma/sdma_4_4_2_offset.h"
 #include "sdma/sdma_4_4_2_sh_mask.h"
@@ -1292,21 +1293,71 @@ static void sdma_v4_4_2_ring_emit_pipeline_sync(struct 
amdgpu_ring *ring)
   seq, 0x, 4);
 }
 
-
-/**
- * sdma_v4_4_2_ring_emit_vm_flush - vm flush using sDMA
+/*
+ * sdma_v4_4_2_get_invalidate_req - Construct the VM_INVALIDATE_ENG0_REQ 
register value
+ * @vmid: The VMID to invalidate
+ * @flush_type: The type of flush (0 = legacy, 1 = lightweight, 2 = 
heavyweight)
  *
- * @ring: amdgpu_ring pointer
- * @vmid: vmid number to use
- * @pd_addr: address
+ * This function constructs the VM_INVALIDATE_ENG0_REQ register value for the 
specified VMID
+ * and flush type. It ensures that all relevant page table cache levels (L1 
PTEs, L2 PTEs, and
+ * L2 PDEs) are invalidated.
+ */
+static uint32_t sdma_v4_4_2_get_invalidate_req(unsigned int vmid,
+   uint32_t flush_type)
+{
+   u32 req = 0;
+
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ,
+   PER_VMID_INVALIDATE_REQ, 1 << vmid);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, FLUSH_TYPE, 
flush_type);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PTES, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE0, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE1, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE2, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L1_PTES, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ,
+   CLEAR_PROTECTION_FAULT_STATUS_ADDR, 0);
+
+   return req;
+}
+
+/*
+ * sdma_v4_4_2_ring_emit_vm_flush - Emit VM flush commands for SDMA
+ * @ring: The SDMA ring
+ * @vmid: The VMID to flush
+ * @pd_addr: The page directory address
  *
- * Update the page table base and flush the VM TLB
- * using sDMA.
+ * This function emits the necessary register writes and waits to perform a VM 
flush for the
+ * specified VMID. It updates the PTB address registers and issues a VM 
invalidation request
+ * using the specified VM invalidation engine.
  */
 static void sdma_v4_4_2_ring_emit_vm_flush(struct amdgpu_ring *ring,
-unsigned vmid, uint64_t pd_addr)
+   unsigned int vmid, uint64_t pd_addr)
 {
-   amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
+   struct amdgpu_device *adev = ring->adev;
+   uint32_t req = sdma_v4_4_2_get_invalidate_req(vmid, 0);
+   unsigned int eng = ring->vm_inv_eng;
+   struct amdgpu_vmhub *hub = &adev->vmhub[ring->vm_hub];
+
+   amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 +
+  (hub->ctx_addr_distance * vmid),
+  lower_32_bits(pd_addr));
+
+amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 +
+  (hub->ctx_addr_distance * vmid),
+  upper_32_bits(pd_addr));
+   /*
+* Construct and emit the VM invalidation packet
+*/
+   amdgpu_ring_write(ring,
+   SDMA_PKT_VM

[PATCH v9 3/3] drm/amdgpu/sdma_v4_4_2: update VM flush implementation for SDMA

2025-03-19 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This commit updates the VM flush implementation for the SDMA engine.

- Added a new function `sdma_v4_4_2_get_invalidate_req` to construct the 
VM_INVALIDATE_ENG0_REQ
  register value for the specified VMID and flush type. This function ensures 
that all relevant
  page table cache levels (L1 PTEs, L2 PTEs, and L2 PDEs) are invalidated.

- Modified the `sdma_v4_4_2_ring_emit_vm_flush` function to use the new 
`sdma_v4_4_2_get_invalidate_req`
  function. The updated function emits the necessary register writes and waits 
to perform a VM flush
  for the specified VMID. It updates the PTB address registers and issues a VM 
invalidation request
  using the specified VM invalidation engine.

- Included the necessary header file `gc/gc_9_0_sh_mask.h` to provide access to 
the required register
  definitions.

v2: vm flush by the vm inalidation packet (Lijo)
v3: code stle and define thh macro for the vm invalidation packet (Christian)
v4: Format definition sdma vm invalidate packet (Lijo)

Suggested-by: Lijo Lazar 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c  | 77 +++
 .../gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h | 70 +
 2 files changed, 133 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index fd34dc138081..0a9238e70e7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -31,6 +31,7 @@
 #include "amdgpu_ucode.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_reset.h"
+#include "gc/gc_9_0_sh_mask.h"
 
 #include "sdma/sdma_4_4_2_offset.h"
 #include "sdma/sdma_4_4_2_sh_mask.h"
@@ -1292,21 +1293,71 @@ static void sdma_v4_4_2_ring_emit_pipeline_sync(struct 
amdgpu_ring *ring)
   seq, 0x, 4);
 }
 
-
-/**
- * sdma_v4_4_2_ring_emit_vm_flush - vm flush using sDMA
+/*
+ * sdma_v4_4_2_get_invalidate_req - Construct the VM_INVALIDATE_ENG0_REQ 
register value
+ * @vmid: The VMID to invalidate
+ * @flush_type: The type of flush (0 = legacy, 1 = lightweight, 2 = 
heavyweight)
  *
- * @ring: amdgpu_ring pointer
- * @vmid: vmid number to use
- * @pd_addr: address
+ * This function constructs the VM_INVALIDATE_ENG0_REQ register value for the 
specified VMID
+ * and flush type. It ensures that all relevant page table cache levels (L1 
PTEs, L2 PTEs, and
+ * L2 PDEs) are invalidated.
+ */
+static uint32_t sdma_v4_4_2_get_invalidate_req(unsigned int vmid,
+   uint32_t flush_type)
+{
+   u32 req = 0;
+
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ,
+   PER_VMID_INVALIDATE_REQ, 1 << vmid);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, FLUSH_TYPE, 
flush_type);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PTES, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE0, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE1, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE2, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L1_PTES, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ,
+   CLEAR_PROTECTION_FAULT_STATUS_ADDR, 0);
+
+   return req;
+}
+
+/*
+ * sdma_v4_4_2_ring_emit_vm_flush - Emit VM flush commands for SDMA
+ * @ring: The SDMA ring
+ * @vmid: The VMID to flush
+ * @pd_addr: The page directory address
  *
- * Update the page table base and flush the VM TLB
- * using sDMA.
+ * This function emits the necessary register writes and waits to perform a VM 
flush for the
+ * specified VMID. It updates the PTB address registers and issues a VM 
invalidation request
+ * using the specified VM invalidation engine.
  */
 static void sdma_v4_4_2_ring_emit_vm_flush(struct amdgpu_ring *ring,
-unsigned vmid, uint64_t pd_addr)
+   unsigned int vmid, uint64_t pd_addr)
 {
-   amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
+   struct amdgpu_device *adev = ring->adev;
+   uint32_t req = sdma_v4_4_2_get_invalidate_req(vmid, 0);
+   unsigned int eng = ring->vm_inv_eng;
+   struct amdgpu_vmhub *hub = &adev->vmhub[ring->vm_hub];
+
+   amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 +
+  (hub->ctx_addr_distance * vmid),
+  lower_32_bits(pd_addr));
+
+amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 +
+  (hub->ctx_addr_distance * vmid),
+  upper_32_bits(pd_addr));
+   /*
+* Construct and emit the VM invalidation packet
+*/
+   amdgpu_ring_write(ring,
+   SDMA_PKT_VM

[v3 1/7] drm/amd/amdgpu: Simplify SDMA reset mechanism by removing dynamic callbacks

2025-04-04 Thread jesse.zh...@amd.com

Since KFD no longer registers its own callbacks for SDMA resets, and only KGD 
uses the reset mechanism,
we can simplify the SDMA reset flow by directly calling the ring's `stop_queue` 
and `start_queue` functions.
This patch removes the dynamic callback mechanism and prepares for its eventual 
deprecation.

1. **Remove Dynamic Callbacks**:
   - The `pre_reset` and `post_reset` callback invocations in 
`amdgpu_sdma_reset_engine` are removed.
   - Instead, the ring's `stop_queue` and `start_queue` functions are called 
directly during the reset process.

2. **Prepare for Deprecation of Dynamic Mechanism**:
   - By removing the callback invocations, this patch prepares the codebase for 
the eventual removal of the dynamic callback registration mechanism.

v2: Update stop_queue/start_queue function paramters to use ring pointer 
instead of device/instance(Christian)

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 34 +++-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 13 -
 3 files changed, 13 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 615c3d5c5a8d..23ea221e26de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -237,6 +237,8 @@ struct amdgpu_ring_funcs {
void (*patch_ce)(struct amdgpu_ring *ring, unsigned offset);
void (*patch_de)(struct amdgpu_ring *ring, unsigned offset);
int (*reset)(struct amdgpu_ring *ring, unsigned int vmid);
+   int (*stop_queue)(struct amdgpu_ring *ring);
+   int (*start_queue)(struct amdgpu_ring *ring);
void (*emit_cleaner_shader)(struct amdgpu_ring *ring);
bool (*is_guilty)(struct amdgpu_ring *ring);
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 0a9893fee828..b51fe644940f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -558,16 +558,10 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
  * @adev: Pointer to the AMDGPU device
  * @instance_id: ID of the SDMA engine instance to reset
  *
- * This function performs the following steps:
- * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save 
their state.
- * 2. Resets the specified SDMA engine instance.
- * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to 
restore their state.
- *
  * Returns: 0 on success, or a negative error code on failure.
  */
 int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
 {
-   struct sdma_on_reset_funcs *funcs;
int ret = 0;
struct amdgpu_sdma_instance *sdma_instance = 
&adev->sdma.instance[instance_id];
struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
@@ -589,18 +583,8 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
uint32_t instance_id)
page_sched_stopped = true;
}
 
-   /* Invoke all registered pre_reset callbacks */
-   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
-   if (funcs->pre_reset) {
-   ret = funcs->pre_reset(adev, instance_id);
-   if (ret) {
-   dev_err(adev->dev,
-   "beforeReset callback failed for instance %u: 
%d\n",
-   instance_id, ret);
-   goto exit;
-   }
-   }
-   }
+   if (gfx_ring->funcs->stop_queue)
+   gfx_ring->funcs->stop_queue(gfx_ring);
 
/* Perform the SDMA reset for the specified instance */
ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
@@ -609,18 +593,8 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
uint32_t instance_id)
goto exit;
}
 
-   /* Invoke all registered post_reset callbacks */
-   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
-   if (funcs->post_reset) {
-   ret = funcs->post_reset(adev, instance_id);
-   if (ret) {
-   dev_err(adev->dev,
-   "afterReset callback failed for instance %u: 
%d\n",
-   instance_id, ret);
-   goto exit;
-   }
-   }
-   }
+   if (gfx_ring->funcs->start_queue)
+   gfx_ring->funcs->start_queue(gfx_ring);
 
 exit:
/* Restart the scheduler's work queue for the GFX and page rings
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 688a720d..a8330504692d 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdg

[v3 4/7] drm/amd/amdgpu: Refactor SDMA v5.0 reset logic into stop_queue and restore_queue functions

2025-04-05 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch refactors the SDMA v5.0 reset logic by splitting the 
`sdma_v5_0_reset_queue` function into two separate functions: 
`sdma_v5_0_stop_queue` and `sdma_v5_0_restore_queue`.
 This change aligns with the new SDMA reset mechanism, where the reset process 
is divided into stopping the queue, performing the reset, and restoring the 
queue.

1. **Split `sdma_v5_0_reset_queue`**:
   - Extracted the queue stopping logic into `sdma_v5_0_stop_queue`.
   - Extracted the queue restoration logic into `sdma_v5_0_restore_queue`.
   - The soft reset step is now handled by the caller 
(`amdgpu_sdma_reset_engine`).

2. **Update Ring Functions**:
   - Added `stop_queue` and `start_queue` to the `sdma_v5_0_ring_funcs` 
structure to support the new reset mechanism.

v2: remove the suspend_user_queues param when calling amdgpu_sdma_reset_engine()
v3: Update stop_queue/start_queue function paramters to use ring pointer 
instead of device/instance(Christian)

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 41 --
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 9501652f903d..cd2d4c2af77e 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1508,17 +1508,25 @@ static int sdma_v5_0_soft_reset(struct amdgpu_ip_block 
*ip_block)
 static int sdma_v5_0_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int j, r;
-   u32 f32_cntl, freeze, cntl, preempt, soft_reset, stat1_reg;
-   u32 inst_id;
+   u32 inst_id = ring->me;
+
+   return amdgpu_sdma_reset_engine(adev, inst_id);
+}
+
+static int sdma_v5_0_stop_queue(struct amdgpu_ring *ring)
+{
+   int j, r = 0;
+   u32 f32_cntl, freeze, cntl, preempt, stat1_reg;
+   struct amdgpu_device *adev = ring->adev;
+   u32 inst_id = ring->me;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
-   inst_id = ring->me;
+
amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
 
/* stop queue */
-   sdma_v5_0_gfx_stop(adev, 1 << ring->me);
+   sdma_v5_0_gfx_stop(adev, inst_id);
 
/* engine stop SDMA1_F32_CNTL.HALT to 1 and SDMAx_FREEZE freeze bit to 
1 */
freeze = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
@@ -1554,17 +1562,19 @@ static int sdma_v5_0_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
preempt = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_GFX_PREEMPT));
preempt = REG_SET_FIELD(preempt, SDMA0_GFX_PREEMPT, IB_PREEMPT, 0);
WREG32(sdma_v5_0_get_reg_offset(adev, inst_id, mmSDMA0_GFX_PREEMPT), 
preempt);
+err0:
+   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+   return r;
+}
 
-   soft_reset = RREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET);
-   soft_reset |= 1 << GRBM_SOFT_RESET__SOFT_RESET_SDMA0__SHIFT << inst_id;
-
-   WREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET, soft_reset);
-
-   udelay(50);
-
-   soft_reset &= ~(1 << GRBM_SOFT_RESET__SOFT_RESET_SDMA0__SHIFT << 
inst_id);
-   WREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET, soft_reset);
+static int sdma_v5_0_restore_queue(struct amdgpu_ring *ring)
+{
+   int r;
+   u32 freeze;
+   struct amdgpu_device *adev = ring->adev;
+   u32 inst_id = ring->me;
 
+   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
/* unfreeze*/
freeze = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
freeze = REG_SET_FIELD(freeze, SDMA0_FREEZE, FREEZE, 0);
@@ -1572,7 +1582,6 @@ static int sdma_v5_0_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
 
r = sdma_v5_0_gfx_resume_instance(adev, inst_id, true);
 
-err0:
amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
return r;
 }
@@ -1919,6 +1928,8 @@ static const struct amdgpu_ring_funcs 
sdma_v5_0_ring_funcs = {
.init_cond_exec = sdma_v5_0_ring_init_cond_exec,
.preempt_ib = sdma_v5_0_ring_preempt_ib,
.reset = sdma_v5_0_reset_queue,
+   .stop_queue = sdma_v5_0_stop_queue,
+   .start_queue = sdma_v5_0_restore_queue,
 };
 
 static void sdma_v5_0_set_ring_funcs(struct amdgpu_device *adev)
-- 
2.25.1

[PATCH 1/2] drm/amdgpu: Enable per-queue reset for SDMA v4.4.2 on IP v9.5.0

2025-04-05 Thread jesse.zh...@amd.com

Add support for per-queue reset on SDMA v4.4.2 when running with:
1. MEC firmware version 0xb0 or later
2. DPM indicates SDMA reset is supported

Signed-off-by: Jesse.Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 688a720d..9a2a5eb93763 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -2373,7 +2373,9 @@ static void sdma_v4_4_2_update_reset_mask(struct 
amdgpu_device *adev)
adev->sdma.supported_reset |= 
AMDGPU_RESET_TYPE_PER_QUEUE;
break;
case IP_VERSION(9, 5, 0):
-   /*TODO: enable the queue reset flag until fw supported */
+   if ((adev->gfx.mec_fw_version >= 0xb0) && 
amdgpu_dpm_reset_sdma_is_supported(adev))
+   adev->sdma.supported_reset |= 
AMDGPU_RESET_TYPE_PER_QUEUE;
+   break;
default:
break;
}
-- 
2.25.1

[v2 1/2] drm/amdgpu: Enable per-queue reset for SDMA v4.4.2 on IP v9.5.0

2025-04-07 Thread jesse.zh...@amd.com

Add support for per-queue reset on SDMA v4.4.2 when running with:
1. MEC firmware version 17 or later
2. DPM indicates SDMA reset is supported

v2: Fixed supported firmware versions  (Lijo)

Signed-off-by: Jesse.Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 688a720d..673ecd208c6d 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -2373,7 +2373,9 @@ static void sdma_v4_4_2_update_reset_mask(struct 
amdgpu_device *adev)
adev->sdma.supported_reset |= 
AMDGPU_RESET_TYPE_PER_QUEUE;
break;
case IP_VERSION(9, 5, 0):
-   /*TODO: enable the queue reset flag until fw supported */
+   if ((adev->gfx.mec_fw_version >= 0xf) && 
amdgpu_dpm_reset_sdma_is_supported(adev))
+   adev->sdma.supported_reset |= 
AMDGPU_RESET_TYPE_PER_QUEUE;
+   break;
default:
break;
}
-- 
2.25.1

[v2 2/2] drm/amdgpu: Enable TMZ support for GC 11.0.0

2025-04-07 Thread jesse.zh...@amd.com

Add GC11.0.0 to the list of GPU generations that support TMZ.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 464625282872..1eb9242436ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -843,6 +843,7 @@ void amdgpu_gmc_tmz_set(struct amdgpu_device *adev)
case IP_VERSION(9, 3, 0):
/* GC 10.3.7 */
case IP_VERSION(10, 3, 7):
+   case IP_VERSION(11, 0, 0):
/* GC 11.0.1 */
case IP_VERSION(11, 0, 1):
if (amdgpu_tmz == 0) {
-- 
2.25.1

[v4 1/7] drm/amd/amdgpu: Simplify SDMA reset mechanism by removing dynamic callbacks

2025-04-08 Thread jesse.zh...@amd.com

Since KFD no longer registers its own callbacks for SDMA resets, and only KGD 
uses the reset mechanism,
we can simplify the SDMA reset flow by directly calling the ring's `stop_queue` 
and `start_queue` functions.
This patch removes the dynamic callback mechanism and prepares for its eventual 
deprecation.

1. **Remove Dynamic Callbacks**:
   - The `pre_reset` and `post_reset` callback invocations in 
`amdgpu_sdma_reset_engine` are removed.
   - Instead, the ring's `stop_queue` and `start_queue` functions are called 
directly during the reset process.

2. **Prepare for Deprecation of Dynamic Mechanism**:
   - By removing the callback invocations, this patch prepares the codebase for 
the eventual removal of the dynamic callback registration mechanism.

v2: Update stop_queue/start_queue function paramters to use ring pointer 
instead of device/instance(Christian)
v3: The general coding style is to declare variables like "i" or "r" last. E.g. 
longest lines first and short lasts. (Chritian)
v4: move stop_queue/start_queue to struct amdgpu_sdma_instance and rename them. 
(Alex)

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 54 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  7 ++-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 22 ++
 3 files changed, 34 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 0a9893fee828..541b349e0310 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -558,20 +558,14 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
  * @adev: Pointer to the AMDGPU device
  * @instance_id: ID of the SDMA engine instance to reset
  *
- * This function performs the following steps:
- * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save 
their state.
- * 2. Resets the specified SDMA engine instance.
- * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to 
restore their state.
- *
  * Returns: 0 on success, or a negative error code on failure.
  */
 int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
 {
-   struct sdma_on_reset_funcs *funcs;
int ret = 0;
struct amdgpu_sdma_instance *sdma_instance = 
&adev->sdma.instance[instance_id];
-   struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
-   struct amdgpu_ring *page_ring = &sdma_instance->page;
+   struct amdgpu_ring *sdma_gfx_ring = &sdma_instance->ring;
+   struct amdgpu_ring *sdma_page_ring = &sdma_instance->page;
bool gfx_sched_stopped = false, page_sched_stopped = false;
 
mutex_lock(&sdma_instance->engine_reset_mutex);
@@ -579,28 +573,18 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
uint32_t instance_id)
* This ensures that no new tasks are submitted to the queues while
* the reset is in progress.
*/
-   if (!amdgpu_ring_sched_ready(gfx_ring)) {
-   drm_sched_wqueue_stop(&gfx_ring->sched);
+   if (!amdgpu_ring_sched_ready(sdma_gfx_ring)) {
+   drm_sched_wqueue_stop(&sdma_gfx_ring->sched);
gfx_sched_stopped = true;
}
 
-   if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) {
-   drm_sched_wqueue_stop(&page_ring->sched);
+   if (adev->sdma.has_page_queue && 
!amdgpu_ring_sched_ready(sdma_page_ring)) {
+   drm_sched_wqueue_stop(&sdma_page_ring->sched);
page_sched_stopped = true;
}
 
-   /* Invoke all registered pre_reset callbacks */
-   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
-   if (funcs->pre_reset) {
-   ret = funcs->pre_reset(adev, instance_id);
-   if (ret) {
-   dev_err(adev->dev,
-   "beforeReset callback failed for instance %u: 
%d\n",
-   instance_id, ret);
-   goto exit;
-   }
-   }
-   }
+   if (sdma_instance->funcs->stop_kernel_queue)
+   sdma_instance->funcs->stop_kernel_queue(sdma_gfx_ring);
 
/* Perform the SDMA reset for the specified instance */
ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
@@ -609,18 +593,8 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
uint32_t instance_id)
goto exit;
}
 
-   /* Invoke all registered post_reset callbacks */
-   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
-   if (funcs->post_reset) {
-   ret = funcs->post_reset(adev, instance_id);
-   if (ret) {
-   dev_err(adev->dev,
-   "afterReset callback failed for instance %u: 
%d\n",
-

[v4 3/7] drm/amdgpu: Optimize SDMA v5.0 queue reset and stop logic

2025-04-08 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch refactors the SDMA v5.0 queue reset and stop logic to improve
code readability, maintainability, and performance. The key changes include:

1. **Generalized `sdma_v5_0_gfx_stop` Function**:
   - Added an `inst_mask` parameter to allow stopping specific SDMA instances
 instead of all instances. This is useful for resetting individual queues.

2. **Simplified `sdma_v5_0_reset_queue` Function**:
   - Removed redundant loops and checks by directly using the `ring->me` field
 to identify the SDMA instance.
   - Reused the `sdma_v5_0_gfx_stop` function to stop the queue, reducing code
 duplication.

Signed-off-by: Jesse Zhang 
Acked-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 65 +++---
 1 file changed, 26 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index d516add85dd4..38eee309b27e 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -555,15 +555,15 @@ static void sdma_v5_0_ring_emit_fence(struct amdgpu_ring 
*ring, u64 addr, u64 se
  * sdma_v5_0_gfx_stop - stop the gfx async dma engines
  *
  * @adev: amdgpu_device pointer
- *
+ * @inst_mask: mask of dma engine instances to be disabled
  * Stop the gfx async dma ring buffers (NAVI10).
  */
-static void sdma_v5_0_gfx_stop(struct amdgpu_device *adev)
+static void sdma_v5_0_gfx_stop(struct amdgpu_device *adev, uint32_t inst_mask)
 {
u32 rb_cntl, ib_cntl;
int i;
 
-   for (i = 0; i < adev->sdma.num_instances; i++) {
+   for_each_inst(i, inst_mask) {
rb_cntl = RREG32_SOC15_IP(GC, sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_GFX_RB_CNTL));
rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_GFX_RB_CNTL, RB_ENABLE, 
0);
WREG32_SOC15_IP(GC, sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_GFX_RB_CNTL), rb_cntl);
@@ -655,9 +655,11 @@ static void sdma_v5_0_enable(struct amdgpu_device *adev, 
bool enable)
 {
u32 f32_cntl;
int i;
+   uint32_t inst_mask;
 
+   inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
if (!enable) {
-   sdma_v5_0_gfx_stop(adev);
+   sdma_v5_0_gfx_stop(adev, 1 << inst_mask);
sdma_v5_0_rlc_stop(adev);
}
 
@@ -1535,40 +1537,25 @@ static int sdma_v5_0_soft_reset(struct amdgpu_ip_block 
*ip_block)
 static int sdma_v5_0_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int i, j, r;
-   u32 rb_cntl, ib_cntl, f32_cntl, freeze, cntl, preempt, soft_reset, 
stat1_reg;
+   int j, r;
+   u32 f32_cntl, freeze, cntl, preempt, soft_reset, stat1_reg;
+   u32 inst_id;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
-
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   if (ring == &adev->sdma.instance[i].ring)
-   break;
-   }
-
-   if (i == adev->sdma.num_instances) {
-   DRM_ERROR("sdma instance not found\n");
-   return -EINVAL;
-   }
-
+   inst_id = ring->me;
amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
 
/* stop queue */
-   ib_cntl = RREG32(sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_GFX_IB_CNTL));
-   ib_cntl = REG_SET_FIELD(ib_cntl, SDMA0_GFX_IB_CNTL, IB_ENABLE, 0);
-   WREG32(sdma_v5_0_get_reg_offset(adev, i, mmSDMA0_GFX_IB_CNTL), ib_cntl);
-
-   rb_cntl = RREG32(sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_GFX_RB_CNTL));
-   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_GFX_RB_CNTL, RB_ENABLE, 0);
-   WREG32(sdma_v5_0_get_reg_offset(adev, i, mmSDMA0_GFX_RB_CNTL), rb_cntl);
+   sdma_v5_0_gfx_stop(adev, 1 << ring->me);
 
/* engine stop SDMA1_F32_CNTL.HALT to 1 and SDMAx_FREEZE freeze bit to 
1 */
-   freeze = RREG32(sdma_v5_0_get_reg_offset(adev, i, mmSDMA0_FREEZE));
+   freeze = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
freeze = REG_SET_FIELD(freeze, SDMA0_FREEZE, FREEZE, 1);
-   WREG32(sdma_v5_0_get_reg_offset(adev, i, mmSDMA0_FREEZE), freeze);
+   WREG32(sdma_v5_0_get_reg_offset(adev, inst_id, mmSDMA0_FREEZE), freeze);
 
for (j = 0; j < adev->usec_timeout; j++) {
-   freeze = RREG32(sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_FREEZE));
+   freeze = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
if (REG_GET_FIELD(freeze, SDMA0_FREEZE, FROZEN) & 1)
break;
udelay(1);
@@ -1576,7 +1563,7 @@ static int sdma_v5_0_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
 
/* check sdma copy engine all idle if frozen not received*/
if (j == adev->usec_timeout) {
-   stat1_reg = RREG32(sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_STATUS

[v4 2/7] drm/amd/amdgpu: Implement SDMA soft reset directly for sdma v5

2025-04-08 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch introduces a new function `amdgpu_sdma_soft_reset` to handle SDMA 
soft resets directly,
rather than relying on the DPM interface.

1. **New `amdgpu_sdma_soft_reset` Function**:
   - Implements a soft reset for SDMA engines by directly writing to the 
hardware registers.
   - Handles SDMA versions 4.x and 5.x separately:
 - For SDMA 4.x, the existing `amdgpu_dpm_reset_sdma` function is used for 
backward compatibility.
 - For SDMA 5.x, the driver directly manipulates the `GRBM_SOFT_RESET` 
register to reset the specified SDMA instance.

2. **Integration into `amdgpu_sdma_reset_engine`**:
   - The `amdgpu_sdma_soft_reset` function is called during the SDMA reset 
process, replacing the previous call to `amdgpu_dpm_reset_sdma`.

v2: move soft reset into a helper funciton (Alex)

Suggested-by: Alex Deucher 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 38 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  1 +
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 29 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c   | 45 +++-
 4 files changed, 95 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 541b349e0310..96d0350c7754 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -26,6 +26,8 @@
 #include "amdgpu_sdma.h"
 #include "amdgpu_ras.h"
 #include "amdgpu_reset.h"
+#include "gc/gc_10_1_0_offset.h"
+#include "gc/gc_10_3_0_sh_mask.h"
 
 #define AMDGPU_CSA_SDMA_SIZE 64
 /* SDMA CSA reside in the 3rd page of CSA */
@@ -553,6 +555,40 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
list_add_tail(&funcs->list, &adev->sdma.reset_callback_list);
 }
 
+static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id)
+{
+   struct amdgpu_sdma_instance *sdma_instance = 
&adev->sdma.instance[instance_id];
+   int r = 0;
+
+   switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
+   case IP_VERSION(4, 4, 2):
+   case IP_VERSION(4, 4, 4):
+   case IP_VERSION(4, 4, 5):
+   /* For SDMA 4.x, use the existing DPM interface for backward 
compatibility */
+   r = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
+   break;
+   case IP_VERSION(5, 0, 0):
+   case IP_VERSION(5, 0, 1):
+   case IP_VERSION(5, 0, 2):
+   case IP_VERSION(5, 0, 5):
+   case IP_VERSION(5, 2, 0):
+   case IP_VERSION(5, 2, 2):
+   case IP_VERSION(5, 2, 4):
+   case IP_VERSION(5, 2, 5):
+   case IP_VERSION(5, 2, 6):
+   case IP_VERSION(5, 2, 3):
+   case IP_VERSION(5, 2, 1):
+   case IP_VERSION(5, 2, 7):
+   if (sdma_instance->funcs->soft_reset_kernel_queue)
+   r = sdma_instance->funcs->soft_reset_kernel_queue(adev, 
instance_id);
+   break;
+   default:
+   break;
+   }
+
+   return r;
+}
+
 /**
  * amdgpu_sdma_reset_engine - Reset a specific SDMA engine
  * @adev: Pointer to the AMDGPU device
@@ -587,7 +623,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
uint32_t instance_id)
sdma_instance->funcs->stop_kernel_queue(sdma_gfx_ring);
 
/* Perform the SDMA reset for the specified instance */
-   ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
+   ret = amdgpu_sdma_soft_reset(adev, instance_id);
if (ret) {
dev_err(adev->dev, "Failed to reset SDMA instance %u\n", 
instance_id);
goto exit;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 620fd7663526..bf83d6646238 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -53,6 +53,7 @@ enum amdgpu_sdma_irq {
 struct amdgpu_sdma_funcs {
int (*stop_kernel_queue)(struct amdgpu_ring *ring);
int (*start_kernel_queue)(struct amdgpu_ring *ring);
+   int (*soft_reset_kernel_queue)(struct amdgpu_device *adev, u32 
instance_id);
 };
 
 struct amdgpu_sdma_instance {
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index e1348b6d9c6a..d516add85dd4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1323,6 +1323,34 @@ static void 
sdma_v5_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
 }
 
+static int sdma_v5_0_soft_reset_engine(struct amdgpu_device *adev, u32 
instance_id)
+{
+   u32 grbm_soft_reset;
+   u32 tmp;
+
+   grbm_soft_reset = REG_SET_FIELD(0,
+   GRBM_SOFT_RESET, SOFT_RESET_SDMA0,
+

[v4 4/7] drm/amd/amdgpu: Refactor SDMA v5.0 reset logic into top_queue and restore_queue function

2025-04-08 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch refactors the SDMA v5.0 reset logic by splitting the 
`sdma_v5_0_reset_queue` function into two separate functions: 
`sdma_v5_0_stop_queue` and `sdma_v5_0_restore_queue`.
 This change aligns with the new SDMA reset mechanism, where the reset process 
is divided into stopping the queue, performing the reset, and restoring the 
queue.

1. **Split `sdma_v5_0_reset_queue`**:
   - Extracted the queue stopping logic into `sdma_v5_0_stop_queue`.
   - Extracted the queue restoration logic into `sdma_v5_0_restore_queue`.
   - The soft reset step is now handled by the caller 
(`amdgpu_sdma_reset_engine`).

2. **Update Ring Functions**:
   - Added `stop_queue` and `start_queue` to the `sdma_v5_0_ring_funcs` 
structure to support the new reset mechanism.

v2: remove the suspend_user_queues param when calling amdgpu_sdma_reset_engine()
v3: Update stop_queue/start_queue function paramters to use ring pointer 
instead of device/instance(Christian)
v4: The general coding style is to declare variables like "i" or "r" last. E.g. 
longest lines first and short lasts.(Chritian)
v5: move stop_queue/start_queue to struct amdgpu_sdma_instance and rename them. 
(Alex)

Signed-off-by: Jesse Zhang 
Acked-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 43 +-
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 38eee309b27e..5c354552c47f 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -112,6 +112,8 @@ static void sdma_v5_0_set_ring_funcs(struct amdgpu_device 
*adev);
 static void sdma_v5_0_set_buffer_funcs(struct amdgpu_device *adev);
 static void sdma_v5_0_set_vm_pte_funcs(struct amdgpu_device *adev);
 static void sdma_v5_0_set_irq_funcs(struct amdgpu_device *adev);
+static int sdma_v5_0_stop_queue(struct amdgpu_ring *ring);
+static int sdma_v5_0_restore_queue(struct amdgpu_ring *ring);
 
 static const struct soc15_reg_golden golden_settings_sdma_5[] = {
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSDMA0_CHICKEN_BITS, 0xffbf1f0f, 
0x03ab0107),
@@ -1350,6 +1352,8 @@ static int sdma_v5_0_soft_reset_engine(struct 
amdgpu_device *adev, u32 instance_
 }
 
 static const struct amdgpu_sdma_funcs sdma_v5_0_sdma_funcs = {
+   .stop_kernel_queue = &sdma_v5_0_stop_queue,
+   .start_kernel_queue = &sdma_v5_0_restore_queue,
.soft_reset_kernel_queue = &sdma_v5_0_soft_reset_engine,
 };
 
@@ -1537,17 +1541,25 @@ static int sdma_v5_0_soft_reset(struct amdgpu_ip_block 
*ip_block)
 static int sdma_v5_0_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int j, r;
-   u32 f32_cntl, freeze, cntl, preempt, soft_reset, stat1_reg;
-   u32 inst_id;
+   u32 inst_id = ring->me;
+
+   return amdgpu_sdma_reset_engine(adev, inst_id);
+}
+
+static int sdma_v5_0_stop_queue(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+   u32 f32_cntl, freeze, cntl, preempt, stat1_reg;
+   u32 inst_id = ring->me;
+   int j, r = 0;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
-   inst_id = ring->me;
+
amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
 
/* stop queue */
-   sdma_v5_0_gfx_stop(adev, 1 << ring->me);
+   sdma_v5_0_gfx_stop(adev, inst_id);
 
/* engine stop SDMA1_F32_CNTL.HALT to 1 and SDMAx_FREEZE freeze bit to 
1 */
freeze = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
@@ -1583,17 +1595,19 @@ static int sdma_v5_0_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
preempt = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_GFX_PREEMPT));
preempt = REG_SET_FIELD(preempt, SDMA0_GFX_PREEMPT, IB_PREEMPT, 0);
WREG32(sdma_v5_0_get_reg_offset(adev, inst_id, mmSDMA0_GFX_PREEMPT), 
preempt);
+err0:
+   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+   return r;
+}
 
-   soft_reset = RREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET);
-   soft_reset |= 1 << GRBM_SOFT_RESET__SOFT_RESET_SDMA0__SHIFT << inst_id;
-
-   WREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET, soft_reset);
-
-   udelay(50);
-
-   soft_reset &= ~(1 << GRBM_SOFT_RESET__SOFT_RESET_SDMA0__SHIFT << 
inst_id);
-   WREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET, soft_reset);
+static int sdma_v5_0_restore_queue(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+   u32 inst_id = ring->me;
+   u32 freeze;
+   int r;
 
+   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
/* unfreeze*/
freeze = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
freeze = REG_SET_FIELD(freeze, SDMA0_FREEZE, FREEZE, 0);
@@ -1601,7 +1615,6 @@ static int sdma_v5_0_reset_

[v4 7/7] drm/amd/amdgpu: Remove deprecated SDMA reset callback mechanism

2025-04-08 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch removes the deprecated SDMA reset callback mechanism, which was 
previously used to register pre-reset and post-reset callbacks for SDMA engine 
resets.
 The callback mechanism has been replaced with a more direct and efficient 
approach using `stop_queue` and `start_queue` functions in the ring's function 
table.

The SDMA reset callback mechanism allowed KFD and AMDGPU to register pre-reset 
and post-reset functions for handling SDMA engine resets.
However, this approach added unnecessary complexity and was no longer needed 
after the introduction of the `stop_queue` and `start_queue` functions in the 
ring's function table.

1. **Remove Callback Mechanism**:
   - Removed the `amdgpu_sdma_register_on_reset_callbacks` function and its 
associated data structures (`sdma_on_reset_funcs`).
   - Removed the callback registration logic from the SDMA v4.4.2 
initialization code.

2. **Clean Up Related Code**:
   - Removed the `sdma_v4_4_2_set_engine_reset_funcs` function, which was used 
to register the callbacks.
   - Removed the `sdma_v4_4_2_engine_reset_funcs` structure, which contained 
the pre-reset and post-reset callback functions.

Signed-off-by: Jesse Zhang 
Reviewed-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 24 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  8 
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 10 --
 3 files changed, 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 96d0350c7754..ca42d47dd072 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -531,30 +531,6 @@ bool amdgpu_sdma_is_shared_inv_eng(struct amdgpu_device 
*adev, struct amdgpu_rin
return false;
 }
 
-/**
- * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
- * @funcs: Pointer to the callback structure containing pre_reset and 
post_reset functions
- *
- * This function allows KFD and AMDGPU to register their own callbacks for 
handling
- * pre-reset and post-reset operations for engine reset. These are needed 
because engine
- * reset will stop all queues on that engine.
- */
-void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, 
struct sdma_on_reset_funcs *funcs)
-{
-   if (!funcs)
-   return;
-
-   /* Ensure the reset_callback_list is initialized */
-   if (!adev->sdma.reset_callback_list.next) {
-   INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
-   }
-   /* Initialize the list node in the callback structure */
-   INIT_LIST_HEAD(&funcs->list);
-
-   /* Add the callback structure to the global list */
-   list_add_tail(&funcs->list, &adev->sdma.reset_callback_list);
-}
-
 static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id)
 {
struct amdgpu_sdma_instance *sdma_instance = 
&adev->sdma.instance[instance_id];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index bf83d6646238..89a114680053 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -109,13 +109,6 @@ struct amdgpu_sdma_ras {
struct amdgpu_ras_block_object ras_block;
 };
 
-struct sdma_on_reset_funcs {
-   int (*pre_reset)(struct amdgpu_device *adev, uint32_t instance_id);
-   int (*post_reset)(struct amdgpu_device *adev, uint32_t instance_id);
-   /* Linked list node to store this structure in a list; */
-   struct list_head list;
-};
-
 struct amdgpu_sdma {
struct amdgpu_sdma_instance instance[AMDGPU_MAX_SDMA_INSTANCES];
struct amdgpu_irq_src   trap_irq;
@@ -176,7 +169,6 @@ struct amdgpu_buffer_funcs {
 uint32_t byte_count);
 };
 
-void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, 
struct sdma_on_reset_funcs *funcs);
 int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id);
 
 #define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) 
(adev)->mman.buffer_funcs->emit_copy_buffer((ib),  (s), (d), (b), (t))
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index c663c63485f4..e343aaaea12d 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -106,7 +106,6 @@ static void sdma_v4_4_2_set_buffer_funcs(struct 
amdgpu_device *adev);
 static void sdma_v4_4_2_set_vm_pte_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev);
-static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_update_reset_mask(struct amdgpu_device *adev);
 static int sdma_v4_4_2_stop_queue(struct amdgpu_ring

[v4 6/7] drm/amd/amdgpu: Refactor SDMA v5.2 reset logic into stop_queue and restore_queue functions

2025-04-08 Thread jesse.zh...@amd.com

This patch refactors the SDMA v5.2 reset logic by splitting the 
`sdma_v5_2_reset_queue` function into two separate functions: 
`sdma_v5_2_stop_queue` and `sdma_v5_2_restore_queue`.
This change aligns with the new SDMA reset mechanism, where the reset process 
is divided into stopping the queue, performing the reset, and restoring the 
queue.

1. **Split `sdma_v5_2_reset_queue`**:
- Extracted the queue stopping logic into `sdma_v5_2_stop_queue`.
- Extracted the queue restoration logic into `sdma_v5_2_restore_queue`.
- The soft reset step is now handled by the caller 
(`amdgpu_sdma_reset_engine`).

2. **Update Ring Functions**:
- Added `stop_queue` and `start_queue` to the `sdma_v5_2_ring_funcs` 
structure to support the new reset mechanism.

v2: remove the suspend_user_queues param when calling amdgpu_sdma_reset_engine()
v3: Update stop_queue/start_queue function paramters to use ring pointer 
instead of device/instance(Christian)
v4: move stop_queue/start_queue to struct amdgpu_sdma_instance and rename them. 
(Alex)

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 44 --
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index 6972f5dd63ef..38016fdc094a 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -113,6 +113,8 @@ static void sdma_v5_2_set_ring_funcs(struct amdgpu_device 
*adev);
 static void sdma_v5_2_set_buffer_funcs(struct amdgpu_device *adev);
 static void sdma_v5_2_set_vm_pte_funcs(struct amdgpu_device *adev);
 static void sdma_v5_2_set_irq_funcs(struct amdgpu_device *adev);
+static int sdma_v5_2_stop_queue(struct amdgpu_ring *ring);
+static int sdma_v5_2_restore_queue(struct amdgpu_ring *ring);
 
 static u32 sdma_v5_2_get_reg_offset(struct amdgpu_device *adev, u32 instance, 
u32 internal_offset)
 {
@@ -799,6 +801,8 @@ static int sdma_v5_2_soft_reset(struct amdgpu_ip_block 
*ip_block)
 }
 
 static const struct amdgpu_sdma_funcs sdma_v5_2_sdma_funcs = {
+   .stop_kernel_queue = &sdma_v5_2_stop_queue,
+   .start_kernel_queue = &sdma_v5_2_restore_queue,
.soft_reset_kernel_queue = &sdma_v5_2_soft_reset_engine,
 };
 
@@ -1450,18 +1454,24 @@ static int sdma_v5_2_wait_for_idle(struct 
amdgpu_ip_block *ip_block)
 static int sdma_v5_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int j, r;
-   u32 f32_cntl, freeze, cntl, preempt, soft_reset, stat1_reg;
-   u32 inst_id;
+   u32 inst_id = ring->me;
+
+   return amdgpu_sdma_reset_engine(adev, inst_id);
+}
+
+static int sdma_v5_2_stop_queue(struct amdgpu_ring *ring)
+{
+   u32 f32_cntl, freeze, cntl, preempt, stat1_reg;
+   struct amdgpu_device *adev = ring->adev;
+   u32 inst_id = ring->me;
+   int j, r = 0;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
 
-   inst_id = ring->me;
amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
-
/* stop queue */
-   sdma_v5_2_gfx_stop(adev, 1 << ring->me);
+   sdma_v5_2_gfx_stop(adev, 1 << inst_id);
 
/*engine stop SDMA1_F32_CNTL.HALT to 1 and SDMAx_FREEZE freeze bit to 1 
*/
freeze = RREG32(sdma_v5_2_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
@@ -1499,18 +1509,19 @@ static int sdma_v5_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
preempt = REG_SET_FIELD(preempt, SDMA0_GFX_PREEMPT, IB_PREEMPT, 0);
WREG32(sdma_v5_2_get_reg_offset(adev, inst_id, mmSDMA0_GFX_PREEMPT), 
preempt);
 
-   soft_reset = RREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET);
-   soft_reset |= 1 << GRBM_SOFT_RESET__SOFT_RESET_SDMA0__SHIFT << inst_id;
-
-
-   WREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET, soft_reset);
-
-   udelay(50);
-
-   soft_reset &= ~(1 << GRBM_SOFT_RESET__SOFT_RESET_SDMA0__SHIFT << 
inst_id);
+err0:
+   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+   return r;
+}
 
-   WREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET, soft_reset);
+static int sdma_v5_2_restore_queue(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+   u32 inst_id = ring->me;
+   u32 freeze;
+   int r;
 
+   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
/* unfreeze and unhalt */
freeze = RREG32(sdma_v5_2_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
freeze = REG_SET_FIELD(freeze, SDMA0_FREEZE, FREEZE, 0);
@@ -1518,7 +1529,6 @@ static int sdma_v5_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
 
r = sdma_v5_2_gfx_resume_instance(adev, inst_id, true);
 
-err0:
amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
return r;
 }
-- 
2.25.1

[v4 5/7] drm/amdgpu: Optimize SDMA v5.2 queue reset and stop logic

2025-04-10 Thread jesse.zh...@amd.com

From: "jesse.zh...@amd.com" 

This patch refactors the SDMA v5.2 queue reset and stop logic to improve
code readability, maintainability, and performance. The key changes include:

1. **Generalized `sdma_v5_2_gfx_stop` Function**:
- Added an `inst_mask` parameter to allow stopping specific SDMA 
instances
  instead of all instances. This is useful for resetting individual 
queues.

2. **Simplified `sdma_v5_2_reset_queue` Function**:
- Removed redundant loops and checks by directly using the `ring->me` 
field
  to identify the SDMA instance.
- Reused the `sdma_v5_2_gfx_stop` function to stop the queue, reducing 
code
  duplication.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 64 +++---
 1 file changed, 26 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index 6f9a5ff7880e..6972f5dd63ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -405,15 +405,15 @@ static void sdma_v5_2_ring_emit_fence(struct amdgpu_ring 
*ring, u64 addr, u64 se
  * sdma_v5_2_gfx_stop - stop the gfx async dma engines
  *
  * @adev: amdgpu_device pointer
- *
+ * @inst_mask: mask of dma engine instances to be disabled
  * Stop the gfx async dma ring buffers.
  */
-static void sdma_v5_2_gfx_stop(struct amdgpu_device *adev)
+static void sdma_v5_2_gfx_stop(struct amdgpu_device *adev,  uint32_t inst_mask)
 {
u32 rb_cntl, ib_cntl;
int i;
 
-   for (i = 0; i < adev->sdma.num_instances; i++) {
+   for_each_inst(i, inst_mask) {
rb_cntl = RREG32_SOC15_IP(GC, sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_GFX_RB_CNTL));
rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_GFX_RB_CNTL, RB_ENABLE, 
0);
WREG32_SOC15_IP(GC, sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_GFX_RB_CNTL), rb_cntl);
@@ -504,9 +504,11 @@ static void sdma_v5_2_enable(struct amdgpu_device *adev, 
bool enable)
 {
u32 f32_cntl;
int i;
+   uint32_t inst_mask;
 
+   inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
if (!enable) {
-   sdma_v5_2_gfx_stop(adev);
+   sdma_v5_2_gfx_stop(adev, inst_mask);
sdma_v5_2_rlc_stop(adev);
}
 
@@ -1448,40 +1450,26 @@ static int sdma_v5_2_wait_for_idle(struct 
amdgpu_ip_block *ip_block)
 static int sdma_v5_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int i, j, r;
-   u32 rb_cntl, ib_cntl, f32_cntl, freeze, cntl, preempt, soft_reset, 
stat1_reg;
+   int j, r;
+   u32 f32_cntl, freeze, cntl, preempt, soft_reset, stat1_reg;
+   u32 inst_id;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
 
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   if (ring == &adev->sdma.instance[i].ring)
-   break;
-   }
-
-   if (i == adev->sdma.num_instances) {
-   DRM_ERROR("sdma instance not found\n");
-   return -EINVAL;
-   }
-
+   inst_id = ring->me;
amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
 
/* stop queue */
-   ib_cntl = RREG32(sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_GFX_IB_CNTL));
-   ib_cntl = REG_SET_FIELD(ib_cntl, SDMA0_GFX_IB_CNTL, IB_ENABLE, 0);
-   WREG32(sdma_v5_2_get_reg_offset(adev, i, mmSDMA0_GFX_IB_CNTL), ib_cntl);
-
-   rb_cntl = RREG32(sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_GFX_RB_CNTL));
-   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_GFX_RB_CNTL, RB_ENABLE, 0);
-   WREG32(sdma_v5_2_get_reg_offset(adev, i, mmSDMA0_GFX_RB_CNTL), rb_cntl);
+   sdma_v5_2_gfx_stop(adev, 1 << ring->me);
 
/*engine stop SDMA1_F32_CNTL.HALT to 1 and SDMAx_FREEZE freeze bit to 1 
*/
-   freeze = RREG32(sdma_v5_2_get_reg_offset(adev, i, mmSDMA0_FREEZE));
+   freeze = RREG32(sdma_v5_2_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
freeze = REG_SET_FIELD(freeze, SDMA0_FREEZE, FREEZE, 1);
-   WREG32(sdma_v5_2_get_reg_offset(adev, i, mmSDMA0_FREEZE), freeze);
+   WREG32(sdma_v5_2_get_reg_offset(adev, inst_id, mmSDMA0_FREEZE), freeze);
 
for (j = 0; j < adev->usec_timeout; j++) {
-   freeze = RREG32(sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_FREEZE));
+   freeze = RREG32(sdma_v5_2_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
 
if (REG_GET_FIELD(freeze, SDMA0_FREEZE, FROZEN) & 1)
break;
@@ -1490,7 +1478,7 @@ static int sdma_v5_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
 
 
if (j == adev->usec_timeout) {
-   stat1_reg = RREG32(sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_STATUS1_REG));
+   stat1_reg = RREG32(sdma_v5_2_get_reg_offset(adev, inst_

1 2 >

1 - 100 of 137 matches

Mail list logo