from:"jesse.zhang"

[PATCH] drm/amdkfd: Fix resource leak in kriu rsetore queue

2024-09-05 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

To avoid memory leaks, release q_extra_data when exiting the restore queue.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 20ea745729ee..b439d4d0bd84 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -1046,6 +1046,7 @@ int kfd_criu_restore_queue(struct kfd_process *p,
pr_debug("Queue id %d was restored successfully\n", queue_id);
 
kfree(q_data);
+   kfree(q_extra_data);
 
return ret;
 }
-- 
2.25.1

[PATCH V2] Revert "drm/amdgpu: remove vm sanity check from amdgpu_vm_make_compute" for Raven

2024-02-28 Thread jesse.zhang

From: "Jesse.Zhang" 

fix the issue:
"amdgpu: Failed to create process VM object".

[Why]when amdgpu initialized, seq64 do mampping and update bo mapping in vm 
page table.
But when clifo run. It also initializes a vm for a process device through the 
function kfd_process_device_init_vm
and ensure the root PD is clean through the function amdgpu_vm_pt_is_root_clean.
So they have a conflict, and clinfo  always failed.

[HOW]
Skip the seq64 entry check in vm page table.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index a160265ddc07..bdae5381887e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -746,8 +746,21 @@ bool amdgpu_vm_pt_is_root_clean(struct amdgpu_device *adev,
enum amdgpu_vm_level root = adev->vm_manager.root_level;
unsigned int entries = amdgpu_vm_pt_num_entries(adev, root);
unsigned int i = 0;
+   u64 seq64_addr = (adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT) - 
AMDGPU_VA_RESERVED_TOP;
+
+   seq64_addr /= AMDGPU_GPU_PAGE_SIZE;
+   mask = amdgpu_vm_pt_entries_mask(adev, adev->vm_manager.root_level);
+   shift = amdgpu_vm_pt_level_shift(adev, adev->vm_manager.root_level);
+   seq64_entry = (seq64_addr >> shift) & mask;
 
for (i = 0; i < entries; i++) {
+   /* seq64  reserve 2M memory from top of address space.
+* Then do the mapping and update the vm page table at amdgpu 
initialize.
+* So skip the know result.
+*/
+
+   if(i == seq64_entry)
+   continue;
if (to_amdgpu_bo_vm(vm->root.bo)->entries[i].bo)
return false;
}
-- 
2.34.1

[PATCH V2] drm/amdkfd: fix shift out of bounds about gpu debug

2024-03-03 Thread jesse.zhang

From: Jesse Zhang 

[ 3810.410040] UBSAN: shift-out-of-bounds in 
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_int_process_v10.c:345:5
[ 3810.410044] shift exponent 4294967295 is too large for 64-bit type 'long 
long unsigned int'
[ 3810.410047] CPU: 6 PID: 331 Comm: kworker/6:1H Not tainted 6.5.0+ #508
[ 3810.410050] Hardware name: AMD Splinter/Splinter-GNR, BIOS WS54117N_140 
01/16/2024
[ 3810.410052] Workqueue: KFD IH interrupt_wq [amdgpu]
[ 3810.410273] Call Trace:
[ 3810.410274]  
[ 3810.410277]  dump_stack_lvl+0x4c/0x70
[ 3810.410283]  dump_stack+0x14/0x20
[ 3810.410285]  ubsan_epilogue+0x9/0x40
[ 3810.410290]  __ubsan_handle_shift_out_of_bounds+0x113/0x170
[ 3810.410292]  ? 
ZSTD_decompressSequencesSplitLitBuffer_default.isra.0+0x1389/0x1b50
[ 3810.410296]  event_interrupt_wq_v10.cold+0x16/0x1e [amdgpu]
[ 3810.410523]  ? raw_spin_rq_unlock+0x14/0x40
[ 3810.410526]  ? finish_task_switch+0x85/0x2b0
[ 3810.410528]  interrupt_wq+0xb2/0x120 [amdgpu]
[ 3810.410692]  ? interrupt_wq+0xb2/0x120 [amdgpu]
[ 3810.410806]  process_one_work+0x229/0x430
[ 3810.410810]  worker_thread+0x4e/0x3c0
[ 3810.410811]  ? __pfx_worker_thread+0x10/0x10
[ 3810.410813]  kthread+0xfb/0x130
[ 3810.410815]  ? __pfx_kthread+0x10/0x10
[ 3810.410816]  ret_from_fork+0x3d/0x60
[ 3810.410819]  ? __pfx_kthread+0x10/0x10
[ 3810.410820]  ret_from_fork_asm+0x1b/0x30
[ 3810.410823]  

 -v2: define a macro. KFD process interrupts v9, v10, v11 can use that check 
prior to mask conversion
  and user space may find it useful as well.(Jon)

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 3 +++
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 6 +-
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  | 3 +++
 include/uapi/linux/kfd_ioctl.h   | 6 ++
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index 9a06c6fb6605..110ec5f71056 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -340,6 +340,9 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
}
kfd_signal_event_interrupt(pasid, context_id0 & 
0x7f, 23);
} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+   /* filter out the invalidate context_id0 */
+   if (KFD_DBG_EC_RANGE_CHECK(context_id0))
+   return;
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_DEBUG_DOORBELL_ID(context_id0),

KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)),
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index 7e2859736a55..c28cafa4b902 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -328,11 +328,15 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
/* CP */
if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
kfd_signal_event_interrupt(pasid, context_id0, 32);
-   else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
+   else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+   /* filter out the invalidate context_id0 */
+   if (KFD_DBG_EC_RANGE_CHECK(context_id0))
+   return;
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_CTXID0_DOORBELL_ID(context_id0),

KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)),
NULL, 0);
+   }
 
/* SDMA */
else if (source_id == SOC21_INTSRC_SDMA_TRAP)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 91dd5e045b51..89dbefbd3081 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -389,6 +389,9 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
}
kfd_signal_event_interrupt(pasid, sq_int_data, 24);
} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+/* filter out the invalidate context_id0 */
+   if (KFD_DBG_EC_RANGE_CHECK(context_id0))
+   return;
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_DEBUG_DOORBELL_ID(context_id0),

KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)),
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 9ce46edc62a5..9cd3aa83aac3 100644
--- a/include/uapi/linux/kfd_

[PATCH] drm/amdgpu : remove unused code

2024-03-04 Thread jesse.zhang

From: Jesse Zhang 

Remove the unused function - amdgpu_vm_pt_is_root_clean
and remove the impossible condition

v1: entries == 0 is not possible any more,
   so this condition could probably be removed (Felix)

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 61 ++-
 2 files changed, 16 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 7f95039bb37d..047ec1930d12 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -537,8 +537,6 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
int level, bool immediate, struct amdgpu_bo_vm **vmbo,
int32_t xcp_id);
 void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm);
-bool amdgpu_vm_pt_is_root_clean(struct amdgpu_device *adev,
-   struct amdgpu_vm *vm);
 
 int amdgpu_vm_pde_update(struct amdgpu_vm_update_params *params,
 struct amdgpu_vm_bo_base *entry);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 8bce4da67131..7ecddb77b3ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -367,6 +367,7 @@ int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
struct amdgpu_bo *bo = &vmbo->bo;
uint64_t addr;
int r, idx;
+   uint64_t value = 0, flags = 0;
 
/* Figure out our place in the hierarchy */
if (ancestor->parent) {
@@ -409,27 +410,24 @@ int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
 
addr = 0;
 
-   if (entries) {
-   uint64_t value = 0, flags = 0;
-
-   if (adev->asic_type >= CHIP_VEGA10) {
-   if (level != AMDGPU_VM_PTB) {
-   /* Handle leaf PDEs as PTEs */
-   flags |= AMDGPU_PDE_PTE;
-   amdgpu_gmc_get_vm_pde(adev, level,
- &value, &flags);
-   } else {
-   /* Workaround for fault priority problem on 
GMC9 */
-   flags = AMDGPU_PTE_EXECUTABLE;
-   }
-   }
 
-   r = vm->update_funcs->update(¶ms, vmbo, addr, 0, entries,
-value, flags);
-   if (r)
-   goto exit;
+   if (adev->asic_type >= CHIP_VEGA10) {
+   if (level != AMDGPU_VM_PTB) {
+   /* Handle leaf PDEs as PTEs */
+   flags |= AMDGPU_PDE_PTE;
+   amdgpu_gmc_get_vm_pde(adev, level,
+ &value, &flags);
+   } else {
+   /* Workaround for fault priority problem on GMC9 */
+   flags = AMDGPU_PTE_EXECUTABLE;
+   }
}
 
+   r = vm->update_funcs->update(¶ms, vmbo, addr, 0, entries,
+value, flags);
+   if (r)
+   goto exit;
+
r = vm->update_funcs->commit(¶ms, NULL);
 exit:
drm_dev_exit(idx);
@@ -673,33 +671,6 @@ void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, 
struct amdgpu_vm *vm)
amdgpu_vm_pt_free_dfs(adev, vm, NULL, false);
 }
 
-/**
- * amdgpu_vm_pt_is_root_clean - check if a root PD is clean
- *
- * @adev: amdgpu_device pointer
- * @vm: the VM to check
- *
- * Check all entries of the root PD, if any subsequent PDs are allocated,
- * it means there are page table creating and filling, and is no a clean
- * VM
- *
- * Returns:
- * 0 if this VM is clean
- */
-bool amdgpu_vm_pt_is_root_clean(struct amdgpu_device *adev,
-   struct amdgpu_vm *vm)
-{
-   enum amdgpu_vm_level root = adev->vm_manager.root_level;
-   unsigned int entries = amdgpu_vm_pt_num_entries(adev, root);
-   unsigned int i = 0;
-
-   for (i = 0; i < entries; i++) {
-   if (to_amdgpu_bo_vm(vm->root.bo)->entries[i].bo)
-   return false;
-   }
-   return true;
-}
-
 /**
  * amdgpu_vm_pde_update - update a single level in the hierarchy
  *
-- 
2.25.1

[PATCH V2] drm/ttm: remove unused paramter

2024-03-31 Thread jesse.zhang

From: Jesse Zhang 

remove the unsed the paramter in the function
ttm_bo_bounce_temp_buffer and ttm_bo_add_move_fence.
 V2:rebase the patch on top of drm-misc-next (Christian)

Signed-off-by: Jesse Zhang 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/ttm/ttm_bo.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index e059b1e1b13b..6396dece0db1 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -402,7 +402,6 @@ void ttm_bo_put(struct ttm_buffer_object *bo)
 EXPORT_SYMBOL(ttm_bo_put);
 
 static int ttm_bo_bounce_temp_buffer(struct ttm_buffer_object *bo,
-struct ttm_resource **mem,
 struct ttm_operation_ctx *ctx,
 struct ttm_place *hop)
 {
@@ -469,7 +468,7 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo,
if (ret != -EMULTIHOP)
break;
 
-   ret = ttm_bo_bounce_temp_buffer(bo, &evict_mem, ctx, &hop);
+   ret = ttm_bo_bounce_temp_buffer(bo, ctx, &hop);
} while (!ret);
 
if (ret) {
@@ -698,7 +697,6 @@ EXPORT_SYMBOL(ttm_bo_unpin);
  */
 static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo,
 struct ttm_resource_manager *man,
-struct ttm_resource *mem,
 bool no_wait_gpu)
 {
struct dma_fence *fence;
@@ -787,7 +785,7 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object 
*bo,
if (ret)
continue;
 
-   ret = ttm_bo_add_move_fence(bo, man, *res, ctx->no_wait_gpu);
+   ret = ttm_bo_add_move_fence(bo, man, ctx->no_wait_gpu);
if (unlikely(ret)) {
ttm_resource_free(bo, res);
if (ret == -EBUSY)
@@ -894,7 +892,7 @@ int ttm_bo_validate(struct ttm_buffer_object *bo,
 bounce:
ret = ttm_bo_handle_move_mem(bo, res, false, ctx, &hop);
if (ret == -EMULTIHOP) {
-   ret = ttm_bo_bounce_temp_buffer(bo, &res, ctx, &hop);
+   ret = ttm_bo_bounce_temp_buffer(bo, ctx, &hop);
/* try and move to final place now. */
if (!ret)
goto bounce;
-- 
2.25.1

[PATCH 1/4] drm/amdgpu: add check before free wb entry

2024-04-23 Thread jesse.zhang

From: Jesse Zhang 

check if ring is not mes queue before free wb entry.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 45a2d0a5a2d7..b7d33d78bce0 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -999,7 +999,8 @@ static int sdma_v5_0_ring_test_ring(struct amdgpu_ring 
*ring)
r = amdgpu_ring_alloc(ring, 20);
if (r) {
DRM_ERROR("amdgpu: dma failed to lock ring %d (%d).\n", 
ring->idx, r);
-   amdgpu_device_wb_free(adev, index);
+   if (!ring->is_mes_queue)
+   amdgpu_device_wb_free(adev, index);
return r;
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index 43e64b2da575..cc9e961f0078 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -839,7 +839,8 @@ static int sdma_v5_2_ring_test_ring(struct amdgpu_ring 
*ring)
r = amdgpu_ring_alloc(ring, 20);
if (r) {
DRM_ERROR("amdgpu: dma failed to lock ring %d (%d).\n", 
ring->idx, r);
-   amdgpu_device_wb_free(adev, index);
+   if (!ring->is_mes_queue)
+   amdgpu_device_wb_free(adev, index);
return r;
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
index 1f4877195213..c833b6b8373b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
@@ -861,7 +861,8 @@ static int sdma_v6_0_ring_test_ring(struct amdgpu_ring 
*ring)
r = amdgpu_ring_alloc(ring, 5);
if (r) {
DRM_ERROR("amdgpu: dma failed to lock ring %d (%d).\n", 
ring->idx, r);
-   amdgpu_device_wb_free(adev, index);
+   if (!ring->is_mes_queue)
+   amdgpu_device_wb_free(adev, index);
return r;
}
 
-- 
2.25.1

[PATCH 2/4] Initialize the last_jump_jiffies in atom_exec_context before it used

2024-04-23 Thread jesse.zhang

From: Jesse Zhang 

The parameter "last_jump_jiffies" should be initialized before being used in 
the function atom_op_jump.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/atom.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c 
b/drivers/gpu/drm/amd/amdgpu/atom.c
index 72362df352f6..d552e013354c 100644
--- a/drivers/gpu/drm/amd/amdgpu/atom.c
+++ b/drivers/gpu/drm/amd/amdgpu/atom.c
@@ -1243,6 +1243,7 @@ static int amdgpu_atom_execute_table_locked(struct 
atom_context *ctx, int index,
ectx.ps_size = params_size;
ectx.abort = false;
ectx.last_jump = 0;
+   ectx.last_jump_jiffies = 0;
if (ws) {
ectx.ws = kcalloc(4, ws, GFP_KERNEL);
ectx.ws_size = ws;
-- 
2.25.1

[PATCH 3/4] drm/amdgpu: Using uninitialized value new_state.jpeg when calling adev->vcn.pause_dpg_mode

2024-04-23 Thread jesse.zhang

From: Jesse Zhang 

Initialize the new_state.jpeg before it used

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 677eb141554e..13125ddd5e86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -410,6 +410,11 @@ static void amdgpu_vcn_idle_work_handler(struct 
work_struct *work)
else
new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
 
+   if 
(amdgpu_fence_count_emitted(adev->jpeg.inst->ring_dec))
+   new_state.jpeg = VCN_DPG_STATE__PAUSE;
+   else
+   new_state.jpeg = VCN_DPG_STATE__UNPAUSE;
+
adev->vcn.pause_dpg_mode(adev, j, &new_state);
}
 
-- 
2.25.1

[PATCH 4/4] drm/amdgpu: Using uninitialized value *size when calling amdgpu_vce_cs_reloc

2024-04-23 Thread jesse.zhang

From: Jesse Zhang 

Initialize the size before calling amdgpu_vce_cs_reloc, such as case 0x0301.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index 59acf424a078..60d97cd14855 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -742,7 +742,7 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p,
uint32_t destroyed = 0;
uint32_t created = 0;
uint32_t allocated = 0;
-   uint32_t tmp, handle = 0;
+   uint32_t tmp = 0, handle = 0;
uint32_t *size = &tmp;
unsigned int idx;
int i, r = 0;
-- 
2.25.1

[PATCH 1/2] drm/sched: adding a new scheduling policy

2024-10-10 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

Added ring ID scheduling.
In some cases, userspace needs to run a job on a specific ring.
Instead of selecting the best ring to run based on the ring score.
For example, The user want to run a bad job on a specific ring to check
whether the ring can recover from a queue reset.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_sched.c  |  2 +-
 drivers/gpu/drm/imagination/pvr_queue.c  |  2 +-
 drivers/gpu/drm/lima/lima_sched.c|  2 +-
 drivers/gpu/drm/msm/msm_gem_submit.c |  2 +-
 drivers/gpu/drm/nouveau/nouveau_sched.c  |  2 +-
 drivers/gpu/drm/panfrost/panfrost_job.c  |  2 +-
 drivers/gpu/drm/scheduler/sched_entity.c | 11 +--
 drivers/gpu/drm/scheduler/sched_main.c   |  4 ++--
 drivers/gpu/drm/v3d/v3d_submit.c |  2 +-
 include/drm/gpu_scheduler.h  |  4 ++--
 12 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index d891ab779ca7..18887128a973 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1286,7 +1286,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
int r;
 
for (i = 0; i < p->gang_size; ++i)
-   drm_sched_job_arm(&p->jobs[i]->base);
+   drm_sched_job_arm(&p->jobs[i]->base, -1);
 
for (i = 0; i < p->gang_size; ++i) {
struct dma_fence *fence;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 717adcedf096..8d75ffa9a097 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -320,7 +320,7 @@ struct dma_fence *amdgpu_job_submit(struct amdgpu_job *job)
 {
struct dma_fence *f;
 
-   drm_sched_job_arm(&job->base);
+   drm_sched_job_arm(&job->base, -1);
f = dma_fence_get(&job->base.s_fence->finished);
amdgpu_job_free_resources(job);
drm_sched_entity_push_job(&job->base);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 62dcfdc7894d..98d003757af1 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -107,7 +107,7 @@ int etnaviv_sched_push_job(struct etnaviv_gem_submit 
*submit)
 */
mutex_lock(&gpu->sched_lock);
 
-   drm_sched_job_arm(&submit->sched_job);
+   drm_sched_job_arm(&submit->sched_job, -1);
 
submit->out_fence = dma_fence_get(&submit->sched_job.s_fence->finished);
ret = xa_alloc_cyclic(&gpu->user_fences, &submit->out_fence_id,
diff --git a/drivers/gpu/drm/imagination/pvr_queue.c 
b/drivers/gpu/drm/imagination/pvr_queue.c
index 5ed9c98fb599..ed7398a0ff21 100644
--- a/drivers/gpu/drm/imagination/pvr_queue.c
+++ b/drivers/gpu/drm/imagination/pvr_queue.c
@@ -1115,7 +1115,7 @@ int pvr_queue_job_init(struct pvr_job *job)
  */
 struct dma_fence *pvr_queue_job_arm(struct pvr_job *job)
 {
-   drm_sched_job_arm(&job->base);
+   drm_sched_job_arm(&job->base, -1);
 
return &job->base.s_fence->finished;
 }
diff --git a/drivers/gpu/drm/lima/lima_sched.c 
b/drivers/gpu/drm/lima/lima_sched.c
index bbf3f8feab94..cc83b2aab9ce 100644
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -130,7 +130,7 @@ int lima_sched_task_init(struct lima_sched_task *task,
return err;
}
 
-   drm_sched_job_arm(&task->base);
+   drm_sched_job_arm(&task->base, -1);
 
task->num_bos = num_bos;
task->vm = lima_vm_get(vm);
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index fba78193127d..74c4e1b4df78 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -831,7 +831,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
goto out;
}
 
-   drm_sched_job_arm(&submit->base);
+   drm_sched_job_arm(&submit->base, -1);
 
submit->user_fence = dma_fence_get(&submit->base.s_fence->finished);
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c 
b/drivers/gpu/drm/nouveau/nouveau_sched.c
index 32fa2e273965..3ff8142b5370 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sched.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
@@ -309,7 +309,7 @@ nouveau_job_submit(struct nouveau_job *job)
list_add(&job->entry, &sched->job.list.head);
spin_unlock(&sched->job.list.lock);
 
-   drm_sched_job_arm(&job->base);
+   drm_sched_job_arm(&job->base, -1);
job->done_fence = dma_fence_get(&job->base.s_fence->finished);
if (job->sync)
done_fence = dma_fence_get(job->done_fence);
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c 
b/drivers/gpu/drm/panfrost/panfrost_job.c
index a61ef0af9a4e..cc937420cd35 100644
--- a/driv

[PATCH 2/2] drm/amdgpu: add the ring id schedule module parameter for amdgpu

2024-10-10 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

Added ring id schedule to switch scheduling policy when cs submits.
Schedule the ring by setting the ring id.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 9 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 8 
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 48c9b9b06905..3fd3e4eeab47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -227,6 +227,7 @@ extern int amdgpu_noretry;
 extern int amdgpu_force_asic_type;
 extern int amdgpu_smartshift_bias;
 extern int amdgpu_use_xgmi_p2p;
+extern int amdgpu_ring_id_schedule;
 extern int amdgpu_mtype_local;
 extern bool enforce_isolation;
 #ifdef CONFIG_HSA_AMD
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 18887128a973..33658bef5513 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1276,6 +1276,7 @@ static void amdgpu_cs_post_dependencies(struct 
amdgpu_cs_parser *p)
 static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
union drm_amdgpu_cs *cs)
 {
+   struct drm_amdgpu_cs_chunk_ib *chunk_ib = p->chunks[0].kdata;
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct amdgpu_job *leader = p->gang_leader;
struct amdgpu_bo_list_entry *e;
@@ -1285,8 +1286,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
uint64_t seq;
int r;
 
-   for (i = 0; i < p->gang_size; ++i)
-   drm_sched_job_arm(&p->jobs[i]->base, -1);
+   for (i = 0; i < p->gang_size; ++i) {
+   if (amdgpu_ring_id_schedule)
+   drm_sched_job_arm(&p->jobs[i]->base, chunk_ib->ring);
+   else
+   drm_sched_job_arm(&p->jobs[i]->base, -1);
+   }
 
for (i = 0; i < p->gang_size; ++i) {
struct dma_fence *fence;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 680e44fdee6e..55fba9e93a8b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -221,6 +221,7 @@ int amdgpu_reset_method = -1; /* auto */
 int amdgpu_num_kcq = -1;
 int amdgpu_smartshift_bias;
 int amdgpu_use_xgmi_p2p = 1;
+int amdgpu_ring_id_schedule = 0;
 int amdgpu_vcnfw_log;
 int amdgpu_sg_display = -1; /* auto */
 int amdgpu_user_partt_mode = AMDGPU_AUTO_COMPUTE_PARTITION_MODE;
@@ -740,6 +741,13 @@ MODULE_PARM_DESC(use_xgmi_p2p,
"Enable XGMI P2P interface (0 = disable; 1 = enable (default))");
 module_param_named(use_xgmi_p2p, amdgpu_use_xgmi_p2p, int, 0444);
 
+/**
+ * DOC: ring_id_schedule (int)
+ * Enables/disables ring id schedule interface (0 = disable, 1 = enable, -1 
auto (default))
+ */
+MODULE_PARM_DESC(ring_id_schedule,
+   "Enable ring id schedule interface(0 = disable, 1 = enable, -1 auto 
(default))");
+module_param_named(ring_id_schedule, amdgpu_ring_id_schedule, int, 0644);
 
 #ifdef CONFIG_HSA_AMD
 /**
-- 
2.25.1

[PATCH 2/4] drm/amdgpu/sdma: Refactor SDMA reset functionality and add callback support

2025-02-07 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch refactors the SDMA reset functionality in the `sdma_v4_4_2` driver
to improve modularity and support shared usage between AMDGPU and KFD. The
changes include:

1. **Refactored SDMA Reset Logic**:
   - Split the `sdma_v4_4_2_reset_queue` function into two separate functions:
 - `sdma_v4_4_2_stop_queue`: Stops the SDMA queue before reset.
 - `sdma_v4_4_2_restore_queue`: Restores the SDMA queue after reset.
   - These functions are now used as callbacks for the shared reset mechanism.

2. **Added Callback Support**:
   - Introduced a new structure `sdma_v4_4_2_reset_funcs` to hold the stop and
 restore callbacks.
   - Added `sdma_v4_4_2_set_reset_funcs` to register these callbacks with the
 shared reset mechanism using `amdgpu_set_on_reset_callbacks`.

3. **Fixed Reset Queue Function**:
   - Modified `sdma_v4_4_2_reset_queue` to use the shared 
`amdgpu_sdma_reset_queue`
 function, ensuring consistency across the driver.

This patch ensures that SDMA reset functionality is more modular, reusable, and
aligned with the shared reset mechanism between AMDGPU and KFD.

Suggested-by: Jiadong Zhu 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 32 +---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 64c163dd708f..3e60456b0db0 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -105,6 +105,7 @@ static void sdma_v4_4_2_set_buffer_funcs(struct 
amdgpu_device *adev);
 static void sdma_v4_4_2_set_vm_pte_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev);
+static void sdma_v4_4_2_set_reset_funcs(struct amdgpu_device *adev);
 
 static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev,
u32 instance, u32 offset)
@@ -1330,6 +1331,7 @@ static int sdma_v4_4_2_early_init(struct amdgpu_ip_block 
*ip_block)
sdma_v4_4_2_set_vm_pte_funcs(adev);
sdma_v4_4_2_set_irq_funcs(adev);
sdma_v4_4_2_set_ras_funcs(adev);
+   sdma_v4_4_2_set_reset_funcs(adev);
 
return 0;
 }
@@ -1605,8 +1607,14 @@ static int sdma_v4_4_2_soft_reset(struct amdgpu_ip_block 
*ip_block)
 static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int i, r;
+   u32 id = GET_INST(SDMA0, ring->me);
+   return amdgpu_sdma_reset_instance(adev, id);
+}
+
+static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t 
instance_id)
+{
u32 inst_mask;
+   struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
@@ -1617,10 +1625,16 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
if (adev->sdma.has_page_queue)
sdma_v4_4_2_inst_page_stop(adev, inst_mask);
 
-   r = amdgpu_dpm_reset_sdma(adev, 1 << GET_INST(SDMA0, ring->me));
-   if (r)
-   return r;
+   return 0;
+}
 
+static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev, uint32_t 
instance_id)
+{
+   int i;
+   u32 inst_mask;
+   struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
+
+   inst_mask = 1 << ring->me;
udelay(50);
 
for (i = 0; i < adev->usec_timeout; i++) {
@@ -1638,6 +1652,16 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
return sdma_v4_4_2_inst_start(adev, inst_mask, true);
 }
 
+static struct sdma_on_reset_funcs sdma_v4_4_2_reset_funcs = {
+   .pre_reset = sdma_v4_4_2_stop_queue,
+   .post_reset = sdma_v4_4_2_restore_queue,
+};
+
+static void sdma_v4_4_2_set_reset_funcs(struct amdgpu_device *adev)
+{
+   amdgpu_sdma_register_on_reset_callbacks(adev, &sdma_v4_4_2_reset_funcs);
+}
+
 static int sdma_v4_4_2_set_trap_irq_state(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
unsigned type,
-- 
2.25.1

[PATCH 4/4] drm/amdgpu: Improve SDMA reset logic with guilty queue tracking

2025-02-07 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This commit introduces several improvements to the SDMA reset logic:

1. Added `cached_rptr` to the `amdgpu_ring` structure to store the read pointer
   before a reset, ensuring proper state restoration after reset.

2. Introduced `gfx_guilty` and `page_guilty` flags in the `amdgpu_sdma` 
structure
   to track which queue (GFX or PAGE) caused a timeout or error.

3. Replaced the `caller` parameter with a `guilty` boolean in the reset and 
resume
   functions to simplify the logic and handle resets based on the guilty state.

4. Added a helper function `sdma_v4_4_2_is_queue_selected` to check the
   `SDMA*_*_CONTEXT_STATUS.SELECTED` register and determine if a queue is 
guilty.

v2:
   1.replace the caller with a guilty bool.
   If the queue is the guilty one, set the rptr and wptr  to the saved wptr 
value,
   else, set the rptr and wptr to the saved rptr value. (Alex)
   2. cache the rptr before the reset. (Alex)

v3: add a new ring callback, is_guilty(), which will get called to check if
the ring in amdgpu_job_timedout() is actually the guilty ring. If it's not,
we can return goto exit(Alex)

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  | 10 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 +
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 90 
 6 files changed, 100 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 100f04475943..ce3e7a9d6688 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -102,6 +102,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
return DRM_GPU_SCHED_STAT_ENODEV;
}
 
+   /* Check if the ring is actually guilty of causing the timeout.
+* If not, skip error handling and fence completion.
+*/
+   if (amdgpu_gpu_recovery && ring->funcs->is_guilty) {
+   if (!ring->funcs->is_guilty(ring)) {
+   dev_err(adev->dev, "ring %s timeout, but not guilty\n",
+   s_job->sched->name);
+   goto exit;
+   }
+   }
/*
 * Do the coredump immediately after a job timeout to get a very
 * close dump/snapshot/representation of GPU's current error status
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index a6e28fe3f8d6..20cd21df38ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -342,6 +342,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct 
amdgpu_ring *ring,
ring->buf_mask = (ring->ring_size / 4) - 1;
ring->ptr_mask = ring->funcs->support_64bit_ptrs ?
0x : ring->buf_mask;
+   /*  Initialize cached_rptr to 0 */
+   ring->cached_rptr = 0;
 
/* Allocate ring buffer */
if (ring->is_mes_queue) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 04af26536f97..182aa535d395 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -237,6 +237,7 @@ struct amdgpu_ring_funcs {
void (*patch_de)(struct amdgpu_ring *ring, unsigned offset);
int (*reset)(struct amdgpu_ring *ring, unsigned int vmid);
void (*emit_cleaner_shader)(struct amdgpu_ring *ring);
+   bool (*is_guilty)(struct amdgpu_ring *ring);
 };
 
 struct amdgpu_ring {
@@ -306,6 +307,8 @@ struct amdgpu_ring {
 
boolis_sw_ring;
unsigned intentry_index;
+   /* store the cached rptr to restore after reset */
+   uint64_t cached_rptr;
 
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 8864a9d7455b..02d3685d10fc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -474,6 +474,10 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
if (!funcs)
return;
 
+   /* Ensure the reset_callback_list is initialized */
+   if (!adev->sdma.reset_callback_list.next) {
+   INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
+   }
/* Initialize the list node in the callback structure */
INIT_LIST_HEAD(&funcs->list);
 
@@ -513,7 +517,7 @@ int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, 
uint32_t instance_id,
*/
if (!amdgpu_ring_sched_ready(gfx_ring)) {
drm_sched_wqueue_stop(&gfx_ring->sched);
-   gfx_sched_stopped = true;;
+   gfx_sched_stopped = t

[PATCH 3/4] drm/amdgpu: Add common lock and reset caller parameter for SDMA reset synchronization

2025-02-07 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This commit introduces a caller parameter to the amdgpu_sdma_reset_instance 
function to differentiate
between reset requests originating from the KGD and KFD.
This change ensures proper synchronization between KGD and KFD during SDMA 
resets.

If the caller is KFD, the function now acquires and releases the scheduler lock 
(ring->sched.job_list_lock)
to protect the SDMA queue during the reset.

These changes prevent race conditions and ensure safe SDMA reset operations
when initiated by KFD, improving system stability and reliability.

V2: replace the ring_lock with the existed the scheduler
locks for the queues (ring->sched) on the sdma engine.(Alex)

v3: call drm_sched_wqueue_stop() rather than job_list_lock.
If a GPU ring reset was already initiated for one ring at 
amdgpu_job_timedout,
skip resetting that ring and call drm_sched_wqueue_stop()
for the other rings (Alex)

   replace  the common lock (sdma_reset_lock) with DQM lock to
   to resolve reset races between the two driver sections during KFD 
eviction.(Jon)

   Rename the caller to Reset_src and
   Change AMDGPU_RESET_SRC_SDMA_KGD/KFD to AMDGPU_RESET_SRC_SDMA_HWS/RING (Jon)
v4: restart the wqueue if the reset was successful,
or fall back to a full adapter reset. (Alex)

   move definition of reset source to enumeration AMDGPU_RESET_SRCS, and
   check reset src in amdgpu_sdma_reset_instance (Jon)

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Suggested-by: Jonathan Kim 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c  | 54 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h  |  6 +--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c  |  8 ++--
 4 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 4d9b9701139b..5b86e12ff9fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -43,6 +43,8 @@ enum AMDGPU_RESET_SRCS {
AMDGPU_RESET_SRC_MES,
AMDGPU_RESET_SRC_HWS,
AMDGPU_RESET_SRC_USER,
+   AMDGPU_RESET_SRC_SDMA_RING,
+   AMDGPU_RESET_SRC_SDMA_HWS,
 };
 
 struct amdgpu_reset_context {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 19c8be7d72e2..8864a9d7455b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -25,6 +25,7 @@
 #include "amdgpu.h"
 #include "amdgpu_sdma.h"
 #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
 
 #define AMDGPU_CSA_SDMA_SIZE 64
 /* SDMA CSA reside in the 3rd page of CSA */
@@ -484,6 +485,7 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
  * amdgpu_sdma_reset_instance - Reset a specific SDMA instance
  * @adev: Pointer to the AMDGPU device
  * @instance_id: ID of the SDMA engine instance to reset
+ * @src: The source of reset function (KGD or KFD)
  *
  * This function performs the following steps:
  * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save 
their state.
@@ -492,20 +494,42 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
  *
  * Returns: 0 on success, or a negative error code on failure.
  */
-int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t 
instance_id)
+int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t 
instance_id, int src)
 {
struct sdma_on_reset_funcs *funcs;
-   int ret;
+   int ret = 0;
+   struct amdgpu_sdma_instance *sdma_instance = 
&adev->sdma.instance[instance_id];;
+   struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
+   struct amdgpu_ring *page_ring = &sdma_instance->page;
+   bool gfx_sched_stopped = false, page_sched_stopped = false;
+
+   /* Check if the reset source is valid for SDMA ring reset */
+   if (src != AMDGPU_RESET_SRC_SDMA_RING && src != AMDGPU_RESET_SRC_HWS)
+   return -EINVAL;
+
+   /* Stop the scheduler's work queue for the GFX and page rings if they 
are running.
+   * This ensures that no new tasks are submitted to the queues while
+   * the reset is in progress.
+   */
+   if (!amdgpu_ring_sched_ready(gfx_ring)) {
+   drm_sched_wqueue_stop(&gfx_ring->sched);
+   gfx_sched_stopped = true;;
+   }
+
+   if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) {
+   drm_sched_wqueue_stop(&page_ring->sched);
+   page_sched_stopped = true;
+   }
 
/* Invoke all registered pre_reset callbacks */
list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
if (funcs->pre_reset) {
-   ret = funcs->pre_reset(adev, instance_id);
+   ret = funcs->pre_reset(adev, instance_id, src);

[PATCH 1/4] drm/amdgpu/kfd: Add shared SDMA reset functionality with callback support

2025-02-07 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch introduces shared SDMA reset functionality between AMDGPU and KFD.
The implementation includes the following key changes:

1. Added `amdgpu_sdma_reset_queue`:
   - Resets a specific SDMA queue by instance ID.
   - Invokes registered pre-reset and post-reset callbacks to allow KFD and 
AMDGPU
 to save/restore their state during the reset process.

2. Added `amdgpu_set_on_reset_callbacks`:
   - Allows KFD and AMDGPU to register callback functions for pre-reset and
 post-reset operations.
   - Callbacks are stored in a global linked list and invoked in the correct 
order
 during SDMA reset.

This patch ensures that both AMDGPU and KFD can handle SDMA reset events
gracefully, with proper state saving and restoration. It also provides a 
flexible
callback mechanism for future extensions.

v2: fix CamelCase and put the SDMA helper into amdgpu_sdma.c (Alex)
v3: rename the `amdgpu_register_on_reset_callbacks` function to
  `amdgpu_sdma_register_on_reset_callbacks`
move global reset_callback_list to struct amdgpu_sdma (Alex)

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 72 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 11 
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  2 +-
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 174badca27e7..19c8be7d72e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -460,3 +460,75 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct 
amdgpu_device *adev)
device_remove_file(adev->dev, 
&dev_attr_sdma_reset_mask);
}
 }
+
+/**
+ * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
+ * @funcs: Pointer to the callback structure containing pre_reset and 
post_reset functions
+ *
+ * This function allows KFD and AMDGPU to register their own callbacks for 
handling
+ * pre-reset and post-reset operations. The callbacks are added to a global 
list.
+ */
+void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, 
struct sdma_on_reset_funcs *funcs)
+{
+   if (!funcs)
+   return;
+
+   /* Initialize the list node in the callback structure */
+   INIT_LIST_HEAD(&funcs->list);
+
+   /* Add the callback structure to the global list */
+   list_add_tail(&funcs->list, &adev->sdma.reset_callback_list);
+}
+
+/**
+ * amdgpu_sdma_reset_instance - Reset a specific SDMA instance
+ * @adev: Pointer to the AMDGPU device
+ * @instance_id: ID of the SDMA engine instance to reset
+ *
+ * This function performs the following steps:
+ * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save 
their state.
+ * 2. Resets the specified SDMA engine instance.
+ * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to 
restore their state.
+ *
+ * Returns: 0 on success, or a negative error code on failure.
+ */
+int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t 
instance_id)
+{
+   struct sdma_on_reset_funcs *funcs;
+   int ret;
+
+   /* Invoke all registered pre_reset callbacks */
+   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
+   if (funcs->pre_reset) {
+   ret = funcs->pre_reset(adev, instance_id);
+   if (ret) {
+   dev_err(adev->dev,
+   "beforeReset callback failed for instance %u: 
%d\n",
+   instance_id, ret);
+   return ret;
+   }
+   }
+   }
+
+   /* Perform the SDMA reset for the specified instance */
+   ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
+   if (ret) {
+   dev_err(adev->dev, "Failed to reset SDMA instance %u\n", 
instance_id);
+   return ret;
+   }
+
+   /* Invoke all registered post_reset callbacks */
+   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
+   if (funcs->post_reset) {
+   ret = funcs->post_reset(adev, instance_id);
+   if (ret) {
+   dev_err(adev->dev,
+   "afterReset callback failed for instance %u: 
%d\n",
+   instance_id, ret);
+   return ret;
+   }
+   }
+   }
+
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 5f60736051d1..fbb8b04ef2cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -98,6 +98,13 @@ struct amdgpu_sdma_ras {
struct amdgpu_ras_blo

[PATCH 1/2] drm/amd/pm: add support for checking SDMA reset capability

2025-02-13 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch introduces a new function to check if the SMU supports resetting the 
SDMA engine.
This capability check ensures that the driver does not attempt to reset the 
SDMA engine
on hardware that does not support it.

The following changes are included:
- New function `amdgpu_dpm_reset_sdma_is_supported` to check SDMA reset
  support at the AMDGPU driver level.
- New function `smu_reset_sdma_is_supported` to check SDMA reset support
  at the SMU level.
- Implementation of `smu_v13_0_6_reset_sdma_is_supported` for the specific
  SMU version v13.0.6.
- Updated `smu_v13_0_6_reset_sdma` to use the new capability check before
  attempting to reset the SDMA engine.

Signed-off-by: Vitaly Prosyak 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c   | 23 +++
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h   |  1 +
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 17 ++
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  5 
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 23 ++-
 5 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index 6a9e26905edf..010f05a44287 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -719,6 +719,29 @@ int amdgpu_dpm_send_rma_reason(struct amdgpu_device *adev)
return ret;
 }
 
+/**
+ * amdgpu_dpm_reset_sdma_is_supported - Check if SDMA reset is supported
+ * @adev: amdgpu_device pointer
+ *
+ * This function checks if the SMU supports resetting the SDMA engine.
+ * It returns -EOPNOTSUPP if the hardware does not support software SMU or
+ * if the feature is not supported.
+ */
+int amdgpu_dpm_reset_sdma_is_supported(struct amdgpu_device *adev)
+{
+   struct smu_context *smu = adev->powerplay.pp_handle;
+   int ret;
+
+   if (!is_support_sw_smu(adev))
+   return -EOPNOTSUPP;
+
+   mutex_lock(&adev->pm.mutex);
+   ret = smu_reset_sdma_is_supported(smu);
+   mutex_unlock(&adev->pm.mutex);
+
+   return ret;
+}
+
 int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask)
 {
struct smu_context *smu = adev->powerplay.pp_handle;
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
index 1f5ac7e0230d..353a10119dc5 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
@@ -603,5 +603,6 @@ int amdgpu_dpm_set_pm_policy(struct amdgpu_device *adev, 
int policy_type,
 ssize_t amdgpu_dpm_get_pm_policy_info(struct amdgpu_device *adev,
  enum pp_pm_policy p_type, char *buf);
 int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask);
+int amdgpu_dpm_reset_sdma_is_supported(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 488360d2aaae..a61c80aed24c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -3903,6 +3903,23 @@ int smu_send_rma_reason(struct smu_context *smu)
return ret;
 }
 
+/**
+ * smu_reset_sdma_is_supported - Check if SDMA reset is supported by SMU
+ * @smu: smu_context pointer
+ *
+ * This function checks if the SMU supports resetting the SDMA engine.
+ * It returns 0 if supported, -EOPNOTSUPP otherwise.
+ */
+int smu_reset_sdma_is_supported(struct smu_context *smu)
+{
+   int ret = 0;
+
+   if (smu->ppt_funcs && smu->ppt_funcs->reset_sdma_is_supported)
+   ret = smu->ppt_funcs->reset_sdma_is_supported(smu);
+
+   return ret;
+}
+
 int smu_reset_sdma(struct smu_context *smu, uint32_t inst_mask)
 {
int ret = 0;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index 3630593bce61..090a2b3b81a0 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -1376,6 +1376,10 @@ struct pptable_funcs {
 * @reset_sdma: message SMU to soft reset sdma instance.
 */
int (*reset_sdma)(struct smu_context *smu, uint32_t inst_mask);
+   /**
+* @reset_sdma_is_supported: Check if support resets the SDMA engine.
+*/
+   int (*reset_sdma_is_supported)(struct smu_context *smu);
 
/**
 * @get_ecc_table:  message SMU to get ECC INFO table.
@@ -1637,6 +1641,7 @@ int smu_send_hbm_bad_pages_num(struct smu_context *smu, 
uint32_t size);
 int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t size);
 int smu_send_rma_reason(struct smu_context *smu);
 int smu_reset_sdma(struct smu_context *smu, uint32_t inst_mask);
+int smu_reset_sdma_is_supported(struct smu_context *smu);
 int smu_set_pm_policy(struct smu_context *smu, enum pp_pm_policy p_type,
  int level);
 ssize_t smu_get_pm_policy_info(struc

[PATCH 2/2] drm/amdgpu: Enable per-queue reset support

2025-02-13 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch updates the SDMA v4.4.2 software initialization to enable per-queue
reset support when the MEC firmware version is 0xb0 or higher and the PMFW
supports SDMA reset.

The following changes are included:
- Added a condition to check if the MEC firmware version is at least 0xb0 and if
  the PMFW supports SDMA reset using `amdgpu_dpm_reset_sdma_is_supported`.
- If both conditions are met, the `AMDGPU_RESET_TYPE_PER_QUEUE` flag is set in
  `adev->sdma.supported_reset`.

Suggested-by: Jonathan Kim 
Signed-off-by: Vitaly Prosyak 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index b24a1ff5d743..e01d97b96655 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1481,9 +1481,10 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block 
*ip_block)
}
}
 
-   /* TODO: Add queue reset mask when FW fully supports it */
adev->sdma.supported_reset =
amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
+   if (adev->gfx.mec_fw_version >= 0xb0 && 
amdgpu_dpm_reset_sdma_is_supported(adev))
+   adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
 
if (amdgpu_sdma_ras_sw_init(adev)) {
dev_err(adev->dev, "fail to initialize sdma ras block\n");
-- 
2.25.1

[PATCH v7 2/9] drm/amdgpu/sdma: Refactor SDMA reset functionality and add callback support

2025-02-12 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch refactors the SDMA reset functionality in the `sdma_v4_4_2` driver
to improve modularity and support shared usage between AMDGPU and KFD. The
changes include:

1. **Refactored SDMA Reset Logic**:
   - Split the `sdma_v4_4_2_reset_queue` function into two separate functions:
 - `sdma_v4_4_2_stop_queue`: Stops the SDMA queue before reset.
 - `sdma_v4_4_2_restore_queue`: Restores the SDMA queue after reset.
   - These functions are now used as callbacks for the shared reset mechanism.

2. **Added Callback Support**:
   - Introduced a new structure `sdma_v4_4_2_reset_funcs` to hold the stop and
 restore callbacks.
   - Added `sdma_v4_4_2_set_reset_funcs` to register these callbacks with the
 shared reset mechanism using `amdgpu_set_on_reset_callbacks`.

3. **Fixed Reset Queue Function**:
   - Modified `sdma_v4_4_2_reset_queue` to use the shared 
`amdgpu_sdma_reset_queue`
 function, ensuring consistency across the driver.

This patch ensures that SDMA reset functionality is more modular, reusable, and
aligned with the shared reset mechanism between AMDGPU and KFD.

v2: Renamed sdma_v4_4_2_set_reset_funcs to sdma_v4_4_2_set_engine_reset_funcs.
Renamed sdma_v4_4_2_reset_funcs to sdma_v4_4_2_engine_reset_funcs.(Alex)

Suggested-by: Jiadong Zhu 
Suggested-by: Alex Deucher 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 32 +---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 64c163dd708f..29a123be90b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -105,6 +105,7 @@ static void sdma_v4_4_2_set_buffer_funcs(struct 
amdgpu_device *adev);
 static void sdma_v4_4_2_set_vm_pte_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev);
+static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev);
 
 static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev,
u32 instance, u32 offset)
@@ -1330,6 +1331,7 @@ static int sdma_v4_4_2_early_init(struct amdgpu_ip_block 
*ip_block)
sdma_v4_4_2_set_vm_pte_funcs(adev);
sdma_v4_4_2_set_irq_funcs(adev);
sdma_v4_4_2_set_ras_funcs(adev);
+   sdma_v4_4_2_set_engine_reset_funcs(adev);
 
return 0;
 }
@@ -1605,8 +1607,14 @@ static int sdma_v4_4_2_soft_reset(struct amdgpu_ip_block 
*ip_block)
 static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int i, r;
+   u32 id = GET_INST(SDMA0, ring->me);
+   return amdgpu_sdma_reset_engine(adev, id);
+}
+
+static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t 
instance_id)
+{
u32 inst_mask;
+   struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
@@ -1617,10 +1625,16 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
if (adev->sdma.has_page_queue)
sdma_v4_4_2_inst_page_stop(adev, inst_mask);
 
-   r = amdgpu_dpm_reset_sdma(adev, 1 << GET_INST(SDMA0, ring->me));
-   if (r)
-   return r;
+   return 0;
+}
 
+static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev, uint32_t 
instance_id)
+{
+   int i;
+   u32 inst_mask;
+   struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
+
+   inst_mask = 1 << ring->me;
udelay(50);
 
for (i = 0; i < adev->usec_timeout; i++) {
@@ -1638,6 +1652,16 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
return sdma_v4_4_2_inst_start(adev, inst_mask, true);
 }
 
+static struct sdma_on_reset_funcs sdma_v4_4_2_engine_reset_funcs = {
+   .pre_reset = sdma_v4_4_2_stop_queue,
+   .post_reset = sdma_v4_4_2_restore_queue,
+};
+
+static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev)
+{
+   amdgpu_sdma_register_on_reset_callbacks(adev, 
&sdma_v4_4_2_engine_reset_funcs);
+}
+
 static int sdma_v4_4_2_set_trap_irq_state(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
unsigned type,
-- 
2.25.1

[PATCH v7 1/9] drm/amdgpu/kfd: Add shared SDMA reset functionality with callback support

2025-02-12 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch introduces shared SDMA reset functionality between AMDGPU and KFD.
The implementation includes the following key changes:

1. Added `amdgpu_sdma_reset_queue`:
   - Resets a specific SDMA queue by instance ID.
   - Invokes registered pre-reset and post-reset callbacks to allow KFD and 
AMDGPU
 to save/restore their state during the reset process.

2. Added `amdgpu_set_on_reset_callbacks`:
   - Allows KFD and AMDGPU to register callback functions for pre-reset and
 post-reset operations.
   - Callbacks are stored in a global linked list and invoked in the correct 
order
 during SDMA reset.

This patch ensures that both AMDGPU and KFD can handle SDMA reset events
gracefully, with proper state saving and restoration. It also provides a 
flexible
callback mechanism for future extensions.

v2: fix CamelCase and put the SDMA helper into amdgpu_sdma.c (Alex)

v3: rename the `amdgpu_register_on_reset_callbacks` function to
  `amdgpu_sdma_register_on_reset_callbacks`
move global reset_callback_list to struct amdgpu_sdma (Alex)

v4: Update the reset callback function description and
   rename the reset function to amdgpu_sdma_reset_engine (Alex)

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 73 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 11 
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  2 +-
 3 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 174badca27e7..fe39198307ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -460,3 +460,76 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct 
amdgpu_device *adev)
device_remove_file(adev->dev, 
&dev_attr_sdma_reset_mask);
}
 }
+
+/**
+ * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
+ * @funcs: Pointer to the callback structure containing pre_reset and 
post_reset functions
+ *
+ * This function allows KFD and AMDGPU to register their own callbacks for 
handling
+ * pre-reset and post-reset operations for engine reset. These are needed 
because engine
+ * reset will stop all queues on that engine.
+ */
+void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, 
struct sdma_on_reset_funcs *funcs)
+{
+   if (!funcs)
+   return;
+
+   /* Initialize the list node in the callback structure */
+   INIT_LIST_HEAD(&funcs->list);
+
+   /* Add the callback structure to the global list */
+   list_add_tail(&funcs->list, &adev->sdma.reset_callback_list);
+}
+
+/**
+ * amdgpu_sdma_reset_engine - Reset a specific SDMA engine
+ * @adev: Pointer to the AMDGPU device
+ * @instance_id: ID of the SDMA engine instance to reset
+ *
+ * This function performs the following steps:
+ * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save 
their state.
+ * 2. Resets the specified SDMA engine instance.
+ * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to 
restore their state.
+ *
+ * Returns: 0 on success, or a negative error code on failure.
+ */
+int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
+{
+   struct sdma_on_reset_funcs *funcs;
+   int ret;
+
+   /* Invoke all registered pre_reset callbacks */
+   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
+   if (funcs->pre_reset) {
+   ret = funcs->pre_reset(adev, instance_id);
+   if (ret) {
+   dev_err(adev->dev,
+   "beforeReset callback failed for instance %u: 
%d\n",
+   instance_id, ret);
+   return ret;
+   }
+   }
+   }
+
+   /* Perform the SDMA reset for the specified instance */
+   ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
+   if (ret) {
+   dev_err(adev->dev, "Failed to reset SDMA instance %u\n", 
instance_id);
+   return ret;
+   }
+
+   /* Invoke all registered post_reset callbacks */
+   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
+   if (funcs->post_reset) {
+   ret = funcs->post_reset(adev, instance_id);
+   if (ret) {
+   dev_err(adev->dev,
+   "afterReset callback failed for instance %u: 
%d\n",
+   instance_id, ret);
+   return ret;
+   }
+   }
+   }
+
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 5f60736051d1..f91d75848557 100644

[PATCH V7 3/9] drm/amdgpu: Add common lock and reset caller parameter for SDMA reset synchronization

2025-02-12 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This commit introduces a caller parameter to the amdgpu_sdma_reset_instance 
function to differentiate
between reset requests originating from the KGD and KFD.
This change ensures proper synchronization between KGD and KFD during SDMA 
resets.

If the caller is KFD, the function now acquires and releases the scheduler lock 
(ring->sched.job_list_lock)
to protect the SDMA queue during the reset.

These changes prevent race conditions and ensure safe SDMA reset operations
when initiated by KFD, improving system stability and reliability.

V2: replace the ring_lock with the existed the scheduler
locks for the queues (ring->sched) on the sdma engine.(Alex)

v3: call drm_sched_wqueue_stop() rather than job_list_lock.
If a GPU ring reset was already initiated for one ring at 
amdgpu_job_timedout,
skip resetting that ring and call drm_sched_wqueue_stop()
for the other rings (Alex)

   replace  the common lock (sdma_reset_lock) with DQM lock to
   to resolve reset races between the two driver sections during KFD 
eviction.(Jon)

   Rename the caller to Reset_src and
   Change AMDGPU_RESET_SRC_SDMA_KGD/KFD to AMDGPU_RESET_SRC_SDMA_HWS/RING (Jon)
v4: restart the wqueue if the reset was successful,
or fall back to a full adapter reset. (Alex)

   move definition of reset source to enumeration AMDGPU_RESET_SRCS, and
   check reset src in amdgpu_sdma_reset_instance (Jon)

v5: Call amdgpu_amdkfd_suspend/resume at the start/end of reset function 
respectively under !SRC_HWS
conditions only (Jon)

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Suggested-by: Jonathan Kim 
Signed-off-by: Jesse Zhang 
Reviewed-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c  | 65 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h  |  6 +--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c  |  8 +--
 4 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 4d9b9701139b..5b86e12ff9fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -43,6 +43,8 @@ enum AMDGPU_RESET_SRCS {
AMDGPU_RESET_SRC_MES,
AMDGPU_RESET_SRC_HWS,
AMDGPU_RESET_SRC_USER,
+   AMDGPU_RESET_SRC_SDMA_RING,
+   AMDGPU_RESET_SRC_SDMA_HWS,
 };
 
 struct amdgpu_reset_context {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index fe39198307ec..808c7112ef10 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -25,6 +25,7 @@
 #include "amdgpu.h"
 #include "amdgpu_sdma.h"
 #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
 
 #define AMDGPU_CSA_SDMA_SIZE 64
 /* SDMA CSA reside in the 3rd page of CSA */
@@ -485,6 +486,7 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
  * amdgpu_sdma_reset_engine - Reset a specific SDMA engine
  * @adev: Pointer to the AMDGPU device
  * @instance_id: ID of the SDMA engine instance to reset
+ * @src: The source of reset function (KGD or KFD)
  *
  * This function performs the following steps:
  * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save 
their state.
@@ -493,20 +495,49 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
  *
  * Returns: 0 on success, or a negative error code on failure.
  */
-int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
+int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, 
int src)
 {
struct sdma_on_reset_funcs *funcs;
-   int ret;
+   int ret = 0;
+   struct amdgpu_sdma_instance *sdma_instance = 
&adev->sdma.instance[instance_id];;
+   struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
+   struct amdgpu_ring *page_ring = &sdma_instance->page;
+   bool gfx_sched_stopped = false, page_sched_stopped = false;
+
+   /* Check if the reset source is valid for SDMA ring reset */
+   if (src != AMDGPU_RESET_SRC_SDMA_RING && src != AMDGPU_RESET_SRC_HWS)
+   return -EINVAL;
+
+   /* Suspend KFD if the reset source is not SDMA_HWS.
+* prevent the destruction of in-flight healthy user queue packets and
+* avoid race conditions between KFD and KGD during the reset process.
+*/
+   if (src != AMDGPU_RESET_SRC_SDMA_HWS)
+   amdgpu_amdkfd_suspend(adev, false);
+
+   /* Stop the scheduler's work queue for the GFX and page rings if they 
are running.
+   * This ensures that no new tasks are submitted to the queues while
+   * the reset is in progress.
+   */
+   if (!amdgpu_ring_sched_ready(gfx_ring)) {
+   drm_sched_wqueue_stop(&gfx_ring->sched);
+   gfx_sched_stopped = true;;
+   }
+
+   if (adev->sdma.has_page_queue && !amdgpu_rin

[PATCH v7 6/9] drm/amdgpu/sdma: Introduce is_guilty callbacks for sdma GFX and PAGE rings

2025-02-12 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch introduces the `is_guilty` callbacks for the GFX and PAGE rings.
These callbacks check if a ring is guilty of causing a timeout or error.

Suggested-by: Alex Deucher 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 30 
 1 file changed, 30 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 50a086264792..b6de4eaf6088 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1606,6 +1606,34 @@ static int sdma_v4_4_2_soft_reset(struct amdgpu_ip_block 
*ip_block)
return 0;
 }
 
+static bool sdma_v4_4_2_is_queue_selected(struct amdgpu_device *adev, uint32_t 
instance_id, bool is_page_queue)
+{
+   uint32_t reg_offset = is_page_queue ? regSDMA_PAGE_CONTEXT_STATUS : 
regSDMA_GFX_CONTEXT_STATUS;
+   uint32_t context_status = RREG32(sdma_v4_4_2_get_reg_offset(adev, 
instance_id, reg_offset));
+
+   /* Check if the SELECTED bit is set */
+   return (context_status & SDMA_GFX_CONTEXT_STATUS__SELECTED_MASK) != 0;
+}
+
+static bool sdma_v4_4_2_ring_is_guilty(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+   uint32_t instance_id = ring->me;
+
+   return sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
+}
+
+static bool sdma_v4_4_2_page_ring_is_guilty(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+   uint32_t instance_id = ring->me;
+
+   if (!adev->sdma.has_page_queue)
+   return false;
+
+   return sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
+}
+
 static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
@@ -2055,6 +2083,7 @@ static const struct amdgpu_ring_funcs 
sdma_v4_4_2_ring_funcs = {
.emit_reg_wait = sdma_v4_4_2_ring_emit_reg_wait,
.emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,
.reset = sdma_v4_4_2_reset_queue,
+   .is_guilty = sdma_v4_4_2_ring_is_guilty,
 };
 
 static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = {
@@ -2086,6 +2115,7 @@ static const struct amdgpu_ring_funcs 
sdma_v4_4_2_page_ring_funcs = {
.emit_wreg = sdma_v4_4_2_ring_emit_wreg,
.emit_reg_wait = sdma_v4_4_2_ring_emit_reg_wait,
.emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,
+   .is_guilty = sdma_v4_4_2_page_ring_is_guilty,
 };
 
 static void sdma_v4_4_2_set_ring_funcs(struct amdgpu_device *adev)
-- 
2.25.1

[PATCH V7 5/9] drm/amdgpu: Update amdgpu_job_timedout to check if the ring is guilty

2025-02-12 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch updates the `amdgpu_job_timedout` function to check if
the ring is actually guilty of causing the timeout. If not, it
skips error handling and fence completion.

Suggested-by: Alex Deucher 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 100f04475943..f94c876db72b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -101,6 +101,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
/* Effectively the job is aborted as the device is gone */
return DRM_GPU_SCHED_STAT_ENODEV;
}
+   /* Check if the ring is actually guilty of causing the timeout.
+   * If not, skip error handling and fence completion.
+   */
+   if (amdgpu_gpu_recovery && ring->funcs->is_guilty) {
+   if (!ring->funcs->is_guilty(ring)) {
+   dev_err(adev->dev, "ring %s timeout, but not guilty\n",
+   s_job->sched->name);
+   goto exit;
+   }
+   }
 
/*
 * Do the coredump immediately after a job timeout to get a very
-- 
2.25.1

[PATCH V7 4/9] drm/amdgpu: Introduce cached_rptr and is_guilty callback in amdgpu_ring

2025-02-12 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch introduces the following changes:
- Add `cached_rptr` to the `amdgpu_ring` structure to store the read pointer 
before a reset.
- Add `is_guilty` callback to the `amdgpu_ring_funcs` structure to check if a 
ring is guilty of causing a timeout.

Suggested-by: Alex Deucher 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index a6e28fe3f8d6..20cd21df38ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -342,6 +342,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct 
amdgpu_ring *ring,
ring->buf_mask = (ring->ring_size / 4) - 1;
ring->ptr_mask = ring->funcs->support_64bit_ptrs ?
0x : ring->buf_mask;
+   /*  Initialize cached_rptr to 0 */
+   ring->cached_rptr = 0;
 
/* Allocate ring buffer */
if (ring->is_mes_queue) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 04af26536f97..182aa535d395 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -237,6 +237,7 @@ struct amdgpu_ring_funcs {
void (*patch_de)(struct amdgpu_ring *ring, unsigned offset);
int (*reset)(struct amdgpu_ring *ring, unsigned int vmid);
void (*emit_cleaner_shader)(struct amdgpu_ring *ring);
+   bool (*is_guilty)(struct amdgpu_ring *ring);
 };
 
 struct amdgpu_ring {
@@ -306,6 +307,8 @@ struct amdgpu_ring {
 
boolis_sw_ring;
unsigned intentry_index;
+   /* store the cached rptr to restore after reset */
+   uint64_t cached_rptr;
 
 };
 
-- 
2.25.1

[PATCH V7 7/9] drm/amdgpu: Improve SDMA reset logic with guilty queue tracking

2025-02-12 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch includes the remaining improvements to the SDMA reset logic:
- Added `gfx_guilty` and `page_guilty` flags to track guilty queues.
- Updated the reset and resume functions to handle the guilty state.
- Cached the `rptr` before reset.

v2:
   1.replace the caller with a guilty bool.
   If the queue is the guilty one, set the rptr and wptr  to the saved wptr 
value,
   else, set the rptr and wptr to the saved rptr value. (Alex)
   2. cache the rptr before the reset. (Alex)

v3: Keeping intermediate variables like u64 rwptr simplifies resotre 
rptr/wptr.(Lijo)

Suggested-by: Alex Deucher 
Suggested-by: Jiadong Zhu 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c |  6 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 ++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 66 +++-
 3 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 808c7112ef10..b9f0c78a6d77 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -475,6 +475,10 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
if (!funcs)
return;
 
+   /* Ensure the reset_callback_list is initialized */
+   if (!adev->sdma.reset_callback_list.next) {
+   INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
+   }
/* Initialize the list node in the callback structure */
INIT_LIST_HEAD(&funcs->list);
 
@@ -521,7 +525,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
uint32_t instance_id, i
*/
if (!amdgpu_ring_sched_ready(gfx_ring)) {
drm_sched_wqueue_stop(&gfx_ring->sched);
-   gfx_sched_stopped = true;;
+   gfx_sched_stopped = true;
}
 
if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 2ef2da772254..7effc2673466 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -126,6 +126,9 @@ struct amdgpu_sdma {
uint32_t*ip_dump;
uint32_tsupported_reset;
struct list_headreset_callback_list;
+   /* track guilty state of GFX and PAGE queues */
+   bool gfx_guilty;
+   bool page_guilty;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index b6de4eaf6088..350506b65cb4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -671,11 +671,12 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring 
*ring, uint32_t rb_cntl)
  * @adev: amdgpu_device pointer
  * @i: instance to resume
  * @restore: used to restore wptr when restart
+ * @guilty: boolean indicating whether this queue is the guilty one (caused 
the timeout/error)
  *
  * Set up the gfx DMA ring buffers and enable them.
  * Returns 0 for success, error for failure.
  */
-static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, 
bool restore)
+static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, 
bool restore, bool guilty)
 {
struct amdgpu_ring *ring = &adev->sdma.instance[i].ring;
u32 rb_cntl, ib_cntl, wptr_poll_cntl;
@@ -683,6 +684,7 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device 
*adev, unsigned int i, b
u32 doorbell;
u32 doorbell_offset;
u64 wptr_gpu_addr;
+   u64 rwptr;
 
wb_offset = (ring->rptr_offs * 4);
 
@@ -708,12 +710,20 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device 
*adev, unsigned int i, b
/* before programing wptr to a less value, need set minor_ptr_update 
first */
WREG32_SDMA(i, regSDMA_GFX_MINOR_PTR_UPDATE, 1);
 
+   /* For the guilty queue, set RPTR to the current wptr to skip bad 
commands,
+* It is not a guilty queue, restore cache_rptr and continue execution.
+ */
+   if (guilty)
+   rwptr = ring->wptr;
+   else
+   rwptr = ring->cached_rptr;
+
/* Initialize the ring buffer's read and write pointers */
if (restore) {
-   WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(ring->wptr << 
2));
-   WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(ring->wptr 
<< 2));
-   WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(ring->wptr << 
2));
-   WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(ring->wptr 
<< 2));
+   WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(rwptr << 2));
+   WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(rwptr << 
2));
+   WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(rwptr << 2));
+   WREG32_SDMA(i, regSDMA_GF

[PATCH v79/9] drm/amdgpu: Update SDMA scheduler mask handling to include page queue

2025-02-12 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch updates the SDMA scheduler mask handling to include the page queue
if it exists. The scheduler mask is calculated based on the number of SDMA
instances and the presence of the page queue. The mask is updated to reflect
the state of both the SDMA gfx ring and the page queue.

Changes:
- Add handling for the SDMA page queue in `amdgpu_debugfs_sdma_sched_mask_set`.
- Update scheduler mask calculations to include the page queue.
- Modify `amdgpu_debugfs_sdma_sched_mask_get` to return the correct mask value.

This change is necessary to verify multiple queues (SDMA gfx queue + page queue)
and ensure proper scheduling and state management for SDMA instances.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 55 +---
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index b9f0c78a6d77..8de214a8ba6d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -356,23 +356,44 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev)
 static int amdgpu_debugfs_sdma_sched_mask_set(void *data, u64 val)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)data;
-   u32 i;
+   u32 i, num_ring;
u64 mask = 0;
-   struct amdgpu_ring *ring;
+   struct amdgpu_ring *ring, *page = NULL;
 
if (!adev)
return -ENODEV;
 
-   mask = BIT_ULL(adev->sdma.num_instances) - 1;
+   /* Determine the number of rings per SDMA instance
+* (1 for sdma gfx ring, 2 if page queue exists)
+*/
+   if (adev->sdma.has_page_queue)
+   num_ring = 2;
+   else
+   num_ring = 1;
+
+   /* Calculate the maximum possible mask value
+* based on the number of SDMA instances and rings
+   */
+   mask = BIT_ULL(adev->sdma.num_instances * num_ring) - 1;
+
if ((val & mask) == 0)
return -EINVAL;
 
for (i = 0; i < adev->sdma.num_instances; ++i) {
ring = &adev->sdma.instance[i].ring;
-   if (val & BIT_ULL(i))
+   if (adev->sdma.has_page_queue)
+   page = &adev->sdma.instance[i].page;
+   if (val & BIT_ULL(i * num_ring))
ring->sched.ready = true;
else
ring->sched.ready = false;
+
+   if (page) {
+   if (val & BIT_ULL(i * num_ring + 1))
+   page->sched.ready = true;
+   else
+   page->sched.ready = false;
+   }
}
/* publish sched.ready flag update effective immediately across smp */
smp_rmb();
@@ -382,16 +403,36 @@ static int amdgpu_debugfs_sdma_sched_mask_set(void *data, 
u64 val)
 static int amdgpu_debugfs_sdma_sched_mask_get(void *data, u64 *val)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)data;
-   u32 i;
+   u32 i, num_ring;
u64 mask = 0;
-   struct amdgpu_ring *ring;
+   struct amdgpu_ring *ring, *page = NULL;
 
if (!adev)
return -ENODEV;
+
+   /* Determine the number of rings per SDMA instance
+* (1 for sdma gfx ring, 2 if page queue exists)
+*/
+   if (adev->sdma.has_page_queue)
+   num_ring = 2;
+   else
+   num_ring = 1;
+
for (i = 0; i < adev->sdma.num_instances; ++i) {
ring = &adev->sdma.instance[i].ring;
+   if (adev->sdma.has_page_queue)
+   page = &adev->sdma.instance[i].page;
+
if (ring->sched.ready)
-   mask |= 1 << i;
+   mask |= 1 << (i * num_ring);
+   else
+   mask &= ~(1 << (i * num_ring));
+
+   if (page && page->sched.ready) {
+   mask |= 1 << (i * num_ring + 1);
+   } else {
+   mask &= ~(1 << (i * num_ring + 1));
+   }
}
 
*val = mask;
-- 
2.25.1

[PATCH V7 8/9] drm/amdgpu: Add reset function pointer for SDMA v4.4.2 page ring

2025-02-12 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch adds a reset function pointer to the SDMA v4.4.2 page ring
functionality. The new function pointer `reset` is set to
`sdma_v4_4_2_reset_queue`, which is responsible for resetting the SDMA queue.

Changes:
- Add `reset` function pointer to `sdma_v4_4_2_page_ring_funcs`.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 350506b65cb4..b24a1ff5d743 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -2155,6 +2155,7 @@ static const struct amdgpu_ring_funcs 
sdma_v4_4_2_page_ring_funcs = {
.emit_wreg = sdma_v4_4_2_ring_emit_wreg,
.emit_reg_wait = sdma_v4_4_2_ring_emit_reg_wait,
.emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,
+   .reset = sdma_v4_4_2_reset_queue,
.is_guilty = sdma_v4_4_2_page_ring_is_guilty,
 };
 
-- 
2.25.1

[PATCH 3/3 V8] drm/amdgpu/sdma_v4_4_2: update VM flush implementation for SDMA

2025-03-18 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This commit updates the VM flush implementation for the SDMA engine.

- Added a new function `sdma_v4_4_2_get_invalidate_req` to construct the 
VM_INVALIDATE_ENG0_REQ
  register value for the specified VMID and flush type. This function ensures 
that all relevant
  page table cache levels (L1 PTEs, L2 PTEs, and L2 PDEs) are invalidated.

- Modified the `sdma_v4_4_2_ring_emit_vm_flush` function to use the new 
`sdma_v4_4_2_get_invalidate_req`
  function. The updated function emits the necessary register writes and waits 
to perform a VM flush
  for the specified VMID. It updates the PTB address registers and issues a VM 
invalidation request
  using the specified VM invalidation engine.

- Included the necessary header file `gc/gc_9_0_sh_mask.h` to provide access to 
the required register
  definitions.

v2: vm flush by the vm inalidation packet (Lijo)
v3: code stle and define thh macro for the vm invalidation packet (Christian)
v4: Format definition sdma vm invalidate packet (Lijo)

Suggested-by: Lijo Lazar 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c  | 77 +++
 .../gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h | 54 +
 2 files changed, 117 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index fd34dc138081..06ce0c98ef5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -31,6 +31,7 @@
 #include "amdgpu_ucode.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_reset.h"
+#include "gc/gc_9_0_sh_mask.h"
 
 #include "sdma/sdma_4_4_2_offset.h"
 #include "sdma/sdma_4_4_2_sh_mask.h"
@@ -1292,21 +1293,71 @@ static void sdma_v4_4_2_ring_emit_pipeline_sync(struct 
amdgpu_ring *ring)
   seq, 0x, 4);
 }
 
-
-/**
- * sdma_v4_4_2_ring_emit_vm_flush - vm flush using sDMA
+/*
+ * sdma_v4_4_2_get_invalidate_req - Construct the VM_INVALIDATE_ENG0_REQ 
register value
+ * @vmid: The VMID to invalidate
+ * @flush_type: The type of flush (0 = legacy, 1 = lightweight, 2 = 
heavyweight)
  *
- * @ring: amdgpu_ring pointer
- * @vmid: vmid number to use
- * @pd_addr: address
+ * This function constructs the VM_INVALIDATE_ENG0_REQ register value for the 
specified VMID
+ * and flush type. It ensures that all relevant page table cache levels (L1 
PTEs, L2 PTEs, and
+ * L2 PDEs) are invalidated.
+ */
+static uint32_t sdma_v4_4_2_get_invalidate_req(unsigned int vmid,
+   uint32_t flush_type)
+{
+   u32 req = 0;
+
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ,
+   PER_VMID_INVALIDATE_REQ, 1 << vmid);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, FLUSH_TYPE, 
flush_type);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PTES, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE0, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE1, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE2, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L1_PTES, 1);
+   req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ,
+   CLEAR_PROTECTION_FAULT_STATUS_ADDR, 0);
+
+   return req;
+}
+
+/*
+ * sdma_v4_4_2_ring_emit_vm_flush - Emit VM flush commands for SDMA
+ * @ring: The SDMA ring
+ * @vmid: The VMID to flush
+ * @pd_addr: The page directory address
  *
- * Update the page table base and flush the VM TLB
- * using sDMA.
+ * This function emits the necessary register writes and waits to perform a VM 
flush for the
+ * specified VMID. It updates the PTB address registers and issues a VM 
invalidation request
+ * using the specified VM invalidation engine.
  */
 static void sdma_v4_4_2_ring_emit_vm_flush(struct amdgpu_ring *ring,
-unsigned vmid, uint64_t pd_addr)
+   unsigned int vmid, uint64_t pd_addr)
 {
-   amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
+   struct amdgpu_device *adev = ring->adev;
+   uint32_t req = sdma_v4_4_2_get_invalidate_req(vmid, 0);
+   unsigned int eng = ring->vm_inv_eng;
+   struct amdgpu_vmhub *hub = &adev->vmhub[ring->vm_hub];
+
+   amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 +
+  (hub->ctx_addr_distance * vmid),
+  lower_32_bits(pd_addr));
+
+amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 +
+  (hub->ctx_addr_distance * vmid),
+  upper_32_bits(pd_addr));
+   /*
+* Construct and emit the VM invalidation packet
+*/
+   amdgpu_ring_write(ring,
+   SDMA_PKT_VM_INVALIDATE_HEADER_OP(SDMA_OP_VM_INVALIDATE) |
+   SDMA_PKT_VM_INVALIDATE_HEADER_SUB_OP(SDMA_SUBOP_VM_INVALI

[PATCH 1/3 v8] drm/amd/amdgpu: Increase max rings to enable SDMA page ring

2025-03-18 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

Increase the maximum number of rings supported by the AMDGPU driver from 133 to 
149.
This change is necessary to enable support for the SDMA page ring.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index b4fd1e17205e..bb2b66385223 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -37,7 +37,7 @@ struct amdgpu_job;
 struct amdgpu_vm;
 
 /* max number of rings */
-#define AMDGPU_MAX_RINGS   133
+#define AMDGPU_MAX_RINGS   149
 #define AMDGPU_MAX_HWIP_RINGS  64
 #define AMDGPU_MAX_GFX_RINGS   2
 #define AMDGPU_MAX_SW_GFX_RINGS 2
-- 
2.25.1

[PATCH 2/3 V8] drm/amdgpu: Optimize VM invalidation engine allocation and synchronize GPU TLB flush

2025-03-18 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

- Modify the VM invalidation engine allocation logic to handle SDMA page rings.
  SDMA page rings now share the VM invalidation engine with SDMA gfx rings 
instead of
  allocating a separate engine. This change ensures efficient resource 
management and
  avoids the issue of insufficient VM invalidation engines.

- Add synchronization for GPU TLB flush operations in gmc_v9_0.c.
  Use spin_lock and spin_unlock to ensure thread safety and prevent race 
conditions
  during TLB flush operations. This improves the stability and reliability of 
the driver,
  especially in multi-threaded environments.

 v2: replace the sdma ring check with a function `amdgpu_sdma_is_page_queue`
 to check if a ring is an SDMA page queue.(Lijo)

 v3: Add GC version check, only enabled on GC9.4.3/9.4.4/9.5.0
 v4: Fix code style and add more detailed description (Christian)
 v5: Remove dependency on vm_inv_eng loop order, explicitly lookup shared 
inv_eng(Christian/Lijo)
 v6: Added search shared ring function amdgpu_sdma_get_shared_ring (Lijo)

Suggested-by: Lijo Lazar 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  | 19 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 33 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 +++
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 4eefa17fa39b..26a90576792c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -573,6 +573,7 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device 
*adev)
unsigned vm_inv_engs[AMDGPU_MAX_VMHUBS] = {0};
unsigned i;
unsigned vmhub, inv_eng;
+   struct amdgpu_ring *shared_ring;
 
/* init the vm inv eng for all vmhubs */
for_each_set_bit(i, adev->vmhubs_mask, AMDGPU_MAX_VMHUBS) {
@@ -602,6 +603,24 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device 
*adev)
return -EINVAL;
}
 
+   /* SDMA has a special packet which allows it to use the same
+* invalidation engine for all the rings in one instance.
+* Therefore, we do not allocate a separate VM invalidation 
engine
+* for SDMA page rings. Instead, they share the VM invalidation
+* engine with the SDMA gfx ring. This change ensures efficient
+* resource management and avoids the issue of insufficient VM
+* invalidation engines.
+*/
+if (amdgpu_sdma_is_shared_inv_eng(adev, ring)) {
+   shared_ring = amdgpu_sdma_get_shared_ring(adev, ring);
+   if (shared_ring) {
+   ring->vm_inv_eng = shared_ring->vm_inv_eng;
+   dev_info(adev->dev, "ring %s shares VM 
invalidation engine %u with ring %s on hub %u\n",
+   ring->name, ring->vm_inv_eng, 
shared_ring->name, ring->vm_hub);
+   continue;
+   }
+   }
+
ring->vm_inv_eng = inv_eng - 1;
vm_inv_engs[vmhub] &= ~(1 << ring->vm_inv_eng);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 39669f8788a7..6287159dab62 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -504,6 +504,37 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct 
amdgpu_device *adev)
}
 }
 
+struct amdgpu_ring *amdgpu_sdma_get_shared_ring(struct amdgpu_device *adev, 
struct amdgpu_ring *ring)
+{
+   if (adev->sdma.has_page_queue && ring == 
&adev->sdma.instance[ring->me].page)
+   return &adev->sdma.instance[ring->me].ring;
+   else
+   return NULL;
+}
+
+/**
+* amdgpu_sdma_is_shared_inv_eng - Check if a ring is an SDMA ring that shares 
a VM invalidation engine
+* @adev: Pointer to the AMDGPU device structure
+* @ring: Pointer to the ring structure to check
+*
+* This function checks if the given ring is an SDMA ring that shares a VM 
invalidation engine.
+* It returns true if the ring is such an SDMA ring, false otherwise.
+*/
+bool amdgpu_sdma_is_shared_inv_eng(struct amdgpu_device *adev, struct 
amdgpu_ring *ring)
+{
+   int i = ring->me;
+
+   if (!adev->sdma.has_page_queue || i >= adev->sdma.num_instances)
+   return false;
+
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
+   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
+   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0))
+   return (ring == &adev->sdma.instance[i].page);
+   else
+   return false;
+}
+
 /**
  * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
  * @funcs: Pointer to

[PATCH 6/7 V2] drm/amd/amdgpu: Refactor SDMA v5.2 reset logic into stop_queue and restore_queue functions

2025-04-05 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch refactors the SDMA v5.2 reset logic by splitting the 
`sdma_v5_2_reset_queue` function into two separate functions: 
`sdma_v5_2_stop_queue` and `sdma_v5_2_restore_queue`.
This change aligns with the new SDMA reset mechanism, where the reset process 
is divided into stopping the queue, performing the reset, and restoring the 
queue.

1. **Split `sdma_v5_2_reset_queue`**:
- Extracted the queue stopping logic into `sdma_v5_2_stop_queue`.
- Extracted the queue restoration logic into `sdma_v5_2_restore_queue`.
- The soft reset step is now handled by the caller 
(`amdgpu_sdma_reset_engine`).

2. **Update Ring Functions**:
- Added `stop_queue` and `start_queue` to the `sdma_v5_2_ring_funcs` 
structure to support the new reset mechanism.

v2: remove the suspend_user_queues param when calling amdgpu_sdma_reset_engine()

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 38 ++
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index 96b02c3e4993..67b7d84c15dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -1439,18 +1439,22 @@ static int sdma_v5_2_wait_for_idle(struct 
amdgpu_ip_block *ip_block)
 static int sdma_v5_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int j, r;
-   u32 f32_cntl, freeze, cntl, preempt, soft_reset, stat1_reg;
-   u32 inst_id;
+   u32 inst_id = ring->me;
+
+   return amdgpu_sdma_reset_engine(adev, inst_id);
+}
+
+static int sdma_v5_2_stop_queue(struct amdgpu_device *adev, uint32_t inst_id)
+{
+   int j, r = 0;
+   u32 f32_cntl, freeze, cntl, preempt, stat1_reg;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
 
-   inst_id = ring->me;
amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
-
/* stop queue */
-   sdma_v5_2_gfx_stop(adev, 1 << ring->me);
+   sdma_v5_2_gfx_stop(adev, 1 << inst_id);
 
/*engine stop SDMA1_F32_CNTL.HALT to 1 and SDMAx_FREEZE freeze bit to 1 
*/
freeze = RREG32(sdma_v5_2_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
@@ -1488,18 +1492,17 @@ static int sdma_v5_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
preempt = REG_SET_FIELD(preempt, SDMA0_GFX_PREEMPT, IB_PREEMPT, 0);
WREG32(sdma_v5_2_get_reg_offset(adev, inst_id, mmSDMA0_GFX_PREEMPT), 
preempt);
 
-   soft_reset = RREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET);
-   soft_reset |= 1 << GRBM_SOFT_RESET__SOFT_RESET_SDMA0__SHIFT << inst_id;
-
-
-   WREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET, soft_reset);
-
-   udelay(50);
-
-   soft_reset &= ~(1 << GRBM_SOFT_RESET__SOFT_RESET_SDMA0__SHIFT << 
inst_id);
+err0:
+   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+   return r;
+}
 
-   WREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET, soft_reset);
+static int sdma_v5_2_restore_queue(struct amdgpu_device *adev, uint32_t 
inst_id)
+{
+   u32 freeze;
+   int r;
 
+   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
/* unfreeze and unhalt */
freeze = RREG32(sdma_v5_2_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
freeze = REG_SET_FIELD(freeze, SDMA0_FREEZE, FREEZE, 0);
@@ -1507,7 +1510,6 @@ static int sdma_v5_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
 
r = sdma_v5_2_gfx_resume_instance(adev, inst_id, true);
 
-err0:
amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
return r;
 }
@@ -1947,6 +1949,8 @@ static const struct amdgpu_ring_funcs 
sdma_v5_2_ring_funcs = {
.init_cond_exec = sdma_v5_2_ring_init_cond_exec,
.preempt_ib = sdma_v5_2_ring_preempt_ib,
.reset = sdma_v5_2_reset_queue,
+   .stop_queue = sdma_v5_2_stop_queue,
+   .start_queue = sdma_v5_2_restore_queue,
 };
 
 static void sdma_v5_2_set_ring_funcs(struct amdgpu_device *adev)
-- 
2.25.1

[PATCH 4/7 V2] drm/amd/amdgpu: Refactor SDMA v5.0 reset logic into stop_queue and restore_queue functions

2025-04-01 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch refactors the SDMA v5.0 reset logic by splitting the 
`sdma_v5_0_reset_queue` function into two separate functions: 
`sdma_v5_0_stop_queue` and `sdma_v5_0_restore_queue`.
This change aligns with the new SDMA reset mechanism, where the reset process 
is divided into stopping the queue, performing the reset, and restoring the 
queue.

1. **Split `sdma_v5_0_reset_queue`**:
   - Extracted the queue stopping logic into `sdma_v5_0_stop_queue`.
   - Extracted the queue restoration logic into `sdma_v5_0_restore_queue`.
   - The soft reset step is now handled by the caller 
(`amdgpu_sdma_reset_engine`).

2. **Update Ring Functions**:
 - Added `stop_queue` and `start_queue` to the `sdma_v5_0_ring_funcs` 
structure to support the new reset mechanism.

v2: remove the suspend_user_queues param when calling amdgpu_sdma_reset_engine()

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 37 +++---
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 9501652f903d..df77bf2639cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1508,17 +1508,23 @@ static int sdma_v5_0_soft_reset(struct amdgpu_ip_block 
*ip_block)
 static int sdma_v5_0_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int j, r;
-   u32 f32_cntl, freeze, cntl, preempt, soft_reset, stat1_reg;
-   u32 inst_id;
+   u32 inst_id = ring->me;
+
+   return amdgpu_sdma_reset_engine(adev, inst_id);
+}
+
+static int sdma_v5_0_stop_queue(struct amdgpu_device *adev, uint32_t inst_id)
+{
+   int j, r = 0;
+   u32 f32_cntl, freeze, cntl, preempt, stat1_reg;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
-   inst_id = ring->me;
+
amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
 
/* stop queue */
-   sdma_v5_0_gfx_stop(adev, 1 << ring->me);
+   sdma_v5_0_gfx_stop(adev, inst_id);
 
/* engine stop SDMA1_F32_CNTL.HALT to 1 and SDMAx_FREEZE freeze bit to 
1 */
freeze = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
@@ -1554,17 +1560,17 @@ static int sdma_v5_0_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
preempt = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_GFX_PREEMPT));
preempt = REG_SET_FIELD(preempt, SDMA0_GFX_PREEMPT, IB_PREEMPT, 0);
WREG32(sdma_v5_0_get_reg_offset(adev, inst_id, mmSDMA0_GFX_PREEMPT), 
preempt);
+err0:
+   amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+   return r;
+}
 
-   soft_reset = RREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET);
-   soft_reset |= 1 << GRBM_SOFT_RESET__SOFT_RESET_SDMA0__SHIFT << inst_id;
-
-   WREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET, soft_reset);
-
-   udelay(50);
-
-   soft_reset &= ~(1 << GRBM_SOFT_RESET__SOFT_RESET_SDMA0__SHIFT << 
inst_id);
-   WREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET, soft_reset);
+static int sdma_v5_0_restore_queue(struct amdgpu_device *adev, uint32_t 
inst_id)
+{
+   int r;
+   u32 freeze;
 
+   amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
/* unfreeze*/
freeze = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
freeze = REG_SET_FIELD(freeze, SDMA0_FREEZE, FREEZE, 0);
@@ -1572,7 +1578,6 @@ static int sdma_v5_0_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
 
r = sdma_v5_0_gfx_resume_instance(adev, inst_id, true);
 
-err0:
amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
return r;
 }
@@ -1919,6 +1924,8 @@ static const struct amdgpu_ring_funcs 
sdma_v5_0_ring_funcs = {
.init_cond_exec = sdma_v5_0_ring_init_cond_exec,
.preempt_ib = sdma_v5_0_ring_preempt_ib,
.reset = sdma_v5_0_reset_queue,
+   .stop_queue = sdma_v5_0_stop_queue,
+   .start_queue = sdma_v5_0_restore_queue,
 };
 
 static void sdma_v5_0_set_ring_funcs(struct amdgpu_device *adev)
-- 
2.25.1

[PATCH 7/7 V2] drm/amd/amdgpu: Remove deprecated SDMA reset callback mechanism

2025-04-01 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch removes the deprecated SDMA reset callback mechanism, which was 
previously used to register pre-reset and post-reset callbacks for SDMA engine 
resets.
The SDMA reset callback mechanism allowed KFD and AMDGPU to register pre-reset 
and post-reset functions for handling SDMA engine resets.
The callback mechanism has been replaced with a more direct and efficient 
approach using `stop_queue` and `start_queue` functions in the ring's function 
table.

1. **Remove Callback Mechanism**:
   - Removed the `amdgpu_sdma_register_on_reset_callbacks` function and its 
associated data structures (`sdma_on_reset_funcs`).
   - Removed the callback registration logic from the SDMA v4.4.2 
initialization code.

2. **Clean Up Related Code**:
   - Removed the `sdma_v4_4_2_set_engine_reset_funcs` function, which was used 
to register the callbacks.
   - Removed the `sdma_v4_4_2_engine_reset_funcs` structure, which contained 
the pre-reset and post-reset callback functions.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 24 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  8 
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 12 
 3 files changed, 44 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index dbc7c7cfee01..2d61f25528dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -531,30 +531,6 @@ bool amdgpu_sdma_is_shared_inv_eng(struct amdgpu_device 
*adev, struct amdgpu_rin
return false;
 }
 
-/**
- * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
- * @funcs: Pointer to the callback structure containing pre_reset and 
post_reset functions
- *
- * This function allows KFD and AMDGPU to register their own callbacks for 
handling
- * pre-reset and post-reset operations for engine reset. These are needed 
because engine
- * reset will stop all queues on that engine.
- */
-void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, 
struct sdma_on_reset_funcs *funcs)
-{
-   if (!funcs)
-   return;
-
-   /* Ensure the reset_callback_list is initialized */
-   if (!adev->sdma.reset_callback_list.next) {
-   INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
-   }
-   /* Initialize the list node in the callback structure */
-   INIT_LIST_HEAD(&funcs->list);
-
-   /* Add the callback structure to the global list */
-   list_add_tail(&funcs->list, &adev->sdma.reset_callback_list);
-}
-
 static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id)
 {
u32 soft_reset;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 47d56fd0589f..419531cc8207 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -103,13 +103,6 @@ struct amdgpu_sdma_ras {
struct amdgpu_ras_block_object ras_block;
 };
 
-struct sdma_on_reset_funcs {
-   int (*pre_reset)(struct amdgpu_device *adev, uint32_t instance_id);
-   int (*post_reset)(struct amdgpu_device *adev, uint32_t instance_id);
-   /* Linked list node to store this structure in a list; */
-   struct list_head list;
-};
-
 struct amdgpu_sdma {
struct amdgpu_sdma_instance instance[AMDGPU_MAX_SDMA_INSTANCES];
struct amdgpu_irq_src   trap_irq;
@@ -170,7 +163,6 @@ struct amdgpu_buffer_funcs {
 uint32_t byte_count);
 };
 
-void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, 
struct sdma_on_reset_funcs *funcs);
 int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id);
 
 #define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) 
(adev)->mman.buffer_funcs->emit_copy_buffer((ib),  (s), (d), (b), (t))
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index df82a97a5388..29dbee7302c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -106,7 +106,6 @@ static void sdma_v4_4_2_set_buffer_funcs(struct 
amdgpu_device *adev);
 static void sdma_v4_4_2_set_vm_pte_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev);
-static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_update_reset_mask(struct amdgpu_device *adev);
 
 static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev,
@@ -1351,7 +1350,6 @@ static int sdma_v4_4_2_early_init(struct amdgpu_ip_block 
*ip_block)
sdma_v4_4_2_set_vm_pte_funcs(adev);
sdma_v4_4_2_set_irq_funcs(adev);
sdma_v4_4_2_set_ras_funcs(adev);
-   sdma_v4_4_2_set_engine_reset_funcs(adev);
 
return 0;
 }
@@ -1739,16 +1737,6 @@ static int sd

[PATCH 1/7 V2] drm/amd/amdgpu: Simplify SDMA reset mechanism by removing dynamic callbacks

2025-04-01 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

Since KFD no longer registers its own callbacks for SDMA resets, and only KGD 
uses the reset mechanism,
we can simplify the SDMA reset flow by directly calling the ring's `stop_queue` 
and `start_queue` functions.
This patch removes the dynamic callback mechanism and prepares for its eventual 
deprecation.

1. **Remove Dynamic Callbacks**:
   - The `pre_reset` and `post_reset` callback invocations in 
`amdgpu_sdma_reset_engine` are removed.
   - Instead, the ring's `stop_queue` and `start_queue` functions are called 
directly during the reset process.

2. **Prepare for Deprecation of Dynamic Mechanism**:
   - By removing the callback invocations, this patch prepares the codebase for 
the eventual removal of the dynamic callback registration mechanism.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 34 +++-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  2 ++
 3 files changed, 8 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 615c3d5c5a8d..1b66be2b49dc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -237,6 +237,8 @@ struct amdgpu_ring_funcs {
void (*patch_ce)(struct amdgpu_ring *ring, unsigned offset);
void (*patch_de)(struct amdgpu_ring *ring, unsigned offset);
int (*reset)(struct amdgpu_ring *ring, unsigned int vmid);
+   int (*stop_queue)(struct amdgpu_device *adev, uint32_t instance_id);
+   int (*start_queue)(struct amdgpu_device *adev, uint32_t instance_id);
void (*emit_cleaner_shader)(struct amdgpu_ring *ring);
bool (*is_guilty)(struct amdgpu_ring *ring);
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 0a9893fee828..7d862c887a1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -558,16 +558,10 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
  * @adev: Pointer to the AMDGPU device
  * @instance_id: ID of the SDMA engine instance to reset
  *
- * This function performs the following steps:
- * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save 
their state.
- * 2. Resets the specified SDMA engine instance.
- * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to 
restore their state.
- *
  * Returns: 0 on success, or a negative error code on failure.
  */
 int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
 {
-   struct sdma_on_reset_funcs *funcs;
int ret = 0;
struct amdgpu_sdma_instance *sdma_instance = 
&adev->sdma.instance[instance_id];
struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
@@ -589,18 +583,8 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
uint32_t instance_id)
page_sched_stopped = true;
}
 
-   /* Invoke all registered pre_reset callbacks */
-   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
-   if (funcs->pre_reset) {
-   ret = funcs->pre_reset(adev, instance_id);
-   if (ret) {
-   dev_err(adev->dev,
-   "beforeReset callback failed for instance %u: 
%d\n",
-   instance_id, ret);
-   goto exit;
-   }
-   }
-   }
+   if (gfx_ring->funcs->stop_queue)
+   gfx_ring->funcs->stop_queue(adev, instance_id);
 
/* Perform the SDMA reset for the specified instance */
ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
@@ -609,18 +593,8 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
uint32_t instance_id)
goto exit;
}
 
-   /* Invoke all registered post_reset callbacks */
-   list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
-   if (funcs->post_reset) {
-   ret = funcs->post_reset(adev, instance_id);
-   if (ret) {
-   dev_err(adev->dev,
-   "afterReset callback failed for instance %u: 
%d\n",
-   instance_id, ret);
-   goto exit;
-   }
-   }
-   }
+   if (gfx_ring->funcs->start_queue)
+   gfx_ring->funcs->start_queue(adev, instance_id);
 
 exit:
/* Restart the scheduler's work queue for the GFX and page rings
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 688a720d..df82a97a5388 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -2143

[PATCH 3/7 V2] drm/amdgpu: Optimize SDMA v5.0 queue reset and stop logic

2025-04-01 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch refactors the SDMA v5.0 queue reset and stop logic to improve
code readability, maintainability, and performance. The key changes include:

1. **Generalized `sdma_v5_0_gfx_stop` Function**:
   - Added an `inst_mask` parameter to allow stopping specific SDMA instances
 instead of all instances. This is useful for resetting individual queues.

2. **Simplified `sdma_v5_0_reset_queue` Function**:
   - Removed redundant loops and checks by directly using the `ring->me` field
 to identify the SDMA instance.
   - Reused the `sdma_v5_0_gfx_stop` function to stop the queue, reducing code
 duplication.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 65 +++---
 1 file changed, 26 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index e1348b6d9c6a..9501652f903d 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -555,15 +555,15 @@ static void sdma_v5_0_ring_emit_fence(struct amdgpu_ring 
*ring, u64 addr, u64 se
  * sdma_v5_0_gfx_stop - stop the gfx async dma engines
  *
  * @adev: amdgpu_device pointer
- *
+ * @inst_mask: mask of dma engine instances to be disabled
  * Stop the gfx async dma ring buffers (NAVI10).
  */
-static void sdma_v5_0_gfx_stop(struct amdgpu_device *adev)
+static void sdma_v5_0_gfx_stop(struct amdgpu_device *adev, uint32_t inst_mask)
 {
u32 rb_cntl, ib_cntl;
int i;
 
-   for (i = 0; i < adev->sdma.num_instances; i++) {
+   for_each_inst(i, inst_mask) {
rb_cntl = RREG32_SOC15_IP(GC, sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_GFX_RB_CNTL));
rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_GFX_RB_CNTL, RB_ENABLE, 
0);
WREG32_SOC15_IP(GC, sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_GFX_RB_CNTL), rb_cntl);
@@ -655,9 +655,11 @@ static void sdma_v5_0_enable(struct amdgpu_device *adev, 
bool enable)
 {
u32 f32_cntl;
int i;
+   uint32_t inst_mask;
 
+   inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
if (!enable) {
-   sdma_v5_0_gfx_stop(adev);
+   sdma_v5_0_gfx_stop(adev, 1 << inst_mask);
sdma_v5_0_rlc_stop(adev);
}
 
@@ -1506,40 +1508,25 @@ static int sdma_v5_0_soft_reset(struct amdgpu_ip_block 
*ip_block)
 static int sdma_v5_0_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int i, j, r;
-   u32 rb_cntl, ib_cntl, f32_cntl, freeze, cntl, preempt, soft_reset, 
stat1_reg;
+   int j, r;
+   u32 f32_cntl, freeze, cntl, preempt, soft_reset, stat1_reg;
+   u32 inst_id;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
-
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   if (ring == &adev->sdma.instance[i].ring)
-   break;
-   }
-
-   if (i == adev->sdma.num_instances) {
-   DRM_ERROR("sdma instance not found\n");
-   return -EINVAL;
-   }
-
+   inst_id = ring->me;
amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
 
/* stop queue */
-   ib_cntl = RREG32(sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_GFX_IB_CNTL));
-   ib_cntl = REG_SET_FIELD(ib_cntl, SDMA0_GFX_IB_CNTL, IB_ENABLE, 0);
-   WREG32(sdma_v5_0_get_reg_offset(adev, i, mmSDMA0_GFX_IB_CNTL), ib_cntl);
-
-   rb_cntl = RREG32(sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_GFX_RB_CNTL));
-   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_GFX_RB_CNTL, RB_ENABLE, 0);
-   WREG32(sdma_v5_0_get_reg_offset(adev, i, mmSDMA0_GFX_RB_CNTL), rb_cntl);
+   sdma_v5_0_gfx_stop(adev, 1 << ring->me);
 
/* engine stop SDMA1_F32_CNTL.HALT to 1 and SDMAx_FREEZE freeze bit to 
1 */
-   freeze = RREG32(sdma_v5_0_get_reg_offset(adev, i, mmSDMA0_FREEZE));
+   freeze = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
freeze = REG_SET_FIELD(freeze, SDMA0_FREEZE, FREEZE, 1);
-   WREG32(sdma_v5_0_get_reg_offset(adev, i, mmSDMA0_FREEZE), freeze);
+   WREG32(sdma_v5_0_get_reg_offset(adev, inst_id, mmSDMA0_FREEZE), freeze);
 
for (j = 0; j < adev->usec_timeout; j++) {
-   freeze = RREG32(sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_FREEZE));
+   freeze = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
if (REG_GET_FIELD(freeze, SDMA0_FREEZE, FROZEN) & 1)
break;
udelay(1);
@@ -1547,7 +1534,7 @@ static int sdma_v5_0_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
 
/* check sdma copy engine all idle if frozen not received*/
if (j == adev->usec_timeout) {
-   stat1_reg = RREG32(sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_STATUS1_REG));
+   stat1_reg = RREG32(sdma_v5_0_get_reg_offset(adev, inst_id, 
mmSDMA0_STATUS1_REG));

[PATCH 2/2] drm/amdgpu: Optimize VM invalidation engine allocation and synchronize GPU TLB flush

2025-02-18 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

- Modify the VM invalidation engine allocation logic to handle SDMA page rings.
  SDMA page rings now share the VM invalidation engine with SDMA gfx rings 
instead of
  allocating a separate engine. This change ensures efficient resource 
management and
  avoids the issue of insufficient VM invalidation engines.

- Add synchronization for GPU TLB flush operations in gmc_v9_0.c.
  Use spin_lock and spin_unlock to ensure thread safety and prevent race 
conditions
  during TLB flush operations. This improves the stability and reliability of 
the driver,
  especially in multi-threaded environments.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 9 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index cb914ce82eb5..013d31f2794b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -601,8 +601,17 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device 
*adev)
return -EINVAL;
}
 
+   if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA &&
+   adev->sdma.has_page_queue &&
+   (strncmp(ring->name, "sdma", 4) == 0)) {
+   /* Do not allocate a separate VM invalidation engine for SDMA 
page rings.
+* Shared VM invalid engine with sdma gfx ring.
+*/
+   ring->vm_inv_eng = inv_eng - 1;
+   } else {
ring->vm_inv_eng = inv_eng - 1;
vm_inv_engs[vmhub] &= ~(1 << ring->vm_inv_eng);
+   }
 
dev_info(adev->dev, "ring %s uses VM inv eng %u on hub %u\n",
 ring->name, ring->vm_inv_eng, ring->vm_hub);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 2aa87fdf715f..2599da8677da 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1000,6 +1000,7 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct 
amdgpu_ring *ring,
 * to WA the Issue
 */
 
+   spin_lock(&adev->gmc.invalidate_lock);
/* TODO: It needs to continue working on debugging with semaphore for 
GFXHUB as well. */
if (use_semaphore)
/* a read return value of 1 means semaphore acuqire */
@@ -1030,6 +1031,7 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct 
amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem +
  hub->eng_distance * eng, 0);
 
+   spin_unlock(&adev->gmc.invalidate_lock);
return pd_addr;
 }
 
-- 
2.25.1

[PATCH 1/2] drm/amd/amdgpu: Increase max rings to enable SDMA page ring

2025-02-18 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

Increase the maximum number of rings supported by the AMDGPU driver from 132 to 
148.
This change is necessary to enable support for the SDMA page ring.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 182aa535d395..ae1dd7d16048 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -37,7 +37,7 @@ struct amdgpu_job;
 struct amdgpu_vm;
 
 /* max number of rings */
-#define AMDGPU_MAX_RINGS   132
+#define AMDGPU_MAX_RINGS   148
 #define AMDGPU_MAX_HWIP_RINGS  64
 #define AMDGPU_MAX_GFX_RINGS   2
 #define AMDGPU_MAX_SW_GFX_RINGS 2
-- 
2.25.1

[PATCH V2 1/2] drm/amd/amdgpu: Increase max rings to enable SDMA page ring

2025-02-19 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

Increase the maximum number of rings supported by the AMDGPU driver from 132 to 
148.
This change is necessary to enable support for the SDMA page ring.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 182aa535d395..ae1dd7d16048 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -37,7 +37,7 @@ struct amdgpu_job;
 struct amdgpu_vm;
 
 /* max number of rings */
-#define AMDGPU_MAX_RINGS   132
+#define AMDGPU_MAX_RINGS   148
 #define AMDGPU_MAX_HWIP_RINGS  64
 #define AMDGPU_MAX_GFX_RINGS   2
 #define AMDGPU_MAX_SW_GFX_RINGS 2
-- 
2.25.1

[PATCH V2 2/2] drm/amdgpu: Optimize VM invalidation engine allocation and synchronize GPU TLB flush

2025-02-19 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

- Modify the VM invalidation engine allocation logic to handle SDMA page rings.
  SDMA page rings now share the VM invalidation engine with SDMA gfx rings 
instead of
  allocating a separate engine. This change ensures efficient resource 
management and
  avoids the issue of insufficient VM invalidation engines.

- Add synchronization for GPU TLB flush operations in gmc_v9_0.c.
  Use spin_lock and spin_unlock to ensure thread safety and prevent race 
conditions
  during TLB flush operations. This improves the stability and reliability of 
the driver,
  especially in multi-threaded environments.

 replace the sdma ring check with a function `amdgpu_sdma_is_page_queue`
 to check if a ring is an SDMA page queue.(Lijo)

Suggested-by: Lijo Lazar 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  |  7 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 18 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c|  2 ++
 4 files changed, 28 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index cb914ce82eb5..da719ec6c6c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -601,8 +601,15 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device 
*adev)
return -EINVAL;
}
 
+   if(amdgpu_sdma_is_page_queue(adev, ring)) {
+   /* Do not allocate a separate VM invalidation engine for SDMA 
page rings.
+* Shared VM invalid engine with sdma gfx ring.
+*/
+   ring->vm_inv_eng = inv_eng - 1;
+   } else {
ring->vm_inv_eng = inv_eng - 1;
vm_inv_engs[vmhub] &= ~(1 << ring->vm_inv_eng);
+   }
 
dev_info(adev->dev, "ring %s uses VM inv eng %u on hub %u\n",
 ring->name, ring->vm_inv_eng, ring->vm_hub);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 8de214a8ba6d..96df544feb67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -503,6 +503,24 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct 
amdgpu_device *adev)
}
 }
 
+/**
+* amdgpu_sdma_is_page_queue - Check if a ring is an SDMA page queue
+* @adev: Pointer to the AMDGPU device structure
+* @ring: Pointer to the ring structure to check
+*
+* This function checks if the given ring is an SDMA page queue.
+* It returns true if the ring is an SDMA page queue, false otherwise.
+*/
+bool amdgpu_sdma_is_page_queue(struct amdgpu_device *adev, struct amdgpu_ring* 
ring)
+{
+   int i = ring->me;
+
+   if (!adev->sdma.has_page_queue || i >= adev->sdma.num_instances)
+   return false;
+
+   return (ring == &adev->sdma.instance[i].page);
+}
+
 /**
  * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
  * @funcs: Pointer to the callback structure containing pre_reset and 
post_reset functions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 7effc2673466..c2df9c3ab882 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -194,4 +194,5 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_sdma_sched_mask_init(struct amdgpu_device *adev);
 int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev);
 void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev);
+bool amdgpu_sdma_is_page_queue(struct amdgpu_device *adev, struct amdgpu_ring* 
ring);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 2aa87fdf715f..2599da8677da 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1000,6 +1000,7 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct 
amdgpu_ring *ring,
 * to WA the Issue
 */
 
+   spin_lock(&adev->gmc.invalidate_lock);
/* TODO: It needs to continue working on debugging with semaphore for 
GFXHUB as well. */
if (use_semaphore)
/* a read return value of 1 means semaphore acuqire */
@@ -1030,6 +1031,7 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct 
amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem +
  hub->eng_distance * eng, 0);
 
+   spin_unlock(&adev->gmc.invalidate_lock);
return pd_addr;
 }
 
-- 
2.25.1

[PATCH v3 1/2] drm/amd/amdgpu: Increase max rings to enable SDMA page ring

2025-02-19 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

Increase the maximum number of rings supported by the AMDGPU driver from 132 to 
148.
This change is necessary to enable support for the SDMA page ring.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 182aa535d395..ae1dd7d16048 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -37,7 +37,7 @@ struct amdgpu_job;
 struct amdgpu_vm;
 
 /* max number of rings */
-#define AMDGPU_MAX_RINGS   132
+#define AMDGPU_MAX_RINGS   148
 #define AMDGPU_MAX_HWIP_RINGS  64
 #define AMDGPU_MAX_GFX_RINGS   2
 #define AMDGPU_MAX_SW_GFX_RINGS 2
-- 
2.25.1

[PATCH v3 2/2] drm/amdgpu: Optimize VM invalidation engine allocation and synchronize GPU TLB flush

2025-02-19 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

- Modify the VM invalidation engine allocation logic to handle SDMA page rings.
  SDMA page rings now share the VM invalidation engine with SDMA gfx rings 
instead of
  allocating a separate engine. This change ensures efficient resource 
management and
  avoids the issue of insufficient VM invalidation engines.

- Add synchronization for GPU TLB flush operations in gmc_v9_0.c.
  Use spin_lock and spin_unlock to ensure thread safety and prevent race 
conditions
  during TLB flush operations. This improves the stability and reliability of 
the driver,
  especially in multi-threaded environments.

V3: replace the sdma ring check with a function `amdgpu_sdma_is_shared_inv_eng`
 to Check if a ring is an SDMA ring that shares a VM invalidation engine

Suggested-by: Lijo Lazar 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  |  7 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 18 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c|  2 ++
 4 files changed, 28 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index cb914ce82eb5..8ccc3fb34940 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -601,8 +601,15 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device 
*adev)
return -EINVAL;
}
 
+   if(amdgpu_sdma_is_shared_inv_eng(adev, ring)) {
+   /* Do not allocate a separate VM invalidation engine for SDMA 
page rings.
+* Shared VM invalid engine with sdma gfx ring.
+*/
+   ring->vm_inv_eng = inv_eng - 1;
+   } else {
ring->vm_inv_eng = inv_eng - 1;
vm_inv_engs[vmhub] &= ~(1 << ring->vm_inv_eng);
+   }
 
dev_info(adev->dev, "ring %s uses VM inv eng %u on hub %u\n",
 ring->name, ring->vm_inv_eng, ring->vm_hub);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 8de214a8ba6d..159ebd9ee62f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -503,6 +503,24 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct 
amdgpu_device *adev)
}
 }
 
+/**
+* amdgpu_sdma_is_shared_inv_eng - Check if a ring is an SDMA ring that shares 
a VM invalidation engine
+* @adev: Pointer to the AMDGPU device structure
+* @ring: Pointer to the ring structure to check
+*
+* This function checks if the given ring is an SDMA ring that shares a VM 
invalidation engine.
+* It returns true if the ring is such an SDMA ring, false otherwise.
+*/
+bool amdgpu_sdma_is_shared_inv_eng(struct amdgpu_device *adev, struct 
amdgpu_ring* ring)
+{
+   int i = ring->me;
+
+   if (!adev->sdma.has_page_queue || i >= adev->sdma.num_instances)
+   return false;
+
+   return (ring == &adev->sdma.instance[i].ring);
+}
+
 /**
  * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
  * @funcs: Pointer to the callback structure containing pre_reset and 
post_reset functions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 7effc2673466..da3ec6655be7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -194,4 +194,5 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_sdma_sched_mask_init(struct amdgpu_device *adev);
 int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev);
 void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev);
+bool amdgpu_sdma_is_shared_inv_eng(struct amdgpu_device *adev, struct 
amdgpu_ring* ring);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 2aa87fdf715f..2599da8677da 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1000,6 +1000,7 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct 
amdgpu_ring *ring,
 * to WA the Issue
 */
 
+   spin_lock(&adev->gmc.invalidate_lock);
/* TODO: It needs to continue working on debugging with semaphore for 
GFXHUB as well. */
if (use_semaphore)
/* a read return value of 1 means semaphore acuqire */
@@ -1030,6 +1031,7 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct 
amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem +
  hub->eng_distance * eng, 0);
 
+   spin_unlock(&adev->gmc.invalidate_lock);
return pd_addr;
 }
 
-- 
2.25.1

[PATCH v3 1/2] drm/amd/pm: add support for checking SDMA reset capability

2025-02-20 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch introduces a new function to check if the SMU supports resetting the 
SDMA engine.
This capability check ensures that the driver does not attempt to reset the 
SDMA engine
on hardware that does not support it.

The following changes are included:
- New function `amdgpu_dpm_reset_sdma_is_supported` to check SDMA reset
  support at the AMDGPU driver level.
- New function `smu_reset_sdma_is_supported` to check SDMA reset support
  at the SMU level.
- Implementation of `smu_v13_0_6_reset_sdma_is_supported` for the specific
  SMU version v13.0.6.
- Updated `smu_v13_0_6_reset_sdma` to use the new capability check before
  attempting to reset the SDMA engine.

v3: change smu_reset_sdma_is_supported type to bool (Tim)

Signed-off-by: Vitaly Prosyak 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c   | 23 +++
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h   |  1 +
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 17 ++
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  5 
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 23 ++-
 5 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index faae9bf48aa4..7c4ff12269d9 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -722,6 +722,29 @@ int amdgpu_dpm_send_rma_reason(struct amdgpu_device *adev)
return ret;
 }
 
+/**
+ * amdgpu_dpm_reset_sdma_is_supported - Check if SDMA reset is supported
+ * @adev: amdgpu_device pointer
+ *
+ * This function checks if the SMU supports resetting the SDMA engine.
+ * It returns false if the hardware does not support software SMU or
+ * if the feature is not supported.
+ */
+bool amdgpu_dpm_reset_sdma_is_supported(struct amdgpu_device *adev)
+{
+   struct smu_context *smu = adev->powerplay.pp_handle;
+   bool ret;
+
+   if (!is_support_sw_smu(adev))
+   return false;
+
+   mutex_lock(&adev->pm.mutex);
+   ret = smu_reset_sdma_is_supported(smu);
+   mutex_unlock(&adev->pm.mutex);
+
+   return ret;
+}
+
 int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask)
 {
struct smu_context *smu = adev->powerplay.pp_handle;
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
index 1f5ac7e0230d..9fb26b5c8ae7 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
@@ -603,5 +603,6 @@ int amdgpu_dpm_set_pm_policy(struct amdgpu_device *adev, 
int policy_type,
 ssize_t amdgpu_dpm_get_pm_policy_info(struct amdgpu_device *adev,
  enum pp_pm_policy p_type, char *buf);
 int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask);
+bool amdgpu_dpm_reset_sdma_is_supported(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 0b32c6cf6924..d71c8c58caa4 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -3907,6 +3907,23 @@ int smu_send_rma_reason(struct smu_context *smu)
return ret;
 }
 
+/**
+ * smu_reset_sdma_is_supported - Check if SDMA reset is supported by SMU
+ * @smu: smu_context pointer
+ *
+ * This function checks if the SMU supports resetting the SDMA engine.
+ * It returns true if supported, false otherwise.
+ */
+bool smu_reset_sdma_is_supported(struct smu_context *smu)
+{
+   bool ret = false;
+
+   if (smu->ppt_funcs && smu->ppt_funcs->reset_sdma_is_supported)
+   ret = smu->ppt_funcs->reset_sdma_is_supported(smu);
+
+   return ret;
+}
+
 int smu_reset_sdma(struct smu_context *smu, uint32_t inst_mask)
 {
int ret = 0;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index 3630593bce61..3ba169639f54 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -1376,6 +1376,10 @@ struct pptable_funcs {
 * @reset_sdma: message SMU to soft reset sdma instance.
 */
int (*reset_sdma)(struct smu_context *smu, uint32_t inst_mask);
+   /**
+* @reset_sdma_is_supported: Check if support resets the SDMA engine.
+*/
+   bool (*reset_sdma_is_supported)(struct smu_context *smu);
 
/**
 * @get_ecc_table:  message SMU to get ECC INFO table.
@@ -1637,6 +1641,7 @@ int smu_send_hbm_bad_pages_num(struct smu_context *smu, 
uint32_t size);
 int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t size);
 int smu_send_rma_reason(struct smu_context *smu);
 int smu_reset_sdma(struct smu_context *smu, uint32_t inst_mask);
+bool smu_reset_sdma_is_supported(struct smu_context *smu);
 int smu_set_pm_policy(struct smu_context *smu, enum pp_pm_policy p_type,

[PATCH v3 2/2] drm/amdgpu: Initialize SDMA sysfs reset mask in late_init

2025-02-20 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

- Introduce a new function `sdma_v4_4_2_init_sysfs_reset_mask` to initialize 
the sysfs reset mask for SDMA.
- Move the initialization of the sysfs reset mask to the `late_init` stage to 
ensure that the SMU  initialization
 and capability setup are completed before checking the SDMA reset 
capability.
- Consolidate the logic for setting the supported reset types and initializing 
the sysfs reset mask into the new function.
- For IP versions 9.4.3 and 9.4.4, enable per-queue reset if the MEC firmware 
version is at least 0xb0 and PMFW supports queue reset.
- Add a TODO comment for future support of per-queue reset for IP version 9.4.5.

This change ensures that per-queue reset is only enabled when the MEC and PMFW 
support it.

Suggested-by: Jonathan Kim 
Signed-off-by: Vitaly Prosyak 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 55 
 1 file changed, 47 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 4fa688e00f5e..fd2884de2dc4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -107,6 +107,7 @@ static void sdma_v4_4_2_set_vm_pte_funcs(struct 
amdgpu_device *adev);
 static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev);
+static int  sdma_v4_4_2_init_sysfs_reset_mask(struct amdgpu_device *adev);
 
 static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev,
u32 instance, u32 offset)
@@ -1366,6 +1367,7 @@ static int sdma_v4_4_2_process_ras_data_cb(struct 
amdgpu_device *adev,
 static int sdma_v4_4_2_late_init(struct amdgpu_ip_block *ip_block)
 {
struct amdgpu_device *adev = ip_block->adev;
+   int r;
 #if 0
struct ras_ih_if ih_info = {
.cb = sdma_v4_4_2_process_ras_data_cb,
@@ -1374,7 +1376,12 @@ static int sdma_v4_4_2_late_init(struct amdgpu_ip_block 
*ip_block)
if (!amdgpu_persistent_edc_harvesting_supported(adev))
amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__SDMA);
 
-   return 0;
+   /* The initialization is done in the late_init stage to ensure that the 
SMU
+* initialization and capability setup are completed before we check 
the SDMA
+* reset capability
+*/
+   r = sdma_v4_4_2_init_sysfs_reset_mask(adev);
+   return r;
 }
 
 static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
@@ -1481,10 +1488,6 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block 
*ip_block)
}
}
 
-   /* TODO: Add queue reset mask when FW fully supports it */
-   adev->sdma.supported_reset =
-   amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
-
if (amdgpu_sdma_ras_sw_init(adev)) {
dev_err(adev->dev, "fail to initialize sdma ras block\n");
return -EINVAL;
@@ -1497,9 +1500,6 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block 
*ip_block)
else
DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n");
 
-   r = amdgpu_sdma_sysfs_reset_mask_init(adev);
-   if (r)
-   return r;
/* Initialize guilty flags for GFX and PAGE queues */
adev->sdma.gfx_guilty = false;
adev->sdma.page_guilty = false;
@@ -2328,6 +2328,45 @@ static void sdma_v4_4_2_set_vm_pte_funcs(struct 
amdgpu_device *adev)
adev->vm_manager.vm_pte_num_scheds = adev->sdma.num_instances;
 }
 
+/**
+ * sdma_v4_4_2_init_sysfs_reset_mask - Initialize sysfs reset mask for SDMA
+ * @adev: Pointer to the AMDGPU device structure
+ *
+ * This function initializes the sysfs reset mask for SDMA and sets the 
supported
+ * reset types based on the IP version and firmware versions.
+ *
+ * Returns: 0 on success, or a negative error code on failure.
+ */
+static int sdma_v4_4_2_init_sysfs_reset_mask(struct amdgpu_device *adev)
+{
+   int r = 0;
+
+   /* Set the supported reset types */
+   adev->sdma.supported_reset =
+   amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
+   /*
+* the user queue relies on MEC fw and pmfw when the sdma queue do 
reset.
+* it needs to check both of them at here to skip old mec and pmfw.
+*/
+   switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+   case IP_VERSION(9, 4, 3):
+   case IP_VERSION(9, 4, 4):
+   if ((adev->gfx.mec_fw_version >= 0xb0) && 
amdgpu_dpm_reset_sdma_is_supported(adev))
+   adev->sdma.supported_reset |= 
AMDGPU_RESET_TYPE_PER_QUEUE;
+   break;
+   case IP_VERSION(9, 4, 5):
+   /*TODO: enable the queue reset flag until fw supported */
+   default:
+   break;
+   }
+
+   /* Initialize the sy

[PATCH] drm/amdgpu: update SDMA reset mask in late_init

2025-02-21 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

- Added `sdma_v4_4_2_update_reset_mask` function to update the reset mask.
- update the sysfs reset mask to the `late_init` stage to ensure that the SMU  
initialization
 and capability setup are completed before checking the SDMA reset 
capability.
- For IP versions 9.4.3 and 9.4.4, enable per-queue reset if the MEC firmware 
version is at least 0xb0 and PMFW supports queue reset.
- Add a TODO comment for future support of per-queue reset for IP version 9.4.5.

This change ensures that per-queue reset is only enabled when the MEC and PMFW 
support it.

Suggested-by: Jonathan Kim 
Signed-off-by: Vitaly Prosyak 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 37 +++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 4fa688e00f5e..17e7e36f4477 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -107,6 +107,7 @@ static void sdma_v4_4_2_set_vm_pte_funcs(struct 
amdgpu_device *adev);
 static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev);
+static void sdma_v4_4_2_update_reset_mask(struct amdgpu_device *adev);
 
 static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev,
u32 instance, u32 offset)
@@ -1374,6 +1375,12 @@ static int sdma_v4_4_2_late_init(struct amdgpu_ip_block 
*ip_block)
if (!amdgpu_persistent_edc_harvesting_supported(adev))
amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__SDMA);
 
+   /* The initialization is done in the late_init stage to ensure that the 
SMU
+* initialization and capability setup are completed before we check 
the SDMA
+* reset capability
+*/
+   sdma_v4_4_2_update_reset_mask(adev);
+
return 0;
 }
 
@@ -1481,7 +1488,6 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block 
*ip_block)
}
}
 
-   /* TODO: Add queue reset mask when FW fully supports it */
adev->sdma.supported_reset =
amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
 
@@ -2328,6 +2334,35 @@ static void sdma_v4_4_2_set_vm_pte_funcs(struct 
amdgpu_device *adev)
adev->vm_manager.vm_pte_num_scheds = adev->sdma.num_instances;
 }
 
+/**
+ * sdma_v4_4_2_update_reset_mask - update  reset mask for SDMA
+ * @adev: Pointer to the AMDGPU device structure
+ *
+ * This function update reset mask for SDMA and sets the supported
+ * reset types based on the IP version and firmware versions.
+ *
+ */
+static void sdma_v4_4_2_update_reset_mask(struct amdgpu_device *adev)
+{
+
+   /*
+* the user queue relies on MEC fw and pmfw when the sdma queue do 
reset.
+* it needs to check both of them at here to skip old mec and pmfw.
+*/
+   switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+   case IP_VERSION(9, 4, 3):
+   case IP_VERSION(9, 4, 4):
+   if ((adev->gfx.mec_fw_version >= 0xb0) && 
amdgpu_dpm_reset_sdma_is_supported(adev))
+   adev->sdma.supported_reset |= 
AMDGPU_RESET_TYPE_PER_QUEUE;
+   break;
+   case IP_VERSION(9, 4, 5):
+   /*TODO: enable the queue reset flag until fw supported */
+   default:
+   break;
+   }
+
+}
+
 const struct amdgpu_ip_block_version sdma_v4_4_2_ip_block = {
.type = AMD_IP_BLOCK_TYPE_SDMA,
.major = 4,
-- 
2.25.1

[PATCH] drm/amdgpu: update SDMA sysfs reset mask in late_init

2025-02-24 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

- Added `sdma_v4_4_2_update_reset_mask` function to update the reset mask.
- update the sysfs reset mask to the `late_init` stage to ensure that the SMU  
initialization
 and capability setup are completed before checking the SDMA reset 
capability.
- For IP versions 9.4.3 and 9.4.4, enable per-queue reset if the MEC firmware 
version is at least 0xb0 and PMFW supports queue reset.
- Add a TODO comment for future support of per-queue reset for IP version 9.5.0.

This change ensures that per-queue reset is only enabled when the MEC and PMFW 
support it.

v2: fix ip version (9.5.4 -> 9.5.0)(Lijo)

Suggested-by: Jonathan Kim 
Signed-off-by: Vitaly Prosyak 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 37 +++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 4fa688e00f5e..ba43c8f46f45 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -107,6 +107,7 @@ static void sdma_v4_4_2_set_vm_pte_funcs(struct 
amdgpu_device *adev);
 static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev);
 static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev);
+static void sdma_v4_4_2_update_reset_mask(struct amdgpu_device *adev);
 
 static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev,
u32 instance, u32 offset)
@@ -1374,6 +1375,12 @@ static int sdma_v4_4_2_late_init(struct amdgpu_ip_block 
*ip_block)
if (!amdgpu_persistent_edc_harvesting_supported(adev))
amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__SDMA);
 
+   /* The initialization is done in the late_init stage to ensure that the 
SMU
+* initialization and capability setup are completed before we check 
the SDMA
+* reset capability
+*/
+   sdma_v4_4_2_update_reset_mask(adev);
+
return 0;
 }
 
@@ -1481,7 +1488,6 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block 
*ip_block)
}
}
 
-   /* TODO: Add queue reset mask when FW fully supports it */
adev->sdma.supported_reset =
amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
 
@@ -2328,6 +2334,35 @@ static void sdma_v4_4_2_set_vm_pte_funcs(struct 
amdgpu_device *adev)
adev->vm_manager.vm_pte_num_scheds = adev->sdma.num_instances;
 }
 
+/**
+ * sdma_v4_4_2_update_reset_mask - update  reset mask for SDMA
+ * @adev: Pointer to the AMDGPU device structure
+ *
+ * This function update reset mask for SDMA and sets the supported
+ * reset types based on the IP version and firmware versions.
+ *
+ */
+static void sdma_v4_4_2_update_reset_mask(struct amdgpu_device *adev)
+{
+
+   /*
+* the user queue relies on MEC fw and pmfw when the sdma queue do 
reset.
+* it needs to check both of them at here to skip old mec and pmfw.
+*/
+   switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+   case IP_VERSION(9, 4, 3):
+   case IP_VERSION(9, 4, 4):
+   if ((adev->gfx.mec_fw_version >= 0xb0) && 
amdgpu_dpm_reset_sdma_is_supported(adev))
+   adev->sdma.supported_reset |= 
AMDGPU_RESET_TYPE_PER_QUEUE;
+   break;
+   case IP_VERSION(9, 5, 0):
+   /*TODO: enable the queue reset flag until fw supported */
+   default:
+   break;
+   }
+
+}
+
 const struct amdgpu_ip_block_version sdma_v4_4_2_ip_block = {
.type = AMD_IP_BLOCK_TYPE_SDMA,
.major = 4,
-- 
2.25.1

[PATCH v2 1/2] drm/amd/pm: add support for checking SDMA reset capability

2025-02-20 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch introduces a new function to check if the SMU supports resetting the 
SDMA engine.
This capability check ensures that the driver does not attempt to reset the 
SDMA engine
on hardware that does not support it.

The following changes are included:
- New function `amdgpu_dpm_reset_sdma_is_supported` to check SDMA reset
  support at the AMDGPU driver level.
- New function `smu_reset_sdma_is_supported` to check SDMA reset support
  at the SMU level.
- Implementation of `smu_v13_0_6_reset_sdma_is_supported` for the specific
  SMU version v13.0.6.
- Updated `smu_v13_0_6_reset_sdma` to use the new capability check before
  attempting to reset the SDMA engine.

Signed-off-by: Vitaly Prosyak 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c   | 23 +++
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h   |  1 +
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 17 ++
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  5 
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 23 ++-
 5 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index faae9bf48aa4..26209d5ff787 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -722,6 +722,29 @@ int amdgpu_dpm_send_rma_reason(struct amdgpu_device *adev)
return ret;
 }
 
+/**
+ * amdgpu_dpm_reset_sdma_is_supported - Check if SDMA reset is supported
+ * @adev: amdgpu_device pointer
+ *
+ * This function checks if the SMU supports resetting the SDMA engine.
+ * It returns -EOPNOTSUPP if the hardware does not support software SMU or
+ * if the feature is not supported.
+ */
+int amdgpu_dpm_reset_sdma_is_supported(struct amdgpu_device *adev)
+{
+   struct smu_context *smu = adev->powerplay.pp_handle;
+   int ret;
+
+   if (!is_support_sw_smu(adev))
+   return -EOPNOTSUPP;
+
+   mutex_lock(&adev->pm.mutex);
+   ret = smu_reset_sdma_is_supported(smu);
+   mutex_unlock(&adev->pm.mutex);
+
+   return ret;
+}
+
 int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask)
 {
struct smu_context *smu = adev->powerplay.pp_handle;
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
index 1f5ac7e0230d..353a10119dc5 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
@@ -603,5 +603,6 @@ int amdgpu_dpm_set_pm_policy(struct amdgpu_device *adev, 
int policy_type,
 ssize_t amdgpu_dpm_get_pm_policy_info(struct amdgpu_device *adev,
  enum pp_pm_policy p_type, char *buf);
 int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask);
+int amdgpu_dpm_reset_sdma_is_supported(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 0b32c6cf6924..f860590ef893 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -3907,6 +3907,23 @@ int smu_send_rma_reason(struct smu_context *smu)
return ret;
 }
 
+/**
+ * smu_reset_sdma_is_supported - Check if SDMA reset is supported by SMU
+ * @smu: smu_context pointer
+ *
+ * This function checks if the SMU supports resetting the SDMA engine.
+ * It returns 0 if supported, -EOPNOTSUPP otherwise.
+ */
+int smu_reset_sdma_is_supported(struct smu_context *smu)
+{
+   int ret = 0;
+
+   if (smu->ppt_funcs && smu->ppt_funcs->reset_sdma_is_supported)
+   ret = smu->ppt_funcs->reset_sdma_is_supported(smu);
+
+   return ret;
+}
+
 int smu_reset_sdma(struct smu_context *smu, uint32_t inst_mask)
 {
int ret = 0;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index 3630593bce61..090a2b3b81a0 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -1376,6 +1376,10 @@ struct pptable_funcs {
 * @reset_sdma: message SMU to soft reset sdma instance.
 */
int (*reset_sdma)(struct smu_context *smu, uint32_t inst_mask);
+   /**
+* @reset_sdma_is_supported: Check if support resets the SDMA engine.
+*/
+   int (*reset_sdma_is_supported)(struct smu_context *smu);
 
/**
 * @get_ecc_table:  message SMU to get ECC INFO table.
@@ -1637,6 +1641,7 @@ int smu_send_hbm_bad_pages_num(struct smu_context *smu, 
uint32_t size);
 int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t size);
 int smu_send_rma_reason(struct smu_context *smu);
 int smu_reset_sdma(struct smu_context *smu, uint32_t inst_mask);
+int smu_reset_sdma_is_supported(struct smu_context *smu);
 int smu_set_pm_policy(struct smu_context *smu, enum pp_pm_policy p_type,
  int level);
 ssize_t smu_get_pm_policy_info(struc

[PATCH V2 2/2] drm/amdgpu: Enable per-queue reset support

2025-02-20 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

- Modify the `sdma_v4_4_2_sw_init` function to conditionally enable per-queue 
reset support.
- For IP versions 9.4.3 and 9.4.4, enable per-queue reset if the MEC firmware 
version is at least 0xb0 and PMFW supports queue reset.
- Add a TODO comment for future support of per-queue reset for IP version 9.4.5.

This change ensures that per-queue reset is only enabled when the MEC and PMFW 
support it.

Suggested-by: Jonathan Kim 
Signed-off-by: Vitaly Prosyak 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 9925b183c07f..0e004b156e95 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1458,9 +1458,23 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block 
*ip_block)
}
}
 
-   /* TODO: Add queue reset mask when FW fully supports it */
adev->sdma.supported_reset =
amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
+   /*
+* the user queue relies on MEC fw and pmfw when the sdma queue do 
reset.
+* it needs to check both of them at here to skip old mec and pmfw.
+*/
+   switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+   case IP_VERSION(9, 4, 3):
+   case IP_VERSION(9, 4, 4):
+   if ((adev->gfx.mec_fw_version >= 0xb0) && 
amdgpu_dpm_reset_sdma_is_supported(adev))
+   adev->gfx.compute_supported_reset |= 
AMDGPU_RESET_TYPE_PER_QUEUE;
+   break;
+   case IP_VERSION(9, 4, 5):
+   /*TODO: enable the queue reset flag until fw supported */
+   default:
+   break;
+   }
 
if (amdgpu_sdma_ras_sw_init(adev)) {
dev_err(adev->dev, "fail to initialize sdma ras block\n");
-- 
2.25.1

[PATCH 2/7 V2] drm/amd/amdgpu: Implement SDMA soft reset directly for sdma v5

2025-04-04 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch introduces a new function `amdgpu_sdma_soft_reset` to handle SDMA 
soft resets directly,
rather than relying on the DPM interface.

1. **New `amdgpu_sdma_soft_reset` Function**:
   - Implements a soft reset for SDMA engines by directly writing to the 
hardware registers.
   - Handles SDMA versions 4.x and 5.x separately:
 - For SDMA 4.x, the existing `amdgpu_dpm_reset_sdma` function is used for 
backward compatibility.
 - For SDMA 5.x, the driver directly manipulates the `GRBM_SOFT_RESET` 
register to reset the specified SDMA instance.

2. **Integration into `amdgpu_sdma_reset_engine`**:
   - The `amdgpu_sdma_soft_reset` function is called during the SDMA reset 
process, replacing the previous call to `amdgpu_dpm_reset_sdma`.

Suggested-by: Alex Deucher 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 46 +++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 7d862c887a1a..dbc7c7cfee01 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -26,6 +26,8 @@
 #include "amdgpu_sdma.h"
 #include "amdgpu_ras.h"
 #include "amdgpu_reset.h"
+#include "gc/gc_10_1_0_offset.h"
+#include "gc/gc_10_3_0_sh_mask.h"
 
 #define AMDGPU_CSA_SDMA_SIZE 64
 /* SDMA CSA reside in the 3rd page of CSA */
@@ -553,6 +555,48 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
amdgpu_device *adev, struct
list_add_tail(&funcs->list, &adev->sdma.reset_callback_list);
 }
 
+static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id)
+{
+   u32 soft_reset;
+   int r = 0;
+
+   switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
+   case IP_VERSION(4, 4, 2):
+   case IP_VERSION(4, 4, 4):
+   case IP_VERSION(4, 4, 5):
+   /* For SDMA 4.x, use the existing DPM interface for backward 
compatibility */
+   r = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
+   break;
+   case IP_VERSION(5, 0, 0):
+   case IP_VERSION(5, 0, 1):
+   case IP_VERSION(5, 0, 2):
+   case IP_VERSION(5, 0, 5):
+   case IP_VERSION(5, 2, 0):
+   case IP_VERSION(5, 2, 2):
+   case IP_VERSION(5, 2, 4):
+   case IP_VERSION(5, 2, 5):
+   case IP_VERSION(5, 2, 6):
+   case IP_VERSION(5, 2, 3):
+   case IP_VERSION(5, 2, 1):
+   case IP_VERSION(5, 2, 7):
+   /* For SDMA 5.x, directly manipulate the GRBM_SOFT_RESET 
register */
+   soft_reset = RREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET);
+   soft_reset |= 1 << GRBM_SOFT_RESET__SOFT_RESET_SDMA0__SHIFT << 
instance_id;
+   /* Issue the soft reset */
+   WREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET, soft_reset);
+
+   udelay(50);
+   /* Clear the soft reset bit */
+   soft_reset &= ~(1 << GRBM_SOFT_RESET__SOFT_RESET_SDMA0__SHIFT 
<< instance_id);
+   WREG32_SOC15(GC, 0, mmGRBM_SOFT_RESET, soft_reset);
+   break;
+   default:
+   break;
+   }
+
+   return r;
+}
+
 /**
  * amdgpu_sdma_reset_engine - Reset a specific SDMA engine
  * @adev: Pointer to the AMDGPU device
@@ -587,7 +631,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
uint32_t instance_id)
gfx_ring->funcs->stop_queue(adev, instance_id);
 
/* Perform the SDMA reset for the specified instance */
-   ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
+   ret = amdgpu_sdma_soft_reset(adev, instance_id);
if (ret) {
dev_err(adev->dev, "Failed to reset SDMA instance %u\n", 
instance_id);
goto exit;
-- 
2.25.1

[PATCH 5/7 V2] drm/amdgpu: Optimize SDMA v5.2 queue reset and stop logic

2025-04-04 Thread jesse.zhang

From: "jesse.zh...@amd.com" 

This patch refactors the SDMA v5.2 queue reset and stop logic to improve
code readability, maintainability, and performance. The key changes include:

1. **Generalized `sdma_v5_2_gfx_stop` Function**:
- Added an `inst_mask` parameter to allow stopping specific SDMA 
instances
  instead of all instances. This is useful for resetting individual 
queues.

2. **Simplified `sdma_v5_2_reset_queue` Function**:
- Removed redundant loops and checks by directly using the `ring->me` 
field
  to identify the SDMA instance.
- Reused the `sdma_v5_2_gfx_stop` function to stop the queue, reducing 
code
  duplication.

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 64 +++---
 1 file changed, 26 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index 964f12afac9e..96b02c3e4993 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -405,15 +405,15 @@ static void sdma_v5_2_ring_emit_fence(struct amdgpu_ring 
*ring, u64 addr, u64 se
  * sdma_v5_2_gfx_stop - stop the gfx async dma engines
  *
  * @adev: amdgpu_device pointer
- *
+ * @inst_mask: mask of dma engine instances to be disabled
  * Stop the gfx async dma ring buffers.
  */
-static void sdma_v5_2_gfx_stop(struct amdgpu_device *adev)
+static void sdma_v5_2_gfx_stop(struct amdgpu_device *adev,  uint32_t inst_mask)
 {
u32 rb_cntl, ib_cntl;
int i;
 
-   for (i = 0; i < adev->sdma.num_instances; i++) {
+   for_each_inst(i, inst_mask) {
rb_cntl = RREG32_SOC15_IP(GC, sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_GFX_RB_CNTL));
rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_GFX_RB_CNTL, RB_ENABLE, 
0);
WREG32_SOC15_IP(GC, sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_GFX_RB_CNTL), rb_cntl);
@@ -504,9 +504,11 @@ static void sdma_v5_2_enable(struct amdgpu_device *adev, 
bool enable)
 {
u32 f32_cntl;
int i;
+   uint32_t inst_mask;
 
+   inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
if (!enable) {
-   sdma_v5_2_gfx_stop(adev);
+   sdma_v5_2_gfx_stop(adev, inst_mask);
sdma_v5_2_rlc_stop(adev);
}
 
@@ -1437,40 +1439,26 @@ static int sdma_v5_2_wait_for_idle(struct 
amdgpu_ip_block *ip_block)
 static int sdma_v5_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
 {
struct amdgpu_device *adev = ring->adev;
-   int i, j, r;
-   u32 rb_cntl, ib_cntl, f32_cntl, freeze, cntl, preempt, soft_reset, 
stat1_reg;
+   int j, r;
+   u32 f32_cntl, freeze, cntl, preempt, soft_reset, stat1_reg;
+   u32 inst_id;
 
if (amdgpu_sriov_vf(adev))
return -EINVAL;
 
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   if (ring == &adev->sdma.instance[i].ring)
-   break;
-   }
-
-   if (i == adev->sdma.num_instances) {
-   DRM_ERROR("sdma instance not found\n");
-   return -EINVAL;
-   }
-
+   inst_id = ring->me;
amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
 
/* stop queue */
-   ib_cntl = RREG32(sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_GFX_IB_CNTL));
-   ib_cntl = REG_SET_FIELD(ib_cntl, SDMA0_GFX_IB_CNTL, IB_ENABLE, 0);
-   WREG32(sdma_v5_2_get_reg_offset(adev, i, mmSDMA0_GFX_IB_CNTL), ib_cntl);
-
-   rb_cntl = RREG32(sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_GFX_RB_CNTL));
-   rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_GFX_RB_CNTL, RB_ENABLE, 0);
-   WREG32(sdma_v5_2_get_reg_offset(adev, i, mmSDMA0_GFX_RB_CNTL), rb_cntl);
+   sdma_v5_2_gfx_stop(adev, 1 << ring->me);
 
/*engine stop SDMA1_F32_CNTL.HALT to 1 and SDMAx_FREEZE freeze bit to 1 
*/
-   freeze = RREG32(sdma_v5_2_get_reg_offset(adev, i, mmSDMA0_FREEZE));
+   freeze = RREG32(sdma_v5_2_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
freeze = REG_SET_FIELD(freeze, SDMA0_FREEZE, FREEZE, 1);
-   WREG32(sdma_v5_2_get_reg_offset(adev, i, mmSDMA0_FREEZE), freeze);
+   WREG32(sdma_v5_2_get_reg_offset(adev, inst_id, mmSDMA0_FREEZE), freeze);
 
for (j = 0; j < adev->usec_timeout; j++) {
-   freeze = RREG32(sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_FREEZE));
+   freeze = RREG32(sdma_v5_2_get_reg_offset(adev, inst_id, 
mmSDMA0_FREEZE));
 
if (REG_GET_FIELD(freeze, SDMA0_FREEZE, FROZEN) & 1)
break;
@@ -1479,7 +1467,7 @@ static int sdma_v5_2_reset_queue(struct amdgpu_ring 
*ring, unsigned int vmid)
 
 
if (j == adev->usec_timeout) {
-   stat1_reg = RREG32(sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_STATUS1_REG));
+   stat1_reg = RREG32(sdma_v5_2_get_reg_offset(adev, inst_id, 
mmSDMA0_STATUS1_REG));
if ((stat1_reg & 0x3FF) != 0x3FF) {

48 matches

Mail list logo