[PATCH v4] drm/amdgpu: fix the memleak caused by fence not released

2025-03-03 Thread Arvind Yadav
Encountering a taint issue during the unloading of gpu_sched
due to the fence not being released/put. In this context,
amdgpu_vm_clear_freed is responsible for creating a job to
update the page table (PT). It allocates kmem_cache for
drm_sched_fence and returns the finished fence associated
with job->base.s_fence. In case of Usermode queue this finished
fence is added to the timeline sync object through
amdgpu_gem_update_bo_mapping, which is utilized by user
space to ensure the completion of the PT update.

[  508.900587] 
=
[  508.900605] BUG drm_sched_fence (Tainted: G N): Objects 
remaining in drm_sched_fence on __kmem_cache_shutdown()
[  508.900617] 
-

[  508.900627] Slab 0xe0cc04548780 objects=32 used=2 fp=0x8ea81521f000 
flags=0x17c240(workingset|head|node=0|zone=2|lastcpupid=0x1f)
[  508.900645] CPU: 3 UID: 0 PID: 2337 Comm: rmmod Tainted: G N 
6.12.0+ #1
[  508.900651] Tainted: [N]=TEST
[  508.900653] Hardware name: Gigabyte Technology Co., Ltd. X570 AORUS 
ELITE/X570 AORUS ELITE, BIOS F34 06/10/2021
[  508.900656] Call Trace:
[  508.900659]  
[  508.900665]  dump_stack_lvl+0x70/0x90
[  508.900674]  dump_stack+0x14/0x20
[  508.900678]  slab_err+0xcb/0x110
[  508.900687]  ? srso_return_thunk+0x5/0x5f
[  508.900692]  ? try_to_grab_pending+0xd3/0x1d0
[  508.900697]  ? srso_return_thunk+0x5/0x5f
[  508.900701]  ? mutex_lock+0x17/0x50
[  508.900708]  __kmem_cache_shutdown+0x144/0x2d0
[  508.900713]  ? flush_rcu_work+0x50/0x60
[  508.900719]  kmem_cache_destroy+0x46/0x1f0
[  508.900728]  drm_sched_fence_slab_fini+0x19/0x970 [gpu_sched]
[  508.900736]  __do_sys_delete_module.constprop.0+0x184/0x320
[  508.900744]  ? srso_return_thunk+0x5/0x5f
[  508.900747]  ? debug_smp_processor_id+0x1b/0x30
[  508.900754]  __x64_sys_delete_module+0x16/0x20
[  508.900758]  x64_sys_call+0xdf/0x20d0
[  508.900763]  do_syscall_64+0x51/0x120
[  508.900769]  entry_SYSCALL_64_after_hwframe+0x76/0x7e

v2: call dma_fence_put in amdgpu_gem_va_update_vm
v3: Addressed review comments from Christian.
- calling amdgpu_gem_update_timeline_node before switch.
- puting a dma_fence in case of error or !timeline_syncobj.
v4: Addressed review comments from Christian.

Cc: Alex Deucher 
Cc: Christian König 
Cc: Shashank Sharma 
Cc: Sunil Khatri 
Signed-off-by: Le Ma 
Signed-off-by: Arvind Yadav 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 28 ++---
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 8b67aae6c2fe..c1d8cee7894b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -931,6 +931,14 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
bo_va = NULL;
}
 
+   r = amdgpu_gem_update_timeline_node(filp,
+   args->vm_timeline_syncobj_out,
+   args->vm_timeline_point,
+   &timeline_syncobj,
+   &timeline_chain);
+   if (r)
+   goto error;
+
switch (args->operation) {
case AMDGPU_VA_OP_MAP:
va_flags = amdgpu_gem_va_map_flags(adev, args->flags);
@@ -957,22 +965,18 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void 
*data,
break;
}
if (!r && !(args->flags & AMDGPU_VM_DELAY_UPDATE) && !adev->debug_vm) {
-
-   r = amdgpu_gem_update_timeline_node(filp,
-   
args->vm_timeline_syncobj_out,
-   args->vm_timeline_point,
-   &timeline_syncobj,
-   &timeline_chain);
-
fence = amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va,
args->operation);
 
-   if (!r)
+   if (timeline_syncobj)
amdgpu_gem_update_bo_mapping(filp, bo_va,
-args->operation,
-args->vm_timeline_point,
-fence, timeline_syncobj,
-timeline_chain);
+args->operation,
+args->vm_timeline_point,
+fence, timeline_syncobj,
+timeline_chain);
+   else
+   dma_fence_put(fence);
+
}
 
 error:
-- 
2.34.1



Re: [PATCH 2/2] drm/amdgpu/gfx12: don't read registers in mqd init

2025-03-03 Thread Alex Deucher
ping on this series.

Alex

On Wed, Feb 26, 2025 at 11:09 PM Alex Deucher  wrote:
>
> Just use the default values.  There's not need to
> get the value from hardware and it could cause problems
> if we do that at runtime and gfxoff is active.
>
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 48 ++
>  1 file changed, 33 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index 667c8013b7738..804e9552a608a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -52,6 +52,24 @@
>
>  #define RLCG_UCODE_LOADING_START_ADDRESS   0x2000L
>
> +#define regCP_GFX_MQD_CONTROL_DEFAULT
>  0x0100
> +#define regCP_GFX_HQD_VMID_DEFAULT   
>  0x
> +#define regCP_GFX_HQD_QUEUE_PRIORITY_DEFAULT 
>  0x
> +#define regCP_GFX_HQD_QUANTUM_DEFAULT
>  0x0a01
> +#define regCP_GFX_HQD_CNTL_DEFAULT   
>  0x00f0
> +#define regCP_RB_DOORBELL_CONTROL_DEFAULT
>  0x
> +#define regCP_GFX_HQD_RPTR_DEFAULT   
>  0x
> +
> +#define regCP_HQD_EOP_CONTROL_DEFAULT
>  0x0006
> +#define regCP_HQD_PQ_DOORBELL_CONTROL_DEFAULT
>  0x
> +#define regCP_MQD_CONTROL_DEFAULT
>  0x0100
> +#define regCP_HQD_PQ_CONTROL_DEFAULT 
>  0x00308509
> +#define regCP_HQD_PQ_DOORBELL_CONTROL_DEFAULT
>  0x
> +#define regCP_HQD_PQ_RPTR_DEFAULT
>  0x
> +#define regCP_HQD_PERSISTENT_STATE_DEFAULT   
>  0x0be05501
> +#define regCP_HQD_IB_CONTROL_DEFAULT 
>  0x0030
> +
> +
>  MODULE_FIRMWARE("amdgpu/gc_12_0_0_pfp.bin");
>  MODULE_FIRMWARE("amdgpu/gc_12_0_0_me.bin");
>  MODULE_FIRMWARE("amdgpu/gc_12_0_0_mec.bin");
> @@ -2926,25 +2944,25 @@ static int gfx_v12_0_gfx_mqd_init(struct 
> amdgpu_device *adev, void *m,
> mqd->cp_mqd_base_addr_hi = upper_32_bits(prop->mqd_gpu_addr);
>
> /* set up mqd control */
> -   tmp = RREG32_SOC15(GC, 0, regCP_GFX_MQD_CONTROL);
> +   tmp = regCP_GFX_MQD_CONTROL_DEFAULT;
> tmp = REG_SET_FIELD(tmp, CP_GFX_MQD_CONTROL, VMID, 0);
> tmp = REG_SET_FIELD(tmp, CP_GFX_MQD_CONTROL, PRIV_STATE, 1);
> tmp = REG_SET_FIELD(tmp, CP_GFX_MQD_CONTROL, CACHE_POLICY, 0);
> mqd->cp_gfx_mqd_control = tmp;
>
> /* set up gfx_hqd_vimd with 0x0 to indicate the ring buffer's vmid */
> -   tmp = RREG32_SOC15(GC, 0, regCP_GFX_HQD_VMID);
> +   tmp = regCP_GFX_HQD_VMID_DEFAULT;
> tmp = REG_SET_FIELD(tmp, CP_GFX_HQD_VMID, VMID, 0);
> mqd->cp_gfx_hqd_vmid = 0;
>
> /* set up default queue priority level
>  * 0x0 = low priority, 0x1 = high priority */
> -   tmp = RREG32_SOC15(GC, 0, regCP_GFX_HQD_QUEUE_PRIORITY);
> +   tmp = regCP_GFX_HQD_QUEUE_PRIORITY_DEFAULT;
> tmp = REG_SET_FIELD(tmp, CP_GFX_HQD_QUEUE_PRIORITY, PRIORITY_LEVEL, 
> 0);
> mqd->cp_gfx_hqd_queue_priority = tmp;
>
> /* set up time quantum */
> -   tmp = RREG32_SOC15(GC, 0, regCP_GFX_HQD_QUANTUM);
> +   tmp = regCP_GFX_HQD_QUANTUM_DEFAULT;
> tmp = REG_SET_FIELD(tmp, CP_GFX_HQD_QUANTUM, QUANTUM_EN, 1);
> mqd->cp_gfx_hqd_quantum = tmp;
>
> @@ -2966,7 +2984,7 @@ static int gfx_v12_0_gfx_mqd_init(struct amdgpu_device 
> *adev, void *m,
>
> /* set up the gfx_hqd_control, similar as CP_RB0_CNTL */
> rb_bufsz = order_base_2(prop->queue_size / 4) - 1;
> -   tmp = RREG32_SOC15(GC, 0, regCP_GFX_HQD_CNTL);
> +   tmp = regCP_GFX_HQD_CNTL_DEFAULT;
> tmp = REG_SET_FIELD(tmp, CP_GFX_HQD_CNTL, RB_BUFSZ, rb_bufsz);
> tmp = REG_SET_FIELD(tmp, CP_GFX_HQD_CNTL, RB_BLKSZ, rb_bufsz - 2);
>  #ifdef __BIG_ENDIAN
> @@ -2975,7 +2993,7 @@ static int gfx_v12_0_gfx_mqd_init(struct amdgpu_device 
> *adev, void *m,
> mqd->cp_gfx_hqd_cntl = tmp;
>
> /* set up cp_doorbell_control */
> -   tmp = RREG32_SOC15(GC, 0, regCP_RB_DOORBELL_CONTROL);
> +   tmp = regCP_RB_DOORBELL_CONTROL_DEFAULT;
> if (prop->use_doorbell) {
> tmp = REG_SET_FIELD(tmp, CP_RB_DOORBELL_CONTROL,
> DOORBELL_OFFSET, prop->doorbell_index);
> @@ -2987,7 +3005,7 @@ static int gfx_v12_0_gfx_mqd_init(struct amdgpu_device 
> *adev, void *m,
> mqd->cp_rb_doorbell_control = tmp;
>
> /* reset read and write 

Re: [PATCH] drm/radeon: Fix rs400_gpu_init for ATI mobility radeon Xpress 200M

2025-03-03 Thread Alex Deucher
Applied with a small tweak to fix a warning.

Thanks,

Alex

On Mon, Mar 3, 2025 at 7:55 AM Marek Olšák  wrote:
>
> Reviewed-by: Marek Olšák 
>
> On Tue, Jun 18, 2019 at 3:19 AM Richard Thier  wrote:
>>
>> num_gb_pipes was set to a wrong value using r420_pipe_config
>>
>> This have lead to HyperZ glitches on fast Z clearing.
>>
>> See: https://bugs.freedesktop.org/show_bug.cgi?id=110897
>>
>> Signed-off-by: Richard Thier 
>> ---
>>  drivers/gpu/drm/radeon/r300.c  |  3 ++-
>>  drivers/gpu/drm/radeon/rs400.c | 21 +++--
>>  2 files changed, 21 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c
>> index 652126f..6724c15 100644
>> --- a/drivers/gpu/drm/radeon/r300.c
>> +++ b/drivers/gpu/drm/radeon/r300.c
>> @@ -355,7 +355,8 @@ int r300_mc_wait_for_idle(struct radeon_device *rdev)
>> return -1;
>>  }
>>
>> -static void r300_gpu_init(struct radeon_device *rdev)
>> +/* rs400_gpu_init also calls this! */
>> +void r300_gpu_init(struct radeon_device *rdev)
>>  {
>> uint32_t gb_tile_config, tmp;
>>
>> diff --git a/drivers/gpu/drm/radeon/rs400.c b/drivers/gpu/drm/radeon/rs400.c
>> index 4121209..4117572 100644
>> --- a/drivers/gpu/drm/radeon/rs400.c
>> +++ b/drivers/gpu/drm/radeon/rs400.c
>> @@ -32,6 +32,9 @@
>>  #include "radeon_asic.h"
>>  #include "rs400d.h"
>>
>> +/* Needed for rs400_gpu_init that forwards to the r300 one */
>> +void r300_gpu_init(struct radeon_device *rdev);
>> +
>>  /* This files gather functions specifics to : rs400,rs480 */
>>  static int rs400_debugfs_pcie_gart_info_init(struct radeon_device *rdev);
>>
>> @@ -252,8 +255,22 @@ int rs400_mc_wait_for_idle(struct radeon_device *rdev)
>>
>>  static void rs400_gpu_init(struct radeon_device *rdev)
>>  {
>> -   /* FIXME: is this correct ? */
>> -   r420_pipes_init(rdev);
>> +   /* Earlier code was calling r420_pipes_init and then
>> +* rs400_mc_wait_for_idle(rdev). The problem is that
>> +* at least on my Mobility Radeon Xpress 200M RC410 card
>> +* that ends up in this code path ends up num_gb_pipes == 3
>> +* while the card seems to have only one pipe. With the
>> +* r420 pipe initialization method.
>> +*
>> +* Problems shown up as HyperZ glitches, see:
>> +* https://bugs.freedesktop.org/show_bug.cgi?id=110897
>> +*
>> +* Delegating initialization to r300 code seems to work
>> +* and results in proper pipe numbers. The rs400 cards
>> +* are said to be not r400, but r300 kind of cards.
>> +*/
>> +   r300_gpu_init(rdev);
>> +
>> if (rs400_mc_wait_for_idle(rdev)) {
>> pr_warn("rs400: Failed to wait MC idle while programming 
>> pipes. Bad things might happen. %08x\n",
>> RREG32(RADEON_MC_STATUS));
>> --
>> 2.21.0
>>
>> ___
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 6/6] drm/amdgpu: Remove unused pqm_get_kernel_queue

2025-03-03 Thread Alex Deucher
Applied the series.  Thanks!

Alex

On Mon, Mar 3, 2025 at 9:49 AM  wrote:
>
> From: "Dr. David Alan Gilbert" 
>
> pqm_get_kernel_queue() has been unused since 2022's
> commit 5bdd3eb25354 ("drm/amdkfd: Remove unused old debugger
> implementation")
>
> Remove it.
>
> Signed-off-by: Dr. David Alan Gilbert 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h   |  2 --
>  .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c  | 13 -
>  2 files changed, 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 2c22e335a02e..bc26a2609f64 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1363,8 +1363,6 @@ int pqm_update_mqd(struct process_queue_manager *pqm, 
> unsigned int qid,
> struct mqd_update_info *minfo);
>  int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
> void *gws);
> -struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
> -   unsigned int qid);
>  struct queue *pqm_get_user_queue(struct process_queue_manager *pqm,
> unsigned int qid);
>  int pqm_get_wave_state(struct process_queue_manager *pqm,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index bd36a75309e1..2b30ed0cecb6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -652,19 +652,6 @@ int pqm_update_mqd(struct process_queue_manager *pqm,
> return 0;
>  }
>
> -struct kernel_queue *pqm_get_kernel_queue(
> -   struct process_queue_manager *pqm,
> -   unsigned int qid)
> -{
> -   struct process_queue_node *pqn;
> -
> -   pqn = get_queue_by_qid(pqm, qid);
> -   if (pqn && pqn->kq)
> -   return pqn->kq;
> -
> -   return NULL;
> -}
> -
>  struct queue *pqm_get_user_queue(struct process_queue_manager *pqm,
> unsigned int qid)
>  {
> --
> 2.48.1
>


Re: [PATCH v2 1/5] drm/amd: Copy entire structure in amdgpu_acpi_get_backlight_caps()

2025-03-03 Thread Alex Hung

Reviewed-by: Alex Hung 

On 2/28/25 11:51, Mario Limonciello wrote:

As new members are introduced to the structure copying the entire
structure will help avoid missing them.

Signed-off-by: Mario Limonciello 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 6 +-
  1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
index b8d4e07d2043..515c6f32448d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
@@ -1277,11 +1277,7 @@ void amdgpu_acpi_get_backlight_caps(struct 
amdgpu_dm_backlight_caps *caps)
  {
struct amdgpu_atif *atif = &amdgpu_acpi_priv.atif;
  
-	caps->caps_valid = atif->backlight_caps.caps_valid;

-   caps->min_input_signal = atif->backlight_caps.min_input_signal;
-   caps->max_input_signal = atif->backlight_caps.max_input_signal;
-   caps->ac_level = atif->backlight_caps.ac_level;
-   caps->dc_level = atif->backlight_caps.dc_level;
+   memcpy(caps, &atif->backlight_caps, sizeof(*caps));
  }
  
  /**




Re: [PATCH v2 1/4] drm/amdgpu/gfx11: Implement the GFX11 KGQ pipe reset

2025-03-03 Thread Alex Deucher
On Fri, Feb 21, 2025 at 8:38 AM Prike Liang  wrote:
>
> Implement the kernel graphics queue pipe reset,and the driver
> will fallback to pipe reset when the queue reset fails. However,
> the ME FW hasn't fully supported pipe reset yet so disable the
> KGQ pipe reset temporarily.
>
> Signed-off-by: Prike Liang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h |  2 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c| 71 ++-
>  2 files changed, 71 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
> index 4eedd92f000b..06fe21e15ed6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
> @@ -25,6 +25,8 @@
>
>  #include "amdgpu_socbb.h"
>
> +#define RS64_FW_UC_START_ADDR_LO 0x3000
> +
>  struct common_firmware_header {
> uint32_t size_bytes; /* size of the entire header+image(s) in bytes */
> uint32_t header_size_bytes; /* size of just the header in bytes */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 2c7f0bb242ff..7e53c0b63f88 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -6653,6 +6653,68 @@ static void gfx_v11_0_emit_mem_sync(struct amdgpu_ring 
> *ring)
> amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
>  }
>
> +static bool gfx_v11_pipe_reset_support(struct amdgpu_device *adev)
> +{
> +   /* Disable the pipe reset until the CPFW fully support it.*/
> +   dev_warn_once(adev->dev, "The CPFW hasn't support pipe reset yet.\n");

I'd drop these or make them debug only for now.  Same for gfx12.  With
that fixed, the series is:
Acked-by: Alex Deucher 


> +   return false;
> +}
> +
> +
> +static int gfx_v11_reset_gfx_pipe(struct amdgpu_ring *ring)
> +{
> +   struct amdgpu_device *adev = ring->adev;
> +   uint32_t reset_pipe = 0, clean_pipe = 0;
> +   int r;
> +
> +   if (!gfx_v11_pipe_reset_support(adev))
> +   return -EOPNOTSUPP;
> +
> +   gfx_v11_0_set_safe_mode(adev, 0);
> +   mutex_lock(&adev->srbm_mutex);
> +   soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
> +
> +   switch (ring->pipe) {
> +   case 0:
> +   reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
> +  PFP_PIPE0_RESET, 1);
> +   reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
> +  ME_PIPE0_RESET, 1);
> +   clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
> +  PFP_PIPE0_RESET, 0);
> +   clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
> +  ME_PIPE0_RESET, 0);
> +   break;
> +   case 1:
> +   reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
> +  PFP_PIPE1_RESET, 1);
> +   reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
> +  ME_PIPE1_RESET, 1);
> +   clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
> +  PFP_PIPE1_RESET, 0);
> +   clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
> +  ME_PIPE1_RESET, 0);
> +   break;
> +   default:
> +   break;
> +   }
> +
> +   WREG32_SOC15(GC, 0, regCP_ME_CNTL, reset_pipe);
> +   WREG32_SOC15(GC, 0, regCP_ME_CNTL, clean_pipe);
> +
> +   r = (RREG32(SOC15_REG_OFFSET(GC, 0, regCP_GFX_RS64_INSTR_PNTR1)) << 
> 2) - RS64_FW_UC_START_ADDR_LO;
> +   soc21_grbm_select(adev, 0, 0, 0, 0);
> +   mutex_unlock(&adev->srbm_mutex);
> +   gfx_v11_0_unset_safe_mode(adev, 0);
> +
> +   dev_info(adev->dev,"The ring %s pipe reset to the ME firmware start 
> PC: %s\n", ring->name,
> +   r == 0 ? "successfuly" : "failed");
> +   /* FIXME: Sometimes driver can't cache the ME firmware start PC 
> correctly, so the pipe reset status
> +* relies on the later gfx ring test result.
> +*/
> +   return 0;
> +}
> +
>  static int gfx_v11_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
>  {
> struct amdgpu_device *adev = ring->adev;
> @@ -6662,8 +6724,13 @@ static int gfx_v11_0_reset_kgq(struct amdgpu_ring 
> *ring, unsigned int vmid)
> return -EINVAL;
>
> r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, false);
> -   if (r)
> -   return r;
> +   if (r) {
> +
> +   dev_warn(adev->dev,"reset via MES failed and try pipe reset 
> %d\n", r);
> +   r = gfx_v11_reset_gfx_pipe(ring);
> +   if (r)
> +   return r;
> +   }
>
> r = amdgpu_bo_reserve(ring->mqd_obj, false);
> if (unlikely(r != 0)) {
> --
> 

RE: [PATCH 2/2] drm/amdgpu: Fix core reset sequence for JPEG5_0_1

2025-03-03 Thread Liu, Leo
[AMD Official Use Only - AMD Internal Distribution Only]

The series is:
Reviewed-by: Leo Liu 

> -Original Message-
> From: Sundararaju, Sathishkumar 
> Sent: February 26, 2025 11:51 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Leo ; Sundararaju, Sathishkumar
> 
> Subject: [PATCH 2/2] drm/amdgpu: Fix core reset sequence for JPEG5_0_1
>
> For cores 1 through 9 repair the core reset sequence by adjusting offsets to
> access the expected registers.
>
> Signed-off-by: Sathishkumar S 
> ---
>  drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c | 14 +-
>  1 file changed, 5 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
> b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
> index 6b8ef8e8c0eb..220f3af01748 100644
> --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
> @@ -669,24 +669,20 @@ static void jpeg_v5_0_1_core_stall_reset(struct
> amdgpu_ring *ring)
>   WREG32_SOC15_OFFSET(JPEG, jpeg_inst,
>   regUVD_JMI0_UVD_JMI_CLIENT_STALL,
>   reg_offset, 0x1F);
> - SOC15_WAIT_ON_RREG(JPEG, jpeg_inst,
> -regUVD_JMI0_UVD_JMI_CLIENT_CLEAN_STATUS,
> -0x1F, 0x1F);
> + SOC15_WAIT_ON_RREG_OFFSET(JPEG, jpeg_inst,
> +
> regUVD_JMI0_UVD_JMI_CLIENT_CLEAN_STATUS,
> +   reg_offset, 0x1F, 0x1F);
>   WREG32_SOC15_OFFSET(JPEG, jpeg_inst,
>   regUVD_JMI0_JPEG_LMI_DROP,
>   reg_offset, 0x1F);
> - WREG32_SOC15_OFFSET(JPEG, jpeg_inst,
> - regJPEG_CORE_RST_CTRL,
> - reg_offset, 1 << ring->pipe);
> + WREG32_SOC15(JPEG, jpeg_inst, regJPEG_CORE_RST_CTRL, 1 << ring-
> >pipe);
>   WREG32_SOC15_OFFSET(JPEG, jpeg_inst,
>   regUVD_JMI0_UVD_JMI_CLIENT_STALL,
>   reg_offset, 0x00);
>   WREG32_SOC15_OFFSET(JPEG, jpeg_inst,
>   regUVD_JMI0_JPEG_LMI_DROP,
>   reg_offset, 0x00);
> - WREG32_SOC15_OFFSET(JPEG, jpeg_inst,
> - regJPEG_CORE_RST_CTRL,
> - reg_offset, 0x00);
> + WREG32_SOC15(JPEG, jpeg_inst, regJPEG_CORE_RST_CTRL, 0x00);
>  }
>
>  static int jpeg_v5_0_1_ring_reset(struct amdgpu_ring *ring, unsigned int
> vmid)
> --
> 2.25.1



Re: [PATCH 5/5] drm/amdgpu/userq: remove BROKEN from config

2025-03-03 Thread Alex Deucher
On Mon, Mar 3, 2025 at 7:09 AM Liang, Prike  wrote:
>
> [Public]
>
> Question: Why use the CONFIG_DRM_AMDGPU_NAVI3X_USERQ to guard all the various 
> GFX user mode queue enablement paths? It seems more generic and reasonable to 
> use the config name CONFIG_DRM_AMDGPU_NAVI3X_USERQ.
>

I assume you mean something more generic like CONFIG_DRM_AMDGPU_USERQ?
 At this point it would make more sense, but at the time it was added
there was only gfx11 support.  We could change it, but at this point,
I'd be more inclined to just remove it.

> Except for that question, the series patch is Reviewed-by: Prike Liang 
>  .

Thanks.

Alex

>
> Regards,
>   Prike
>
> > -Original Message-
> > From: amd-gfx  On Behalf Of Alex
> > Deucher
> > Sent: Saturday, March 1, 2025 3:54 AM
> > To: amd-gfx@lists.freedesktop.org
> > Cc: Deucher, Alexander 
> > Subject: [PATCH 5/5] drm/amdgpu/userq: remove BROKEN from config
> >
> > This can be enabled now.  We have the firmware checks in place.
> >
> > Signed-off-by: Alex Deucher 
> > ---
> >  drivers/gpu/drm/amd/amdgpu/Kconfig | 1 -
> >  1 file changed, 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig
> > b/drivers/gpu/drm/amd/amdgpu/Kconfig
> > index a614320114796..23c014da0f324 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/Kconfig
> > +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
> > @@ -98,7 +98,6 @@ config DRM_AMDGPU_WERROR  config
> > DRM_AMDGPU_NAVI3X_USERQ
> >   bool "Enable amdgpu usermode queues"
> >   depends on DRM_AMDGPU
> > - depends on BROKEN
> >   default n
> >   help
> > Choose this option to enable GFX usermode queue support for
> > GFX/SDMA/Compute
> > --
> > 2.48.1
>


RE: [PATCH] drm/amdkfd: Change error handling at prange update in svm_range_set_attr

2025-03-03 Thread Chen, Xiaogang
[AMD Official Use Only - AMD Internal Distribution Only]

Ping

-Original Message-
From: Xiaogang.Chen 
Sent: Friday, January 31, 2025 10:59 AM
To: amd-gfx@lists.freedesktop.org
Cc: Chen, Xiaogang 
Subject: [PATCH] drm/amdkfd: Change error handling at prange update in 
svm_range_set_attr

From: Xiaogang Chen 

When register a vm range at svm the added vm range may be split into multiple 
subranges and/or existing pranges got spitted. The new pranges need validated 
and mapped. This patch changes error handling for pranges that fail updating:

1: free prange resources and remove it from svms if its updating fails as it 
will not be used.
2: return -EAGAIN when all pranges at update_list need redo valid/map, 
otherwise return no -EAGAIN error to user space to indicate failure. That 
removes unnecessary retries.

Signed-off-by: Xiaogang Chen 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 27 +++
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index e32e19196f6b..455cb98bf16a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -3716,8 +3716,19 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,

 out_unlock_range:
mutex_unlock(&prange->migrate_mutex);
-   if (r)
-   ret = r;
+   /* this prange cannot be migraed, valid or map */
+   if (r) {
+   /* free this prange resources, remove it from svms */
+   svm_range_unlink(prange);
+   svm_range_remove_notifier(prange);
+   svm_range_free(prange, false);
+
+   /* ret got update when any r != -EAGAIN;
+* return -EAGAIN when all pranges at update_list
+* need redo valid/map */
+   if (r != -EAGAIN || !ret)
+   ret = r;
+   }
}

list_for_each_entry(prange, &remap_list, update_list) { @@ -3729,8 
+3740,16 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
if (r)
pr_debug("failed %d on remap svm range\n", r);
mutex_unlock(&prange->migrate_mutex);
-   if (r)
-   ret = r;
+
+   if (r) {
+   /* remove this prange */
+   svm_range_unlink(prange);
+   svm_range_remove_notifier(prange);
+   svm_range_free(prange, false);
+
+   if (r != -EAGAIN || !ret)
+   ret = r;
+   }
}

dynamic_svm_range_dump(svms);
--
2.25.1



Re: [PATCH v2 1/2] drm/amdgpu: add dce_v6_0_soft_reset() to DCE6

2025-03-03 Thread Alex Deucher
Applied.  Thanks.

Alex

On Fri, Feb 28, 2025 at 11:19 PM Alexandre Demers
 wrote:
>
> DCE6 was missing soft reset, but it was easily identifiable under radeon.
> This should be it, pretty much as it is done under DCE8 and DCE10.
>
> Signed-off-by: Alexandre Demers 
> ---
>  drivers/gpu/drm/amd/amdgpu/dce_v6_0.c | 53 ++-
>  1 file changed, 51 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c 
> b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
> index 185401d66961..2ccb450b35a6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
> @@ -372,13 +372,41 @@ static u32 dce_v6_0_hpd_get_gpio_reg(struct 
> amdgpu_device *adev)
> return mmDC_GPIO_HPD_A;
>  }
>
> +static bool dce_v6_0_is_display_hung(struct amdgpu_device *adev)
> +{
> +   u32 crtc_hung = 0;
> +   u32 crtc_status[6];
> +   u32 i, j, tmp;
> +
> +   for (i = 0; i < adev->mode_info.num_crtc; i++) {
> +   if (RREG32(mmCRTC_CONTROL + crtc_offsets[i]) & 
> CRTC_CONTROL__CRTC_MASTER_EN_MASK) {
> +   crtc_status[i] = RREG32(mmCRTC_STATUS_HV_COUNT + 
> crtc_offsets[i]);
> +   crtc_hung |= (1 << i);
> +   }
> +   }
> +
> +   for (j = 0; j < 10; j++) {
> +   for (i = 0; i < adev->mode_info.num_crtc; i++) {
> +   if (crtc_hung & (1 << i)) {
> +   tmp = RREG32(mmCRTC_STATUS_HV_COUNT + 
> crtc_offsets[i]);
> +   if (tmp != crtc_status[i])
> +   crtc_hung &= ~(1 << i);
> +   }
> +   }
> +   if (crtc_hung == 0)
> +   return false;
> +   udelay(100);
> +   }
> +
> +   return true;
> +}
> +
>  static void dce_v6_0_set_vga_render_state(struct amdgpu_device *adev,
>   bool render)
>  {
> if (!render)
> WREG32(mmVGA_RENDER_CONTROL,
> RREG32(mmVGA_RENDER_CONTROL) & VGA_VSTATUS_CNTL);
> -
>  }
>
>  static int dce_v6_0_get_num_crtc(struct amdgpu_device *adev)
> @@ -2860,7 +2888,28 @@ static bool dce_v6_0_is_idle(void *handle)
>
>  static int dce_v6_0_soft_reset(struct amdgpu_ip_block *ip_block)
>  {
> -   DRM_INFO(": dce_v6_0_soft_reset --- no impl!!\n");
> +   u32 srbm_soft_reset = 0, tmp;
> +   struct amdgpu_device *adev = ip_block->adev;
> +
> +   if (dce_v6_0_is_display_hung(adev))
> +   srbm_soft_reset |= SRBM_SOFT_RESET__SOFT_RESET_DC_MASK;
> +
> +   if (srbm_soft_reset) {
> +   tmp = RREG32(mmSRBM_SOFT_RESET);
> +   tmp |= srbm_soft_reset;
> +   dev_info(adev->dev, "SRBM_SOFT_RESET=0x%08X\n", tmp);
> +   WREG32(mmSRBM_SOFT_RESET, tmp);
> +   tmp = RREG32(mmSRBM_SOFT_RESET);
> +
> +   udelay(50);
> +
> +   tmp &= ~srbm_soft_reset;
> +   WREG32(mmSRBM_SOFT_RESET, tmp);
> +   tmp = RREG32(mmSRBM_SOFT_RESET);
> +
> +   /* Wait a little for things to settle down */
> +   udelay(50);
> +   }
> return 0;
>  }
>
> --
> 2.48.1
>


Re: [PATCH v2 0/2] drm/amdgpu: fix style and comments in DCE6

2025-03-03 Thread Alex Deucher
Applied these.  thanks.

Alex

On Fri, Feb 28, 2025 at 9:22 PM Alexandre Demers
 wrote:
>
> While going throught DCE6 code, I took on myself to add some comments
> and to fix style in a few places.
>
> Alexandre Demers (2):
>   drm/amdgpu: add some comments in DCE6
>   dmr/amdgpu: fix style in DCE6
>
>  drivers/gpu/drm/amd/amdgpu/dce_v6_0.c | 32 ++-
>  1 file changed, 21 insertions(+), 11 deletions(-)
>
> --
> 2.48.1
>


Re: [PATCH v2 2/2] drm/amdgpu: complete dce_v6_0_set_crtc_vline_interrupt_state() in DCE6

2025-03-03 Thread Alex Deucher
On Fri, Feb 28, 2025 at 11:11 PM Alexandre Demers
 wrote:
>
> dce_v6_0_set_crtc_vline_interrupt_state() was empty without any info to
> inform the user.

I don't see much point in adding this.  As I mentioned previously,
this interrupt source is never used.  Would be better to just remove
it.

Alex

>
> Based on DCE8 and DCE10 code.
>
> Signed-off-by: Alexandre Demers 
> ---
>  drivers/gpu/drm/amd/amdgpu/dce_v6_0.c | 44 +++
>  1 file changed, 44 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c 
> b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
> index 2ccb450b35a6..f8a743aade52 100644
> --- a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
> @@ -2968,7 +2968,51 @@ static void 
> dce_v6_0_set_crtc_vline_interrupt_state(struct amdgpu_device *adev,
> int crtc,
> enum 
> amdgpu_interrupt_state state)
>  {
> +   u32 reg_block, lb_interrupt_mask;
>
> +   if (crtc >= adev->mode_info.num_crtc) {
> +   DRM_DEBUG("invalid crtc %d\n", crtc);
> +   return;
> +   }
> +
> +   switch (crtc) {
> +   case 0:
> +   reg_block = CRTC0_REGISTER_OFFSET;
> +   break;
> +   case 1:
> +   reg_block = CRTC1_REGISTER_OFFSET;
> +   break;
> +   case 2:
> +   reg_block = CRTC2_REGISTER_OFFSET;
> +   break;
> +   case 3:
> +   reg_block = CRTC3_REGISTER_OFFSET;
> +   break;
> +   case 4:
> +   reg_block = CRTC4_REGISTER_OFFSET;
> +   break;
> +   case 5:
> +   reg_block = CRTC5_REGISTER_OFFSET;
> +   break;
> +   default:
> +   DRM_DEBUG("invalid crtc %d\n", crtc);
> +   return;
> +   }
> +
> +   switch (state) {
> +   case AMDGPU_IRQ_STATE_DISABLE:
> +   lb_interrupt_mask = RREG32(mmINT_MASK + reg_block);
> +   lb_interrupt_mask &= ~VLINE_INT_MASK;
> +   WREG32(mmINT_MASK + reg_block, lb_interrupt_mask);
> +   break;
> +   case AMDGPU_IRQ_STATE_ENABLE:
> +   lb_interrupt_mask = RREG32(mmINT_MASK + reg_block);
> +   lb_interrupt_mask |= VLINE_INT_MASK;
> +   WREG32(mmINT_MASK + reg_block, lb_interrupt_mask);
> +   break;
> +   default:
> +   break;
> +   }
>  }
>
>  static int dce_v6_0_set_hpd_interrupt_state(struct amdgpu_device *adev,
> --
> 2.48.1
>


RE: [PATCH v2 4/4] drm/amdgpu/gfx12: Implement the GFX12 KCQ pipe reset

2025-03-03 Thread Liang, Prike
[Public]

Thank you for the input. I have confirmed with the firmware team that the CP FW 
_start instruction address will be kept consistent.
Could we move this patch series forward now?

Regards,
  Prike

> -Original Message-
> From: Alex Deucher 
> Sent: Thursday, February 27, 2025 10:38 PM
> To: Liang, Prike 
> Cc: amd-gfx@lists.freedesktop.org; Deucher, Alexander
> 
> Subject: Re: [PATCH v2 4/4] drm/amdgpu/gfx12: Implement the GFX12 KCQ pipe
> reset
>
> On Thu, Feb 27, 2025 at 7:36 AM Liang, Prike  wrote:
> >
> > [AMD Official Use Only - AMD Internal Distribution Only]
> >
> > Please review the series patch to catch up the gfx latest base and to avoid 
> > the
> commit merged problem.
>
> See my comment on patch 1:
>
> #define RS64_FW_UC_START_ADDR_LO 0x3000
>
> Will this potentially change in future firmware versions or is it fixed?  If 
> it will
> change, then let's just read it back from registers and store it in adev-
> >gfx.rs64_fw_use_start_addr_lo so that it will be correct if the user has a 
> >mix of
> GPUs in the system.
> Other than those comments, the series looks good to me.
>
> Alex
>
>
> >
> > Regards,
> >   Prike
> >
> > > -Original Message-
> > > From: Liang, Prike 
> > > Sent: Friday, February 21, 2025 9:01 PM
> > > To: amd-gfx@lists.freedesktop.org
> > > Cc: Deucher, Alexander ; Liang, Prike
> > > 
> > > Subject: [PATCH v2 4/4] drm/amdgpu/gfx12: Implement the GFX12 KCQ
> > > pipe reset
> > >
> > > Implement the GFX12 KCQ pipe reset, and disable the GFX12 kernel
> > > compute queue until the CPFW fully supports it.
> > >
> > > Signed-off-by: Prike Liang 
> > > ---
> > >  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 90
> > > +-
> > >  1 file changed, 88 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > > b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > > index 79ae7d538844..103298938d22 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > > @@ -5352,6 +5352,90 @@ static int gfx_v12_0_reset_kgq(struct
> > > amdgpu_ring *ring, unsigned int vmid)
> > >   return amdgpu_ring_test_ring(ring);  }
> > >
> > > +static int gfx_v12_0_reset_compute_pipe(struct amdgpu_ring *ring) {
> > > +
> > > + struct amdgpu_device *adev = ring->adev;
> > > + uint32_t reset_pipe = 0, clean_pipe = 0;
> > > + int r;
> > > +
> > > + if (!gfx_v12_pipe_reset_support(adev))
> > > + return -EOPNOTSUPP;
> > > +
> > > + gfx_v12_0_set_safe_mode(adev, 0);
> > > + mutex_lock(&adev->srbm_mutex);
> > > + soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
> > > +
> > > + reset_pipe = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL);
> > > + clean_pipe = reset_pipe;
> > > +
> > > + if (adev->gfx.rs64_enable) {
> > > +
> > > + switch (ring->pipe) {
> > > + case 0:
> > > + reset_pipe = REG_SET_FIELD(reset_pipe,
> > > CP_MEC_RS64_CNTL,
> > > +MEC_PIPE0_RESET, 1);
> > > + clean_pipe = REG_SET_FIELD(clean_pipe,
> > > CP_MEC_RS64_CNTL,
> > > +MEC_PIPE0_RESET, 0);
> > > + break;
> > > + case 1:
> > > + reset_pipe = REG_SET_FIELD(reset_pipe,
> > > CP_MEC_RS64_CNTL,
> > > +MEC_PIPE1_RESET, 1);
> > > + clean_pipe = REG_SET_FIELD(clean_pipe,
> > > CP_MEC_RS64_CNTL,
> > > +MEC_PIPE1_RESET, 0);
> > > + break;
> > > + case 2:
> > > + reset_pipe = REG_SET_FIELD(reset_pipe,
> > > CP_MEC_RS64_CNTL,
> > > +MEC_PIPE2_RESET, 1);
> > > + clean_pipe = REG_SET_FIELD(clean_pipe,
> > > CP_MEC_RS64_CNTL,
> > > +MEC_PIPE2_RESET, 0);
> > > + break;
> > > + case 3:
> > > + reset_pipe = REG_SET_FIELD(reset_pipe,
> > > CP_MEC_RS64_CNTL,
> > > +MEC_PIPE3_RESET, 1);
> > > + clean_pipe = REG_SET_FIELD(clean_pipe,
> > > CP_MEC_RS64_CNTL,
> > > +MEC_PIPE3_RESET, 0);
> > > + break;
> > > + default:
> > > + break;
> > > + }
> > > + WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_pipe);
> > > + WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_pipe);
> > > + r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR)
> > > << 2) - RS64_FW_UC_START_ADDR_LO;
> > > + } else {
> > > + switch (ring->pipe) {
> > > + case 0:
> > > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL

[PATCH] drm/amdgpu: Fix missing drain retry fault the last entry

2025-03-03 Thread Emily Deng
For equal case, it also need to be dropped.

Signed-off-by: Emily Deng 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
index 7d4395a5d8ac..73b8bcb54734 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
@@ -76,7 +76,7 @@ struct amdgpu_ih_ring {
 
 /* return true if time stamp t2 is after t1 with 48bit wrap around */
 #define amdgpu_ih_ts_after(t1, t2) \
-   (((int64_t)((t2) << 16) - (int64_t)((t1) << 16)) > 0LL)
+   (((int64_t)((t2) << 16) - (int64_t)((t1) << 16)) >= 0LL)
 
 /* provided by the ih block */
 struct amdgpu_ih_funcs {
-- 
2.34.1



[PATCH 2/2] drm/amd/display: use drm_* instead of DRM_ in apply_edid_quirks()

2025-03-03 Thread Aurabindo Pillai
drm_* macros are more helpful that DRM_* macros since the former
indicates the associated DRM device that prints the error, which maybe
helpful when debugging.

Signed-off-by: Aurabindo Pillai 
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
index 253aac93e3d8..2cd35392e2da 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
@@ -69,7 +69,7 @@ static void apply_edid_quirks(struct drm_device *dev, struct 
edid *edid, struct
case drm_edid_encode_panel_id('S', 'A', 'M', 0x0E5E):
case drm_edid_encode_panel_id('S', 'A', 'M', 0x7053):
case drm_edid_encode_panel_id('S', 'A', 'M', 0x71AC):
-   DRM_DEBUG_DRIVER("Disabling FAMS on monitor with panel id 
%X\n", panel_id);
+   drm_dbg_driver(dev, "Disabling FAMS on monitor with panel id 
%X\n", panel_id);
edid_caps->panel_patch.disable_fams = true;
break;
/* Workaround for some monitors that do not clear DPCD 0x317 if 
FreeSync is unsupported */
@@ -78,11 +78,11 @@ static void apply_edid_quirks(struct drm_device *dev, 
struct edid *edid, struct
case drm_edid_encode_panel_id('B', 'O', 'E', 0x092A):
case drm_edid_encode_panel_id('L', 'G', 'D', 0x06D1):
case drm_edid_encode_panel_id('M', 'S', 'F', 0x1003):
-   DRM_DEBUG_DRIVER("Clearing DPCD 0x317 on monitor with panel id 
%X\n", panel_id);
+   drm_dbg_driver(dev, "Clearing DPCD 0x317 on monitor with panel 
id %X\n", panel_id);
edid_caps->panel_patch.remove_sink_ext_caps = true;
break;
case drm_edid_encode_panel_id('S', 'D', 'C', 0x4154):
-   DRM_DEBUG_DRIVER("Disabling VSC on monitor with panel id %X\n", 
panel_id);
+   drm_dbg_driver(dev, "Disabling VSC on monitor with panel id 
%X\n", panel_id);
edid_caps->panel_patch.disable_colorimetry = true;
break;
default:
-- 
2.48.1



[PATCH 1/2] drm/amd/display: Add a temp w/a for a panel

2025-03-03 Thread Aurabindo Pillai
Implement w/a for a panel which requires 10s delay after link detect.

Signed-off-by: Aurabindo Pillai 
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 24 ++-
 .../amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 10 ++--
 drivers/gpu/drm/amd/display/dc/dc_types.h |  1 +
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 73c95c3c39f9..3a2843e3367e 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -3327,6 +3327,21 @@ static void dm_gpureset_commit_state(struct dc_state 
*dc_state,
}
 }
 
+static void apply_delay_after_dpcd_poweroff(struct amdgpu_device *adev, struct 
dc_sink *sink) {
+   struct dc_panel_patch *ppatch = NULL;
+
+   if (!sink)
+   return;
+
+   ppatch = &sink->edid_caps.panel_patch;
+   if (ppatch->wait_after_dpcd_poweroff_ms) {
+   msleep(ppatch->wait_after_dpcd_poweroff_ms);
+   drm_dbg_driver(adev_to_drm(adev), "%s: adding a %ds delay as 
w/a for panel\n",
+   __func__,
+   ppatch->wait_after_dpcd_poweroff_ms / 1000);
+   }
+}
+
 static int dm_resume(struct amdgpu_ip_block *ip_block)
 {
struct amdgpu_device *adev = ip_block->adev;
@@ -3448,6 +3463,7 @@ static int dm_resume(struct amdgpu_ip_block *ip_block)
/* Do detection*/
drm_connector_list_iter_begin(ddev, &iter);
drm_for_each_connector_iter(connector, &iter) {
+   bool ret;
 
if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK)
continue;
@@ -3473,7 +3489,11 @@ static int dm_resume(struct amdgpu_ip_block *ip_block)
} else {
guard(mutex)(&dm->dc_lock);
dc_exit_ips_for_hw_access(dm->dc);
-   dc_link_detect(aconnector->dc_link, 
DETECT_REASON_RESUMEFROMS3S4);
+   ret = dc_link_detect(aconnector->dc_link, 
DETECT_REASON_RESUMEFROMS3S4);
+   if (ret) {
+   /* w/a delay for certain panels */
+   apply_delay_after_dpcd_poweroff(adev, 
aconnector->dc_sink);
+   }
}
 
if (aconnector->fake_enable && aconnector->dc_link->local_sink)
@@ -3834,6 +3854,8 @@ static void handle_hpd_irq_helper(struct 
amdgpu_dm_connector *aconnector)
ret = dc_link_detect(aconnector->dc_link, 
DETECT_REASON_HPD);
}
if (ret) {
+   /* w/a delay for certain panels */
+   apply_delay_after_dpcd_poweroff(adev, 
aconnector->dc_sink);
amdgpu_dm_update_connector_after_detect(aconnector);
 
drm_modeset_lock_all(dev);
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
index fbd80d8545a8..253aac93e3d8 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
@@ -55,11 +55,16 @@ static u32 edid_extract_panel_id(struct edid *edid)
   (u32)EDID_PRODUCT_ID(edid);
 }
 
-static void apply_edid_quirks(struct edid *edid, struct dc_edid_caps 
*edid_caps)
+static void apply_edid_quirks(struct drm_device *dev, struct edid *edid, 
struct dc_edid_caps *edid_caps)
 {
uint32_t panel_id = edid_extract_panel_id(edid);
 
switch (panel_id) {
+   /* Workaround for monitors that need a delay after detecting the link */
+   case drm_edid_encode_panel_id('G', 'B', 'T', 0x3215):
+   drm_dbg_driver(dev, "Add 10s delay for link detection for panel 
id %X\n", panel_id);
+   edid_caps->panel_patch.wait_after_dpcd_poweroff_ms = 1;
+   break;
/* Workaround for some monitors which does not work well with FAMS */
case drm_edid_encode_panel_id('S', 'A', 'M', 0x0E5E):
case drm_edid_encode_panel_id('S', 'A', 'M', 0x7053):
@@ -101,6 +106,7 @@ enum dc_edid_status dm_helpers_parse_edid_caps(
 {
struct amdgpu_dm_connector *aconnector = link->priv;
struct drm_connector *connector = &aconnector->base;
+   struct drm_device *dev = connector->dev;
struct edid *edid_buf = edid ? (struct edid *) edid->raw_edid : NULL;
struct cea_sad *sads;
int sad_count = -1;
@@ -130,7 +136,7 @@ enum dc_edid_status dm_helpers_parse_edid_caps(
 
edid_caps->edid_hdmi = connector->display_info.is_hdmi;
 
-   apply_edid_quirks(edid_buf, edid_caps);
+   apply_edid_quirks(dev, edid_buf, edid_caps);
 
sad_count = drm_edid_to_sad((struct edid *) edid->raw_edid, &sads);
if (sad_count <= 0)
diff --git a/drivers/gpu/drm/amd/display/d

Re: [PATCH 2/2] drm/amdgpu: Make use of drm_wedge_app_info

2025-03-03 Thread André Almeida

Hi Raag,

On 2/28/25 11:58, Raag Jadav wrote:

On Fri, Feb 28, 2025 at 09:13:53AM -0300, André Almeida wrote:

To notify userspace about which app (if any) made the device get in a
wedge state, make use of drm_wedge_app_info parameter, filling it with
the app PID and name.

Signed-off-by: André Almeida 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  6 +-
  2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 00b9b87dafd8..e06adf6f34fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6123,8 +6123,23 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
  
  	atomic_set(&adev->reset_domain->reset_res, r);
  
-	if (!r)

-   drm_dev_wedged_event(adev_to_drm(adev), 
DRM_WEDGE_RECOVERY_NONE, NULL);
+   if (!r) {
+   struct drm_wedge_app_info aux, *info = NULL;
+
+   if (job) {
+   struct amdgpu_task_info *ti;
+
+   ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid);
+   if (ti) {
+   aux.pid = ti->pid;
+   aux.comm = ti->process_name;
+   info = &aux;
+   amdgpu_vm_put_task_info(ti);
+   }
+   }

Is this guaranteed to be guilty app and not some scheduled worker?


This is how amdgpu decides which app is the guilty one earlier in the 
code as in the print:


    ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);

    "Process information: process %s pid %d thread %s pid %d\n"

So I think it's consistent with what the driver thinks it's the guilty 
process.



Raag



Re: [PATCH 2/2] drm/amdgpu: Make use of drm_wedge_app_info

2025-03-03 Thread Raag Jadav
On Fri, Feb 28, 2025 at 09:13:53AM -0300, André Almeida wrote:
> To notify userspace about which app (if any) made the device get in a
> wedge state, make use of drm_wedge_app_info parameter, filling it with
> the app PID and name.
> 
> Signed-off-by: André Almeida 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  6 +-
>  2 files changed, 22 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 00b9b87dafd8..e06adf6f34fd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6123,8 +6123,23 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>  
>   atomic_set(&adev->reset_domain->reset_res, r);
>  
> - if (!r)
> - drm_dev_wedged_event(adev_to_drm(adev), 
> DRM_WEDGE_RECOVERY_NONE, NULL);
> + if (!r) {
> + struct drm_wedge_app_info aux, *info = NULL;
> +
> + if (job) {
> + struct amdgpu_task_info *ti;
> +
> + ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid);
> + if (ti) {
> + aux.pid = ti->pid;
> + aux.comm = ti->process_name;
> + info = &aux;
> + amdgpu_vm_put_task_info(ti);
> + }
> + }

Is this guaranteed to be guilty app and not some scheduled worker?

Raag


Re: [PATCH 00/21] Add Picasso support

2025-03-03 Thread Adam Lemieszka



Re: [PATCH] drm/radeon: Simplify maximum determination in radeon_uvd_calc_upll_dividers()

2025-03-03 Thread Natalie Vock

On 28.02.25 17:36, Markus Elfring wrote:

From: Markus Elfring 
Date: Fri, 28 Feb 2025 17:32:45 +0100

Replace nested max() calls by single max3() call in this
function implementation.

This issue was transformed by using the Coccinelle software.


How about something like "this change was made" or "this code was
transformed"? Coccinelle didn't transform the issue, it transformed the
code to solve the issue.

Cheers,
Natalie



Signed-off-by: Markus Elfring 
---
  drivers/gpu/drm/radeon/radeon_uvd.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c
index 058a1c8451b2..ded5747a58d1 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -961,7 +961,7 @@ int radeon_uvd_calc_upll_dividers(struct radeon_device 
*rdev,
unsigned optimal_score = ~0;

/* loop through vco from low to high */
-   vco_min = max(max(vco_min, vclk), dclk);
+   vco_min = max3(vco_min, vclk, dclk);
for (vco_freq = vco_min; vco_freq <= vco_max; vco_freq += 100) {

uint64_t fb_div = (uint64_t)vco_freq * fb_factor;
--
2.48.1





[PATCH] drm/amdgpu: Simplify maximum determination in si_calc_upll_dividers()

2025-03-03 Thread Markus Elfring
From: Markus Elfring 
Date: Fri, 28 Feb 2025 16:37:00 +0100

Replace nested max() calls by single max3() call in this
function implementation.

This issue was transformed by using the Coccinelle software.

Signed-off-by: Markus Elfring 
---
 drivers/gpu/drm/amd/amdgpu/si.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/si.c b/drivers/gpu/drm/amd/amdgpu/si.c
index 026e8376e2c0..2a255fb15768 100644
--- a/drivers/gpu/drm/amd/amdgpu/si.c
+++ b/drivers/gpu/drm/amd/amdgpu/si.c
@@ -1726,7 +1726,7 @@ static int si_calc_upll_dividers(struct amdgpu_device 
*adev,
unsigned optimal_score = ~0;

/* Loop through vco from low to high */
-   vco_min = max(max(vco_min, vclk), dclk);
+   vco_min = max3(vco_min, vclk, dclk);
for (vco_freq = vco_min; vco_freq <= vco_max; vco_freq += 100) {
uint64_t fb_div = (uint64_t)vco_freq * fb_factor;
unsigned vclk_div, dclk_div, score;
--
2.48.1



Re: [PATCH 1/2] drm: Create an app info option for wedge events

2025-03-03 Thread Raag Jadav
On Fri, Feb 28, 2025 at 06:54:12PM -0300, André Almeida wrote:
> Hi Raag,
> 
> On 2/28/25 11:20, Raag Jadav wrote:
> > Cc: Lucas
> > 
> > On Fri, Feb 28, 2025 at 09:13:52AM -0300, André Almeida wrote:
> > > When a device get wedged, it might be caused by a guilty application.
> > > For userspace, knowing which app was the cause can be useful for some
> > > situations, like for implementing a policy, logs or for giving a chance
> > > for the compositor to let the user know what app caused the problem.
> > > This is an optional argument, when `PID=-1` there's no information about
> > > the app caused the problem, or if any app was involved during the hang.
> > > 
> > > Sometimes just the PID isn't enough giving that the app might be already
> > > dead by the time userspace will try to check what was this PID's name,
> > > so to make the life easier also notify what's the app's name in the user
> > > event.
> > > 
> > > Signed-off-by: André Almeida 
> > > ---
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  2 +-
> > >   drivers/gpu/drm/drm_drv.c  | 16 +---
> > >   drivers/gpu/drm/i915/gt/intel_reset.c  |  3 ++-
> > >   drivers/gpu/drm/xe/xe_device.c |  3 ++-
> > >   include/drm/drm_device.h   |  8 
> > >   include/drm/drm_drv.h  |  3 ++-
> > >   7 files changed, 29 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > index 24ba52d76045..00b9b87dafd8 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > @@ -6124,7 +6124,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> > > *adev,
> > >   atomic_set(&adev->reset_domain->reset_res, r);
> > >   if (!r)
> > > - drm_dev_wedged_event(adev_to_drm(adev), 
> > > DRM_WEDGE_RECOVERY_NONE);
> > > + drm_dev_wedged_event(adev_to_drm(adev), 
> > > DRM_WEDGE_RECOVERY_NONE, NULL);
> > >   return r;
> > >   }
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > index ef1b77f1e88f..3ed9cbcab1ad 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > @@ -150,7 +150,7 @@ static enum drm_gpu_sched_stat 
> > > amdgpu_job_timedout(struct drm_sched_job *s_job)
> > >   amdgpu_fence_driver_force_completion(ring);
> > >   if (amdgpu_ring_sched_ready(ring))
> > >   drm_sched_start(&ring->sched, 0);
> > > - drm_dev_wedged_event(adev_to_drm(adev), 
> > > DRM_WEDGE_RECOVERY_NONE);
> > > + drm_dev_wedged_event(adev_to_drm(adev), 
> > > DRM_WEDGE_RECOVERY_NONE, NULL);
> > >   dev_err(adev->dev, "Ring %s reset succeeded\n", 
> > > ring->sched.name);
> > >   goto exit;
> > >   }
> > > diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
> > > index 17fc5dc708f4..48faafd82a99 100644
> > > --- a/drivers/gpu/drm/drm_drv.c
> > > +++ b/drivers/gpu/drm/drm_drv.c
> > > @@ -522,6 +522,7 @@ static const char *drm_get_wedge_recovery(unsigned 
> > > int opt)
> > >* drm_dev_wedged_event - generate a device wedged uevent
> > >* @dev: DRM device
> > >* @method: method(s) to be used for recovery
> > > + * @info: optional information about the guilty app
> > >*
> > >* This generates a device wedged uevent for the DRM device specified 
> > > by @dev.
> > >* Recovery @method\(s) of choice will be sent in the uevent 
> > > environment as
> > > @@ -534,13 +535,14 @@ static const char *drm_get_wedge_recovery(unsigned 
> > > int opt)
> > >*
> > >* Returns: 0 on success, negative error code otherwise.
> > >*/
> > > -int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
> > > +int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
> > > +  struct drm_wedge_app_info *info)
> > >   {
> > >   const char *recovery = NULL;
> > >   unsigned int len, opt;
> > >   /* Event string length up to 28+ characters with available 
> > > methods */
> > > - char event_string[32];
> > > - char *envp[] = { event_string, NULL };
> > > + char event_string[32], pid_string[15], comm_string[TASK_COMM_LEN];
> > > + char *envp[] = { event_string, pid_string, comm_string, NULL };
> > >   len = scnprintf(event_string, sizeof(event_string), "%s", 
> > > "WEDGED=");
> > > @@ -562,6 +564,14 @@ int drm_dev_wedged_event(struct drm_device *dev, 
> > > unsigned long method)
> > >   drm_info(dev, "device wedged, %s\n", method == 
> > > DRM_WEDGE_RECOVERY_NONE ?
> > >"but recovered through reset" : "needs recovery");
> > > + if (info) {
> > 

Re: [PATCH 1/2] drm: Create an app info option for wedge events

2025-03-03 Thread André Almeida

Hi Raag,

On 2/28/25 11:20, Raag Jadav wrote:

Cc: Lucas

On Fri, Feb 28, 2025 at 09:13:52AM -0300, André Almeida wrote:

When a device get wedged, it might be caused by a guilty application.
For userspace, knowing which app was the cause can be useful for some
situations, like for implementing a policy, logs or for giving a chance
for the compositor to let the user know what app caused the problem.
This is an optional argument, when `PID=-1` there's no information about
the app caused the problem, or if any app was involved during the hang.

Sometimes just the PID isn't enough giving that the app might be already
dead by the time userspace will try to check what was this PID's name,
so to make the life easier also notify what's the app's name in the user
event.

Signed-off-by: André Almeida 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  2 +-
  drivers/gpu/drm/drm_drv.c  | 16 +---
  drivers/gpu/drm/i915/gt/intel_reset.c  |  3 ++-
  drivers/gpu/drm/xe/xe_device.c |  3 ++-
  include/drm/drm_device.h   |  8 
  include/drm/drm_drv.h  |  3 ++-
  7 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 24ba52d76045..00b9b87dafd8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6124,7 +6124,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
atomic_set(&adev->reset_domain->reset_res, r);
  
  	if (!r)

-   drm_dev_wedged_event(adev_to_drm(adev), 
DRM_WEDGE_RECOVERY_NONE);
+   drm_dev_wedged_event(adev_to_drm(adev), 
DRM_WEDGE_RECOVERY_NONE, NULL);
  
  	return r;

  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index ef1b77f1e88f..3ed9cbcab1ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -150,7 +150,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
amdgpu_fence_driver_force_completion(ring);
if (amdgpu_ring_sched_ready(ring))
drm_sched_start(&ring->sched, 0);
-   drm_dev_wedged_event(adev_to_drm(adev), 
DRM_WEDGE_RECOVERY_NONE);
+   drm_dev_wedged_event(adev_to_drm(adev), 
DRM_WEDGE_RECOVERY_NONE, NULL);
dev_err(adev->dev, "Ring %s reset succeeded\n", 
ring->sched.name);
goto exit;
}
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 17fc5dc708f4..48faafd82a99 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -522,6 +522,7 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
   * drm_dev_wedged_event - generate a device wedged uevent
   * @dev: DRM device
   * @method: method(s) to be used for recovery
+ * @info: optional information about the guilty app
   *
   * This generates a device wedged uevent for the DRM device specified by @dev.
   * Recovery @method\(s) of choice will be sent in the uevent environment as
@@ -534,13 +535,14 @@ static const char *drm_get_wedge_recovery(unsigned int 
opt)
   *
   * Returns: 0 on success, negative error code otherwise.
   */
-int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
+int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
+struct drm_wedge_app_info *info)
  {
const char *recovery = NULL;
unsigned int len, opt;
/* Event string length up to 28+ characters with available methods */
-   char event_string[32];
-   char *envp[] = { event_string, NULL };
+   char event_string[32], pid_string[15], comm_string[TASK_COMM_LEN];
+   char *envp[] = { event_string, pid_string, comm_string, NULL };
  
  	len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED=");
  
@@ -562,6 +564,14 @@ int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)

drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ?
 "but recovered through reset" : "needs recovery");
  
+	if (info) {

+   snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid);
+   snprintf(comm_string, sizeof(comm_string), "APP=%s", 
info->comm);
+   } else {
+   snprintf(pid_string, sizeof(pid_string), "%s", "PID=-1");
+   snprintf(comm_string, sizeof(comm_string), "%s", "APP=none");
+   }

This is not much use for wedge cases that needs recovery, since at that point
the userspace will need to clean house anyway.

Which leaves us with only 'none' case and perhaps the need for standardization
of "optional telemetry collection".

Thoughts?


I had the feeling that 'none' was already 

Re: [PATCH 2/2] drm/amdgpu: Make use of drm_wedge_app_info

2025-03-03 Thread Raag Jadav
On Fri, Feb 28, 2025 at 06:49:43PM -0300, André Almeida wrote:
> Hi Raag,
> 
> On 2/28/25 11:58, Raag Jadav wrote:
> > On Fri, Feb 28, 2025 at 09:13:53AM -0300, André Almeida wrote:
> > > To notify userspace about which app (if any) made the device get in a
> > > wedge state, make use of drm_wedge_app_info parameter, filling it with
> > > the app PID and name.
> > > 
> > > Signed-off-by: André Almeida 
> > > ---
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +--
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  6 +-
> > >   2 files changed, 22 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > index 00b9b87dafd8..e06adf6f34fd 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > @@ -6123,8 +6123,23 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> > > *adev,
> > >   atomic_set(&adev->reset_domain->reset_res, r);
> > > - if (!r)
> > > - drm_dev_wedged_event(adev_to_drm(adev), 
> > > DRM_WEDGE_RECOVERY_NONE, NULL);
> > > + if (!r) {
> > > + struct drm_wedge_app_info aux, *info = NULL;
> > > +
> > > + if (job) {
> > > + struct amdgpu_task_info *ti;
> > > +
> > > + ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid);
> > > + if (ti) {
> > > + aux.pid = ti->pid;
> > > + aux.comm = ti->process_name;
> > > + info = &aux;
> > > + amdgpu_vm_put_task_info(ti);
> > > + }
> > > + }
> > Is this guaranteed to be guilty app and not some scheduled worker?
> 
> This is how amdgpu decides which app is the guilty one earlier in the code
> as in the print:
> 
>     ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
> 
>     "Process information: process %s pid %d thread %s pid %d\n"
> 
> So I think it's consistent with what the driver thinks it's the guilty
> process.

Sure, but with something like app_info we're kind of hinting to userspace
that an application was _indeed_ involved with reset. Is that also guaranteed?

Is it possible that an application needlessly suffers from a false positive
scenario (reset due to other factors)?

Raag


Re: [PATCH] drm/radeon: Fix rs400_gpu_init for ATI mobility radeon Xpress 200M

2025-03-03 Thread Marek Olšák
Reviewed-by: Marek Olšák 

On Tue, Jun 18, 2019 at 3:19 AM Richard Thier  wrote:

> num_gb_pipes was set to a wrong value using r420_pipe_config
>
> This have lead to HyperZ glitches on fast Z clearing.
>
> See: https://bugs.freedesktop.org/show_bug.cgi?id=110897
>
> Signed-off-by: Richard Thier 
> ---
>  drivers/gpu/drm/radeon/r300.c  |  3 ++-
>  drivers/gpu/drm/radeon/rs400.c | 21 +++--
>  2 files changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c
> index 652126f..6724c15 100644
> --- a/drivers/gpu/drm/radeon/r300.c
> +++ b/drivers/gpu/drm/radeon/r300.c
> @@ -355,7 +355,8 @@ int r300_mc_wait_for_idle(struct radeon_device *rdev)
> return -1;
>  }
>
> -static void r300_gpu_init(struct radeon_device *rdev)
> +/* rs400_gpu_init also calls this! */
> +void r300_gpu_init(struct radeon_device *rdev)
>  {
> uint32_t gb_tile_config, tmp;
>
> diff --git a/drivers/gpu/drm/radeon/rs400.c
> b/drivers/gpu/drm/radeon/rs400.c
> index 4121209..4117572 100644
> --- a/drivers/gpu/drm/radeon/rs400.c
> +++ b/drivers/gpu/drm/radeon/rs400.c
> @@ -32,6 +32,9 @@
>  #include "radeon_asic.h"
>  #include "rs400d.h"
>
> +/* Needed for rs400_gpu_init that forwards to the r300 one */
> +void r300_gpu_init(struct radeon_device *rdev);
> +
>  /* This files gather functions specifics to : rs400,rs480 */
>  static int rs400_debugfs_pcie_gart_info_init(struct radeon_device *rdev);
>
> @@ -252,8 +255,22 @@ int rs400_mc_wait_for_idle(struct radeon_device *rdev)
>
>  static void rs400_gpu_init(struct radeon_device *rdev)
>  {
> -   /* FIXME: is this correct ? */
> -   r420_pipes_init(rdev);
> +   /* Earlier code was calling r420_pipes_init and then
> +* rs400_mc_wait_for_idle(rdev). The problem is that
> +* at least on my Mobility Radeon Xpress 200M RC410 card
> +* that ends up in this code path ends up num_gb_pipes == 3
> +* while the card seems to have only one pipe. With the
> +* r420 pipe initialization method.
> +*
> +* Problems shown up as HyperZ glitches, see:
> +* https://bugs.freedesktop.org/show_bug.cgi?id=110897
> +*
> +* Delegating initialization to r300 code seems to work
> +* and results in proper pipe numbers. The rs400 cards
> +* are said to be not r400, but r300 kind of cards.
> +*/
> +   r300_gpu_init(rdev);
> +
> if (rs400_mc_wait_for_idle(rdev)) {
> pr_warn("rs400: Failed to wait MC idle while programming
> pipes. Bad things might happen. %08x\n",
> RREG32(RADEON_MC_STATUS));
> --
> 2.21.0
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 5/5] drm/amdgpu/userq: remove BROKEN from config

2025-03-03 Thread Liang, Prike
[Public]

Question: Why use the CONFIG_DRM_AMDGPU_NAVI3X_USERQ to guard all the various 
GFX user mode queue enablement paths? It seems more generic and reasonable to 
use the config name CONFIG_DRM_AMDGPU_NAVI3X_USERQ.

Except for that question, the series patch is Reviewed-by: Prike Liang 
 .

Regards,
  Prike

> -Original Message-
> From: amd-gfx  On Behalf Of Alex
> Deucher
> Sent: Saturday, March 1, 2025 3:54 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander 
> Subject: [PATCH 5/5] drm/amdgpu/userq: remove BROKEN from config
>
> This can be enabled now.  We have the firmware checks in place.
>
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/amdgpu/Kconfig | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig
> b/drivers/gpu/drm/amd/amdgpu/Kconfig
> index a614320114796..23c014da0f324 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Kconfig
> +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
> @@ -98,7 +98,6 @@ config DRM_AMDGPU_WERROR  config
> DRM_AMDGPU_NAVI3X_USERQ
>   bool "Enable amdgpu usermode queues"
>   depends on DRM_AMDGPU
> - depends on BROKEN
>   default n
>   help
> Choose this option to enable GFX usermode queue support for
> GFX/SDMA/Compute
> --
> 2.48.1



[PATCH] drm/radeon: Simplify maximum determination in radeon_uvd_calc_upll_dividers()

2025-03-03 Thread Markus Elfring
From: Markus Elfring 
Date: Fri, 28 Feb 2025 17:32:45 +0100

Replace nested max() calls by single max3() call in this
function implementation.

This issue was transformed by using the Coccinelle software.

Signed-off-by: Markus Elfring 
---
 drivers/gpu/drm/radeon/radeon_uvd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c
index 058a1c8451b2..ded5747a58d1 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -961,7 +961,7 @@ int radeon_uvd_calc_upll_dividers(struct radeon_device 
*rdev,
unsigned optimal_score = ~0;

/* loop through vco from low to high */
-   vco_min = max(max(vco_min, vclk), dclk);
+   vco_min = max3(vco_min, vclk, dclk);
for (vco_freq = vco_min; vco_freq <= vco_max; vco_freq += 100) {

uint64_t fb_div = (uint64_t)vco_freq * fb_factor;
--
2.48.1



Re: [PATCH 1/2] drm: Create an app info option for wedge events

2025-03-03 Thread Raag Jadav
Cc: Lucas

On Fri, Feb 28, 2025 at 09:13:52AM -0300, André Almeida wrote:
> When a device get wedged, it might be caused by a guilty application.
> For userspace, knowing which app was the cause can be useful for some
> situations, like for implementing a policy, logs or for giving a chance
> for the compositor to let the user know what app caused the problem.
> This is an optional argument, when `PID=-1` there's no information about
> the app caused the problem, or if any app was involved during the hang.
> 
> Sometimes just the PID isn't enough giving that the app might be already
> dead by the time userspace will try to check what was this PID's name,
> so to make the life easier also notify what's the app's name in the user
> event.
> 
> Signed-off-by: André Almeida 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  2 +-
>  drivers/gpu/drm/drm_drv.c  | 16 +---
>  drivers/gpu/drm/i915/gt/intel_reset.c  |  3 ++-
>  drivers/gpu/drm/xe/xe_device.c |  3 ++-
>  include/drm/drm_device.h   |  8 
>  include/drm/drm_drv.h  |  3 ++-
>  7 files changed, 29 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 24ba52d76045..00b9b87dafd8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6124,7 +6124,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>   atomic_set(&adev->reset_domain->reset_res, r);
>  
>   if (!r)
> - drm_dev_wedged_event(adev_to_drm(adev), 
> DRM_WEDGE_RECOVERY_NONE);
> + drm_dev_wedged_event(adev_to_drm(adev), 
> DRM_WEDGE_RECOVERY_NONE, NULL);
>  
>   return r;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index ef1b77f1e88f..3ed9cbcab1ad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -150,7 +150,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
> drm_sched_job *s_job)
>   amdgpu_fence_driver_force_completion(ring);
>   if (amdgpu_ring_sched_ready(ring))
>   drm_sched_start(&ring->sched, 0);
> - drm_dev_wedged_event(adev_to_drm(adev), 
> DRM_WEDGE_RECOVERY_NONE);
> + drm_dev_wedged_event(adev_to_drm(adev), 
> DRM_WEDGE_RECOVERY_NONE, NULL);
>   dev_err(adev->dev, "Ring %s reset succeeded\n", 
> ring->sched.name);
>   goto exit;
>   }
> diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
> index 17fc5dc708f4..48faafd82a99 100644
> --- a/drivers/gpu/drm/drm_drv.c
> +++ b/drivers/gpu/drm/drm_drv.c
> @@ -522,6 +522,7 @@ static const char *drm_get_wedge_recovery(unsigned int 
> opt)
>   * drm_dev_wedged_event - generate a device wedged uevent
>   * @dev: DRM device
>   * @method: method(s) to be used for recovery
> + * @info: optional information about the guilty app
>   *
>   * This generates a device wedged uevent for the DRM device specified by 
> @dev.
>   * Recovery @method\(s) of choice will be sent in the uevent environment as
> @@ -534,13 +535,14 @@ static const char *drm_get_wedge_recovery(unsigned int 
> opt)
>   *
>   * Returns: 0 on success, negative error code otherwise.
>   */
> -int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
> +int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
> +  struct drm_wedge_app_info *info)
>  {
>   const char *recovery = NULL;
>   unsigned int len, opt;
>   /* Event string length up to 28+ characters with available methods */
> - char event_string[32];
> - char *envp[] = { event_string, NULL };
> + char event_string[32], pid_string[15], comm_string[TASK_COMM_LEN];
> + char *envp[] = { event_string, pid_string, comm_string, NULL };
>  
>   len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED=");
>  
> @@ -562,6 +564,14 @@ int drm_dev_wedged_event(struct drm_device *dev, 
> unsigned long method)
>   drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ?
>"but recovered through reset" : "needs recovery");
>  
> + if (info) {
> + snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid);
> + snprintf(comm_string, sizeof(comm_string), "APP=%s", 
> info->comm);
> + } else {
> + snprintf(pid_string, sizeof(pid_string), "%s", "PID=-1");
> + snprintf(comm_string, sizeof(comm_string), "%s", "APP=none");
> + }

This is not much use for wedge cases that needs recovery, since at that point
the userspace will need to clean house anyway.

Which leaves us with only 'none' case and perhaps the need for standardizati

[PATCH v2 2/2] drm/amdgpu: Reduce dequeue retry timeout for gfx9 family

2025-03-03 Thread Harish Kasiviswanathan
Dequeue retry timeout controls the interval between checks for unmet
conditions. On MI series, reduce this from 0x40 to 0x1 (~ 1 uS). The
cost of additional bandwidth consumed by CP when polling memory
shouldn't be substantial.

Signed-off-by: Harish Kasiviswanathan 
Reviewed-by: : Jonathan Kim 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  2 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c   |  4 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 28 ++---
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|  5 ++-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 28 ++---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  5 ++-
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c| 40 +--
 .../gpu/drm/amd/include/kgd_kfd_interface.h   |  5 ++-
 10 files changed, 70 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 8dfdb18197c4..53f5f1885870 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -189,7 +189,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.set_address_watch = kgd_gfx_aldebaran_set_address_watch,
.clear_address_watch = kgd_gfx_v9_clear_address_watch,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
-   .build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
+   .build_dequeue_wait_counts_packet_info = 
kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
.program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
.hqd_reset = kgd_gfx_v9_hqd_reset,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 9abf29b58ac7..6fd41aece7e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -415,7 +415,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
.set_address_watch = kgd_gfx_v9_set_address_watch,
.clear_address_watch = kgd_gfx_v9_clear_address_watch,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
-   .build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
+   .build_dequeue_wait_counts_packet_info = 
kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
index e2ae714a700f..95f249896275 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
@@ -530,8 +530,8 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.program_trap_handler_settings =
kgd_gfx_v9_program_trap_handler_settings,
-   .build_grace_period_packet_info =
-   kgd_gfx_v9_build_grace_period_packet_info,
+   .build_dequeue_wait_counts_packet_info =
+   
kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
.disable_debug_trap = kgd_gfx_v9_4_3_disable_debug_trap,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 62176d607bef..0b03f2e9a858 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -1021,25 +1021,25 @@ void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device 
*adev,
*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
 }
 
-void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
+void kgd_gfx_v10_build_dequeue_wait_counts_packet_info(struct amdgpu_device 
*adev,
uint32_t wait_times,
-   uint32_t grace_period,
+   uint32_t sch_wave,
+   uint32_t que_sleep,
uint32_t *reg_offset,
uint32_t *reg_data)
 {
*reg_data = wait_times;
 
-   /*
-* The CP cannont handle a 0 grace period input and will result in
-* an infinite grace period being set so set to 1 to prevent this.
-*/
-   if (grace_period == 0)
-   grace_period = 1;

[PATCH v2 1/2] drm/amdkfd: Add pm_config_dequeue_wait_counts API

2025-03-03 Thread Harish Kasiviswanathan
Update pm_update_grace_period() to more cleaner
pm_config_dequeue_wait_counts(). Previously, grace_period variable was
overloaded as a variable and a macro, making it inflexible to configure
additional dequeue wait times.

pm_config_dequeue_wait_counts() now takes in a cmd / variable. This
allows flexibility to update different dequeue wait times.

Signed-off-by: Harish Kasiviswanathan 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 45 ++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h | 11 +++-
 .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   | 26 +++-
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c| 59 ++-
 .../drm/amd/amdkfd/kfd_packet_manager_vi.c|  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 28 +++--
 6 files changed, 123 insertions(+), 50 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 94b1ac8a4735..cc7465f9562a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -42,6 +42,8 @@
 /* Size of the per-pipe EOP queue */
 #define CIK_HPD_EOP_BYTES_LOG2 11
 #define CIK_HPD_EOP_BYTES (1U << CIK_HPD_EOP_BYTES_LOG2)
+/* See unmap_queues_cpsch() */
+#define USE_DEFAULT_GRACE_PERIOD 0x
 
 static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
  u32 pasid, unsigned int vmid);
@@ -1745,10 +1747,7 @@ static int initialize_cpsch(struct device_queue_manager 
*dqm)
 
init_sdma_bitmaps(dqm);
 
-   if (dqm->dev->kfd2kgd->get_iq_wait_times)
-   dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev,
-   &dqm->wait_times,
-   ffs(dqm->dev->xcc_mask) - 1);
+   update_dqm_wait_times(dqm);
return 0;
 }
 
@@ -1844,25 +1843,11 @@ static int start_cpsch(struct device_queue_manager *dqm)
/* clear hang status when driver try to start the hw scheduler */
dqm->sched_running = true;
 
-   if (!dqm->dev->kfd->shared_resources.enable_mes)
+   if (!dqm->dev->kfd->shared_resources.enable_mes) {
+   if (pm_config_dequeue_wait_counts(&dqm->packet_mgr,
+   KFD_DEQUEUE_WAIT_INIT, 0 /* unused */))
+   dev_err(dev, "Setting optimized dequeue wait failed. 
Using default values\n");
execute_queues_cpsch(dqm, 
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
-
-   /* Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU */
-   if (amdgpu_emu_mode == 0 && dqm->dev->adev->gmc.is_app_apu &&
-   (KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3))) {
-   uint32_t reg_offset = 0;
-   uint32_t grace_period = 1;
-
-   retval = pm_update_grace_period(&dqm->packet_mgr,
-   grace_period);
-   if (retval)
-   dev_err(dev, "Setting grace timeout failed\n");
-   else if (dqm->dev->kfd2kgd->build_grace_period_packet_info)
-   /* Update dqm->wait_times maintained in software */
-   dqm->dev->kfd2kgd->build_grace_period_packet_info(
-   dqm->dev->adev, dqm->wait_times,
-   grace_period, ®_offset,
-   &dqm->wait_times);
}
 
/* setup per-queue reset detection buffer  */
@@ -2261,7 +2246,14 @@ static int reset_queues_on_hws_hang(struct 
device_queue_manager *dqm)
return r;
 }
 
-/* dqm->lock mutex has to be locked before calling this function */
+/* dqm->lock mutex has to be locked before calling this function
+ *
+ * @grace_period: If USE_DEFAULT_GRACE_PERIOD then default wait time
+ *   for context switch latency. Lower values are used by debugger
+ *   since context switching are triggered at high frequency.
+ *   This is configured by setting CP_IQ_WAIT_TIME2.SCH_WAVE
+ *
+ */
 static int unmap_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param,
@@ -2280,7 +2272,8 @@ static int unmap_queues_cpsch(struct device_queue_manager 
*dqm,
return -EIO;
 
if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
-   retval = pm_update_grace_period(&dqm->packet_mgr, grace_period);
+   retval = pm_config_dequeue_wait_counts(&dqm->packet_mgr,
+   KFD_DEQUEUE_WAIT_SET_SCH_WAVE, grace_period);
if (retval)
goto out;
}
@@ -2324,8 +2317,8 @@ static int unmap_queues_cpsch(struct device_queue_manager 
*dqm,
 
/* We need to reset the grace period value for this device */
if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
-   if

RE: [PATCH v2 2/2] drm/amdgpu: Reduce dequeue retry timeout for gfx9 family

2025-03-03 Thread Kim, Jonathan
[Public]

> -Original Message-
> From: Kasiviswanathan, Harish 
> Sent: Monday, March 3, 2025 3:44 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kasiviswanathan, Harish ; Kim, Jonathan
> 
> Subject: [PATCH v2 2/2] drm/amdgpu: Reduce dequeue retry timeout for gfx9 
> family
>
> Dequeue retry timeout controls the interval between checks for unmet
> conditions. On MI series, reduce this from 0x40 to 0x1 (~ 1 uS). The
> cost of additional bandwidth consumed by CP when polling memory
> shouldn't be substantial.
>
> Signed-off-by: Harish Kasiviswanathan 
> Reviewed-by: : Jonathan Kim 
> ---
>  .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  2 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c   |  4 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 28 ++---
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|  5 ++-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  2 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 28 ++---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  5 ++-
>  .../drm/amd/amdkfd/kfd_packet_manager_v9.c| 40 +--
>  .../gpu/drm/amd/include/kgd_kfd_interface.h   |  5 ++-
>  10 files changed, 70 insertions(+), 51 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index 8dfdb18197c4..53f5f1885870 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -189,7 +189,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   .set_address_watch = kgd_gfx_aldebaran_set_address_watch,
>   .clear_address_watch = kgd_gfx_v9_clear_address_watch,
>   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> - .build_grace_period_packet_info =
> kgd_gfx_v9_build_grace_period_packet_info,
> + .build_dequeue_wait_counts_packet_info =
> kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
>   .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings,
>   .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
>   .hqd_reset = kgd_gfx_v9_hqd_reset,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index 9abf29b58ac7..6fd41aece7e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -415,7 +415,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   .set_address_watch = kgd_gfx_v9_set_address_watch,
>   .clear_address_watch = kgd_gfx_v9_clear_address_watch,
>   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> - .build_grace_period_packet_info =
> kgd_gfx_v9_build_grace_period_packet_info,
> + .build_dequeue_wait_counts_packet_info =
> kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
>   .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>   .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings,
>   .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> index e2ae714a700f..95f249896275 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> @@ -530,8 +530,8 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
>   .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>   .program_trap_handler_settings =
>   kgd_gfx_v9_program_trap_handler_settings,
> - .build_grace_period_packet_info =
> - kgd_gfx_v9_build_grace_period_packet_info,
> + .build_dequeue_wait_counts_packet_info =
> + 
> kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
>   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   .enable_debug_trap = kgd_aldebaran_enable_debug_trap,
>   .disable_debug_trap = kgd_gfx_v9_4_3_disable_debug_trap,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 62176d607bef..0b03f2e9a858 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -1021,25 +1021,25 @@ void kgd_gfx_v10_get_iq_wait_times(struct
> amdgpu_device *adev,
>   *wait_times = RREG32(SOC15_REG_OFFSET(GC, 0,
> mmCP_IQ_WAIT_TIME2));
>  }
>
> -void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
> +void kgd_gfx_v10_build_dequeue_wait_counts_packet_info(struct amdgpu_device
> *adev,
>   uint32_t wait_times,
> - uint32_t grace_period,
> + uint32_t sch_wave,
> + uint32_t que_sleep,
>

Re: [PATCH] drm/amdkfd: remove unnecessary cpu domain validation

2025-03-03 Thread Felix Kuehling
On 2025-03-03 13:48, Christian König wrote:
> Am 03.03.25 um 19:45 schrieb James Zhu:
>> before move to GTT domain.
> That might not be unnecessary. We sometimes intentionally move BOs to the CPU 
> domain to invalidate all VM mappings.

We discussed this in our VM sync meeting this morning, and I wasn't 100% sure 
either. In this case it's causing a nearly live-lock of mutual evictions when 
two processes share the same BO and have both their eviction fences on the 
shared reservation object.

I was thinking the VM invalidation should be taken care of by the DMABuf move 
notifier, so this explicit invalidation seemed redundant. Is there a way we can 
test or otherwise verify that we're not missing anything?

Thanks,
  Felix


>
> Christian.
>
>> Signed-off-by: James Zhu 
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 6 --
>>  1 file changed, 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 62ca12e94581..2ac6d4fa0601 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -595,12 +595,6 @@ kfd_mem_dmamap_dmabuf(struct kfd_mem_attachment 
>> *attachment)
>>  {
>>  struct ttm_operation_ctx ctx = {.interruptible = true};
>>  struct amdgpu_bo *bo = attachment->bo_va->base.bo;
>> -int ret;
>> -
>> -amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
>> -ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>> -if (ret)
>> -return ret;
>>  
>>  amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
>>  return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);


RE: [PATCH 2/2] drm/amdgpu: Reduce dequeue retry timeout for gfx9 family

2025-03-03 Thread Kim, Jonathan
[Public]

> -Original Message-
> From: amd-gfx  On Behalf Of Harish
> Kasiviswanathan
> Sent: Wednesday, February 26, 2025 2:23 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kasiviswanathan, Harish 
> Subject: [PATCH 2/2] drm/amdgpu: Reduce dequeue retry timeout for gfx9 family
>
> Dequeue retry timeout controls the interval between checks for unmet
> conditions. On MI series, reduce this from 0x40 to 0x1 (~ 1 uS). The
> cost of additional bandwidth consumed by CP when polling memory
> shouldn't be substantial.
>
> Signed-off-by: Harish Kasiviswanathan 
> ---
>  .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  2 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c   |  4 +--
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 28 -
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|  5 +--
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  2 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 28 -
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  5 +--
>  .../drm/amd/amdkfd/kfd_packet_manager_v9.c| 31 ++-
>  .../gpu/drm/amd/include/kgd_kfd_interface.h   |  5 +--
>  10 files changed, 65 insertions(+), 47 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index 8dfdb18197c4..53f5f1885870 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -189,7 +189,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   .set_address_watch = kgd_gfx_aldebaran_set_address_watch,
>   .clear_address_watch = kgd_gfx_v9_clear_address_watch,
>   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> - .build_grace_period_packet_info =
> kgd_gfx_v9_build_grace_period_packet_info,
> + .build_dequeue_wait_counts_packet_info =
> kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
>   .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings,
>   .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
>   .hqd_reset = kgd_gfx_v9_hqd_reset,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index 9abf29b58ac7..6fd41aece7e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -415,7 +415,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   .set_address_watch = kgd_gfx_v9_set_address_watch,
>   .clear_address_watch = kgd_gfx_v9_clear_address_watch,
>   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> - .build_grace_period_packet_info =
> kgd_gfx_v9_build_grace_period_packet_info,
> + .build_dequeue_wait_counts_packet_info =
> kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
>   .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>   .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings,
>   .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> index e2ae714a700f..95f249896275 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> @@ -530,8 +530,8 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
>   .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>   .program_trap_handler_settings =
>   kgd_gfx_v9_program_trap_handler_settings,
> - .build_grace_period_packet_info =
> - kgd_gfx_v9_build_grace_period_packet_info,
> + .build_dequeue_wait_counts_packet_info =
> + 
> kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
>   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   .enable_debug_trap = kgd_aldebaran_enable_debug_trap,
>   .disable_debug_trap = kgd_gfx_v9_4_3_disable_debug_trap,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 62176d607bef..0b03f2e9a858 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -1021,25 +1021,25 @@ void kgd_gfx_v10_get_iq_wait_times(struct
> amdgpu_device *adev,
>   *wait_times = RREG32(SOC15_REG_OFFSET(GC, 0,
> mmCP_IQ_WAIT_TIME2));
>  }
>
> -void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
> +void kgd_gfx_v10_build_dequeue_wait_counts_packet_info(struct amdgpu_device
> *adev,
>   uint32_t wait_times,
> - uint32_t grace_period,
> + uint32_t sch_wave,
> + uint32_t que_sleep,
> 

RE: [PATCH] drm/amdgpu: Fix missing drain retry fault the last entry

2025-03-03 Thread Deng, Emily
[AMD Official Use Only - AMD Internal Distribution Only]

Ping..

Emily Deng
Best Wishes



>-Original Message-
>From: Emily Deng 
>Sent: Monday, March 3, 2025 5:35 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deng, Emily 
>Subject: [PATCH] drm/amdgpu: Fix missing drain retry fault the last entry
>
>For equal case, it also need to be dropped.
>
>Signed-off-by: Emily Deng 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>index 7d4395a5d8ac..73b8bcb54734 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>@@ -76,7 +76,7 @@ struct amdgpu_ih_ring {
>
> /* return true if time stamp t2 is after t1 with 48bit wrap around */  #define
>amdgpu_ih_ts_after(t1, t2) \
>-  (((int64_t)((t2) << 16) - (int64_t)((t1) << 16)) > 0LL)
>+  (((int64_t)((t2) << 16) - (int64_t)((t1) << 16)) >= 0LL)
>
> /* provided by the ih block */
> struct amdgpu_ih_funcs {
>--
>2.34.1



RE: [PATCH 2/2] drm/amdgpu: Add support for CPERs on virtualization

2025-03-03 Thread Zhang, Hawking
[AMD Official Use Only - AMD Internal Distribution Only]

Series is

Reviewed-by: Hawking Zhang 

Regards,
Hawking
From: Yi, Tony 
Sent: Friday, February 28, 2025 22:01
To: Zhang, Hawking ; Skvortsov, Victor 
; amd-gfx@lists.freedesktop.org; Luo, Zhigang 
; Liu, Xiang(Dean) ; Zhou1, Tao 

Subject: Re: [PATCH 2/2] drm/amdgpu: Add support for CPERs on virtualization


[AMD Official Use Only - AMD Internal Distribution Only]

Hi Hawking,

We still use the BM CPER ring (through amdgpu_cper_ring_write) but as SRIOV 
requires an extra step to query the CPERs, we created a new function 
amdgpu_debugfs_virt_ring_read within amdgpu_debugfs_virt_ring_fops that does 
exactly that. It is separate to avoid affecting the BM hot path with any 
extraneous calculations. The debugfs is exactly the same (amdgpu_ring_cper).

Thanks, Tony


From: Zhang, Hawking mailto:hawking.zh...@amd.com>>
Sent: Friday, February 28, 2025 1:00 AM
To: Yi, Tony mailto:tony...@amd.com>>; Skvortsov, Victor 
mailto:victor.skvort...@amd.com>>; 
amd-gfx@lists.freedesktop.org 
mailto:amd-gfx@lists.freedesktop.org>>; Luo, 
Zhigang mailto:zhigang@amd.com>>; Liu, Xiang(Dean) 
mailto:xiang@amd.com>>; Zhou1, Tao 
mailto:tao.zh...@amd.com>>
Subject: RE: [PATCH 2/2] drm/amdgpu: Add support for CPERs on virtualization

[AMD Official Use Only - AMD Internal Distribution Only]

+ @Liu, Xiang(Dean)/@Zhou1, 
Tao for the code review

+   if (amdgpu_sriov_vf(adev))
+   debugfs_create_file_size(name, S_IFREG | 0444, root, ring,
+&amdgpu_debugfs_virt_ring_fops,
+ring->ring_size + 12);
+   else
+   debugfs_create_file_size(name, S_IFREG | 0444, root, ring,
+&amdgpu_debugfs_ring_fops,
+ring->ring_size + 12);

Hi Tony,

Is there any reason the VF requires a separate file system node? Is it because 
the VF has its own CPER ring? If so, can you please check if the VF can reuse 
the CPER created for bare-metal?

Regards,
Hawking

-Original Message-
From: Yi, Tony mailto:tony...@amd.com>>
Sent: Thursday, February 27, 2025 23:12
To: Yi, Tony mailto:tony...@amd.com>>; Skvortsov, Victor 
mailto:victor.skvort...@amd.com>>; 
amd-gfx@lists.freedesktop.org; Zhang, 
Hawking mailto:hawking.zh...@amd.com>>; Luo, Zhigang 
mailto:zhigang@amd.com>>
Cc: Yi, Tony mailto:tony...@amd.com>>
Subject: [PATCH 2/2] drm/amdgpu: Add support for CPERs on virtualization

Add support for CPERs on VFs.

VFs do not receive PMFW messages directly; as such, they need to query them 
from the host. To avoid hitting host event guard, CPER queries need to be rate 
limited. CPER queries share the same RAS telemetry buffer as error count query, 
so a mutex protecting the shared buffer was added as well.

For readability, the amdgpu_detect_virtualization was refactored into multiple 
individual functions.

Signed-off-by: Tony Yi mailto:tony...@amd.com>>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   7 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  31 -
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   | 138 -
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  18 ++-
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  14 +++
5 files changed, 195 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5e1d8f0039d0..198d29faa754 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3099,7 +3099,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)

 amdgpu_fru_get_product_info(adev);

-   r = amdgpu_cper_init(adev);
+   if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev))
+   r = amdgpu_cper_init(adev);

init_failed:

@@ -4335,10 +4336,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
  * for throttling interrupt) = 60 seconds.
  */
 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
-   ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1);

 ratelimit_set_flags(&adev->throttling_logging_rs, 
RATELIMIT_MSG_ON_RELEASE);
-   ratelimit_set_flags(&adev->virt.ras_telemetry_rs, 
RATELIMIT_MSG_ON_RELEASE);

 /* Registers mapping */
 /* TODO: block userspace mapping of io register */ @@ -4370,7 +4369,7 
@@ int amdgpu_device_init(struct amdgpu_device *adev,
 return -ENOMEM;

 /* detect hw virtualization here */
-   amdgpu_detect_virtualization(adev);
+   amdgpu_virt_init(adev);

 amdgpu_device_get_pcie_info(adev);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 81a7d4faac9c..d

RE: [PATCH v2 1/2] drm/amdkfd: Add pm_config_dequeue_wait_counts API

2025-03-03 Thread Kim, Jonathan
[Public]

> -Original Message-
> From: amd-gfx  On Behalf Of Harish
> Kasiviswanathan
> Sent: Monday, March 3, 2025 3:44 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kasiviswanathan, Harish 
> Subject: [PATCH v2 1/2] drm/amdkfd: Add pm_config_dequeue_wait_counts API
>
> Update pm_update_grace_period() to more cleaner
> pm_config_dequeue_wait_counts(). Previously, grace_period variable was
> overloaded as a variable and a macro, making it inflexible to configure
> additional dequeue wait times.
>
> pm_config_dequeue_wait_counts() now takes in a cmd / variable. This
> allows flexibility to update different dequeue wait times.
>
> Signed-off-by: Harish Kasiviswanathan 

Reviewed-by: Jonathan Kim 

> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 45 ++
>  .../drm/amd/amdkfd/kfd_device_queue_manager.h | 11 +++-
>  .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   | 26 +++-
>  .../drm/amd/amdkfd/kfd_packet_manager_v9.c| 59 ++-
>  .../drm/amd/amdkfd/kfd_packet_manager_vi.c|  4 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 28 +++--
>  6 files changed, 123 insertions(+), 50 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 94b1ac8a4735..cc7465f9562a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -42,6 +42,8 @@
>  /* Size of the per-pipe EOP queue */
>  #define CIK_HPD_EOP_BYTES_LOG2 11
>  #define CIK_HPD_EOP_BYTES (1U << CIK_HPD_EOP_BYTES_LOG2)
> +/* See unmap_queues_cpsch() */
> +#define USE_DEFAULT_GRACE_PERIOD 0x
>
>  static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
> u32 pasid, unsigned int vmid);
> @@ -1745,10 +1747,7 @@ static int initialize_cpsch(struct device_queue_manager
> *dqm)
>
>   init_sdma_bitmaps(dqm);
>
> - if (dqm->dev->kfd2kgd->get_iq_wait_times)
> - dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev,
> - &dqm->wait_times,
> - ffs(dqm->dev->xcc_mask) - 1);
> + update_dqm_wait_times(dqm);
>   return 0;
>  }
>
> @@ -1844,25 +1843,11 @@ static int start_cpsch(struct device_queue_manager
> *dqm)
>   /* clear hang status when driver try to start the hw scheduler */
>   dqm->sched_running = true;
>
> - if (!dqm->dev->kfd->shared_resources.enable_mes)
> + if (!dqm->dev->kfd->shared_resources.enable_mes) {
> + if (pm_config_dequeue_wait_counts(&dqm->packet_mgr,
> + KFD_DEQUEUE_WAIT_INIT, 0 /* unused */))
> + dev_err(dev, "Setting optimized dequeue wait failed. 
> Using
> default values\n");
>   execute_queues_cpsch(dqm,
> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
> USE_DEFAULT_GRACE_PERIOD);
> -
> - /* Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU */
> - if (amdgpu_emu_mode == 0 && dqm->dev->adev->gmc.is_app_apu &&
> - (KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3))) {
> - uint32_t reg_offset = 0;
> - uint32_t grace_period = 1;
> -
> - retval = pm_update_grace_period(&dqm->packet_mgr,
> - grace_period);
> - if (retval)
> - dev_err(dev, "Setting grace timeout failed\n");
> - else if (dqm->dev->kfd2kgd->build_grace_period_packet_info)
> - /* Update dqm->wait_times maintained in software */
> - dqm->dev->kfd2kgd->build_grace_period_packet_info(
> - dqm->dev->adev, dqm->wait_times,
> - grace_period, ®_offset,
> - &dqm->wait_times);
>   }
>
>   /* setup per-queue reset detection buffer  */
> @@ -2261,7 +2246,14 @@ static int reset_queues_on_hws_hang(struct
> device_queue_manager *dqm)
>   return r;
>  }
>
> -/* dqm->lock mutex has to be locked before calling this function */
> +/* dqm->lock mutex has to be locked before calling this function
> + *
> + * @grace_period: If USE_DEFAULT_GRACE_PERIOD then default wait time
> + *   for context switch latency. Lower values are used by debugger
> + *   since context switching are triggered at high frequency.
> + *   This is configured by setting CP_IQ_WAIT_TIME2.SCH_WAVE
> + *
> + */
>  static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   enum kfd_unmap_queues_filter filter,
>   uint32_t filter_param,
> @@ -2280,7 +2272,8 @@ static int unmap_queues_cpsch(struct
> device_queue_manager *dqm,
>   return -EIO;
>
>   if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
> - retval = pm_update_grace_period(&dqm->packet_mgr,
> grace_period);
> + retval = pm_config_dequeue_w

[PATCH] drm/amdgpu: add initial documentation for debugfs files

2025-03-03 Thread Alex Deucher
Describes what debugfs files are available and what
they are used for.

Signed-off-by: Alex Deucher 
---
 Documentation/gpu/amdgpu/debugfs.rst | 201 +++
 Documentation/gpu/amdgpu/index.rst   |   1 +
 2 files changed, 202 insertions(+)
 create mode 100644 Documentation/gpu/amdgpu/debugfs.rst

diff --git a/Documentation/gpu/amdgpu/debugfs.rst 
b/Documentation/gpu/amdgpu/debugfs.rst
new file mode 100644
index 0..9d82c770c1e78
--- /dev/null
+++ b/Documentation/gpu/amdgpu/debugfs.rst
@@ -0,0 +1,201 @@
+==
+AMDGPU DebugFS
+==
+
+The amdgpu driver provides a number of debugfs files to aid in debugging
+issues in the driver.
+
+DebugFS Files
+=
+
+amdgpu_benchmark
+
+
+Run benchmarks using the DMA engine the driver uses for GPU memory paging.
+Write a number to the file to run the test.  The results are written to the
+kernel log.  The following tests are available:
+
+- 1: simple test, VRAM to GTT and GTT to VRAM
+- 2: simple test, VRAM to VRAM
+- 3: GTT to VRAM, buffer size sweep, powers of 2
+- 4: VRAM to GTT, buffer size sweep, powers of 2
+- 5: VRAM to VRAM, buffer size sweep, powers of 2
+- 6: GTT to VRAM, buffer size sweep, common modes
+- 7: VRAM to GTT, buffer size sweep, common modes
+- 8: VRAM to VRAM, buffer size sweep, common modes
+
+amdgpu_test_ib
+--
+
+Read this file to run simple IB (Indirect Buffer) tests on all kernel managed
+rings.  IBs are command buffers usually generated by userspace applications
+which are submitted to the kernel for execution on an particular GPU engine.
+This just runs the simple IB tests included in the kernel.
+
+amdgpu_discovery
+
+
+Provides raw access to the IP discovery binary provided by the GPU.  Read this
+file to acess the raw binary.
+
+amdgpu_vbios
+
+
+Provides raw access to the ROM binary image from the GPU.  Read this file to
+access the raw binary.
+
+amdgpu_evict_gtt
+
+
+Evict all buffers from the GTT memory pool.  Read this file to evict all
+buffers from this pool.
+
+amdgpu_evict_vram
+-
+
+Evict all buffers from the VRAM memory pool.  Read this file to evict all
+buffers from this pool.
+
+amdgpu_gpu_recover
+--
+
+Read this file to trigger a full GPU reset.  All work currently running
+on the GPU will be lost.
+
+amdgpu_ring_
+--
+
+Provides read access to the kernel managed ring buffers for each ring .
+These are useful for debugging problems on a particular ring.  The ring buffer
+is how the CPU sends commands to the GPU.  The CPU writes commands into the
+buffer and then asks the GPU engine to process it.
+
+amdgpu_mqd_
+-
+
+Provides read access to the kernel managed MQD (Memory Queue Descriptor) for
+ring  managed by the kernel driver.  MQDs define the features of the ring
+and are used to store the ring's state when it is not connected to hardware.
+The driver writes the requested ring features and metadata (GPU addresses of
+the ring itself and associated buffers) to the MQD and the firmware uses the 
MQD
+to populate the hardware when the ring is mapped to a hardware slot.  Only
+available on engines which use MQDs.
+
+amdgpu_error_
+---
+
+Provides an interface to set an error on fences associated with ring .
+The error code specified is propogated to all fences associated with the
+ring.
+
+amdgpu_pm_info
+--
+
+Provides human readable information about the power management features
+and state of the GPU.  This includes current GFX clock, Memory clock,
+voltages, average SoC power, temperature, GFX load, Memory load, SMU
+feature mask, VCN power state, clock and power gating features.
+
+amdgpu_firmware_info
+
+
+Lists the firmware versions for all firmwares used by the GPU.  Only
+entries with a non-0 version are valid.  If the version is 0, the firmware
+is not valid for the GPU.
+
+amdgpu_fence_info
+-
+
+Shows the last signalled and emitted fence sequence numbers for each
+kernel driver managed ring.  Fences are associated with submissions
+to the engine.  Emitted fences have been submitted to the ring
+and signalled fences have been signalled by the GPU.  Rings with a
+larger emitted fence value have outstanding work that is still being
+processed by the engine that owns that ring.  When the emitted and
+signalled fence values are equal, the ring is idle.
+
+amdgpu_gem_info
+---
+
+Lists all of the PIDs using the GPU and the GPU buffers that are they have
+allocated.  This lists the buffer size, pool (VRAM, GTT, etc.), and buffer
+attributes (CPU access required, CPU cache attributes, etc.).
+
+amdgpu_vm_info
+--
+
+Lists all of the PIDs using the GPU and the GPU buffers that are they have
+allocated as well as the status of those buffers relative to that process'
+GPU virtual address space (e.g., evicted, idle, invalidated, etc.).
+
+amdgpu_

RE: [PATCH 5/5] drm/amdgpu/userq: remove BROKEN from config

2025-03-03 Thread Liang, Prike
[Public]

> From: amd-gfx  On Behalf Of Alex
> Deucher
> Sent: Tuesday, March 4, 2025 12:15 AM
> To: Liang, Prike 
> Cc: Deucher, Alexander ; amd-
> g...@lists.freedesktop.org
> Subject: Re: [PATCH 5/5] drm/amdgpu/userq: remove BROKEN from config
>
> On Mon, Mar 3, 2025 at 7:09 AM Liang, Prike  wrote:
> >
> > [Public]
> >
> > Question: Why use the CONFIG_DRM_AMDGPU_NAVI3X_USERQ to guard all
> the various GFX user mode queue enablement paths? It seems more generic and
> reasonable to use the config name CONFIG_DRM_AMDGPU_NAVI3X_USERQ.
> >
>
> I assume you mean something more generic like
> CONFIG_DRM_AMDGPU_USERQ?
>  At this point it would make more sense, but at the time it was added there 
> was
> only gfx11 support.  We could change it, but at this point, I'd be more 
> inclined to
> just remove it.
>
Yes, it's a typo, and I mean we may need to change the userq config label to 
CONFIG_DRM_AMDGPU_USERQ later.

Thanks,
Prike

> > Except for that question, the series patch is Reviewed-by: Prike Liang
>  .
>
> Thanks.
>
> Alex
>
> >
> > Regards,
> >   Prike
> >
> > > -Original Message-
> > > From: amd-gfx  On Behalf Of
> > > Alex Deucher
> > > Sent: Saturday, March 1, 2025 3:54 AM
> > > To: amd-gfx@lists.freedesktop.org
> > > Cc: Deucher, Alexander 
> > > Subject: [PATCH 5/5] drm/amdgpu/userq: remove BROKEN from config
> > >
> > > This can be enabled now.  We have the firmware checks in place.
> > >
> > > Signed-off-by: Alex Deucher 
> > > ---
> > >  drivers/gpu/drm/amd/amdgpu/Kconfig | 1 -
> > >  1 file changed, 1 deletion(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig
> > > b/drivers/gpu/drm/amd/amdgpu/Kconfig
> > > index a614320114796..23c014da0f324 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/Kconfig
> > > +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
> > > @@ -98,7 +98,6 @@ config DRM_AMDGPU_WERROR  config
> > > DRM_AMDGPU_NAVI3X_USERQ
> > >   bool "Enable amdgpu usermode queues"
> > >   depends on DRM_AMDGPU
> > > - depends on BROKEN
> > >   default n
> > >   help
> > > Choose this option to enable GFX usermode queue support for
> > > GFX/SDMA/Compute
> > > --
> > > 2.48.1
> >


Re: [PATCH 1/2] drm/amd/display: Add a temp w/a for a panel

2025-03-03 Thread Alex Hung

This has conflicts to latest asdn and a minor spacing issue below.

The others look good to me.

Reviewed-by: Alex Hung 

On 3/3/25 06:50, Aurabindo Pillai wrote:

Implement w/a for a panel which requires 10s delay after link detect.

Signed-off-by: Aurabindo Pillai 
---
  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 24 ++-
  .../amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 10 ++--
  drivers/gpu/drm/amd/display/dc/dc_types.h |  1 +
  3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 73c95c3c39f9..3a2843e3367e 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -3327,6 +3327,21 @@ static void dm_gpureset_commit_state(struct dc_state 
*dc_state,
}
  }
  
+static void apply_delay_after_dpcd_poweroff(struct amdgpu_device *adev, struct dc_sink *sink) {

+   struct dc_panel_patch *ppatch = NULL;
+
+   if (!sink)
+   return;
+
+   ppatch = &sink->edid_caps.panel_patch;
+   if (ppatch->wait_after_dpcd_poweroff_ms) {
+   msleep(ppatch->wait_after_dpcd_poweroff_ms);
+   drm_dbg_driver(adev_to_drm(adev), "%s: adding a %ds delay as w/a for 
panel\n",
+   __func__,
+   ppatch->wait_after_dpcd_poweroff_ms / 1000);
This doesn't look aligned to me, and maybe the two line can fit into 
one. But this is just my opinions.



+   }
+}
+
  static int dm_resume(struct amdgpu_ip_block *ip_block)
  {
struct amdgpu_device *adev = ip_block->adev;
@@ -3448,6 +3463,7 @@ static int dm_resume(struct amdgpu_ip_block *ip_block)
/* Do detection*/
drm_connector_list_iter_begin(ddev, &iter);
drm_for_each_connector_iter(connector, &iter) {
+   bool ret;
  
  		if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK)

continue;
@@ -3473,7 +3489,11 @@ static int dm_resume(struct amdgpu_ip_block *ip_block)
} else {
guard(mutex)(&dm->dc_lock);
dc_exit_ips_for_hw_access(dm->dc);
-   dc_link_detect(aconnector->dc_link, 
DETECT_REASON_RESUMEFROMS3S4);
+   ret = dc_link_detect(aconnector->dc_link, 
DETECT_REASON_RESUMEFROMS3S4);
+   if (ret) {
+   /* w/a delay for certain panels */
+   apply_delay_after_dpcd_poweroff(adev, 
aconnector->dc_sink);
+   }
}
  
  		if (aconnector->fake_enable && aconnector->dc_link->local_sink)

@@ -3834,6 +3854,8 @@ static void handle_hpd_irq_helper(struct 
amdgpu_dm_connector *aconnector)
ret = dc_link_detect(aconnector->dc_link, 
DETECT_REASON_HPD);
}
if (ret) {
+   /* w/a delay for certain panels */
+   apply_delay_after_dpcd_poweroff(adev, 
aconnector->dc_sink);
amdgpu_dm_update_connector_after_detect(aconnector);
  
  			drm_modeset_lock_all(dev);

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
index fbd80d8545a8..253aac93e3d8 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
@@ -55,11 +55,16 @@ static u32 edid_extract_panel_id(struct edid *edid)
   (u32)EDID_PRODUCT_ID(edid);
  }
  
-static void apply_edid_quirks(struct edid *edid, struct dc_edid_caps *edid_caps)

+static void apply_edid_quirks(struct drm_device *dev, struct edid *edid, 
struct dc_edid_caps *edid_caps)
  {
uint32_t panel_id = edid_extract_panel_id(edid);
  
  	switch (panel_id) {

+   /* Workaround for monitors that need a delay after detecting the link */
+   case drm_edid_encode_panel_id('G', 'B', 'T', 0x3215):
+   drm_dbg_driver(dev, "Add 10s delay for link detection for panel id 
%X\n", panel_id);
+   edid_caps->panel_patch.wait_after_dpcd_poweroff_ms = 1;
+   break;
/* Workaround for some monitors which does not work well with FAMS */
case drm_edid_encode_panel_id('S', 'A', 'M', 0x0E5E):
case drm_edid_encode_panel_id('S', 'A', 'M', 0x7053):
@@ -101,6 +106,7 @@ enum dc_edid_status dm_helpers_parse_edid_caps(
  {
struct amdgpu_dm_connector *aconnector = link->priv;
struct drm_connector *connector = &aconnector->base;
+   struct drm_device *dev = connector->dev;
struct edid *edid_buf = edid ? (struct edid *) edid->raw_edid : NULL;
struct cea_sad *sads;
int sad_count = -1;
@@ -130,7 +136,7 @@ enum dc_edid_status dm_helpers_parse_edid_caps(
  
  	edid_caps->edid_hdmi = connector->display_info.is_hdmi;
  
-	apply

Re: [PATCH 2/2] drm/amd/display: use drm_* instead of DRM_ in apply_edid_quirks()

2025-03-03 Thread Alex Hung

Reviewed-by: Alex Hung 

On 3/3/25 06:50, Aurabindo Pillai wrote:

drm_* macros are more helpful that DRM_* macros since the former
indicates the associated DRM device that prints the error, which maybe
helpful when debugging.

Signed-off-by: Aurabindo Pillai 
---
  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
index 253aac93e3d8..2cd35392e2da 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
@@ -69,7 +69,7 @@ static void apply_edid_quirks(struct drm_device *dev, struct 
edid *edid, struct
case drm_edid_encode_panel_id('S', 'A', 'M', 0x0E5E):
case drm_edid_encode_panel_id('S', 'A', 'M', 0x7053):
case drm_edid_encode_panel_id('S', 'A', 'M', 0x71AC):
-   DRM_DEBUG_DRIVER("Disabling FAMS on monitor with panel id 
%X\n", panel_id);
+   drm_dbg_driver(dev, "Disabling FAMS on monitor with panel id 
%X\n", panel_id);
edid_caps->panel_patch.disable_fams = true;
break;
/* Workaround for some monitors that do not clear DPCD 0x317 if 
FreeSync is unsupported */
@@ -78,11 +78,11 @@ static void apply_edid_quirks(struct drm_device *dev, 
struct edid *edid, struct
case drm_edid_encode_panel_id('B', 'O', 'E', 0x092A):
case drm_edid_encode_panel_id('L', 'G', 'D', 0x06D1):
case drm_edid_encode_panel_id('M', 'S', 'F', 0x1003):
-   DRM_DEBUG_DRIVER("Clearing DPCD 0x317 on monitor with panel id 
%X\n", panel_id);
+   drm_dbg_driver(dev, "Clearing DPCD 0x317 on monitor with panel id 
%X\n", panel_id);
edid_caps->panel_patch.remove_sink_ext_caps = true;
break;
case drm_edid_encode_panel_id('S', 'D', 'C', 0x4154):
-   DRM_DEBUG_DRIVER("Disabling VSC on monitor with panel id %X\n", 
panel_id);
+   drm_dbg_driver(dev, "Disabling VSC on monitor with panel id 
%X\n", panel_id);
edid_caps->panel_patch.disable_colorimetry = true;
break;
default:




RE: [PATCH v2 2/2] drm/amdgpu: Reduce dequeue retry timeout for gfx9 family

2025-03-03 Thread Kasiviswanathan, Harish
[Public]

-Original Message-
From: Kim, Jonathan 
Sent: Monday, March 3, 2025 3:54 PM
To: Kasiviswanathan, Harish ; 
amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH v2 2/2] drm/amdgpu: Reduce dequeue retry timeout for gfx9 
family

[Public]

> -Original Message-
> From: Kasiviswanathan, Harish 
> Sent: Monday, March 3, 2025 3:44 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kasiviswanathan, Harish ; Kim, Jonathan
> 
> Subject: [PATCH v2 2/2] drm/amdgpu: Reduce dequeue retry timeout for gfx9 
> family
>
> Dequeue retry timeout controls the interval between checks for unmet
> conditions. On MI series, reduce this from 0x40 to 0x1 (~ 1 uS). The
> cost of additional bandwidth consumed by CP when polling memory
> shouldn't be substantial.
>
> Signed-off-by: Harish Kasiviswanathan 
> Reviewed-by: : Jonathan Kim 
> ---
>  .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  2 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c   |  4 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 28 ++---
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|  5 ++-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  2 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 28 ++---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  5 ++-
>  .../drm/amd/amdkfd/kfd_packet_manager_v9.c| 40 +--
>  .../gpu/drm/amd/include/kgd_kfd_interface.h   |  5 ++-
>  10 files changed, 70 insertions(+), 51 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index 8dfdb18197c4..53f5f1885870 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -189,7 +189,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   .set_address_watch = kgd_gfx_aldebaran_set_address_watch,
>   .clear_address_watch = kgd_gfx_v9_clear_address_watch,
>   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> - .build_grace_period_packet_info =
> kgd_gfx_v9_build_grace_period_packet_info,
> + .build_dequeue_wait_counts_packet_info =
> kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
>   .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings,
>   .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
>   .hqd_reset = kgd_gfx_v9_hqd_reset,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index 9abf29b58ac7..6fd41aece7e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -415,7 +415,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   .set_address_watch = kgd_gfx_v9_set_address_watch,
>   .clear_address_watch = kgd_gfx_v9_clear_address_watch,
>   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> - .build_grace_period_packet_info =
> kgd_gfx_v9_build_grace_period_packet_info,
> + .build_dequeue_wait_counts_packet_info =
> kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
>   .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>   .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings,
>   .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> index e2ae714a700f..95f249896275 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> @@ -530,8 +530,8 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
>   .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>   .program_trap_handler_settings =
>   kgd_gfx_v9_program_trap_handler_settings,
> - .build_grace_period_packet_info =
> - kgd_gfx_v9_build_grace_period_packet_info,
> + .build_dequeue_wait_counts_packet_info =
> + 
> kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
>   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   .enable_debug_trap = kgd_aldebaran_enable_debug_trap,
>   .disable_debug_trap = kgd_gfx_v9_4_3_disable_debug_trap,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 62176d607bef..0b03f2e9a858 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -1021,25 +1021,25 @@ void kgd_gfx_v10_get_iq_wait_times(struct
> amdgpu_device *adev,
>   *wait_times = RREG32(SOC15_REG_OFFSET(GC, 0,
> mmCP_IQ_WAIT_TIME2));
>  }
>
> -void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
> +void kgd_gfx_v10_build_dequeue_wait_counts_packet_info(struct amdgpu_device
> *adev,
>   uint32_t wait_

Re: [PATCH 1/2] drm/amd/display: Add a temp w/a for a panel

2025-03-03 Thread Pillai, Aurabindo
[AMD Official Use Only - AMD Internal Distribution Only]

Thanks Alex, I'll fix that and push.



--

Regards,
Jay

From: Hung, Alex 
Sent: Monday, March 3, 2025 4:48 PM
To: Pillai, Aurabindo ; amd-gfx@lists.freedesktop.org 

Cc: Wentland, Harry ; Li, Sun peng (Leo) 
; Deucher, Alexander 
Subject: Re: [PATCH 1/2] drm/amd/display: Add a temp w/a for a panel

This has conflicts to latest asdn and a minor spacing issue below.

The others look good to me.

Reviewed-by: Alex Hung 

On 3/3/25 06:50, Aurabindo Pillai wrote:
> Implement w/a for a panel which requires 10s delay after link detect.
>
> Signed-off-by: Aurabindo Pillai 
> ---
>   .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 24 ++-
>   .../amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 10 ++--
>   drivers/gpu/drm/amd/display/dc/dc_types.h |  1 +
>   3 files changed, 32 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 73c95c3c39f9..3a2843e3367e 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -3327,6 +3327,21 @@ static void dm_gpureset_commit_state(struct dc_state 
> *dc_state,
>}
>   }
>
> +static void apply_delay_after_dpcd_poweroff(struct amdgpu_device *adev, 
> struct dc_sink *sink) {
> + struct dc_panel_patch *ppatch = NULL;
> +
> + if (!sink)
> + return;
> +
> + ppatch = &sink->edid_caps.panel_patch;
> + if (ppatch->wait_after_dpcd_poweroff_ms) {
> + msleep(ppatch->wait_after_dpcd_poweroff_ms);
> + drm_dbg_driver(adev_to_drm(adev), "%s: adding a %ds delay as 
> w/a for panel\n",
> + __func__,
> + ppatch->wait_after_dpcd_poweroff_ms / 1000);
This doesn't look aligned to me, and maybe the two line can fit into
one. But this is just my opinions.

> + }
> +}
> +
>   static int dm_resume(struct amdgpu_ip_block *ip_block)
>   {
>struct amdgpu_device *adev = ip_block->adev;
> @@ -3448,6 +3463,7 @@ static int dm_resume(struct amdgpu_ip_block *ip_block)
>/* Do detection*/
>drm_connector_list_iter_begin(ddev, &iter);
>drm_for_each_connector_iter(connector, &iter) {
> + bool ret;
>
>if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK)
>continue;
> @@ -3473,7 +3489,11 @@ static int dm_resume(struct amdgpu_ip_block *ip_block)
>} else {
>guard(mutex)(&dm->dc_lock);
>dc_exit_ips_for_hw_access(dm->dc);
> - dc_link_detect(aconnector->dc_link, 
> DETECT_REASON_RESUMEFROMS3S4);
> + ret = dc_link_detect(aconnector->dc_link, 
> DETECT_REASON_RESUMEFROMS3S4);
> + if (ret) {
> + /* w/a delay for certain panels */
> + apply_delay_after_dpcd_poweroff(adev, 
> aconnector->dc_sink);
> + }
>}
>
>if (aconnector->fake_enable && aconnector->dc_link->local_sink)
> @@ -3834,6 +3854,8 @@ static void handle_hpd_irq_helper(struct 
> amdgpu_dm_connector *aconnector)
>ret = dc_link_detect(aconnector->dc_link, 
> DETECT_REASON_HPD);
>}
>if (ret) {
> + /* w/a delay for certain panels */
> + apply_delay_after_dpcd_poweroff(adev, 
> aconnector->dc_sink);
>amdgpu_dm_update_connector_after_detect(aconnector);
>
>drm_modeset_lock_all(dev);
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
> index fbd80d8545a8..253aac93e3d8 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
> @@ -55,11 +55,16 @@ static u32 edid_extract_panel_id(struct edid *edid)
>   (u32)EDID_PRODUCT_ID(edid);
>   }
>
> -static void apply_edid_quirks(struct edid *edid, struct dc_edid_caps 
> *edid_caps)
> +static void apply_edid_quirks(struct drm_device *dev, struct edid *edid, 
> struct dc_edid_caps *edid_caps)
>   {
>uint32_t panel_id = edid_extract_panel_id(edid);
>
>switch (panel_id) {
> + /* Workaround for monitors that need a delay after detecting the link */
> + case drm_edid_encode_panel_id('G', 'B', 'T', 0x3215):
> + drm_dbg_driver(dev, "Add 10s delay for link detection for panel 
> id %X\n", panel_id);
> + edid_caps->panel_patch.wait_after_dpcd_poweroff_ms = 1;
> + break;
>/* Workaround for some monitors which does not work well with FAMS */
>case drm_edid_encode_panel_id('S', 'A', 'M', 0x0E5E):
>case drm_edid_encode_panel_id('

Re: [PATCH] drm/amdkfd: Change error handling at prange update in svm_range_set_attr

2025-03-03 Thread Felix Kuehling


On 2025-01-31 11:58, Xiaogang.Chen wrote:
> From: Xiaogang Chen 
>
> When register a vm range at svm the added vm range may be split into multiple
> subranges and/or existing pranges got spitted. The new pranges need validated
> and mapped. This patch changes error handling for pranges that fail updating:

It may help if you clearly state the problem you're trying to solve to justify 
the changes in error handling. See more comments inline.


>
> 1: free prange resources and remove it from svms if its updating fails as it
> will not be used.
> 2: return -EAGAIN when all pranges at update_list need redo valid/map,
> otherwise return no -EAGAIN error to user space to indicate failure. That
> removes unnecessary retries.
>
> Signed-off-by: Xiaogang Chen 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 27 +++
>  1 file changed, 23 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index e32e19196f6b..455cb98bf16a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -3716,8 +3716,19 @@ svm_range_set_attr(struct kfd_process *p, struct 
> mm_struct *mm,
>  
>  out_unlock_range:
>   mutex_unlock(&prange->migrate_mutex);
> - if (r)
> - ret = r;
> + /* this prange cannot be migraed, valid or map */
> + if (r) {
> + /* free this prange resources, remove it from svms */
> + svm_range_unlink(prange);
> + svm_range_remove_notifier(prange);
> + svm_range_free(prange, false);

Freeing the prange removes SVM mappings from the process. This will break the 
subsequent execution of the application. In case you were going to return 
-EAGAIN that's definitely wrong because the application would expect the SVM 
range to work after a successful retry.

I'm not sure the resource waste is a valid argument in case of a fatal error. I 
would expect the application to terminate anyways in this case, which would 
result in freeing the resources. Do you see a scenario where an application 
wants to continue running after this function returned a fatal error?


> +
> + /* ret got update when any r != -EAGAIN;
> +  * return -EAGAIN when all pranges at update_list
> +  * need redo valid/map */
> + if (r != -EAGAIN || !ret)
> + ret = r;

This is a good point. But the explanation is a bit misleading: "all pranges ... 
need redo" is not really true. There may also be ranges that were validated 
successfully. I think the point you're trying to make is this: Don't return 
-EAGAIN if there was any previous fatal error found.

I could potentially see a different optimization. If you encounter a fatal 
error, you can skip the rest of the ranges and return the error immediately.


> + }
>   }
>  
>   list_for_each_entry(prange, &remap_list, update_list) {
> @@ -3729,8 +3740,16 @@ svm_range_set_attr(struct kfd_process *p, struct 
> mm_struct *mm,
>   if (r)
>   pr_debug("failed %d on remap svm range\n", r);
>   mutex_unlock(&prange->migrate_mutex);
> - if (r)
> - ret = r;
> +
> + if (r) {
> + /* remove this prange */
> + svm_range_unlink(prange);
> + svm_range_remove_notifier(prange);
> + svm_range_free(prange, false);

Same as above.

Regards,
  Felix


> +
> + if (r != -EAGAIN || !ret)
> + ret = r;
> + }
>   }
>  
>   dynamic_svm_range_dump(svms);


RE: [PATCH V6 2/3] drm/amdgpu: Optimize VM invalidation engine allocation and synchronize GPU TLB flush

2025-03-03 Thread Zhang, Jesse(Jie)
[AMD Official Use Only - AMD Internal Distribution Only]

Ping on this series?

-Original Message-
From: jesse.zh...@amd.com 
Sent: Friday, February 28, 2025 6:27 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Koenig, Christian 
; Lazar, Lijo ; Zhu, Jiadong 
; Zhang, Jesse(Jie) ; Zhang, 
Jesse(Jie) 
Subject: [PATCH V6 2/3] drm/amdgpu: Optimize VM invalidation engine allocation 
and synchronize GPU TLB flush

From: "jesse.zh...@amd.com" 

- Modify the VM invalidation engine allocation logic to handle SDMA page rings.
  SDMA page rings now share the VM invalidation engine with SDMA gfx rings 
instead of
  allocating a separate engine. This change ensures efficient resource 
management and
  avoids the issue of insufficient VM invalidation engines.

- Add synchronization for GPU TLB flush operations in gmc_v9_0.c.
  Use spin_lock and spin_unlock to ensure thread safety and prevent race 
conditions
  during TLB flush operations. This improves the stability and reliability of 
the driver,
  especially in multi-threaded environments.

 v2: replace the sdma ring check with a function `amdgpu_sdma_is_page_queue`  
to check if a ring is an SDMA page queue.(Lijo)

 v3: Add GC version check, only enabled on GC9.4.3/9.4.4/9.5.0
 v4: Fix code style and add more detailed description (Christian)

Suggested-by: Lijo Lazar 
Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  | 12   
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 25 +++-  
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  1 +
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 4eefa17fa39b..aad3c5ea8557 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -602,8 +602,20 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device 
*adev)
return -EINVAL;
}

+   if (amdgpu_sdma_is_shared_inv_eng(adev, ring)) {
+   /* SDMA has a special packet which allows it to use the same
+* invalidation engine for all the rings in one instance.
+* Therefore, we do not allocate a separate VM invalidation 
engine
+* for SDMA page rings. Instead, they share the VM invalidation
+* engine with the SDMA gfx ring. This change ensures efficient
+* resource management and avoids the issue of insufficient VM
+* invalidation engines.
+*/
+   ring->vm_inv_eng = inv_eng - 1;
+   } else {
ring->vm_inv_eng = inv_eng - 1;
vm_inv_engs[vmhub] &= ~(1 << ring->vm_inv_eng);
+   }

dev_info(adev->dev, "ring %s uses VM inv eng %u on hub %u\n",
 ring->name, ring->vm_inv_eng, ring->vm_hub); diff 
--git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 42a7b86e41c3..9b958d6202bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -462,6 +462,29 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct 
amdgpu_device *adev)
}
 }

+/**
+* amdgpu_sdma_is_shared_inv_eng - Check if a ring is an SDMA ring that
+shares a VM invalidation engine
+* @adev: Pointer to the AMDGPU device structure
+* @ring: Pointer to the ring structure to check
+*
+* This function checks if the given ring is an SDMA ring that shares a VM 
invalidation engine.
+* It returns true if the ring is such an SDMA ring, false otherwise.
+*/
+bool amdgpu_sdma_is_shared_inv_eng(struct amdgpu_device *adev, struct
+amdgpu_ring *ring) {
+   int i = ring->me;
+
+   if (!adev->sdma.has_page_queue || i >= adev->sdma.num_instances)
+   return false;
+
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
+   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
+   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0))
+   return (ring == &adev->sdma.instance[i].ring);
+   else
+   return false;
+}
+
 /**
  * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
  * @funcs: Pointer to the callback structure containing pre_reset and 
post_reset functions @@ -503,7 +526,7 @@ int amdgpu_sdma_reset_engine(struct 
amdgpu_device *adev, uint32_t instance_id, b  {
struct sdma_on_reset_funcs *funcs;
int ret = 0;
-   struct amdgpu_sdma_instance *sdma_instance = 
&adev->sdma.instance[instance_id];;
+   struct amdgpu_sdma_instance *sdma_instance =
+&adev->sdma.instance[instance_id];
struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
struct amdgpu_ring *page_ring = &sdma_instance->page;
bool gfx_sched_stopped = false, page_sched_stopped = false; diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_s

Re: [PATCH] amdkfd: initialize svm lists at where they are defined

2025-03-03 Thread Zhu Lingshan
On 3/4/2025 1:49 PM, Felix Kuehling wrote:
> On 2025-02-21 4:23, Zhu Lingshan wrote:
>> This commit initialized svm lists at where they are
>> defined. This is defensive programing for security
>> and consistency.
>>
>> Initalizing variables ensures that their states are
>> always valid, avoiding issues caused by
>> uninitialized variables that could lead to
>> unpredictable behaviros.
> The lists are clearly documented as output parameters in the svm_range_add 
> function definition. I think it's more readable to do the list initialization 
> in svm_range_add to keep the logic of the caller more readable. One 
> suggestion inline that would move the initialization to the caller without 
> cluttering the calling function's code.
>
>
>> And we should not assume the callee would always
>> initialize them
>>
>> Signed-off-by: Zhu Lingshan 
>> ---
>>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 9 +
>>  1 file changed, 5 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> index bd3e20d981e0..cbc997449379 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> @@ -2130,11 +2130,7 @@ svm_range_add(struct kfd_process *p, uint64_t start, 
>> uint64_t size,
>>  
>>  pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last);
>>  
>> -INIT_LIST_HEAD(update_list);
>> -INIT_LIST_HEAD(insert_list);
>> -INIT_LIST_HEAD(remove_list);
>>  INIT_LIST_HEAD(&new_list);
>> -INIT_LIST_HEAD(remap_list);
>>  
>>  node = interval_tree_iter_first(&svms->objects, start, last);
>>  while (node) {
>> @@ -3635,6 +3631,11 @@ svm_range_set_attr(struct kfd_process *p, struct 
>> mm_struct *mm,
>>  if (r)
>>  return r;
>>  
>> +INIT_LIST_HEAD(&update_list);
>> +INIT_LIST_HEAD(&insert_list);
>> +INIT_LIST_HEAD(&remove_list);
>> +INIT_LIST_HEAD(&remap_list);
>> +
> You could initialize these where they are defined by replacing the struct 
> list_head ... definitions with
>
>   LIST_HEAD(update_list);
>   LIST_HEAD(insert_list);
Yes, this is better, I will use LIST_HEAD and remove the initialization in  
svm_range_add because we don't need to init them twice

By the way, I am not sure the lists are "output parameters", usually an output 
parameter should carry some information for other functions,
but here the lists are just defined, even not initialized, and are on the 
stack. Actually the callee only fills the lists and the caller itself use
the lists. I suggest to remove the "output parameters" in the code comments.

Thanks
Lingshan
>   ...
>
> Regards,
>   Felix
>
>
>>  svms = &p->svms;
>>  
>>  mutex_lock(&process_info->lock);



Re: [PATCH] amdkfd: initialize svm lists at where they are defined

2025-03-03 Thread Felix Kuehling
On 2025-02-21 4:23, Zhu Lingshan wrote:
> This commit initialized svm lists at where they are
> defined. This is defensive programing for security
> and consistency.
>
> Initalizing variables ensures that their states are
> always valid, avoiding issues caused by
> uninitialized variables that could lead to
> unpredictable behaviros.

The lists are clearly documented as output parameters in the svm_range_add 
function definition. I think it's more readable to do the list initialization 
in svm_range_add to keep the logic of the caller more readable. One suggestion 
inline that would move the initialization to the caller without cluttering the 
calling function's code.


>
> And we should not assume the callee would always
> initialize them
>
> Signed-off-by: Zhu Lingshan 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index bd3e20d981e0..cbc997449379 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -2130,11 +2130,7 @@ svm_range_add(struct kfd_process *p, uint64_t start, 
> uint64_t size,
>  
>   pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last);
>  
> - INIT_LIST_HEAD(update_list);
> - INIT_LIST_HEAD(insert_list);
> - INIT_LIST_HEAD(remove_list);
>   INIT_LIST_HEAD(&new_list);
> - INIT_LIST_HEAD(remap_list);
>  
>   node = interval_tree_iter_first(&svms->objects, start, last);
>   while (node) {
> @@ -3635,6 +3631,11 @@ svm_range_set_attr(struct kfd_process *p, struct 
> mm_struct *mm,
>   if (r)
>   return r;
>  
> + INIT_LIST_HEAD(&update_list);
> + INIT_LIST_HEAD(&insert_list);
> + INIT_LIST_HEAD(&remove_list);
> + INIT_LIST_HEAD(&remap_list);
> +

You could initialize these where they are defined by replacing the struct 
list_head ... definitions with

LIST_HEAD(update_list);
LIST_HEAD(insert_list);
...

Regards,
  Felix


>   svms = &p->svms;
>  
>   mutex_lock(&process_info->lock);


Re: [PATCH V6 2/3] drm/amdgpu: Optimize VM invalidation engine allocation and synchronize GPU TLB flush

2025-03-03 Thread Lazar, Lijo



On 2/28/2025 3:57 PM, jesse.zh...@amd.com wrote:
> From: "jesse.zh...@amd.com" 
> 
> - Modify the VM invalidation engine allocation logic to handle SDMA page 
> rings.
>   SDMA page rings now share the VM invalidation engine with SDMA gfx rings 
> instead of
>   allocating a separate engine. This change ensures efficient resource 
> management and
>   avoids the issue of insufficient VM invalidation engines.
> 
> - Add synchronization for GPU TLB flush operations in gmc_v9_0.c.
>   Use spin_lock and spin_unlock to ensure thread safety and prevent race 
> conditions
>   during TLB flush operations. This improves the stability and reliability of 
> the driver,
>   especially in multi-threaded environments.
> 
>  v2: replace the sdma ring check with a function `amdgpu_sdma_is_page_queue`
>  to check if a ring is an SDMA page queue.(Lijo)
> 
>  v3: Add GC version check, only enabled on GC9.4.3/9.4.4/9.5.0
>  v4: Fix code style and add more detailed description (Christian)
> 
> Suggested-by: Lijo Lazar 
> Signed-off-by: Jesse Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  | 12 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 25 +++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  1 +
>  3 files changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 4eefa17fa39b..aad3c5ea8557 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -602,8 +602,20 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device 
> *adev)
>   return -EINVAL;
>   }
>  
> + if (amdgpu_sdma_is_shared_inv_eng(adev, ring)) {
> + /* SDMA has a special packet which allows it to use the same
> +  * invalidation engine for all the rings in one instance.
> +  * Therefore, we do not allocate a separate VM invalidation 
> engine
> +  * for SDMA page rings. Instead, they share the VM invalidation
> +  * engine with the SDMA gfx ring. This change ensures efficient
> +  * resource management and avoids the issue of insufficient VM
> +  * invalidation engines.
> +  */
> + ring->vm_inv_eng = inv_eng - 1;

As mentioned in a previous comment also, strongly discourage to assume
vm_inv_eng based just on loop order. If two rings of the same instance
need to share the inv_eng, call them out explicitly and assign.
Shouldn't have any implicit assumption that the previous/next ring in
the loop order will be part of the same engine.

Thanks,
Lijo

> + } else {
>   ring->vm_inv_eng = inv_eng - 1;
>   vm_inv_engs[vmhub] &= ~(1 << ring->vm_inv_eng);
> + }
>  
>   dev_info(adev->dev, "ring %s uses VM inv eng %u on hub %u\n",
>ring->name, ring->vm_inv_eng, ring->vm_hub);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 42a7b86e41c3..9b958d6202bc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -462,6 +462,29 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct 
> amdgpu_device *adev)
>   }
>  }
>  
> +/**
> +* amdgpu_sdma_is_shared_inv_eng - Check if a ring is an SDMA ring that 
> shares a VM invalidation engine
> +* @adev: Pointer to the AMDGPU device structure
> +* @ring: Pointer to the ring structure to check
> +*
> +* This function checks if the given ring is an SDMA ring that shares a VM 
> invalidation engine.
> +* It returns true if the ring is such an SDMA ring, false otherwise.
> +*/
> +bool amdgpu_sdma_is_shared_inv_eng(struct amdgpu_device *adev, struct 
> amdgpu_ring *ring)
> +{
> + int i = ring->me;
> +
> + if (!adev->sdma.has_page_queue || i >= adev->sdma.num_instances)
> + return false;
> +
> + if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
> + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
> + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0))
> + return (ring == &adev->sdma.instance[i].ring);
> + else
> + return false;
> +}
> +
>  /**
>   * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
>   * @funcs: Pointer to the callback structure containing pre_reset and 
> post_reset functions
> @@ -503,7 +526,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
> uint32_t instance_id, b
>  {
>   struct sdma_on_reset_funcs *funcs;
>   int ret = 0;
> - struct amdgpu_sdma_instance *sdma_instance = 
> &adev->sdma.instance[instance_id];;
> + struct amdgpu_sdma_instance *sdma_instance = 
> &adev->sdma.instance[instance_id];
>   struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
>   struct amdgpu_ring *page_ring = &sdma_instance->page;
>   bool gfx_sched_stopped = false, page_sched_

Re: [PATCH] drm/amdkfd: Change error handling at prange update in svm_range_set_attr

2025-03-03 Thread Felix Kuehling


On 2025-01-31 11:58, Xiaogang.Chen wrote:
> From: Xiaogang Chen 
>
> When register a vm range at svm the added vm range may be split into multiple
> subranges and/or existing pranges got spitted. The new pranges need validated
> and mapped. This patch changes error handling for pranges that fail updating:

It may help if you clearly state the problem you're trying to solve to justify 
the changes in error handling. See more comments inline.


>
> 1: free prange resources and remove it from svms if its updating fails as it
> will not be used.
> 2: return -EAGAIN when all pranges at update_list need redo valid/map,
> otherwise return no -EAGAIN error to user space to indicate failure. That
> removes unnecessary retries.
>
> Signed-off-by: Xiaogang Chen 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 27 +++
>  1 file changed, 23 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index e32e19196f6b..455cb98bf16a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -3716,8 +3716,19 @@ svm_range_set_attr(struct kfd_process *p, struct 
> mm_struct *mm,
>  
>  out_unlock_range:
>   mutex_unlock(&prange->migrate_mutex);
> - if (r)
> - ret = r;
> + /* this prange cannot be migraed, valid or map */
> + if (r) {
> + /* free this prange resources, remove it from svms */
> + svm_range_unlink(prange);
> + svm_range_remove_notifier(prange);
> + svm_range_free(prange, false);

Freeing the prange removes SVM mappings from the process. This will break the 
subsequent execution of the application. In case you were going to return 
-EAGAIN that's definitely wrong because the application would expect the SVM 
range to work after a successful retry.

I'm not sure the resource waste is a valid argument in case of a fatal error. I 
would expect the application to terminate anyways in this case, which would 
result in freeing the resources. Do you see a scenario where an application 
wants to continue running after this function returned a fatal error?


> +
> + /* ret got update when any r != -EAGAIN;
> +  * return -EAGAIN when all pranges at update_list
> +  * need redo valid/map */
> + if (r != -EAGAIN || !ret)
> + ret = r;

This is a good point. But the explanation is a bit misleading: "all pranges ... 
need redo" is not really true. There may also be ranges that were validated 
successfully. I think the point you're trying to make is this: Don't return 
-EAGAIN if there was any previous fatal error found.

I could potentially see a different optimization. If you encounter a fatal 
error, you can skip the rest of the ranges and return the error immediately.


> + }
>   }
>  
>   list_for_each_entry(prange, &remap_list, update_list) {
> @@ -3729,8 +3740,16 @@ svm_range_set_attr(struct kfd_process *p, struct 
> mm_struct *mm,
>   if (r)
>   pr_debug("failed %d on remap svm range\n", r);
>   mutex_unlock(&prange->migrate_mutex);
> - if (r)
> - ret = r;
> +
> + if (r) {
> + /* remove this prange */
> + svm_range_unlink(prange);
> + svm_range_remove_notifier(prange);
> + svm_range_free(prange, false);

Same as above.

Regards,
  Felix


> +
> + if (r != -EAGAIN || !ret)
> + ret = r;
> + }
>   }
>  
>   dynamic_svm_range_dump(svms);


[PATCH] drm/amdkfd: remove unnecessary cpu domain validation

2025-03-03 Thread James Zhu
before move to GTT domain.

Signed-off-by: James Zhu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 62ca12e94581..2ac6d4fa0601 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -595,12 +595,6 @@ kfd_mem_dmamap_dmabuf(struct kfd_mem_attachment 
*attachment)
 {
struct ttm_operation_ctx ctx = {.interruptible = true};
struct amdgpu_bo *bo = attachment->bo_va->base.bo;
-   int ret;
-
-   amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
-   ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
-   if (ret)
-   return ret;
 
amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
-- 
2.34.1



Re: [PATCH] drm/amdkfd: remove unnecessary cpu domain validation

2025-03-03 Thread Christian König
Am 03.03.25 um 19:45 schrieb James Zhu:
> before move to GTT domain.

That might not be unnecessary. We sometimes intentionally move BOs to the CPU 
domain to invalidate all VM mappings.

Christian.

>
> Signed-off-by: James Zhu 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 6 --
>  1 file changed, 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 62ca12e94581..2ac6d4fa0601 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -595,12 +595,6 @@ kfd_mem_dmamap_dmabuf(struct kfd_mem_attachment 
> *attachment)
>  {
>   struct ttm_operation_ctx ctx = {.interruptible = true};
>   struct amdgpu_bo *bo = attachment->bo_va->base.bo;
> - int ret;
> -
> - amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
> - ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
> - if (ret)
> - return ret;
>  
>   amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
>   return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);