The following error messages showed up on an APU and a dGPU during testing.

<3> [100.231411] BUG: sleeping function called from invalid context at include/linux/sched/mm.h:321 <3> [100.231414] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1711, name: kms_color
<3> [100.231416] preempt_count: 2, expected: 0
<3> [100.231417] RCU nest depth: 0, expected: 0
<3> [100.231418] Preemption disabled at:
<3> [100.231419] [<ffffffffc0c2843b>] dc_fpu_begin+0x2b/0xc0 [amdgpu]
<4> [100.231626] CPU: 4 UID: 0 PID: 1711 Comm: kms_color Tainted: G W 6.12.0+ #1
<4> [100.231629] Tainted: [W]=WARN
<4> [100.231631] Call Trace:
<4> [100.231632]  <TASK>
<4> [100.231633]  dump_stack_lvl+0x5b/0x70
<4> [100.231638]  dump_stack+0x10/0x20
<4> [100.231639]  __might_resched+0x170/0x1d0
<4> [100.231643]  __might_sleep+0x44/0x70
<4> [100.231645]  __alloc_pages_noprof+0x22f/0x370
<4> [100.231649]  ___kmalloc_large_node+0x95/0x150
<4> [100.231651]  ? preempt_count_add+0x4e/0xc0
<4> [100.231653]  __kmalloc_large_noprof+0x1d/0xb0
<4> [100.231655]  dml2_create_copy+0x27/0x60 [amdgpu]
<4> [100.231827]  dc_state_create_copy+0x7e/0x170 [amdgpu]
<4> [100.231995]  update_planes_and_stream_state+0x23c/0x600 [amdgpu]
<4> [100.232189]  update_planes_and_stream_v2+0x22b/0x530 [amdgpu]
<4> [100.232366]  ? amdgpu_dm_atomic_commit_tail+0x1310/0x4100 [amdgpu]
<4> [100.232569]  ? commit_tail+0x96/0x140 [drm_kms_helper]
<4> [100.232577]  dc_update_planes_and_stream+0x5b/0xe0 [amdgpu]
<4> [100.232730]  amdgpu_dm_atomic_commit_tail+0x1fa7/0x4100 [amdgpu]
<4> [100.232908]  ? stack_depot_save_flags+0x2c/0x730
<4> [100.232915]  ? wait_for_completion_timeout+0x1d/0x30
<4> [100.232917]  commit_tail+0x96/0x140 [drm_kms_helper]
<4> [100.232923]  drm_atomic_helper_commit+0x12b/0x150 [drm_kms_helper]
<4> [100.232927]  drm_atomic_commit+0xad/0xe0 [drm]
<4> [100.232939]  ? __pfx___drm_printfn_info+0x10/0x10 [drm]
<4> [100.232956]  drm_atomic_helper_set_config+0x80/0xc0 [drm_kms_helper]
<4> [100.232961]  drm_mode_setcrtc+0x22e/0x910 [drm]
<4> [100.232975]  ? kfree+0x18f/0x350
<4> [100.232977]  ? __pfx_drm_mode_setcrtc+0x10/0x10 [drm]
<4> [100.232987]  drm_ioctl_kernel+0xa7/0x100 [drm]
<4> [100.233004]  drm_ioctl+0x29d/0x500 [drm]
<4> [100.233015]  ? __pfx_drm_mode_setcrtc+0x10/0x10 [drm]
<4> [100.233026]  ? _raw_spin_unlock_irqrestore+0x1f/0x40
<4> [100.233029]  amdgpu_drm_ioctl+0x4b/0x80 [amdgpu]
<4> [100.233131]  __x64_sys_ioctl+0x92/0xd0
<4> [100.233133]  x64_sys_call+0x1205/0x20d0
<4> [100.233136]  do_syscall_64+0x50/0x110
<4> [100.233138]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
<4> [100.233142] RIP: 0033:0x7fb21e71a94f
<4> [100.233144] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <41> 89 c0 3d 00 f0 ff ff 77 1f 48 8b 44 24 18 64 48 2b 04 25 28 00 <4> [100.233145] RSP: 002b:00007ffdd9a52e50 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 <4> [100.233148] RAX: ffffffffffffffda RBX: 00007ffdd9a52ee0 RCX: 00007fb21e71a94f <4> [100.233149] RDX: 00007ffdd9a52ee0 RSI: 00000000c06864a2 RDI: 0000000000000005 <4> [100.233149] RBP: 00000000c06864a2 R08: 0000000000000000 R09: 00005609537f7b08 <4> [100.233150] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 <4> [100.233151] R13: 0000000000000005 R14: 0000000000000000 R15: 00005609537e2848
<4> [100.233152]  </TASK>

On 3/25/25 18:02, Alex Hung wrote:


On 3/19/25 22:23, Huacai Chen wrote:
Hi, Alex,

On Thu, Mar 20, 2025 at 10:16 AM Alex Hung <alex.h...@amd.com> wrote:



On 3/18/25 05:17, Huacai Chen wrote:
Commit 7da55c27e76749b9 ("drm/amd/display: Remove incorrect FP context
start") removes the FP context protection of dml2_create(), and it said
"All the DC_FP_START/END should be used before call anything from DML2".

However, dml2_create()/dml2_copy()/dml2_create_copy() are not protected
from their callers, causing such errors:

   do_fpu invoked from kernel context![#1]:
   CPU: 0 UID: 0 PID: 239 Comm: kworker/0:5 Not tainted 6.14.0-rc6+ #1
   Workqueue: events work_for_cpu_fn
   pc ffff80000319de80 ra ffff80000319de5c tp 900000010575c000 sp 900000010575f840    a0 0000000000000000 a1 900000012f210130 a2 900000012f000000 a3 ffff80000357e268    a4 ffff80000357e260 a5 900000012ea52cf0 a6 0000000400000004 a7 0000012c00001388    t0 00001900000015e0 t1 ffff80000379d000 t2 0000000010624dd3 t3 0000006400000014    t4 00000000000003e8 t5 0000005000000018 t6 0000000000000020 t7 0000000f00000064    t8 000000000000002f u0 5f5e9200f8901912 s9 900000012d380010 s0 900000012ea51fd8    s1 900000012f000000 s2 9000000109296000 s3 0000000000000001 s4 0000000000001fd8    s5 0000000000000001 s6 ffff800003415000 s7 900000012d390000 s8 ffff800003211f80       ra: ffff80000319de5c dml21_apply_soc_bb_overrides+0x3c/0x960 [amdgpu]      ERA: ffff80000319de80 dml21_apply_soc_bb_overrides+0x60/0x960 [amdgpu]
    CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE)
    PRMD: 00000004 (PPLV0 +PIE -PWE)
    EUEN: 00000000 (-FPE -SXE -ASXE -BTE)
    ECFG: 00071c1d (LIE=0,2-4,10-12 VS=7)
   ESTAT: 000f0000 [FPD] (IS= ECode=15 EsubCode=0)
    PRID: 0014d010 (Loongson-64bit, Loongson-3C6000/S)
   Process kworker/0:5 (pid: 239, threadinfo=00000000927eadc6, task=000000008fd31682)    Stack : 00040dc000003164 0000000000000001 900000012f210130 900000012eabeeb8            900000012f000000 ffff80000319fe48 900000012f210000 900000012f210130            900000012f000000 900000012eabeeb8 0000000000000001 ffff8000031a0064            900000010575f9f0 900000012f210130 900000012eac0000 900000012ea80000            900000012f000000 ffff8000031cefc4 900000010575f9f0 ffff8000035859c0            ffff800003414000 900000010575fa78 900000012f000000 ffff8000031b4c50            0000000000000000 9000000101c9d700 9000000109c40000 5f5e9200f8901912            900000012d3c4bd0 900000012d3c5000 ffff8000034aed18 900000012d380010            900000012d3c4bd0 ffff800003414000 900000012d380000 ffff800002ea49dc            0000000000000001 900000012d3c6000 00000000ffffe423 0000000000010000
           ...
   Call Trace:
   [<ffff80000319de80>] dml21_apply_soc_bb_overrides+0x60/0x960 [amdgpu]
   [<ffff80000319fe44>] dml21_init+0xa4/0x280 [amdgpu]
   [<ffff8000031a0060>] dml21_create+0x40/0x80 [amdgpu]
   [<ffff8000031cefc0>] dc_state_create+0x100/0x160 [amdgpu]
   [<ffff8000031b4c4c>] dc_create+0x44c/0x640 [amdgpu]
   [<ffff800002ea49d8>] amdgpu_dm_init+0x3f8/0x2060 [amdgpu]
   [<ffff800002ea6658>] dm_hw_init+0x18/0x60 [amdgpu]
   [<ffff800002b16738>] amdgpu_device_init+0x1938/0x27e0 [amdgpu]
   [<ffff800002b18e80>] amdgpu_driver_load_kms+0x20/0xa0 [amdgpu]
   [<ffff800002b0c8f0>] amdgpu_pci_probe+0x1b0/0x580 [amdgpu]
   [<900000000448eae4>] local_pci_probe+0x44/0xc0
   [<9000000003b02b18>] work_for_cpu_fn+0x18/0x40
   [<9000000003b05da0>] process_one_work+0x160/0x300
   [<9000000003b06718>] worker_thread+0x318/0x440
   [<9000000003b11b8c>] kthread+0x12c/0x220
   [<9000000003ac1484>] ret_from_kernel_thread+0x8/0xa4

So protect dml2_create()/dml2_copy()/dml2_create_copy() with DC_FP_START
and DC_FP_END.

Hi Huacai,

Can you try to put DC_FP_START DC_FP_END in the
dml2_create()/dml2_copy()/dml2_create_copy()/dml2_validate() instead?
The code will be cleaner and less error-prone to future changes.
At first I want to add them in
dml2_create()/dml2_copy()/dml2_create_copy()/dml2_validate(), but
commit 7da55c27e76749b9 ("drm/amd/display: Remove incorrect FP context
start")  said
that "All the DC_FP_START/END should be used before call anything from DML2".

Huacai

Thanks. I sent them to testing, and will share results later.



Thanks.



Cc: sta...@vger.kernel.org
Signed-off-by: Huacai Chen <chenhua...@loongson.cn>
---
   drivers/gpu/drm/amd/display/dc/core/dc_state.c | 16 ++++++++++++++++
   1 file changed, 16 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c b/ drivers/gpu/drm/amd/display/dc/core/dc_state.c
index 1b2cce127981..6e2cac08002d 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c
@@ -210,17 +210,23 @@ struct dc_state *dc_state_create(struct dc *dc, struct dc_state_create_params *p

   #ifdef CONFIG_DRM_AMD_DC_FP
       if (dc->debug.using_dml2) {
+             DC_FP_START();
+
               dml2_opt->use_clock_dc_limits = false;
               if (!dml2_create(dc, dml2_opt, &state->bw_ctx.dml2)) {
+                     DC_FP_END();
                       dc_state_release(state);
                       return NULL;
               }

               dml2_opt->use_clock_dc_limits = true;
               if (!dml2_create(dc, dml2_opt, &state- >bw_ctx.dml2_dc_power_source)) {
+                     DC_FP_END();
                       dc_state_release(state);
                       return NULL;
               }
+
+             DC_FP_END();
       }
   #endif

@@ -240,6 +246,8 @@ void dc_state_copy(struct dc_state *dst_state, struct dc_state *src_state)
       dc_state_copy_internal(dst_state, src_state);

   #ifdef CONFIG_DRM_AMD_DC_FP
+     DC_FP_START();
+
       dst_state->bw_ctx.dml2 = dst_dml2;
       if (src_state->bw_ctx.dml2)
               dml2_copy(dst_state->bw_ctx.dml2, src_state- >bw_ctx.dml2); @@ -247,6 +255,8 @@ void dc_state_copy(struct dc_state *dst_state, struct dc_state *src_state)        dst_state->bw_ctx.dml2_dc_power_source = dst_dml2_dc_power_source;
       if (src_state->bw_ctx.dml2_dc_power_source)
               dml2_copy(dst_state->bw_ctx.dml2_dc_power_source, src_state->bw_ctx.dml2_dc_power_source);
+
+     DC_FP_END();
   #endif

       /* context refcount should not be overridden */
@@ -268,17 +278,23 @@ struct dc_state *dc_state_create_copy(struct dc_state *src_state)
       new_state->bw_ctx.dml2 = NULL;
       new_state->bw_ctx.dml2_dc_power_source = NULL;

+     DC_FP_START();
+
       if (src_state->bw_ctx.dml2 &&
                       !dml2_create_copy(&new_state->bw_ctx.dml2, src_state->bw_ctx.dml2)) {
+             DC_FP_END();
               dc_state_release(new_state);
               return NULL;
       }

       if (src_state->bw_ctx.dml2_dc_power_source &&
                       !dml2_create_copy(&new_state- >bw_ctx.dml2_dc_power_source, src_state- >bw_ctx.dml2_dc_power_source)) {
+             DC_FP_END();
               dc_state_release(new_state);
               return NULL;
       }
+
+     DC_FP_END();
   #endif

       kref_init(&new_state->refcount);



Reply via email to