An extra dma_fence_put() can drop the last reference to a fence while
it is
still attached to a dma_resv object. This frees the fence prematurely
via
dma_fence_release() while other users still hold the pointer.
Later accesses through dma_resv iteration may then operate on the freed
fence object, leading to refcount underflow warnings and potential hangs
when walking reservation fences.
Fix this by correcting the fence lifetime so the dma_resv object
retains a
valid reference until it is done with the fence.
[ 31.133803] refcount_t: underflow; use-after-free.
[ 31.133805] WARNING: lib/refcount.c:28 at
refcount_warn_saturate+0x58/0x90, CPU#18: kworker/u96:1/188
[ 31.133815] Modules linked in: snd_seq_dummy snd_hrtimer qrtr
binfmt_misc nls_iso8859_1 snd_hda_codec_alc882
snd_hda_codec_realtek_lib snd_hda_codec_generic snd_hda_codec_atihdmi
snd_hda_codec_hdmi snd_hda_intel amd_atl snd_hda_codec intel_rapl_msr
intel_rapl_common amdgpu snd_hda_core snd_intel_dspcfg amdxcp
snd_intel_sdw_acpi drm_panel_backlight_quirks snd_hwdep gpu_sched
drm_buddy snd_pcm drm_ttm_helper ttm drm_exec drm_suballoc_helper
snd_seq_midi drm_client_lib snd_seq_midi_event drm_display_helper
snd_rawmidi cec snd_seq edac_mce_amd ghash_clmulni_intel
snd_seq_device aesni_intel rc_core drm_kms_helper gigabyte_wmi
snd_timer wmi_bmof rapl k10temp video i2c_piix4 snd i2c_smbus
input_leds soundcore joydev ccp mac_hid sch_fq_codel msr parport_pc
ppdev lp parport drm efi_pstore nfnetlink dmi_sysfs autofs4
hid_generic usbhid hid nvme igb ahci i2c_algo_bit dca libahci
nvme_core wmi
[ 31.133932] CPU: 18 UID: 0 PID: 188 Comm: kworker/u96:1 Not
tainted 6.19.0-amd-staging-drm-next #28 PREEMPT(voluntary)
[ 31.133937] Hardware name: Gigabyte Technology Co., Ltd. X570
AORUS ELITE/X570 AORUS ELITE, BIOS F37c 05/12/2022
[ 31.133940] Workqueue: sdma1 drm_sched_run_job_work [gpu_sched]
[ 31.133951] RIP: 0010:refcount_warn_saturate+0x58/0x90
[ 31.133955] Code: 74 2f 83 fe 01 75 38 48 8d 3d a4 2c 91 01 67 48
0f b9 3a eb 36 48 8d 3d a6 2c 91 01 67 48 0f b9 3a eb 28 48 8d 3d a8
2c 91 01 <67> 48 0f b9 3a eb 1a 48 8d 3d aa 2c 91 01 67 48 0f b9 3a
eb 0c 48
[ 31.133959] RSP: 0018:ffffca16807dfd68 EFLAGS: 00010246
[ 31.133962] RAX: ffff89e988f05600 RBX: 0000000000000000 RCX:
0000000000000000
[ 31.133965] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
ffffffffa1fd2f30
[ 31.133967] RBP: ffffca16807dfd68 R08: 0000000000000000 R09:
0000000000000000
[ 31.133969] R10: 0000000000000000 R11: 0000000000000000 R12:
ffff89e98edf1308
[ 31.133971] R13: ffff89e9d3001380 R14: ffff89e9dab5f800 R15:
ffff89e9dab5f880
[ 31.133974] FS: 0000000000000000(0000) GS:ffff89ed0cc3e000(0000)
knlGS:0000000000000000
[ 31.133976] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 31.133979] CR2: 00007f3050081c28 CR3: 0000000117f06000 CR4:
0000000000350ef0
[ 31.133982] Call Trace:
[ 31.133985] <TASK>
[ 31.133989] drm_sched_entity_pop_job+0x414/0x420 [gpu_sched]
[ 31.133997] drm_sched_run_job_work+0x15f/0x3c0 [gpu_sched]
[ 31.134003] process_scheduled_works+0x1f0/0x450
[ 31.134011] worker_thread+0x27f/0x370
[ 31.134016] kthread+0x1ed/0x210
[ 31.134020] ? __pfx_worker_thread+0x10/0x10
[ 31.134023] ? srso_return_thunk+0x5/0x5f
[ 31.134027] ? __pfx_kthread+0x10/0x10
[ 31.134031] ret_from_fork+0x10f/0x1b0
[ 31.134035] ? __pfx_kthread+0x10/0x10
[ 31.134039] ret_from_fork_asm+0x1a/0x30
[ 31.134047] </TASK>
[ 31.134049] ---[ end trace 0000000000000000 ]---
...
[ 56.544104] watchdog: BUG: soft lockup - CPU#9 stuck for 26s!
[glxgears:cs0:3483]
[ 56.544108] Modules linked in: snd_seq_dummy snd_hrtimer qrtr
binfmt_misc nls_iso8859_1 snd_hda_codec_alc882
snd_hda_codec_realtek_lib snd_hda_codec_generic snd_hda_codec_atihdmi
snd_hda_codec_hdmi snd_hda_intel amd_atl snd_hda_codec intel_rapl_msr
intel_rapl_common amdgpu snd_hda_core snd_intel_dspcfg amdxcp
snd_intel_sdw_acpi drm_panel_backlight_quirks snd_hwdep gpu_sched
drm_buddy snd_pcm drm_ttm_helper ttm drm_exec drm_suballoc_helper
snd_seq_midi drm_client_lib snd_seq_midi_event drm_display_helper
snd_rawmidi cec snd_seq edac_mce_amd ghash_clmulni_intel
snd_seq_device aesni_intel rc_core drm_kms_helper gigabyte_wmi
snd_timer wmi_bmof rapl k10temp video i2c_piix4 snd i2c_smbus
input_leds soundcore joydev ccp mac_hid sch_fq_codel msr parport_pc
ppdev lp parport drm efi_pstore nfnetlink dmi_sysfs autofs4
hid_generic usbhid hid nvme igb ahci i2c_algo_bit dca libahci
nvme_core wmi
[ 56.544166] CPU: 9 UID: 0 PID: 3483 Comm: glxgears:cs0 Tainted:
G W 6.19.0-amd-staging-drm-next #28 PREEMPT(voluntary)
[ 56.544170] Tainted: [W]=WARN
[ 56.544171] Hardware name: Gigabyte Technology Co., Ltd. X570
AORUS ELITE/X570 AORUS ELITE, BIOS F37c 05/12/2022
[ 56.544172] RIP: 0010:dma_resv_iter_walk_unlocked+0x4e/0x180
[ 56.544179] Code: 45 31 ed eb 0e 41 8b 46 08 41 3b 46 18 0f 83 23
01 00 00 49 8b 46 10 48 85 c0 74 20 48 8d 78 38 b9 ff ff ff ff f0 0f
c1 48 38 <83> f9 01 75 07 e8 78 ce ff ff eb 06 0f 8c e3 00 00 00 41
8b 46 1c
[ 56.544180] RSP: 0018:ffffca16865bb870 EFLAGS: 00000217
[ 56.544182] RAX: ffff89e997f38d80 RBX: 0000000000000005 RCX:
0000000000000006
[ 56.544183] RDX: 0000000000000001 RSI: 0000000000000000 RDI:
ffff89e997f38db8
[ 56.544184] RBP: ffffca16865bb898 R08: 0000000000000000 R09:
0000000000000000
[ 56.544185] R10: 0000000000000000 R11: 0000000000000000 R12:
ffffca16865bb8c0
[ 56.544186] R13: 0000000000000000 R14: ffffca16865bb8a8 R15:
ffff89e997f38d80
[ 56.544187] FS: 00007f8f8d3ff6c0(0000) GS:ffff89ed0c9fe000(0000)
knlGS:0000000000000000
[ 56.544189] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 56.544190] CR2: 00007f8f9b735020 CR3: 0000000117f06000 CR4:
0000000000350ef0
[ 56.544191] Call Trace:
[ 56.544193] <TASK>
[ 56.544197] dma_resv_wait_timeout+0x55/0x190
[ 56.544202] amdgpu_bo_kmap+0x3a/0xa0 [amdgpu]
[ 56.544502] amdgpu_userq_fence_read_wptr+0x130/0x2e0 [amdgpu]
[ 56.544670] amdgpu_userq_signal_ioctl+0x1f6/0x5e0 [amdgpu]
[ 56.544847] ? srso_return_thunk+0x5/0x5f
[ 56.544851] ? amdgpu_userq_wait_ioctl+0xab7/0xb80 [amdgpu]
[ 56.545021] ? __pfx_amdgpu_userq_signal_ioctl+0x10/0x10 [amdgpu]
[ 56.545190] drm_ioctl_kernel+0xd9/0x150 [drm]
[ 56.545222] drm_ioctl+0x29a/0x4a0 [drm]
[ 56.545245] ? __pfx_amdgpu_userq_signal_ioctl+0x10/0x10 [amdgpu]
[ 56.545422] ? srso_return_thunk+0x5/0x5f
[ 56.545426] amdgpu_drm_ioctl+0x46/0x90 [amdgpu]
[ 56.545595] __se_sys_ioctl+0x73/0xd0
[ 56.545600] __x64_sys_ioctl+0x1d/0x30
[ 56.545602] x64_sys_call+0x1715/0x2d00
[ 56.545604] do_syscall_64+0x7c/0x6a0
[ 56.545608] ? __pfx_amdgpu_userq_wait_ioctl+0x10/0x10 [amdgpu]
[ 56.545778] ? srso_return_thunk+0x5/0x5f
[ 56.545781] ? amdgpu_drm_ioctl+0x6c/0x90 [amdgpu]
[ 56.545950] ? srso_return_thunk+0x5/0x5f
Signed-off-by: Sunil Khatri <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index 146ca6d7f4f5..442c08b69f7c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -882,12 +882,9 @@ int amdgpu_userq_wait_ioctl(struct drm_device
*dev, void *data,
* be good for now
*/
r = dma_fence_wait(fences[i], true);
- if (r) {
- dma_fence_put(fences[i]);
+ if (r)
goto free_fences;
- }
- dma_fence_put(fences[i]);
continue;
}
@@ -909,7 +906,6 @@ int amdgpu_userq_wait_ioctl(struct drm_device
*dev, void *data,
fence_info[cnt].va = fence_drv->va;
fence_info[cnt].value = fences[i]->seqno;
- dma_fence_put(fences[i]);
/* Increment the actual userq fence count */
cnt++;
}