> 2021年9月6日 17:04,Christian König <ckoenig.leichtzumer...@gmail.com> 写道: > > > > Am 06.09.21 um 03:12 schrieb xinhui pan: >> A long time ago, someone reports system got hung during memory test. >> In recent days, I am trying to look for or understand the potential >> deadlock in ttm/amdgpu code. >> >> This patchset aims to fix the deadlock during ttm populate. >> >> TTM has a parameter called pages_limit, when allocated GTT memory >> reaches this limit, swapout would be triggered. As ttm_bo_swapout does >> not return the correct retval, populate might get hung. >> >> UVD ib test uses GTT which might be insufficient. So a gpu recovery >> would hung if populate hung. > > Ah, now I understand what you are trying to do. > > Problem is that won't work either. Allocating VRAM can easily land you inside > the same deadlock. > > We need to avoid the allocation altogether for this for work correctly.
looks like we need reserve some pages at sw init. > >> >> I have made one drm test which alloc two GTT BOs, submit gfx copy >> commands and free these BOs without waiting fence. What's more, these >> gfx copy commands will cause gfx ring hang. So gpu recovery would be >> triggered. > > Mhm, that should never be possible. It is perfectly valid for an application > to terminate without waitting for the GFX submission to be completed. gfx ring hangs because of the command is illegal. the packet is COMMAND [30:21] | BYTE_COUNT [20:0] I use 0xFF << 20 to hang the ring on purpose. > > Going to push patch #1 to drm-misc-fixes or drm-misc-next-fixes in a moment. > > Thanks, > Christian. > >> >> Now here is one possible deadlock case. >> gpu_recovery >> -> stop drm scheduler >> -> asic reset >> -> ib test >> -> tt populate (uvd ib test) >> -> ttm_bo_swapout (BO A) // this always fails as the fence of >> BO A would not be signaled by schedluer or HW. Hit deadlock. >> >> I paste the drm test patch below. >> #modprobe ttm pages_limit=65536 >> #amdgpu_test -s 1 -t 4 >> --- >> tests/amdgpu/basic_tests.c | 32 ++++++++++++++------------------ >> 1 file changed, 14 insertions(+), 18 deletions(-) >> >> diff --git a/tests/amdgpu/basic_tests.c b/tests/amdgpu/basic_tests.c >> index dbf02fee..f85ed340 100644 >> --- a/tests/amdgpu/basic_tests.c >> +++ b/tests/amdgpu/basic_tests.c >> @@ -65,13 +65,16 @@ static void amdgpu_direct_gma_test(void); >> static void amdgpu_command_submission_write_linear_helper(unsigned ip_type); >> static void amdgpu_command_submission_const_fill_helper(unsigned ip_type); >> static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type); >> -static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle, >> +static void _amdgpu_test_exec_cs_helper(amdgpu_context_handle >> context_handle, >> unsigned ip_type, >> int instance, int pm4_dw, uint32_t >> *pm4_src, >> int res_cnt, amdgpu_bo_handle *resources, >> struct amdgpu_cs_ib_info *ib_info, >> - struct amdgpu_cs_request *ibs_request); >> + struct amdgpu_cs_request *ibs_request, >> int sync, int repeat); >> +#define amdgpu_test_exec_cs_helper(...) \ >> + _amdgpu_test_exec_cs_helper(__VA_ARGS__, 1, 1) >> + >> CU_TestInfo basic_tests[] = { >> { "Query Info Test", amdgpu_query_info_test }, >> { "Userptr Test", amdgpu_userptr_test }, >> @@ -1341,12 +1344,12 @@ static void amdgpu_command_submission_compute(void) >> * pm4_src, resources, ib_info, and ibs_request >> * submit command stream described in ibs_request and wait for this IB >> accomplished >> */ >> -static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle, >> +static void _amdgpu_test_exec_cs_helper(amdgpu_context_handle >> context_handle, >> unsigned ip_type, >> int instance, int pm4_dw, uint32_t >> *pm4_src, >> int res_cnt, amdgpu_bo_handle *resources, >> struct amdgpu_cs_ib_info *ib_info, >> - struct amdgpu_cs_request *ibs_request) >> + struct amdgpu_cs_request *ibs_request, >> int sync, int repeat) >> { >> int r; >> uint32_t expired; >> @@ -1395,12 +1398,15 @@ static void >> amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle, >> CU_ASSERT_NOT_EQUAL(ibs_request, NULL); >> /* submit CS */ >> - r = amdgpu_cs_submit(context_handle, 0, ibs_request, 1); >> + while (repeat--) >> + r = amdgpu_cs_submit(context_handle, 0, ibs_request, 1); >> CU_ASSERT_EQUAL(r, 0); >> r = amdgpu_bo_list_destroy(ibs_request->resources); >> CU_ASSERT_EQUAL(r, 0); >> + if (!sync) >> + return; >> fence_status.ip_type = ip_type; >> fence_status.ip_instance = 0; >> fence_status.ring = ibs_request->ring; >> @@ -1667,7 +1673,7 @@ static void >> amdgpu_command_submission_sdma_const_fill(void) >> static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type) >> { >> - const int sdma_write_length = 1024; >> + const int sdma_write_length = (255) << 20; >> const int pm4_dw = 256; >> amdgpu_context_handle context_handle; >> amdgpu_bo_handle bo1, bo2; >> @@ -1715,8 +1721,6 @@ static void >> amdgpu_command_submission_copy_linear_helper(unsigned ip_type) >> &bo1_va_handle); >> CU_ASSERT_EQUAL(r, 0); >> - /* set bo1 */ >> - memset((void*)bo1_cpu, 0xaa, sdma_write_length); >> /* allocate UC bo2 for sDMA use */ >> r = amdgpu_bo_alloc_and_map(device_handle, >> @@ -1727,8 +1731,6 @@ static void >> amdgpu_command_submission_copy_linear_helper(unsigned ip_type) >> &bo2_va_handle); >> CU_ASSERT_EQUAL(r, 0); >> - /* clear bo2 */ >> - memset((void*)bo2_cpu, 0, sdma_write_length); >> resources[0] = bo1; >> resources[1] = bo2; >> @@ -1785,17 +1787,11 @@ static void >> amdgpu_command_submission_copy_linear_helper(unsigned ip_type) >> } >> } >> - amdgpu_test_exec_cs_helper(context_handle, >> + _amdgpu_test_exec_cs_helper(context_handle, >> ip_type, ring_id, >> i, pm4, >> 2, resources, >> - ib_info, >> ibs_request); >> - >> - /* verify if SDMA test result meets with >> expected */ >> - i = 0; >> - while(i < sdma_write_length) { >> - CU_ASSERT_EQUAL(bo2_cpu[i++], 0xaa); >> - } >> + ib_info, >> ibs_request, 0, 100); >> r = amdgpu_bo_unmap_and_free(bo1, >> bo1_va_handle, bo1_mc, >> sdma_write_length); >> CU_ASSERT_EQUAL(r, 0); >