Re: [PATCH v2 0/2] Fix a hung during memory pressure test

Pan, Xinhui Mon, 06 Sep 2021 03:16:16 -0700


> 2021年9月6日 17:04，Christian König <ckoenig.leichtzumer...@gmail.com> 写道：
> 
> 
> 
> Am 06.09.21 um 03:12 schrieb xinhui pan:
>> A long time ago, someone reports system got hung during memory test.
>> In recent days, I am trying to look for or understand the potential
>> deadlock in ttm/amdgpu code.
>> 
>> This patchset aims to fix the deadlock during ttm populate.
>> 
>> TTM has a parameter called pages_limit, when allocated GTT memory
>> reaches this limit, swapout would be triggered. As ttm_bo_swapout does
>> not return the correct retval, populate might get hung.
>> 
>> UVD ib test uses GTT which might be insufficient. So a gpu recovery
>> would hung if populate hung.
> 
> Ah, now I understand what you are trying to do.
> 
> Problem is that won't work either. Allocating VRAM can easily land you inside 
> the same deadlock.
> 
> We need to avoid the allocation altogether for this for work correctly.


looks like we need reserve some pages at sw init.

> 
>> 
>> I have made one drm test which alloc two GTT BOs, submit gfx copy
>> commands and free these BOs without waiting fence. What's more, these
>> gfx copy commands will cause gfx ring hang. So gpu recovery would be
>> triggered.
> 
> Mhm, that should never be possible. It is perfectly valid for an application 
> to terminate without waitting for the GFX submission to be completed.

gfx ring hangs because of the command is illegal.
the packet is COMMAND [30:21] | BYTE_COUNT [20:0]
I use 0xFF << 20 to hang the ring on purpose.

> 
> Going to push patch #1 to drm-misc-fixes or drm-misc-next-fixes in a moment.
> 
> Thanks,
> Christian.
> 
>> 
>> Now here is one possible deadlock case.
>> gpu_recovery
>>  -> stop drm scheduler
>>  -> asic reset
>>    -> ib test
>>       -> tt populate (uvd ib test)
>>      ->  ttm_bo_swapout (BO A) // this always fails as the fence of
>>      BO A would not be signaled by schedluer or HW. Hit deadlock.
>> 
>> I paste the drm test patch below.
>> #modprobe ttm pages_limit=65536
>> #amdgpu_test -s 1 -t 4
>> ---
>>  tests/amdgpu/basic_tests.c | 32 ++++++++++++++------------------
>>  1 file changed, 14 insertions(+), 18 deletions(-)
>> 
>> diff --git a/tests/amdgpu/basic_tests.c b/tests/amdgpu/basic_tests.c
>> index dbf02fee..f85ed340 100644
>> --- a/tests/amdgpu/basic_tests.c
>> +++ b/tests/amdgpu/basic_tests.c
>> @@ -65,13 +65,16 @@ static void amdgpu_direct_gma_test(void);
>>  static void amdgpu_command_submission_write_linear_helper(unsigned ip_type);
>>  static void amdgpu_command_submission_const_fill_helper(unsigned ip_type);
>>  static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type);
>> -static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
>> +static void _amdgpu_test_exec_cs_helper(amdgpu_context_handle 
>> context_handle,
>>                                     unsigned ip_type,
>>                                     int instance, int pm4_dw, uint32_t 
>> *pm4_src,
>>                                     int res_cnt, amdgpu_bo_handle *resources,
>>                                     struct amdgpu_cs_ib_info *ib_info,
>> -                                   struct amdgpu_cs_request *ibs_request);
>> +                                   struct amdgpu_cs_request *ibs_request, 
>> int sync, int repeat);
>>   +#define amdgpu_test_exec_cs_helper(...) \
>> +    _amdgpu_test_exec_cs_helper(__VA_ARGS__, 1, 1)
>> +
>>  CU_TestInfo basic_tests[] = {
>>      { "Query Info Test",  amdgpu_query_info_test },
>>      { "Userptr Test",  amdgpu_userptr_test },
>> @@ -1341,12 +1344,12 @@ static void amdgpu_command_submission_compute(void)
>>   * pm4_src, resources, ib_info, and ibs_request
>>   * submit command stream described in ibs_request and wait for this IB 
>> accomplished
>>   */
>> -static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
>> +static void _amdgpu_test_exec_cs_helper(amdgpu_context_handle 
>> context_handle,
>>                                     unsigned ip_type,
>>                                     int instance, int pm4_dw, uint32_t 
>> *pm4_src,
>>                                     int res_cnt, amdgpu_bo_handle *resources,
>>                                     struct amdgpu_cs_ib_info *ib_info,
>> -                                   struct amdgpu_cs_request *ibs_request)
>> +                                   struct amdgpu_cs_request *ibs_request, 
>> int sync, int repeat)
>>  {
>>      int r;
>>      uint32_t expired;
>> @@ -1395,12 +1398,15 @@ static void 
>> amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
>>      CU_ASSERT_NOT_EQUAL(ibs_request, NULL);
>>      /* submit CS */
>> -    r = amdgpu_cs_submit(context_handle, 0, ibs_request, 1);
>> +    while (repeat--)
>> +            r = amdgpu_cs_submit(context_handle, 0, ibs_request, 1);
>>      CU_ASSERT_EQUAL(r, 0);
>>      r = amdgpu_bo_list_destroy(ibs_request->resources);
>>      CU_ASSERT_EQUAL(r, 0);
>>  +   if (!sync)
>> +            return;
>>      fence_status.ip_type = ip_type;
>>      fence_status.ip_instance = 0;
>>      fence_status.ring = ibs_request->ring;
>> @@ -1667,7 +1673,7 @@ static void 
>> amdgpu_command_submission_sdma_const_fill(void)
>>    static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
>>  {
>> -    const int sdma_write_length = 1024;
>> +    const int sdma_write_length = (255) << 20;
>>      const int pm4_dw = 256;
>>      amdgpu_context_handle context_handle;
>>      amdgpu_bo_handle bo1, bo2;
>> @@ -1715,8 +1721,6 @@ static void 
>> amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
>>                                                          &bo1_va_handle);
>>                              CU_ASSERT_EQUAL(r, 0);
>>  -                           /* set bo1 */
>> -                            memset((void*)bo1_cpu, 0xaa, sdma_write_length);
>>                              /* allocate UC bo2 for sDMA use */
>>                              r = amdgpu_bo_alloc_and_map(device_handle,
>> @@ -1727,8 +1731,6 @@ static void 
>> amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
>>                                                          &bo2_va_handle);
>>                              CU_ASSERT_EQUAL(r, 0);
>>  -                           /* clear bo2 */
>> -                            memset((void*)bo2_cpu, 0, sdma_write_length);
>>                              resources[0] = bo1;
>>                              resources[1] = bo2;
>> @@ -1785,17 +1787,11 @@ static void 
>> amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
>>                                      }
>>                              }
>>  -                           amdgpu_test_exec_cs_helper(context_handle,
>> +                            _amdgpu_test_exec_cs_helper(context_handle,
>>                                                         ip_type, ring_id,
>>                                                         i, pm4,
>>                                                         2, resources,
>> -                                                       ib_info, 
>> ibs_request);
>> -
>> -                            /* verify if SDMA test result meets with 
>> expected */
>> -                            i = 0;
>> -                            while(i < sdma_write_length) {
>> -                                    CU_ASSERT_EQUAL(bo2_cpu[i++], 0xaa);
>> -                            }
>> +                                                       ib_info, 
>> ibs_request, 0, 100);
>>                              r = amdgpu_bo_unmap_and_free(bo1, 
>> bo1_va_handle, bo1_mc,
>>                                                           sdma_write_length);
>>                              CU_ASSERT_EQUAL(r, 0);
>

Re: [PATCH v2 0/2] Fix a hung during memory pressure test

Reply via email to