On 11/5/2024 11:52 AM, Lazar, Lijo wrote:
>
>
> On 11/5/2024 11:45 AM, Ma, Le wrote:
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Lijo
>>> Lazar
>>> Sent: Tuesday, November 5, 2024 1:39 PM
>>> To: amd-gfx@lists.freedesktop.org
>>> Cc: Zhang, Hawking <hawking.zh...@amd.com>; Deucher, Alexander
>>> <alexander.deuc...@amd.com>; Koenig, Christian <christian.koe...@amd.com>
>>> Subject: [PATCH 1/2] drm/amdgpu: Fix unmap queue logic
>>>
>>> In current logic, it calls ring_alloc followed by a ring_test. ring_test in
>>> turn will call
>>> another ring_alloc. This is illegal usage as a ring_alloc is expected to be
>>> closed
>>> properly with a ring_commit. Change to commit the unmap queue packet first
>>> followed by a ring_test. Add a comment about the usage of ring_test.
>>
>> Regarding the description " This is illegal usage as a ring_alloc is
>> expected to be closed properly with a ring_commit ", does this only apply to
>> unmap queue logic ?
>> The current logic to do map queue is also to commit once in ring_test but
>> ring_alloc twice.
>>
>
> Should be applicable for this case also - ring_alloc calls begin_use and
> ring_commit marks end_use. It could be working fine because those are
> not implemented for these rings currently.
>
> But using ring_test without a commit doesn't appear to be the correct
> usage of API.
>
> Will fix map calls in another patch.
>
After checking, it seems better to have those changes as well in a
single patch. Will send a v2.
Thanks,
Lijo
> Thanks,
> Lijo
>>>
>>> Also, reorder the current pre-condition checks of job hang or kiq ring
>>> scheduler not
>>> ready. Without them being met, it is not useful to attempt ring or memory
>>> allocations.
>>>
>>> Fixes tag refers to the original patch which introduced this issue which
>>> then got
>>> carried over into newer code.
>>>
>>> Signed-off-by: Lijo Lazar <lijo.la...@amd.com>
>>>
>>> Fixes: 6c10b5cc4eaa ("drm/amdgpu: Remove duplicate code in gfx_v8_0.c")
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 13 +++++-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 47 ++++++++++++++--------
>>> drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 7 ++++
>>> 3 files changed, 49 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> index c218e8f117e4..2a40150ae10f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> @@ -844,6 +844,9 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device
>>> *adev, u32 doorbell_off,
>>> if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
>>> return -EINVAL;
>>>
>>> + if (!kiq_ring->sched.ready || adev->job_hang)
>>> + return 0;
>>> +
>>> ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL);
>>> if (!ring_funcs)
>>> return -ENOMEM;
>>> @@ -868,8 +871,14 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device
>>> *adev, u32 doorbell_off,
>>>
>>> kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0);
>>>
>>> - if (kiq_ring->sched.ready && !adev->job_hang)
>>> - r = amdgpu_ring_test_helper(kiq_ring);
>>> + /* Submit unmap queue packet */
>>> + amdgpu_ring_commit(kiq_ring);
>>> + /*
>>> + * Ring test will do a basic scratch register change check. Just run
>>> + * this to ensure that unmap queues that is submitted before got
>>> + * processed successfully before returning.
>>> + */
>>> + r = amdgpu_ring_test_helper(kiq_ring);
>>>
>>> spin_unlock(&kiq->ring_lock);
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> index dabc01cf1fbb..6733ff5d9628 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> @@ -515,6 +515,17 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev,
>>> int xcc_id)
>>> if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
>>> return -EINVAL;
>>>
>>> + if (!kiq_ring->sched.ready || adev->job_hang)
>>> + return 0;
>>> + /**
>>> + * This is workaround: only skip kiq_ring test
>>> + * during ras recovery in suspend stage for gfx9.4.3
>>> + */
>>> + if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
>>> + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
>>> + amdgpu_ras_in_recovery(adev))
>>> + return 0;
>>> +
>>> spin_lock(&kiq->ring_lock);
>>> if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size *
>>> adev->gfx.num_compute_rings)) {
>>> @@ -528,20 +539,15 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device
>>> *adev, int xcc_id)
>>> &adev->gfx.compute_ring[j],
>>> RESET_QUEUES, 0, 0);
>>> }
>>> -
>>> - /**
>>> - * This is workaround: only skip kiq_ring test
>>> - * during ras recovery in suspend stage for gfx9.4.3
>>> + /* Submit unmap queue packet */
>>> + amdgpu_ring_commit(kiq_ring);
>>> + /*
>>> + * Ring test will do a basic scratch register change check. Just run
>>> + * this to ensure that unmap queues that is submitted before got
>>> + * processed successfully before returning.
>>> */
>>> - if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
>>> - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
>>> - amdgpu_ras_in_recovery(adev)) {
>>> - spin_unlock(&kiq->ring_lock);
>>> - return 0;
>>> - }
>>> + r = amdgpu_ring_test_helper(kiq_ring);
>>>
>>> - if (kiq_ring->sched.ready && !adev->job_hang)
>>> - r = amdgpu_ring_test_helper(kiq_ring);
>>> spin_unlock(&kiq->ring_lock);
>>>
>>> return r;
>>> @@ -569,8 +575,11 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev,
>>> int xcc_id)
>>> if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
>>> return -EINVAL;
>>>
>>> - spin_lock(&kiq->ring_lock);
>>> + if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang)
>>> + return 0;
>>> +
>>> if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) {
>>> + spin_lock(&kiq->ring_lock);
>>> if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size *
>>> adev->gfx.num_gfx_rings)) {
>>> spin_unlock(&kiq->ring_lock);
>>> @@ -583,11 +592,17 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device
>>> *adev, int xcc_id)
>>> &adev->gfx.gfx_ring[j],
>>> PREEMPT_QUEUES, 0, 0);
>>> }
>>> - }
>>> + /* Submit unmap queue packet */
>>> + amdgpu_ring_commit(kiq_ring);
>>>
>>> - if (adev->gfx.kiq[0].ring.sched.ready && !adev->job_hang)
>>> + /*
>>> + * Ring test will do a basic scratch register change check.
>>> + * Just run this to ensure that unmap queues that is submitted
>>> + * before got processed successfully before returning.
>>> + */
>>> r = amdgpu_ring_test_helper(kiq_ring);
>>> - spin_unlock(&kiq->ring_lock);
>>> + spin_unlock(&kiq->ring_lock);
>>> + }
>>>
>>> return r;
>>> }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index f85e545653c7..553a6113fa67 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -4823,6 +4823,13 @@ static int gfx_v8_0_kcq_disable(struct amdgpu_device
>>> *adev)
>>> amdgpu_ring_write(kiq_ring, 0);
>>> amdgpu_ring_write(kiq_ring, 0);
>>> }
>>> + /* Submit unmap queue packet */
>>> + amdgpu_ring_commit(kiq_ring);
>>> + /*
>>> + * Ring test will do a basic scratch register change check. Just run
>>> + * this to ensure that unmap queues that is submitted before got
>>> + * processed successfully before returning.
>>> + */
>>> r = amdgpu_ring_test_helper(kiq_ring);
>>> if (r)
>>> DRM_ERROR("KCQ disable failed\n");
>>> --
>>> 2.25.1
>>