[AMD Official Use Only]
> -----Original Message----- > From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of > Michel Dänzer > Sent: Monday, August 16, 2021 6:35 PM > To: Deucher, Alexander <alexander.deuc...@amd.com>; Koenig, Christian > <christian.koe...@amd.com> > Cc: Liu, Leo <leo....@amd.com>; Zhu, James <james....@amd.com>; amd- > g...@lists.freedesktop.org; dri-de...@lists.freedesktop.org > Subject: [PATCH v3] drm/amdgpu: Cancel delayed work when GFXOFF is > disabled > > From: Michel Dänzer <mdaen...@redhat.com> > > schedule_delayed_work does not push back the work if it was already > scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms > after the first time GFXOFF was disabled and re-enabled, even if GFXOFF > was disabled and re-enabled again during those 100 ms. > > This resulted in frame drops / stutter with the upcoming mutter 41 > release on Navi 14, due to constantly enabling GFXOFF in the HW and > disabling it again (for getting the GPU clock counter). > > To fix this, call cancel_delayed_work_sync when the disable count > transitions from 0 to 1, and only schedule the delayed work on the > reverse transition, not if the disable count was already 0. This makes > sure the delayed work doesn't run at unexpected times, and allows it to > be lock-free. > > v2: > * Use cancel_delayed_work_sync & mutex_trylock instead of > mod_delayed_work. > v3: > * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König) > > Cc: sta...@vger.kernel.org > Signed-off-by: Michel Dänzer <mdaen...@redhat.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++++------ > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 22 +++++++++++++++++- > ---- > 2 files changed, 22 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index f3fd5ec710b6..f944ed858f3e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2777,12 +2777,11 @@ static void > amdgpu_device_delay_enable_gfx_off(struct work_struct *work) > struct amdgpu_device *adev = > container_of(work, struct amdgpu_device, > gfx.gfx_off_delay_work.work); > > - mutex_lock(&adev->gfx.gfx_off_mutex); > - if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { > - if (!amdgpu_dpm_set_powergating_by_smu(adev, > AMD_IP_BLOCK_TYPE_GFX, true)) > - adev->gfx.gfx_off_state = true; > - } > - mutex_unlock(&adev->gfx.gfx_off_mutex); > + WARN_ON_ONCE(adev->gfx.gfx_off_state); > + WARN_ON_ONCE(adev->gfx.gfx_off_req_count); > + > + if (!amdgpu_dpm_set_powergating_by_smu(adev, > AMD_IP_BLOCK_TYPE_GFX, true)) > + adev->gfx.gfx_off_state = true; > } > > /** > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index a0be0772c8b3..ca91aafcb32b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -563,15 +563,26 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device > *adev, bool enable) > > mutex_lock(&adev->gfx.gfx_off_mutex); > > - if (!enable) > - adev->gfx.gfx_off_req_count++; > - else if (adev->gfx.gfx_off_req_count > 0) > + if (enable) { > + /* If the count is already 0, it means there's an imbalance bug > somewhere. > + * Note that the bug may be in a different caller than the one > which triggers the > + * WARN_ON_ONCE. > + */ > + if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0)) > + goto unlock; > + > adev->gfx.gfx_off_req_count--; > + } else { > + adev->gfx.gfx_off_req_count++; > + } > > if (enable && !adev->gfx.gfx_off_state && !adev- > >gfx.gfx_off_req_count) { > schedule_delayed_work(&adev->gfx.gfx_off_delay_work, > GFX_OFF_DELAY_ENABLE); > - } else if (!enable && adev->gfx.gfx_off_state) { > - if (!amdgpu_dpm_set_powergating_by_smu(adev, > AMD_IP_BLOCK_TYPE_GFX, false)) { > + } else if (!enable && adev->gfx.gfx_off_req_count == 1) { [Quan, Evan] It seems here will leave a small time window for race condition. If amdgpu_device_delay_enable_gfx_off() happens to occur here, it will "WARN_ON_ONCE(adev->gfx.gfx_off_req_count);". How about something as below? @@ -573,13 +573,11 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable) goto unlock; adev->gfx.gfx_off_req_count--; - } else { - adev->gfx.gfx_off_req_count++; } if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE); - } else if (!enable && adev->gfx.gfx_off_req_count == 1) { + } else if (!enable && adev->gfx.gfx_off_req_count == 0) { cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work); if (adev->gfx.gfx_off_state && @@ -593,6 +591,9 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable) } } + if (!enable) + adev->gfx.gfx_off_req_count++; + unlock: BR Evan > + cancel_delayed_work_sync(&adev- > >gfx.gfx_off_delay_work); > + > + if (adev->gfx.gfx_off_state && > + !amdgpu_dpm_set_powergating_by_smu(adev, > AMD_IP_BLOCK_TYPE_GFX, false)) { > adev->gfx.gfx_off_state = false; > > if (adev->gfx.funcs->init_spm_golden) { > @@ -581,6 +592,7 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device > *adev, bool enable) > } > } > > +unlock: > mutex_unlock(&adev->gfx.gfx_off_mutex); > } > > -- > 2.32.0