On 5/2/2025 8:13 AM, Alex Deucher wrote:
On Fri, May 2, 2025 at 8:56 AM Alex Deucher <alexdeuc...@gmail.com> wrote:
On Thu, May 1, 2025 at 5:19 PM Mario Limonciello
<mario.limoncie...@amd.com> wrote:
On 5/1/2025 3:09 PM, Alex Deucher wrote:
Set the s3/s0ix and s4 flags in the pm notifier so that we can skip
the resource evictions properly in pm prepare based on whether
we are suspending or hibernating. Drop the eviction as processes
are not frozen at this time, we we can end up getting stuck trying
to evict VRAM while applications continue to submit work which
causes the buffers to get pulled back into VRAM.
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4178
Fixes: 2965e6355dcd ("drm/amd: Add Suspend/Hibernate notification callback
support")
Cc: Mario Limonciello <mario.limoncie...@amd.com>
Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
Rather than removing the callbacks (and thus introducing the "other"
bugs with memory pressure), I've sent an alternative series. LMK what
you think of this instead.
https://lore.kernel.org/amd-gfx/20250501211734.2434369-1-supe...@kernel.org/T/#m6cdc075af911868667ea6fc849bcd196477d6c20
Series looks good to me, but I think a variant of this patch (with the
evictions still in place) still makes sense so that we can properly
set the suspend and hibernate flags in amdgpu so that we can optimize
the evictions for suspend on APUs. Will respin.
I think this set still makes sense for stable, but then if your patch
series lands, we can apply the attached patch on top of that.
OK. Let me re-look through your series once more and I'll leave
comments or a tag.
Regarding the patch to re-introduced it you attached, I suppose the
drm_warn() message doesn't need to talk about freezing processes as a
solution to eviction failure because freezing is implied with the new
notifier in use.
Alex
Alex
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 25 +++++++++++-----------
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 22 ++-----------------
2 files changed, 15 insertions(+), 32 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 71d95f16067a4..d232e4a26d7bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4974,28 +4974,29 @@ static int amdgpu_device_evict_resources(struct
amdgpu_device *adev)
* @data: data
*
* This function is called when the system is about to suspend or hibernate.
- * It is used to evict resources from the device before the system goes to
- * sleep while there is still access to swap.
+ * It is used to set the appropriate flags so that eviction can be optimized
+ * in the pm prepare callback.
*/
static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned
long mode,
void *data)
{
struct amdgpu_device *adev = container_of(nb, struct amdgpu_device,
pm_nb);
- int r;
switch (mode) {
case PM_HIBERNATION_PREPARE:
adev->in_s4 = true;
- fallthrough;
+ break;
+ case PM_POST_HIBERNATION:
+ adev->in_s4 = false;
+ break;
case PM_SUSPEND_PREPARE:
- r = amdgpu_device_evict_resources(adev);
- /*
- * This is considered non-fatal at this time because
- * amdgpu_device_prepare() will also fatally evict resources.
- * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781
- */
- if (r)
- drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze
active processes if problems occur: %d\n", r);
+ if (amdgpu_acpi_is_s0ix_active(adev))
+ adev->in_s0ix = true;
+ else if (amdgpu_acpi_is_s3_active(adev))
+ adev->in_s3 = true;
+ break;
+ case PM_POST_SUSPEND:
+ adev->in_s0ix = adev->in_s3 = false;
break;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index cec041a556013..6599fb6313220 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2572,10 +2572,6 @@ static int amdgpu_pmops_suspend(struct device *dev)
struct drm_device *drm_dev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(drm_dev);
- if (amdgpu_acpi_is_s0ix_active(adev))
- adev->in_s0ix = true;
- else if (amdgpu_acpi_is_s3_active(adev))
- adev->in_s3 = true;
if (!adev->in_s0ix && !adev->in_s3) {
/* don't allow going deep first time followed by s2idle the next
time */
if (adev->last_suspend_state != PM_SUSPEND_ON &&
@@ -2608,7 +2604,6 @@ static int amdgpu_pmops_resume(struct device *dev)
{
struct drm_device *drm_dev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(drm_dev);
- int r;
if (!adev->in_s0ix && !adev->in_s3)
return 0;
@@ -2617,12 +2612,7 @@ static int amdgpu_pmops_resume(struct device *dev)
if (!pci_device_is_present(adev->pdev))
adev->no_hw_access = true;
- r = amdgpu_device_resume(drm_dev, true);
- if (amdgpu_acpi_is_s0ix_active(adev))
- adev->in_s0ix = false;
- else
- adev->in_s3 = false;
- return r;
+ return amdgpu_device_resume(drm_dev, true);
}
static int amdgpu_pmops_freeze(struct device *dev)
@@ -2643,13 +2633,8 @@ static int amdgpu_pmops_freeze(struct device *dev)
static int amdgpu_pmops_thaw(struct device *dev)
{
struct drm_device *drm_dev = dev_get_drvdata(dev);
- struct amdgpu_device *adev = drm_to_adev(drm_dev);
- int r;
- r = amdgpu_device_resume(drm_dev, true);
- adev->in_s4 = false;
-
- return r;
+ return amdgpu_device_resume(drm_dev, true);
}
static int amdgpu_pmops_poweroff(struct device *dev)
@@ -2662,9 +2647,6 @@ static int amdgpu_pmops_poweroff(struct device *dev)
static int amdgpu_pmops_restore(struct device *dev)
{
struct drm_device *drm_dev = dev_get_drvdata(dev);
- struct amdgpu_device *adev = drm_to_adev(drm_dev);
-
- adev->in_s4 = false;
return amdgpu_device_resume(drm_dev, true);
}