Add gpu reset check and exception handling for
page retirement.

Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7f8e6ca07957..635dc86dbfd8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1386,10 +1386,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev, struct ras_query_i
        memset(&qctx, 0, sizeof(qctx));
        qctx.event_id = amdgpu_ras_acquire_event_id(adev, 
amdgpu_ras_intr_triggered() ?
                                                   RAS_EVENT_TYPE_ISR : 
RAS_EVENT_TYPE_INVALID);
+
+       if (!down_read_trylock(&adev->reset_domain->sem))
+               return -EIO;
+
        ret = amdgpu_ras_query_error_status_helper(adev, info,
                                                   &err_data,
                                                   &qctx,
                                                   error_query_mode);
+       up_read(&adev->reset_domain->sem);
        if (ret)
                goto out_fini_err_data;
 
@@ -2884,6 +2889,14 @@ static int amdgpu_ras_poison_creation_handler(struct 
amdgpu_device *adev,
        return 0;
 }
 
+static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct ras_poison_msg msg;
+
+       while (kfifo_get(&con->poison_fifo, &msg));
+}
+
 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
                        uint32_t msg_count, uint32_t *gpu_reset)
 {
@@ -2913,6 +2926,11 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
                else
                        reset = reset_flags;
 
+               /* Check if gpu is in reset state */
+               if (!down_read_trylock(&adev->reset_domain->sem))
+                       return -EIO;
+               up_read(&adev->reset_domain->sem);
+
                flush_delayed_work(&con->page_retirement_dwork);
 
                reinit_completion(&con->ras_recovery_completion);
@@ -2977,6 +2995,31 @@ static int amdgpu_ras_page_retirement_thread(void *param)
                        }
                }
 
+               if ((ret == -EIO) || (gpu_reset == 
AMDGPU_RAS_GPU_RESET_MODE1_RESET)) {
+                       /* gpu is in mode-1 reset state */
+                       /* Clear poison creation request */
+                       while (atomic_read(&con->poison_creation_count))
+                               atomic_dec(&con->poison_creation_count);
+
+                       /* Clear poison consumption fifo */
+                       amdgpu_ras_clear_poison_fifo(adev);
+
+                       while (atomic_read(&con->page_retirement_req_cnt))
+                               atomic_dec(&con->page_retirement_req_cnt);
+
+                       if (ret == -EIO) {
+                               /* Wait for mode-1 reset to complete */
+                               down_read(&adev->reset_domain->sem);
+                               up_read(&adev->reset_domain->sem);
+                       }
+
+                       /* Wake up work queue to save bad pages to eeprom */
+                       schedule_delayed_work(&con->page_retirement_dwork, 0);
+               } else if (gpu_reset) {
+                       /* gpu is in mode-2 reset or other reset state */
+                       /* Wake up work queue to save bad pages to eeprom */
+                       schedule_delayed_work(&con->page_retirement_dwork, 0);
+               }
 #else
         dev_info(adev->dev, "Start processing page retirement. request:%d\n",
                     atomic_read(&con->page_retirement_req_cnt));
-- 
2.34.1

Reply via email to