amdgpu: reuse code of ras bad page's bo create

Chen, Guchun Sun, 29 Sep 2019 20:26:23 -0700

Regards,
Guchun

-----Original Message-----
From: Zhou1, Tao <tao.zh...@amd.com> 
Sent: Monday, September 30, 2019 11:20 AM
To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey 
<andrey.grodzov...@amd.com>; Chen, Guchun <guchun.c...@amd.com>; Zhang, Hawking 
<hawking.zh...@amd.com>
Cc: Zhou1, Tao <tao.zh...@amd.com>
Subject: [PATCH 3/3] drm/amdgpu: reuse code of ras bad page's bo create


implement ras_create_bad_pages_bo to simplify ras code

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 72 +++++++++++--------------
 1 file changed, 31 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d1bafa92ca91..fe3a57e567c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1430,41 +1430,53 @@ static int amdgpu_ras_load_bad_pages(struct 
amdgpu_device *adev)
        return ret;
 }
 
-/* called in gpu recovery/init */
-int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
+static void amdgpu_ras_create_bad_pages_bo(struct amdgpu_device *adev)
 {
+       /* Note: the caller should guarantee con and data are not NULL */
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       struct ras_err_handler_data *data;
+       struct ras_err_handler_data *data = con->eh_data;
        uint64_t bp;
-       struct amdgpu_bo *bo = NULL;
-       int i, ret = 0;
-
-       if (!con || !con->eh_data)
-               return 0;
+       struct amdgpu_bo *bo;
+       int i;
 
-       mutex_lock(&con->recovery_lock);
-       data = con->eh_data;
-       if (!data)
-               goto out;
-       /* reserve vram at driver post stage. */
        for (i = data->last_reserved; i < data->count; i++) {
+               bo = NULL;
                bp = data->bps[i].retired_page;
 
-               /* There are two cases of reserve error should be ignored:
+               /* There are three cases of reserve error should be ignored:
                 * 1) a ras bad page has been allocated (used by someone);
                 * 2) a ras bad page has been reserved (duplicate error 
injection
                 *    for one page);
+                * 3) bo info isn't lost in gpu reset
                 */
                if (amdgpu_bo_create_kernel_at(adev, bp << 
AMDGPU_GPU_PAGE_SHIFT,
                                               AMDGPU_GPU_PAGE_SIZE,
                                               AMDGPU_GEM_DOMAIN_VRAM,
                                               &bo, NULL))
-                       DRM_WARN("RAS WARN: reserve vram for retired page %llx 
fail\n", bp);
-
-               data->bps_bo[i] = bo;
+                       DRM_NOTE("RAS NOTE: reserve vram for retired page 
0x%llx fail\n", bp);
+               else
+                       data->bps_bo[i] = bo;
[Guchun]The "else" should not needed? Otherwise, if amdgpu_bo_create_kernel_at 
always succeeds, we don't catch a chance to update bps_bo.
Is that true?
                data->last_reserved = i + 1;
-               bo = NULL;
        }
+}
+
+/* called in gpu recovery/init */
+int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) {
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct ras_err_handler_data *data;
+       int ret = 0;
+
+       if (!con || !con->eh_data)
+               return 0;
+
+       mutex_lock(&con->recovery_lock);
+       data = con->eh_data;
+       if (!data)
+               goto out;
+
+       /* reserve vram at driver post stage. */
+       amdgpu_ras_create_bad_pages_bo(adev);
 
        /* continue to save bad pages to eeprom even reesrve_vram fails */
        ret = amdgpu_ras_save_bad_pages(adev); @@ -1583,9 +1595,6 @@ void 
amdgpu_ras_recovery_reset(struct amdgpu_device *adev)  {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_err_handler_data *data;
-       uint64_t bp;
-       struct amdgpu_bo *bo = NULL;
-       int i;
 
        if (!con || !con->eh_data)
                return;
@@ -1600,26 +1609,7 @@ void amdgpu_ras_recovery_reset(struct amdgpu_device 
*adev)
        data = con->eh_data;
        /* force to reserve all retired page again */
        data->last_reserved = 0;
-
-       for (i = data->last_reserved; i < data->count; i++) {
-               bp = data->bps[i].retired_page;
-
-               /* There are three cases of reserve error should be ignored:
-                * 1) a ras bad page has been allocated (used by someone);
-                * 2) a ras bad page has been reserved (duplicate error 
injection
-                *    for one page);
-                * 3) bo info isn't lost in gpu reset
-                */
-               if (amdgpu_bo_create_kernel_at(adev, bp << 
AMDGPU_GPU_PAGE_SHIFT,
-                                              AMDGPU_GPU_PAGE_SIZE,
-                                              AMDGPU_GEM_DOMAIN_VRAM,
-                                              &bo, NULL))
-                       DRM_NOTE("RAS NOTE: recreate bo for retired page 0x%llx 
fail\n", bp);
-               else
-                       data->bps_bo[i] = bo;
-               data->last_reserved = i + 1;
-               bo = NULL;
-       }
+       amdgpu_ras_create_bad_pages_bo(adev);
 }
 /* recovery end */
 
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 3/3] drm/amdgpu: reuse code of ras bad page's bo create

Reply via email to