Almost error count registers are automatically cleared
after reading once, so both CE and UE count needs to be
read in one loop.

Signed-off-by: Guchun Chen <guchun.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 16 +++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 ++--
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index c06cb06398b1..29fa6b6b9d3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -335,7 +335,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
 {
        struct amdgpu_ctx *ctx;
        struct amdgpu_ctx_mgr *mgr;
-       unsigned long ras_counter;
+       unsigned long ras_counter_ue, ras_counter_ce;
 
        if (!fpriv)
                return -EINVAL;
@@ -360,19 +360,17 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
        if (atomic_read(&ctx->guilty))
                out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
 
-       /*query ue count*/
-       ras_counter = amdgpu_ras_query_error_count(adev, false);
+       /*query both ue and ce count*/
+       amdgpu_ras_query_error_count(adev, &ras_counter_ue, &ras_counter_ce);
        /*ras counter is monotonic increasing*/
-       if (ras_counter != ctx->ras_counter_ue) {
+       if (ras_counter_ue != ctx->ras_counter_ue) {
                out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
-               ctx->ras_counter_ue = ras_counter;
+               ctx->ras_counter_ue = ras_counter_ue;
        }
 
-       /*query ce count*/
-       ras_counter = amdgpu_ras_query_error_count(adev, true);
-       if (ras_counter != ctx->ras_counter_ce) {
+       if (ras_counter_ce != ctx->ras_counter_ce) {
                out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
-               ctx->ras_counter_ce = ras_counter;
+               ctx->ras_counter_ce = ras_counter_ce;
        }
 
        mutex_unlock(&mgr->lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 337bf2da7bdc..109eff2869b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -861,15 +861,18 @@ int amdgpu_ras_error_cure(struct amdgpu_device *adev,
 }
 
 /* get the total error counts on all IPs */
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-               bool is_ce)
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+               unsigned long *ue_cnt, unsigned long *ce_cnt)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_manager *obj;
        struct ras_err_data data = {0, 0};
 
+       *ue_cnt = 0;
+       *ce_cnt = 0;
+
        if (!con)
-               return 0;
+               return;
 
        list_for_each_entry(obj, &con->head, node) {
                struct ras_query_if info = {
@@ -877,13 +880,14 @@ unsigned long amdgpu_ras_query_error_count(struct 
amdgpu_device *adev,
                };
 
                if (amdgpu_ras_error_query(adev, &info))
-                       return 0;
+                       continue;
 
                data.ce_count += info.ce_count;
                data.ue_count += info.ue_count;
        }
 
-       return is_ce ? data.ce_count : data.ue_count;
+       *ue_cnt = data.ue_count;
+       *ce_cnt = data.ce_count;
 }
 /* query/inject/cure end */
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index e7df5d8429f8..733eab5bc512 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -487,8 +487,8 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device 
*adev,
 void amdgpu_ras_resume(struct amdgpu_device *adev);
 void amdgpu_ras_suspend(struct amdgpu_device *adev);
 
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-               bool is_ce);
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+               unsigned long *ue_cnt, unsigned long *ce_cnt);
 
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to