To maintain system error state when SMU errors occurred,
which will aid in debugging SMU firmware issues, add SMU
debug option support.

It can be enabled or disabled via amdgpu_smu_debug
debugfs file. When enabled, it makes SMU errors fatal.
It is disabled by default.

== Command Guide ==

1, enable SMU debug option

 # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug

2, disable SMU debug option

 # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug

v3:
 - Use debugfs_create_bool().(Christian)
 - Put variable into smu_context struct.
 - Don't resend command when timeout.

v2:
 - Resend command when timeout.(Lijo)
 - Use debugfs file instead of module parameter.

Signed-off-by: Lang Yu <lang...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c        | 3 +++
 drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h            | 5 +++++
 drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 ++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c             | 8 +++++++-
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 164d6a9e9fbb..86cd888c7822 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1618,6 +1618,9 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
        if (!debugfs_initialized())
                return 0;
 
+       debugfs_create_bool("amdgpu_smu_debug", 0600, root,
+                                 &adev->smu.smu_debug_mode);
+
        ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
                                  &fops_ib_preempt);
        if (IS_ERR(ent)) {
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index f738f7dc20c9..50dbf5594a9d 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -569,6 +569,11 @@ struct smu_context
        struct smu_user_dpm_profile user_dpm_profile;
 
        struct stb_context stb_context;
+       /*
+        * When enabled, it makes SMU errors fatal.
+        * (0 = disabled (default), 1 = enabled)
+        */
+       bool smu_debug_mode;
 };
 
 struct i2c_adapter;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 6e781cee8bb6..d3797a2d6451 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1919,6 +1919,8 @@ static int aldebaran_mode2_reset(struct smu_context *smu)
 out:
        mutex_unlock(&smu->message_lock);
 
+       BUG_ON(unlikely(smu->smu_debug_mode) && ret);
+
        return ret;
 }
 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 048ca1673863..9be005eb4241 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -349,15 +349,21 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context 
*smu,
                __smu_cmn_reg_print_error(smu, reg, index, param, msg);
                goto Out;
        }
+
        __smu_cmn_send_msg(smu, (uint16_t) index, param);
        reg = __smu_cmn_poll_stat(smu);
        res = __smu_cmn_reg2errno(smu, reg);
-       if (res != 0)
+       if (res != 0) {
                __smu_cmn_reg_print_error(smu, reg, index, param, msg);
+               goto Out;
+       }
        if (read_arg)
                smu_cmn_read_arg(smu, read_arg);
 Out:
        mutex_unlock(&smu->message_lock);
+
+       BUG_ON(unlikely(smu->smu_debug_mode) && res);
+
        return res;
 }
 
-- 
2.25.1

Reply via email to