Add debug option to enable mode2 for poison recovery
for testing purpose only.

Signed-off-by: Hawking Zhang <hawking.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h             |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c         |  6 ++++++
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 16 ++++++++++------
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e6b641cb362a..c34819f947ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1201,6 +1201,7 @@ struct amdgpu_device {
        bool                            debug_disable_soft_recovery;
        bool                            debug_use_vram_fw_buf;
        bool                            debug_enable_ras_aca;
+       bool                            debug_mode2_for_poison_recovery;
 };
 
 static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index afe3b8bd35a1..be6b920933d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -133,6 +133,7 @@ enum AMDGPU_DEBUG_MASK {
        AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
        AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
        AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4),
+       AMDGPU_DEBUG_MODE2_FOR_POISON_RECOVERY = BIT(5),
 };
 
 unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -2229,6 +2230,11 @@ static void amdgpu_init_debug_options(struct 
amdgpu_device *adev)
                pr_info("debug: enable RAS ACA\n");
                adev->debug_enable_ras_aca = true;
        }
+
+       if (amdgpu_debug_mask & AMDGPU_DEBUG_MODE2_FOR_POISON_RECOVERY) {
+               pr_info("debug: enable mode2 reset for poison consumption 
recovery");
+               adev->debug_mode2_for_poison_recovery = true;
+       }
 }
 
 static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long 
flags)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 816800555f7f..a355b2bc2214 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -164,10 +164,12 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
        case SOC15_IH_CLIENTID_SE3SH:
        case SOC15_IH_CLIENTID_UTCL2:
                block = AMDGPU_RAS_BLOCK__GFX;
-               if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3))
-                       reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
-               else
+               if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3)) {
+                       reset = ((dev->adev->debug_mode2_for_poison_recovery) ?
+                                AMDGPU_RAS_GPU_RESET_MODE2_RESET : 
AMDGPU_RAS_GPU_RESET_MODE1_RESET);
+               } else {
                        reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+               }
                break;
        case SOC15_IH_CLIENTID_VMC:
        case SOC15_IH_CLIENTID_VMC1:
@@ -180,10 +182,12 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
        case SOC15_IH_CLIENTID_SDMA3:
        case SOC15_IH_CLIENTID_SDMA4:
                block = AMDGPU_RAS_BLOCK__SDMA;
-               if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3))
-                       reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
-               else
+               if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3)) {
+                       reset = ((dev->adev->debug_mode2_for_poison_recovery) ?
+                                AMDGPU_RAS_GPU_RESET_MODE2_RESET : 
AMDGPU_RAS_GPU_RESET_MODE1_RESET);
+               } else {
                        reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+               }
                break;
        default:
                dev_warn(dev->adev->dev,
-- 
2.17.1

Reply via email to