From: Nicholas Kazlauskas <[email protected]>

[Why]
To improve reliability of the system in the case of infrequent or
potentially recoverable hangs when the DMU times out.

[How]
Attempt to recover the ASIC when DMU hangs by triggering a crash
recovery callback for the DM to forward to the base driver.

Reviewed-by: Dillon Varone <[email protected]>
Signed-off-by: Nicholas Kazlauskas <[email protected]>
Signed-off-by: Matthew Stewart <[email protected]>
---
 .../amd/display/amdgpu_dm/amdgpu_dm_helpers.c |  6 ++++
 drivers/gpu/drm/amd/display/dc/dc.h           |  1 +
 drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c  | 35 ++++++++++++-------
 drivers/gpu/drm/amd/display/dc/dm_helpers.h   |  1 +
 4 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
index e5e993d3ef74..1f41d6540b83 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
@@ -1121,6 +1121,12 @@ void dm_set_dcn_clocks(struct dc_context *ctx, struct 
dc_clocks *clks)
        /* TODO: something */
 }
 
+void dm_helpers_dmu_timeout(struct dc_context *ctx)
+{
+       // TODO:
+       //amdgpu_device_gpu_recover(dc_context->driver-context, NULL);
+}
+
 void dm_helpers_smu_timeout(struct dc_context *ctx, unsigned int msg_id, 
unsigned int param, unsigned int timeout_us)
 {
        // TODO:
diff --git a/drivers/gpu/drm/amd/display/dc/dc.h 
b/drivers/gpu/drm/amd/display/dc/dc.h
index 94f62cf2cd30..15b90fac723a 100644
--- a/drivers/gpu/drm/amd/display/dc/dc.h
+++ b/drivers/gpu/drm/amd/display/dc/dc.h
@@ -1209,6 +1209,7 @@ struct dc_debug_options {
        uint32_t custom_psp_footer_size;
        bool disable_deferred_minimal_transitions;
        unsigned int num_fast_flips_to_steady_state_override;
+       bool enable_dmu_recovery;
 };
 
 
diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c 
b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
index 2dc6ae6b5bea..dc1b3f6c22c9 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
+++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
@@ -41,6 +41,8 @@
 #define DC_LOGGER CTX->logger
 #define GPINT_RETRY_NUM 20
 
+#define MAX_WAIT_US 100000
+
 static void dc_dmub_srv_construct(struct dc_dmub_srv *dc_srv, struct dc *dc,
                                  struct dmub_srv *dmub)
 {
@@ -48,6 +50,13 @@ static void dc_dmub_srv_construct(struct dc_dmub_srv 
*dc_srv, struct dc *dc,
        dc_srv->ctx = dc->ctx;
 }
 
+static void dc_dmub_srv_handle_failure(struct dc_dmub_srv *dc_dmub_srv)
+{
+       dc_dmub_srv_log_diagnostic_data(dc_dmub_srv);
+       if (dc_dmub_srv->ctx->dc->debug.enable_dmu_recovery)
+               dm_helpers_dmu_timeout(dc_dmub_srv->ctx);
+}
+
 struct dc_dmub_srv *dc_dmub_srv_create(struct dc *dc, struct dmub_srv *dmub)
 {
        struct dc_dmub_srv *dc_srv =
@@ -84,12 +93,12 @@ bool dc_dmub_srv_wait_for_pending(struct dc_dmub_srv 
*dc_dmub_srv)
        dmub = dc_dmub_srv->dmub;
 
        do {
-               status = dmub_srv_wait_for_pending(dmub, 100000);
+               status = dmub_srv_wait_for_pending(dmub, MAX_WAIT_US);
        } while (dc_dmub_srv->ctx->dc->debug.disable_timeout && status != 
DMUB_STATUS_OK);
 
        if (status != DMUB_STATUS_OK) {
                DC_ERROR("Error waiting for DMUB idle: status=%d\n", status);
-               dc_dmub_srv_log_diagnostic_data(dc_dmub_srv);
+               dc_dmub_srv_handle_failure(dc_dmub_srv);
        }
 
        return status == DMUB_STATUS_OK;
@@ -104,7 +113,7 @@ void dc_dmub_srv_clear_inbox0_ack(struct dc_dmub_srv 
*dc_dmub_srv)
        status = dmub_srv_clear_inbox0_ack(dmub);
        if (status != DMUB_STATUS_OK) {
                DC_ERROR("Error clearing INBOX0 ack: status=%d\n", status);
-               dc_dmub_srv_log_diagnostic_data(dc_dmub_srv);
+               dc_dmub_srv_handle_failure(dc_dmub_srv);
        }
 }
 
@@ -114,10 +123,10 @@ void dc_dmub_srv_wait_for_inbox0_ack(struct dc_dmub_srv 
*dc_dmub_srv)
        struct dc_context *dc_ctx = dc_dmub_srv->ctx;
        enum dmub_status status = DMUB_STATUS_OK;
 
-       status = dmub_srv_wait_for_inbox0_ack(dmub, 100000);
+       status = dmub_srv_wait_for_inbox0_ack(dmub, MAX_WAIT_US);
        if (status != DMUB_STATUS_OK) {
                DC_ERROR("Error waiting for INBOX0 HW Lock Ack\n");
-               dc_dmub_srv_log_diagnostic_data(dc_dmub_srv);
+               dc_dmub_srv_handle_failure(dc_dmub_srv);
        }
 }
 
@@ -131,7 +140,7 @@ void dc_dmub_srv_send_inbox0_cmd(struct dc_dmub_srv 
*dc_dmub_srv,
        status = dmub_srv_send_inbox0_cmd(dmub, data);
        if (status != DMUB_STATUS_OK) {
                DC_ERROR("Error sending INBOX0 cmd\n");
-               dc_dmub_srv_log_diagnostic_data(dc_dmub_srv);
+               dc_dmub_srv_handle_failure(dc_dmub_srv);
        }
 }
 
@@ -153,7 +162,7 @@ static bool dc_dmub_srv_reg_cmd_list_queue_execute(struct 
dc_dmub_srv *dc_dmub_s
        for (i = 0 ; i < count; i++) {
                /* confirm no messages pending */
                do {
-                       status = dmub_srv_wait_for_idle(dmub, 100000);
+                       status = dmub_srv_wait_for_idle(dmub, MAX_WAIT_US);
                } while (dc_dmub_srv->ctx->dc->debug.disable_timeout && status 
!= DMUB_STATUS_OK);
 
                /* queue command */
@@ -169,7 +178,7 @@ static bool dc_dmub_srv_reg_cmd_list_queue_execute(struct 
dc_dmub_srv *dc_dmub_s
        if (status != DMUB_STATUS_OK) {
                if (status != DMUB_STATUS_POWER_STATE_D3) {
                        DC_ERROR("Error starting DMUB execution: status=%d\n", 
status);
-                       dc_dmub_srv_log_diagnostic_data(dc_dmub_srv);
+                       dc_dmub_srv_handle_failure(dc_dmub_srv);
                }
                return false;
        }
@@ -208,7 +217,7 @@ static bool dc_dmub_srv_fb_cmd_list_queue_execute(struct 
dc_dmub_srv *dc_dmub_sr
                                return false;
 
                        do {
-                                       status = 
dmub_srv_wait_for_inbox_free(dmub, 100000, count - i);
+                                       status = 
dmub_srv_wait_for_inbox_free(dmub, MAX_WAIT_US, count - i);
                        } while (dc_dmub_srv->ctx->dc->debug.disable_timeout && 
status != DMUB_STATUS_OK);
 
                        /* Requeue the command. */
@@ -218,7 +227,7 @@ static bool dc_dmub_srv_fb_cmd_list_queue_execute(struct 
dc_dmub_srv *dc_dmub_sr
                if (status != DMUB_STATUS_OK) {
                        if (status != DMUB_STATUS_POWER_STATE_D3) {
                                DC_ERROR("Error queueing DMUB command: 
status=%d\n", status);
-                               dc_dmub_srv_log_diagnostic_data(dc_dmub_srv);
+                               dc_dmub_srv_handle_failure(dc_dmub_srv);
                        }
                        return false;
                }
@@ -228,7 +237,7 @@ static bool dc_dmub_srv_fb_cmd_list_queue_execute(struct 
dc_dmub_srv *dc_dmub_sr
        if (status != DMUB_STATUS_OK) {
                if (status != DMUB_STATUS_POWER_STATE_D3) {
                        DC_ERROR("Error starting DMUB execution: status=%d\n", 
status);
-                       dc_dmub_srv_log_diagnostic_data(dc_dmub_srv);
+                       dc_dmub_srv_handle_failure(dc_dmub_srv);
                }
                return false;
        }
@@ -271,7 +280,7 @@ bool dc_dmub_srv_wait_for_idle(struct dc_dmub_srv 
*dc_dmub_srv,
        // Wait for DMUB to process command
        if (wait_type != DM_DMUB_WAIT_TYPE_NO_WAIT) {
                do {
-                       status = dmub_srv_wait_for_idle(dmub, 100000);
+                       status = dmub_srv_wait_for_idle(dmub, MAX_WAIT_US);
                } while (dc_dmub_srv->ctx->dc->debug.disable_timeout && status 
!= DMUB_STATUS_OK);
 
                if (status != DMUB_STATUS_OK) {
@@ -282,7 +291,7 @@ bool dc_dmub_srv_wait_for_idle(struct dc_dmub_srv 
*dc_dmub_srv,
                                        dmub->debug.timeout_info.timeout_cmd = 
*cmd_list;
                                dmub->debug.timeout_info.timestamp = 
dm_get_timestamp(dc_dmub_srv->ctx);
                        }
-                       dc_dmub_srv_log_diagnostic_data(dc_dmub_srv);
+                       dc_dmub_srv_handle_failure(dc_dmub_srv);
                        return false;
                }
 
diff --git a/drivers/gpu/drm/amd/display/dc/dm_helpers.h 
b/drivers/gpu/drm/amd/display/dc/dm_helpers.h
index 9d160b39e8c5..7014b8c2c956 100644
--- a/drivers/gpu/drm/amd/display/dc/dm_helpers.h
+++ b/drivers/gpu/drm/amd/display/dc/dm_helpers.h
@@ -197,6 +197,7 @@ void dm_set_phyd32clk(struct dc_context *ctx, int freq_khz);
 
 bool dm_helpers_dmub_outbox_interrupt_control(struct dc_context *ctx, bool 
enable);
 
+void dm_helpers_dmu_timeout(struct dc_context *ctx);
 void dm_helpers_smu_timeout(struct dc_context *ctx, unsigned int msg_id, 
unsigned int param, unsigned int timeout_us);
 
 // 0x1 = Result_OK, 0xFE = Result_UnkmownCmd, 0x0 = Status_Busy
-- 
2.52.0

Reply via email to