From: Nicholas Kazlauskas <[email protected]> [Why] To improve reliability of the system in the case of infrequent or potentially recoverable hangs when the DMU times out.
[How] Attempt to recover the ASIC when DMU hangs by triggering a crash recovery callback for the DM to forward to the base driver. Reviewed-by: Dillon Varone <[email protected]> Signed-off-by: Nicholas Kazlauskas <[email protected]> Signed-off-by: Matthew Stewart <[email protected]> --- .../amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 6 ++++ drivers/gpu/drm/amd/display/dc/dc.h | 1 + drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c | 35 ++++++++++++------- drivers/gpu/drm/amd/display/dc/dm_helpers.h | 1 + 4 files changed, 30 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c index e5e993d3ef74..1f41d6540b83 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c @@ -1121,6 +1121,12 @@ void dm_set_dcn_clocks(struct dc_context *ctx, struct dc_clocks *clks) /* TODO: something */ } +void dm_helpers_dmu_timeout(struct dc_context *ctx) +{ + // TODO: + //amdgpu_device_gpu_recover(dc_context->driver-context, NULL); +} + void dm_helpers_smu_timeout(struct dc_context *ctx, unsigned int msg_id, unsigned int param, unsigned int timeout_us) { // TODO: diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 94f62cf2cd30..15b90fac723a 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1209,6 +1209,7 @@ struct dc_debug_options { uint32_t custom_psp_footer_size; bool disable_deferred_minimal_transitions; unsigned int num_fast_flips_to_steady_state_override; + bool enable_dmu_recovery; }; diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c index 2dc6ae6b5bea..dc1b3f6c22c9 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c @@ -41,6 +41,8 @@ #define DC_LOGGER CTX->logger #define GPINT_RETRY_NUM 20 +#define MAX_WAIT_US 100000 + static void dc_dmub_srv_construct(struct dc_dmub_srv *dc_srv, struct dc *dc, struct dmub_srv *dmub) { @@ -48,6 +50,13 @@ static void dc_dmub_srv_construct(struct dc_dmub_srv *dc_srv, struct dc *dc, dc_srv->ctx = dc->ctx; } +static void dc_dmub_srv_handle_failure(struct dc_dmub_srv *dc_dmub_srv) +{ + dc_dmub_srv_log_diagnostic_data(dc_dmub_srv); + if (dc_dmub_srv->ctx->dc->debug.enable_dmu_recovery) + dm_helpers_dmu_timeout(dc_dmub_srv->ctx); +} + struct dc_dmub_srv *dc_dmub_srv_create(struct dc *dc, struct dmub_srv *dmub) { struct dc_dmub_srv *dc_srv = @@ -84,12 +93,12 @@ bool dc_dmub_srv_wait_for_pending(struct dc_dmub_srv *dc_dmub_srv) dmub = dc_dmub_srv->dmub; do { - status = dmub_srv_wait_for_pending(dmub, 100000); + status = dmub_srv_wait_for_pending(dmub, MAX_WAIT_US); } while (dc_dmub_srv->ctx->dc->debug.disable_timeout && status != DMUB_STATUS_OK); if (status != DMUB_STATUS_OK) { DC_ERROR("Error waiting for DMUB idle: status=%d\n", status); - dc_dmub_srv_log_diagnostic_data(dc_dmub_srv); + dc_dmub_srv_handle_failure(dc_dmub_srv); } return status == DMUB_STATUS_OK; @@ -104,7 +113,7 @@ void dc_dmub_srv_clear_inbox0_ack(struct dc_dmub_srv *dc_dmub_srv) status = dmub_srv_clear_inbox0_ack(dmub); if (status != DMUB_STATUS_OK) { DC_ERROR("Error clearing INBOX0 ack: status=%d\n", status); - dc_dmub_srv_log_diagnostic_data(dc_dmub_srv); + dc_dmub_srv_handle_failure(dc_dmub_srv); } } @@ -114,10 +123,10 @@ void dc_dmub_srv_wait_for_inbox0_ack(struct dc_dmub_srv *dc_dmub_srv) struct dc_context *dc_ctx = dc_dmub_srv->ctx; enum dmub_status status = DMUB_STATUS_OK; - status = dmub_srv_wait_for_inbox0_ack(dmub, 100000); + status = dmub_srv_wait_for_inbox0_ack(dmub, MAX_WAIT_US); if (status != DMUB_STATUS_OK) { DC_ERROR("Error waiting for INBOX0 HW Lock Ack\n"); - dc_dmub_srv_log_diagnostic_data(dc_dmub_srv); + dc_dmub_srv_handle_failure(dc_dmub_srv); } } @@ -131,7 +140,7 @@ void dc_dmub_srv_send_inbox0_cmd(struct dc_dmub_srv *dc_dmub_srv, status = dmub_srv_send_inbox0_cmd(dmub, data); if (status != DMUB_STATUS_OK) { DC_ERROR("Error sending INBOX0 cmd\n"); - dc_dmub_srv_log_diagnostic_data(dc_dmub_srv); + dc_dmub_srv_handle_failure(dc_dmub_srv); } } @@ -153,7 +162,7 @@ static bool dc_dmub_srv_reg_cmd_list_queue_execute(struct dc_dmub_srv *dc_dmub_s for (i = 0 ; i < count; i++) { /* confirm no messages pending */ do { - status = dmub_srv_wait_for_idle(dmub, 100000); + status = dmub_srv_wait_for_idle(dmub, MAX_WAIT_US); } while (dc_dmub_srv->ctx->dc->debug.disable_timeout && status != DMUB_STATUS_OK); /* queue command */ @@ -169,7 +178,7 @@ static bool dc_dmub_srv_reg_cmd_list_queue_execute(struct dc_dmub_srv *dc_dmub_s if (status != DMUB_STATUS_OK) { if (status != DMUB_STATUS_POWER_STATE_D3) { DC_ERROR("Error starting DMUB execution: status=%d\n", status); - dc_dmub_srv_log_diagnostic_data(dc_dmub_srv); + dc_dmub_srv_handle_failure(dc_dmub_srv); } return false; } @@ -208,7 +217,7 @@ static bool dc_dmub_srv_fb_cmd_list_queue_execute(struct dc_dmub_srv *dc_dmub_sr return false; do { - status = dmub_srv_wait_for_inbox_free(dmub, 100000, count - i); + status = dmub_srv_wait_for_inbox_free(dmub, MAX_WAIT_US, count - i); } while (dc_dmub_srv->ctx->dc->debug.disable_timeout && status != DMUB_STATUS_OK); /* Requeue the command. */ @@ -218,7 +227,7 @@ static bool dc_dmub_srv_fb_cmd_list_queue_execute(struct dc_dmub_srv *dc_dmub_sr if (status != DMUB_STATUS_OK) { if (status != DMUB_STATUS_POWER_STATE_D3) { DC_ERROR("Error queueing DMUB command: status=%d\n", status); - dc_dmub_srv_log_diagnostic_data(dc_dmub_srv); + dc_dmub_srv_handle_failure(dc_dmub_srv); } return false; } @@ -228,7 +237,7 @@ static bool dc_dmub_srv_fb_cmd_list_queue_execute(struct dc_dmub_srv *dc_dmub_sr if (status != DMUB_STATUS_OK) { if (status != DMUB_STATUS_POWER_STATE_D3) { DC_ERROR("Error starting DMUB execution: status=%d\n", status); - dc_dmub_srv_log_diagnostic_data(dc_dmub_srv); + dc_dmub_srv_handle_failure(dc_dmub_srv); } return false; } @@ -271,7 +280,7 @@ bool dc_dmub_srv_wait_for_idle(struct dc_dmub_srv *dc_dmub_srv, // Wait for DMUB to process command if (wait_type != DM_DMUB_WAIT_TYPE_NO_WAIT) { do { - status = dmub_srv_wait_for_idle(dmub, 100000); + status = dmub_srv_wait_for_idle(dmub, MAX_WAIT_US); } while (dc_dmub_srv->ctx->dc->debug.disable_timeout && status != DMUB_STATUS_OK); if (status != DMUB_STATUS_OK) { @@ -282,7 +291,7 @@ bool dc_dmub_srv_wait_for_idle(struct dc_dmub_srv *dc_dmub_srv, dmub->debug.timeout_info.timeout_cmd = *cmd_list; dmub->debug.timeout_info.timestamp = dm_get_timestamp(dc_dmub_srv->ctx); } - dc_dmub_srv_log_diagnostic_data(dc_dmub_srv); + dc_dmub_srv_handle_failure(dc_dmub_srv); return false; } diff --git a/drivers/gpu/drm/amd/display/dc/dm_helpers.h b/drivers/gpu/drm/amd/display/dc/dm_helpers.h index 9d160b39e8c5..7014b8c2c956 100644 --- a/drivers/gpu/drm/amd/display/dc/dm_helpers.h +++ b/drivers/gpu/drm/amd/display/dc/dm_helpers.h @@ -197,6 +197,7 @@ void dm_set_phyd32clk(struct dc_context *ctx, int freq_khz); bool dm_helpers_dmub_outbox_interrupt_control(struct dc_context *ctx, bool enable); +void dm_helpers_dmu_timeout(struct dc_context *ctx); void dm_helpers_smu_timeout(struct dc_context *ctx, unsigned int msg_id, unsigned int param, unsigned int timeout_us); // 0x1 = Result_OK, 0xFE = Result_UnkmownCmd, 0x0 = Status_Busy -- 2.52.0
