From: Hawking Zhang <hawking.zh...@amd.com> Introduce new functions that are used to generate cper ue or ce records.
v2: return -ENOMEM instead of false v2: check return value of fill section function Signed-off-by: Hawking Zhang <hawking.zh...@amd.com> Signed-off-by: Xiang Liu <xiang....@amd.com> Reviewed-by: Yang Wang <keivnyang.w...@amd.com> Reviewed-by: Tao Zhou <tao.zh...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 12 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 12 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 108 +++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h | 9 +- 4 files changed, 128 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 1a26b8ad14cb..ed1c20bd8114 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -30,16 +30,6 @@ typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data); -struct aca_banks { - int nr_banks; - struct list_head list; -}; - -struct aca_hwip { - int hwid; - int mcatype; -}; - static struct aca_hwip aca_hwid_mcatypes[ACA_HWIP_TYPE_COUNT] = { ACA_BANK_HWID(SMU, 0x01, 0x01), ACA_BANK_HWID(PCS_XGMI, 0x50, 0x00), @@ -111,7 +101,7 @@ static struct aca_regs_dump { {"STATUS", ACA_REG_IDX_STATUS}, {"ADDR", ACA_REG_IDX_ADDR}, {"MISC", ACA_REG_IDX_MISC0}, - {"CONFIG", ACA_REG_IDX_CONFG}, + {"CONFIG", ACA_REG_IDX_CONFIG}, {"IPID", ACA_REG_IDX_IPID}, {"SYND", ACA_REG_IDX_SYND}, {"DESTAT", ACA_REG_IDX_DESTAT}, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index 3cd0115b0244..b84a3489b116 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -81,7 +81,7 @@ enum aca_reg_idx { ACA_REG_IDX_STATUS = 1, ACA_REG_IDX_ADDR = 2, ACA_REG_IDX_MISC0 = 3, - ACA_REG_IDX_CONFG = 4, + ACA_REG_IDX_CONFIG = 4, ACA_REG_IDX_IPID = 5, ACA_REG_IDX_SYND = 6, ACA_REG_IDX_DESTAT = 8, @@ -114,6 +114,11 @@ enum aca_smu_type { ACA_SMU_TYPE_COUNT, }; +struct aca_hwip { + int hwid; + int mcatype; +}; + struct aca_bank { enum aca_error_type aca_err_type; enum aca_smu_type smu_err_type; @@ -125,6 +130,11 @@ struct aca_bank_node { struct list_head node; }; +struct aca_banks { + int nr_banks; + struct list_head list; +}; + struct aca_bank_info { int die_id; int socket_id; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c index 8ce5dc6efcf9..f82aa12a88f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c @@ -21,6 +21,7 @@ * OTHER DEALINGS IN THE SOFTWARE. * */ +#include <linux/list.h> #include "amdgpu.h" static const guid_t MCE = CPER_NOTIFY_MCE; @@ -257,6 +258,113 @@ struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev, return hdr; } +int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev, + struct aca_bank *bank) +{ + struct cper_hdr *fatal = NULL; + struct cper_sec_crashdump_reg_data reg_data = { 0 }; + int ret; + + fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1); + if (!fatal) { + dev_err(adev->dev, "fail to alloc cper entry for ue record\n"); + return -ENOMEM; + } + + reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]); + reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]); + reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]); + reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]); + reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]); + reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]); + reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]); + reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]); + + amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL); + ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data); + if (ret) + return ret; + + /*TODO: commit the cper entry to cper ring */ + + return 0; +} + +static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev, + enum aca_error_type aca_err_type) +{ + switch (aca_err_type) { + case ACA_ERROR_TYPE_UE: + return CPER_SEV_FATAL; + case ACA_ERROR_TYPE_CE: + return CPER_SEV_NON_FATAL_CORRECTED; + case ACA_ERROR_TYPE_DEFERRED: + return CPER_SEV_NON_FATAL_UNCORRECTED; + default: + dev_err(adev->dev, "Unknown ACA error type!\n"); + return CPER_SEV_FATAL; + } +} + +int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev, + struct aca_banks *banks, + uint16_t bank_count) +{ + struct cper_hdr *corrected = NULL; + enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED; + uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 }; + struct aca_bank_node *node; + struct aca_bank *bank; + uint32_t i; + int ret; + + corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count); + if (!corrected) { + dev_err(adev->dev, "fail to allocate cper entry for ce records\n"); + return -ENOMEM; + } + + /* Raise severity if any DE is detected in the ACA bank list */ + list_for_each_entry(node, &banks->list, node) { + bank = &node->bank; + if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) { + sev = CPER_SEV_NON_FATAL_UNCORRECTED; + break; + } + } + + amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev); + + /* Combine CE and UE in cper record */ + list_for_each_entry(node, &banks->list, node) { + bank = &node->bank; + reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]); + reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]); + reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]); + reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]); + reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]); + reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]); + reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]); + reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]); + reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]); + reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]); + reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]); + reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]); + reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]); + reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]); + + ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i, + amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type), + reg_data, CPER_ACA_REG_COUNT); + if (ret) + return ret; + } + + /*TODO: commit the cper entry to cper ring */ + + return 0; +} + int amdgpu_cper_init(struct amdgpu_device *adev) { mutex_init(&adev->cper.cper_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h index 0ae845420983..6860a809f2f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h @@ -26,6 +26,7 @@ #define __AMDGPU_CPER_H__ #include "amd_cper.h" +#include "amdgpu_aca.h" #define CPER_MAX_ALLOWED_COUNT 0x1000 #define HDR_LEN (sizeof(struct cper_hdr)) @@ -84,7 +85,13 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev, enum amdgpu_cper_type type, uint16_t section_count); - +/* UE must be encoded into separated cper entries, 1 UE 1 cper */ +int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev, + struct aca_bank *bank); +/* CEs and DEs are combined into 1 cper entry */ +int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev, + struct aca_banks *banks, + uint16_t bank_count); int amdgpu_cper_init(struct amdgpu_device *adev); int amdgpu_cper_fini(struct amdgpu_device *adev); -- 2.34.1