On 2025-07-03 09:15:04 Thu, Ganesh Goudar wrote: > parse AER uncorrectable and correctable error status > registers to print error type and severity. > > output looks like > EEH:AER Uncorrectable Error > EEH:AER Error Type: Data Link Protocol Error [Fatal]
Thanks for working on this. But how do we know which PHB this error is reported on ? Can we have PHB details as prefix in the error message ? Also, can we have Error message format something like below ? 0000:50:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Transaction Layer, id=0500(Requester ID) Thanks, -Mahesh. > > Signed-off-by: Ganesh Goudar <[email protected]> > --- > arch/powerpc/kernel/eeh.c | 84 ++++++++++++++++++++++++++++++++++++++- > 1 file changed, 83 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c > index 83fe99861eb1..03e1e2eeb679 100644 > --- a/arch/powerpc/kernel/eeh.c > +++ b/arch/powerpc/kernel/eeh.c > @@ -139,6 +139,49 @@ struct eeh_stats { > > static struct eeh_stats eeh_stats; > > +static const char * const aer_uncor_errors[] = { > + "Undefined", > + "Undefined", > + "Undefined", > + "Undefined", > + "Data Link Protocol", > + "Surprise Down", > + "Poisoned TLP", > + "Flow Control Protocol", > + "Completion Timeout", > + "Completer Abort", > + "Unexpected Completion", > + "Receiver Overflow", > + "Malformed TLP", > + "ECRC Error", > + "Unsupported Request", > + "ACS Violation", > + "Uncorrectable Internal Error", > + "MC Blocked TLP", > + "AtomicOp Egress Blocked", > + "TLPPrefix Blocked", > + "Poisoned TLP Egress Blocked" > +}; > + > +static const char * const aer_cor_errors[] = { > + "Receiver Error", > + "Undefined", > + "Undefined", > + "Undefined", > + "Undefined", > + "Undefined", > + "Bad TLP", > + "Bad DLLP", > + "Replay Num Rollover", > + "Undefined", > + "Undefined", > + "Undefined", > + "Replay Timer Timeout", > + "Advisory Non-Fatal Error", > + "Corrected Internal Error", > + "Header Log Overflow", > +}; > + > static int __init eeh_setup(char *str) > { > if (!strcmp(str, "off")) > @@ -160,6 +203,43 @@ void eeh_show_enabled(void) > pr_info("EEH: No capable adapters found: recovery disabled.\n"); > } > > +static void eeh_parse_aer_registers(struct eeh_dev *edev, int cap) > +{ > + int i; > + const char *error_type; > + u32 uncor_status, uncor_severity, cor_status; > + > + eeh_ops->read_config(edev, cap + PCI_ERR_UNCOR_STATUS, 4, > &uncor_status); > + eeh_ops->read_config(edev, cap + PCI_ERR_UNCOR_SEVER, 4, > &uncor_severity); > + eeh_ops->read_config(edev, cap + PCI_ERR_COR_STATUS, 4, &cor_status); > + > + if (!uncor_status && !cor_status) > + return; > + > + if (uncor_status) { > + pr_err("EEH:AER Uncorrectable Error\n"); > + for (i = 0; i < ARRAY_SIZE(aer_uncor_errors); i++) { > + if (uncor_status & (1 << i)) { > + error_type = (i < ARRAY_SIZE(aer_uncor_errors)) > + ? aer_uncor_errors[i] : "Unknown"; > + pr_err("EEH:AER Error Type: %s [%s]\n", > error_type, > + (uncor_severity & (1 << i)) ? "Fatal" : > "Non-Fatal"); > + } > + } > + } > + > + if (cor_status) { > + pr_err("EEH:AER Correctable Error\n"); > + for (i = 0; i < ARRAY_SIZE(aer_cor_errors); i++) { > + if (cor_status & (1 << i)) { > + error_type = (i < ARRAY_SIZE(aer_cor_errors)) > + ? aer_cor_errors[i] : "Unknown"; > + pr_err("EEH:AER Error Type: %s\n", error_type); > + } > + } > + } > +} > + > /* > * This routine captures assorted PCI configuration space data > * for the indicated PCI device, and puts them into a buffer > @@ -237,9 +317,11 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, > char *buf, size_t len) > pr_warn("%s\n", buffer); > } > > - /* If AER capable, dump it */ > + /* If AER capable, parse and dump it */ > cap = edev->aer_cap; > if (cap) { > + eeh_parse_aer_registers(edev, cap); > + > n += scnprintf(buf+n, len-n, "pci-e AER:\n"); > pr_warn("EEH: PCI-E AER capability register set follows:\n"); > > -- > 2.48.1 > > -- Mahesh J Salgaonkar
