Module Name: src Committed By: riastradh Date: Sun Oct 27 12:59:09 UTC 2024
Modified Files: src/sys/dev/acpi: apei.c apei_cper.h Log Message: apei(4): Print PCIe errors. PR kern/58775: apei(4) spamming console To generate a diff of this commit: cvs rdiff -u -r1.6 -r1.7 src/sys/dev/acpi/apei.c cvs rdiff -u -r1.4 -r1.5 src/sys/dev/acpi/apei_cper.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/dev/acpi/apei.c diff -u src/sys/dev/acpi/apei.c:1.6 src/sys/dev/acpi/apei.c:1.7 --- src/sys/dev/acpi/apei.c:1.6 Sun Oct 27 12:14:07 2024 +++ src/sys/dev/acpi/apei.c Sun Oct 27 12:59:08 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: apei.c,v 1.6 2024/10/27 12:14:07 riastradh Exp $ */ +/* $NetBSD: apei.c,v 1.7 2024/10/27 12:59:08 riastradh Exp $ */ /*- * Copyright (c) 2024 The NetBSD Foundation, Inc. @@ -38,12 +38,13 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: apei.c,v 1.6 2024/10/27 12:14:07 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: apei.c,v 1.7 2024/10/27 12:59:08 riastradh Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/atomic.h> +#include <sys/endian.h> #include <sys/device.h> #include <sys/module.h> #include <sys/sysctl.h> @@ -58,6 +59,7 @@ __KERNEL_RCSID(0, "$NetBSD: apei.c,v 1.6 #include <dev/acpi/apei_hestvar.h> #include <dev/acpi/apei_interp.h> #include <dev/acpi/apeivar.h> +#include <dev/pci/pcireg.h> #define _COMPONENT ACPI_RESOURCE_COMPONENT ACPI_MODULE_NAME ("apei") @@ -489,6 +491,241 @@ out: /* } /* + * N.2.7. PCI Express Error Section + * + * https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#pci-express-error-section + */ +static const struct uuid CPER_PCIE_ERROR_SECTION = + {0xd995e954,0xbbc1,0x430f,0xad,0x91,{0xb4,0x4d,0xcb,0x3c,0x6f,0x35}}; + +static const char *const cper_pcie_error_port_type[] = { +#define F(LN, SN, V) [LN] = #SN, + CPER_PCIE_ERROR_PORT_TYPES(F) +#undef F +}; + +static void +apei_cper_pcie_error_report(struct apei_softc *sc, const void *buf, size_t len, + const char *ctx, bool ratelimitok) +{ + const struct cper_pcie_error *PE = buf; + char bitbuf[1024]; + + /* + * If we've hit the rate limit, skip printing the error. + */ + if (!ratelimitok) + goto out; + + snprintb(bitbuf, sizeof(bitbuf), + CPER_PCIE_ERROR_VALIDATION_BITS_FMT, PE->ValidationBits); + aprint_debug_dev(sc->sc_dev, "%s: ValidationBits=%s\n", ctx, bitbuf); + if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_PORT_TYPE) { + const uint32_t t = PE->PortType; + const char *n = t < __arraycount(cper_pcie_error_port_type) + ? cper_pcie_error_port_type[t] : NULL; + + if (n) { + device_printf(sc->sc_dev, "%s: PortType=%"PRIu32 + " (%s)\n", ctx, t, n); + } else { + device_printf(sc->sc_dev, "%s: PortType=%"PRIu32"\n", + ctx, t); + } + } + if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_VERSION) { + /* XXX BCD */ + device_printf(sc->sc_dev, "%s: Version=0x08%"PRIx32"\n", + ctx, PE->Version); + } + if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_COMMAND_STATUS) { + /* XXX move me to pcireg.h */ + snprintb(bitbuf, sizeof(bitbuf), "\177\020" + /* command */ + "b\000" "IO_ENABLE\0" + "b\001" "MEM_ENABLE\0" + "b\002" "MASTER_ENABLE\0" + "b\003" "SPECIAL_ENABLE\0" + "b\004" "INVALIDATE_ENABLE\0" + "b\005" "PALETTE_ENABLE\0" + "b\006" "PARITY_ENABLE\0" + "b\007" "STEPPING_ENABLE\0" + "b\010" "SERR_ENABLE\0" + "b\011" "BACKTOBACK_ENABLE\0" + "b\012" "INTERRUPT_DISABLE\0" + /* status */ + "b\023" "INT_STATUS\0" + "b\024" "CAPLIST_SUPPORT\0" + "b\025" "66MHZ_SUPPORT\0" + "b\026" "UDF_SUPPORT\0" + "b\027" "BACKTOBACK_SUPPORT\0" + "b\030" "PARITY_ERROR\0" + "f\031\002" "DEVSEL\0" + "=\000" "FAST\0" + "=\001" "MEDIUM\0" + "=\002" "SLOW\0" + "b\033" "TARGET_TARGET_ABORT\0" + "b\034" "MASTER_TARGET_ABORT\0" + "b\035" "MASTER_ABORT\0" + "b\036" "SPECIAL_ERROR\0" + "b\037" "PARITY_DETECT\0" + "\0", PE->CommandStatus); + device_printf(sc->sc_dev, "%s: CommandStatus=%s\n", + ctx, bitbuf); + } + if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_DEVICE_ID) { + device_printf(sc->sc_dev, "%s: DeviceID:" + " VendorID=0x%04"PRIx16 + " DeviceID=0x%04"PRIx16 + " ClassCode=0x%06"PRIx32 + " Function=%"PRIu8 + " Device=%"PRIu8 + " Segment=%"PRIu16 + " Bus=%"PRIu8 + " SecondaryBus=%"PRIu8 + " Slot=0x%04"PRIx16 + " Reserved0=0x%02"PRIx8 + "\n", + ctx, + le16dec(PE->DeviceID.VendorID), + le16dec(PE->DeviceID.DeviceID), + (PE->DeviceID.ClassCode[0] | /* le24dec */ + ((uint32_t)PE->DeviceID.ClassCode[1] << 8) | + ((uint32_t)PE->DeviceID.ClassCode[2] << 16)), + PE->DeviceID.Function, PE->DeviceID.Device, + le16dec(PE->DeviceID.Segment), PE->DeviceID.Bus, + PE->DeviceID.SecondaryBus, le16dec(PE->DeviceID.Slot), + PE->DeviceID.Reserved0); + } + if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_DEVICE_SERIAL) { + device_printf(sc->sc_dev, "%s: DeviceSerial={%016"PRIx64"}\n", + ctx, PE->DeviceSerial); + } + if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_BRIDGE_CONTROL_STATUS) { + /* XXX snprintb */ + device_printf(sc->sc_dev, "%s: BridgeControlStatus=%"PRIx32 + "\n", ctx, PE->BridgeControlStatus); + } + if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_CAPABILITY_STRUCTURE) { + uint32_t dcsr, dsr; + char hex[2*sizeof(PE->CapabilityStructure) + 1]; + unsigned i; + + for (i = 0; i < sizeof(PE->CapabilityStructure); i++) { + snprintf(hex + 2*i, sizeof(hex) - 2*i, "%02hhx", + PE->CapabilityStructure[i]); + } + device_printf(sc->sc_dev, "%s: CapabilityStructure={%s}\n", + ctx, hex); + + dcsr = le32dec(&PE->CapabilityStructure[PCIE_DCSR]); + dsr = __SHIFTOUT(dcsr, __BITS(31,16)); + if (dsr != 0) { + /* + * XXX move me to pcireg.h; note: high + * half of DCSR + */ + snprintb(bitbuf, sizeof(bitbuf), "\177\020" + "b\000" "CORRECTABLE_ERROR\0" + "b\001" "NONFATAL_UNCORRECTABLE_ERROR\0" + "b\002" "FATAL_ERROR\0" + "b\003" "UNSUPPORTED_REQUEST\0" + "b\004" "AUX_POWER\0" + "b\005" "TRANSACTIONS_PENDING\0" + "\0", dsr); + device_printf(sc->sc_dev, "%s: PCIe Device Status:" + " %s\n", + ctx, bitbuf); + } + } + if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_AER_INFO) { + uint32_t uc_status, uc_sev; + uint32_t cor_status; + uint32_t control; + char hex[2*sizeof(PE->AERInfo) + 1]; + unsigned i; + + for (i = 0; i < sizeof(PE->AERInfo); i++) { + snprintf(hex + 2*i, sizeof(hex) - 2*i, "%02hhx", + PE->AERInfo[i]); + } + device_printf(sc->sc_dev, "%s: AERInfo={%s}\n", ctx, hex); + + /* XXX move me to pcireg.h */ +#define PCI_AER_UC_STATUS_FMT "\177\020" \ + "b\000" "UNDEFINED\0" \ + "b\004" "DL_PROTOCOL_ERROR\0" \ + "b\005" "SURPRISE_DOWN_ERROR\0" \ + "b\014" "POISONED_TLP\0" \ + "b\015" "FC_PROTOCOL_ERROR\0" \ + "b\016" "COMPLETION_TIMEOUT\0" \ + "b\017" "COMPLETION_ABORT\0" \ + "b\020" "UNEXPECTED_COMPLETION\0" \ + "b\021" "RECEIVER_OVERFLOW\0" \ + "b\022" "MALFORMED_TLP\0" \ + "b\023" "ECRC_ERROR\0" \ + "b\024" "UNSUPPORTED_REQUEST_ERROR\0" \ + "b\025" "ACS_VIOLATION\0" \ + "b\026" "INTERNAL_ERROR\0" \ + "b\027" "MC_BLOCKED_TLP\0" \ + "b\030" "ATOMIC_OP_EGRESS_BLOCKED\0" \ + "b\031" "TLP_PREFIX_BLOCKED_ERROR\0" \ + "b\032" "POISONTLP_EGRESS_BLOCKED\0" \ + "\0" + + uc_status = le32dec(&PE->AERInfo[PCI_AER_UC_STATUS]); + uc_sev = le32dec(&PE->AERInfo[PCI_AER_UC_SEVERITY]); + cor_status = le32dec(&PE->AERInfo[PCI_AER_COR_STATUS]); + control = le32dec(&PE->AERInfo[PCI_AER_CAP_CONTROL]); + + if (uc_status & uc_sev) { + snprintb(bitbuf, sizeof(bitbuf), PCI_AER_UC_STATUS_FMT, + uc_status & uc_sev); + device_printf(sc->sc_dev, "%s:" + " AER hardware fatal uncorrectable errors: %s\n", + ctx, bitbuf); + } + if (uc_status & ~uc_sev) { + snprintb(bitbuf, sizeof(bitbuf), PCI_AER_UC_STATUS_FMT, + uc_status & uc_sev); + device_printf(sc->sc_dev, "%s:" + " AER hardware fatal uncorrectable errors: %s\n", + ctx, bitbuf); + } + if (uc_status) { + unsigned first = __SHIFTOUT(control, + PCI_AER_FIRST_ERROR_PTR); + snprintb(bitbuf, sizeof(bitbuf), PCI_AER_UC_STATUS_FMT, + (uint32_t)1 << first); + device_printf(sc->sc_dev, "%s:" + " AER hardware first uncorrectable error: %s\n", + ctx, bitbuf); + } + if (cor_status) { + /* XXX move me to pcireg.h */ + snprintb(bitbuf, sizeof(bitbuf), "\177\020" + "b\000" "RECEIVER_ERROR\0" + "b\006" "BAD_TLP\0" + "b\007" "BAD_DLLP\0" + "b\010" "REPLAY_NUM_ROLLOVER\0" + "b\014" "REPLAY_TIMER_TIMEOUT\0" + "b\015" "ADVISORY_NF_ERROR\0" + "b\016" "INTERNAL_ERROR\0" + "b\017" "HEADER_LOG_OVERFLOW\0" + "\0", cor_status); + device_printf(sc->sc_dev, "%s:" + " AER hardware corrected error: %s\n", + ctx, bitbuf); + } + } + +out: /* + * XXX pass this on to the PCI subsystem to handle + */ + return; +} + +/* * apei_cper_reports * * Table of known Common Platform Error Record types, symbolic @@ -509,6 +746,9 @@ static const struct apei_cper_report { { "memory", &CPER_MEMORY_ERROR_SECTION, sizeof(struct cper_memory_error), apei_cper_memory_error_report }, + { "PCIe", &CPER_PCIE_ERROR_SECTION, + sizeof(struct cper_pcie_error), + apei_cper_pcie_error_report }, }; /* Index: src/sys/dev/acpi/apei_cper.h diff -u src/sys/dev/acpi/apei_cper.h:1.4 src/sys/dev/acpi/apei_cper.h:1.5 --- src/sys/dev/acpi/apei_cper.h:1.4 Sun Oct 27 12:14:07 2024 +++ src/sys/dev/acpi/apei_cper.h Sun Oct 27 12:59:08 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: apei_cper.h,v 1.4 2024/10/27 12:14:07 riastradh Exp $ */ +/* $NetBSD: apei_cper.h,v 1.5 2024/10/27 12:59:08 riastradh Exp $ */ /*- * Copyright (c) 2024 The NetBSD Foundation, Inc. @@ -233,4 +233,80 @@ enum { /* struct cper_memory_error_ex CPER_MEMORY_ERROR_EXTENDED_CHIPID = __BITS(7,5), }; +/* + * N.2.7. PCI Express Error Section + * + * https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#pci-express-error-section + * + * Type: {0xd995e954,0xbbc1,0x430f,{0xad,0x91,0xb4,0x4d,0xcb,0x3c,0x6f,0x35}} + */ + +struct cper_pcie_error { + uint64_t ValidationBits; + uint32_t PortType; + uint32_t Version; + uint32_t CommandStatus; + uint32_t Reserved0; + struct { + uint8_t VendorID[2]; + uint8_t DeviceID[2]; /* product */ + uint8_t ClassCode[3]; + uint8_t Function; + uint8_t Device; + uint8_t Segment[2]; + uint8_t Bus; + uint8_t SecondaryBus; + uint8_t Slot[2]; /* bits 0:2 resv, bits 3:15 slot */ + uint8_t Reserved0; + } DeviceID; + uint64_t DeviceSerial; + uint32_t BridgeControlStatus; + uint8_t CapabilityStructure[60]; + uint8_t AERInfo[96]; +} __packed; +__CTASSERT(sizeof(struct cper_pcie_error) == 208); + +enum { /* struct cper_pcie_error::ValidationBits */ + CPER_PCIE_ERROR_VALID_PORT_TYPE = __BIT(0), + CPER_PCIE_ERROR_VALID_VERSION = __BIT(1), + CPER_PCIE_ERROR_VALID_COMMAND_STATUS = __BIT(2), + CPER_PCIE_ERROR_VALID_DEVICE_ID = __BIT(3), + CPER_PCIE_ERROR_VALID_DEVICE_SERIAL = __BIT(4), + CPER_PCIE_ERROR_VALID_BRIDGE_CONTROL_STATUS = __BIT(5), + CPER_PCIE_ERROR_VALID_CAPABILITY_STRUCTURE = __BIT(6), + CPER_PCIE_ERROR_VALID_AER_INFO = __BIT(7), +}; + +#define CPER_PCIE_ERROR_VALIDATION_BITS_FMT "\177\020" \ + "b\000" "PORT_TYPE\0" \ + "b\001" "VERSION\0" \ + "b\002" "COMMAND_STATUS\0" \ + "b\003" "DEVICE_ID\0" \ + "b\004" "DEVICE_SERIAL\0" \ + "b\005" "BRIDGE_CONTROL_STATUS\0" \ + "b\006" "CAPABILITY_STRUCTURE\0" \ + "b\007" "AER_INFO\0" \ + "\0" + +#define CPER_PCIE_ERROR_PORT_TYPES(F) \ + F(CPER_PCIE_ERROR_PORT_TYPE_PCIE_ENDPOINT, PCIE_ENDPOINT, 0) \ + F(CPER_PCIE_ERROR_PORT_TYPE_LEGACY_PCI_ENDPOINT, LEGACY_PCI_ENDPOINT, \ + 1) \ + F(CPER_PCIE_ERROR_PORT_TYPE_ROOTPORT5_UPSTREAMSWITCH, \ + ROOTPORT5_UPSTREAMSWITCH, 4) \ + F(CPER_PCIE_ERROR_PORT_TYPE_DOWNSTREAMSWITCH, DOWNSTREAMSWITCH, 6) \ + F(CPER_PCIE_ERROR_PORT_TYPE_PCIE_PCI_BRIDGE, PCIE_PCI_BRIDGE, 7) \ + F(CPER_PCIE_ERROR_PORT_TYPE_PCI_PCIE_BRIDGE, PCI_PCIE_BRIDGE, 8) \ + F(CPER_PCIE_ERROR_PORT_TYPE_RCIEP_DEV, RCIEP_DEV, 9) \ + /* Root Complex Integrated Endpoint Device */ \ + F(CPER_PCIE_ERROR_PORT_TYPE_RCEC, RCEC, 10) \ + /* Root Complex Event Collector */ \ + /* end of CPER_PCIE_ERROR_PORT_TYPES */ + +enum cper_pcie_error_port_type { /* struct cper_pcie_error::PortType */ +#define CPER_PCIE_ERROR_PORT_TYPE_DEF(LN, SN, V) LN = V, + CPER_PCIE_ERROR_PORT_TYPES(CPER_PCIE_ERROR_PORT_TYPE_DEF) +#undef CPER_PCIE_ERROR_PORT_TYPE_DEF +}; + #endif /* _SYS_DEV_ACPI_APEI_CPER_H_ */