Module Name:    src
Committed By:   riastradh
Date:           Sun Oct 27 12:59:09 UTC 2024

Modified Files:
        src/sys/dev/acpi: apei.c apei_cper.h

Log Message:
apei(4): Print PCIe errors.

PR kern/58775: apei(4) spamming console


To generate a diff of this commit:
cvs rdiff -u -r1.6 -r1.7 src/sys/dev/acpi/apei.c
cvs rdiff -u -r1.4 -r1.5 src/sys/dev/acpi/apei_cper.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/dev/acpi/apei.c
diff -u src/sys/dev/acpi/apei.c:1.6 src/sys/dev/acpi/apei.c:1.7
--- src/sys/dev/acpi/apei.c:1.6	Sun Oct 27 12:14:07 2024
+++ src/sys/dev/acpi/apei.c	Sun Oct 27 12:59:08 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: apei.c,v 1.6 2024/10/27 12:14:07 riastradh Exp $	*/
+/*	$NetBSD: apei.c,v 1.7 2024/10/27 12:59:08 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2024 The NetBSD Foundation, Inc.
@@ -38,12 +38,13 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: apei.c,v 1.6 2024/10/27 12:14:07 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: apei.c,v 1.7 2024/10/27 12:59:08 riastradh Exp $");
 
 #include <sys/param.h>
 #include <sys/types.h>
 
 #include <sys/atomic.h>
+#include <sys/endian.h>
 #include <sys/device.h>
 #include <sys/module.h>
 #include <sys/sysctl.h>
@@ -58,6 +59,7 @@ __KERNEL_RCSID(0, "$NetBSD: apei.c,v 1.6
 #include <dev/acpi/apei_hestvar.h>
 #include <dev/acpi/apei_interp.h>
 #include <dev/acpi/apeivar.h>
+#include <dev/pci/pcireg.h>
 
 #define	_COMPONENT	ACPI_RESOURCE_COMPONENT
 ACPI_MODULE_NAME	("apei")
@@ -489,6 +491,241 @@ out:	/*
 }
 
 /*
+ * N.2.7. PCI Express Error Section
+ *
+ * https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#pci-express-error-section
+ */
+static const struct uuid CPER_PCIE_ERROR_SECTION =
+    {0xd995e954,0xbbc1,0x430f,0xad,0x91,{0xb4,0x4d,0xcb,0x3c,0x6f,0x35}};
+
+static const char *const cper_pcie_error_port_type[] = {
+#define	F(LN, SN, V)	[LN] = #SN,
+	CPER_PCIE_ERROR_PORT_TYPES(F)
+#undef	F
+};
+
+static void
+apei_cper_pcie_error_report(struct apei_softc *sc, const void *buf, size_t len,
+    const char *ctx, bool ratelimitok)
+{
+	const struct cper_pcie_error *PE = buf;
+	char bitbuf[1024];
+
+	/*
+	 * If we've hit the rate limit, skip printing the error.
+	 */
+	if (!ratelimitok)
+		goto out;
+
+	snprintb(bitbuf, sizeof(bitbuf),
+	    CPER_PCIE_ERROR_VALIDATION_BITS_FMT, PE->ValidationBits);
+	aprint_debug_dev(sc->sc_dev, "%s: ValidationBits=%s\n", ctx, bitbuf);
+	if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_PORT_TYPE) {
+		const uint32_t t = PE->PortType;
+		const char *n = t < __arraycount(cper_pcie_error_port_type)
+		    ? cper_pcie_error_port_type[t] : NULL;
+
+		if (n) {
+			device_printf(sc->sc_dev, "%s: PortType=%"PRIu32
+			    " (%s)\n", ctx, t, n);
+		} else {
+			device_printf(sc->sc_dev, "%s: PortType=%"PRIu32"\n",
+			    ctx, t);
+		}
+	}
+	if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_VERSION) {
+		/* XXX BCD */
+		device_printf(sc->sc_dev, "%s: Version=0x08%"PRIx32"\n",
+		    ctx, PE->Version);
+	}
+	if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_COMMAND_STATUS) {
+		/* XXX move me to pcireg.h */
+		snprintb(bitbuf, sizeof(bitbuf), "\177\020"
+			/* command */
+		    "b\000"	"IO_ENABLE\0"
+		    "b\001"	"MEM_ENABLE\0"
+		    "b\002"	"MASTER_ENABLE\0"
+		    "b\003"	"SPECIAL_ENABLE\0"
+		    "b\004"	"INVALIDATE_ENABLE\0"
+		    "b\005"	"PALETTE_ENABLE\0"
+		    "b\006"	"PARITY_ENABLE\0"
+		    "b\007"	"STEPPING_ENABLE\0"
+		    "b\010"	"SERR_ENABLE\0"
+		    "b\011"	"BACKTOBACK_ENABLE\0"
+		    "b\012"	"INTERRUPT_DISABLE\0"
+			/* status */
+		    "b\023"	"INT_STATUS\0"
+		    "b\024"	"CAPLIST_SUPPORT\0"
+		    "b\025"	"66MHZ_SUPPORT\0"
+		    "b\026"	"UDF_SUPPORT\0"
+		    "b\027"	"BACKTOBACK_SUPPORT\0"
+		    "b\030"	"PARITY_ERROR\0"
+		    "f\031\002"	"DEVSEL\0"
+			"=\000"		"FAST\0"
+			"=\001"		"MEDIUM\0"
+			"=\002"		"SLOW\0"
+		    "b\033"	"TARGET_TARGET_ABORT\0"
+		    "b\034"	"MASTER_TARGET_ABORT\0"
+		    "b\035"	"MASTER_ABORT\0"
+		    "b\036"	"SPECIAL_ERROR\0"
+		    "b\037"	"PARITY_DETECT\0"
+		    "\0", PE->CommandStatus);
+		device_printf(sc->sc_dev, "%s: CommandStatus=%s\n",
+		    ctx, bitbuf);
+	}
+	if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_DEVICE_ID) {
+		device_printf(sc->sc_dev, "%s: DeviceID:"
+		    " VendorID=0x%04"PRIx16
+		    " DeviceID=0x%04"PRIx16
+		    " ClassCode=0x%06"PRIx32
+		    " Function=%"PRIu8
+		    " Device=%"PRIu8
+		    " Segment=%"PRIu16
+		    " Bus=%"PRIu8
+		    " SecondaryBus=%"PRIu8
+		    " Slot=0x%04"PRIx16
+		    " Reserved0=0x%02"PRIx8
+		    "\n",
+		    ctx,
+		    le16dec(PE->DeviceID.VendorID),
+		    le16dec(PE->DeviceID.DeviceID),
+		    (PE->DeviceID.ClassCode[0] |	/* le24dec */
+			((uint32_t)PE->DeviceID.ClassCode[1] << 8) |
+			((uint32_t)PE->DeviceID.ClassCode[2] << 16)),
+		    PE->DeviceID.Function, PE->DeviceID.Device,
+		    le16dec(PE->DeviceID.Segment), PE->DeviceID.Bus,
+		    PE->DeviceID.SecondaryBus, le16dec(PE->DeviceID.Slot),
+		    PE->DeviceID.Reserved0);
+	}
+	if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_DEVICE_SERIAL) {
+		device_printf(sc->sc_dev, "%s: DeviceSerial={%016"PRIx64"}\n",
+		    ctx, PE->DeviceSerial);
+	}
+	if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_BRIDGE_CONTROL_STATUS) {
+		/* XXX snprintb */
+		device_printf(sc->sc_dev, "%s: BridgeControlStatus=%"PRIx32
+		    "\n", ctx, PE->BridgeControlStatus);
+	}
+	if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_CAPABILITY_STRUCTURE) {
+		uint32_t dcsr, dsr;
+		char hex[2*sizeof(PE->CapabilityStructure) + 1];
+		unsigned i;
+
+		for (i = 0; i < sizeof(PE->CapabilityStructure); i++) {
+			snprintf(hex + 2*i, sizeof(hex) - 2*i, "%02hhx",
+			    PE->CapabilityStructure[i]);
+		}
+		device_printf(sc->sc_dev, "%s: CapabilityStructure={%s}\n",
+		    ctx, hex);
+
+		dcsr = le32dec(&PE->CapabilityStructure[PCIE_DCSR]);
+		dsr = __SHIFTOUT(dcsr, __BITS(31,16));
+		if (dsr != 0) {
+			/*
+			 * XXX move me to pcireg.h; note: high
+			 * half of DCSR
+			 */
+			snprintb(bitbuf, sizeof(bitbuf), "\177\020"
+			    "b\000"	"CORRECTABLE_ERROR\0"
+			    "b\001"	"NONFATAL_UNCORRECTABLE_ERROR\0"
+			    "b\002"	"FATAL_ERROR\0"
+			    "b\003"	"UNSUPPORTED_REQUEST\0"
+			    "b\004"	"AUX_POWER\0"
+			    "b\005"	"TRANSACTIONS_PENDING\0"
+			    "\0", dsr);
+			device_printf(sc->sc_dev, "%s: PCIe Device Status:"
+			    " %s\n",
+			    ctx, bitbuf);
+		}
+	}
+	if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_AER_INFO) {
+		uint32_t uc_status, uc_sev;
+		uint32_t cor_status;
+		uint32_t control;
+		char hex[2*sizeof(PE->AERInfo) + 1];
+		unsigned i;
+
+		for (i = 0; i < sizeof(PE->AERInfo); i++) {
+			snprintf(hex + 2*i, sizeof(hex) - 2*i, "%02hhx",
+			    PE->AERInfo[i]);
+		}
+		device_printf(sc->sc_dev, "%s: AERInfo={%s}\n", ctx, hex);
+
+			/* XXX move me to pcireg.h */
+#define	PCI_AER_UC_STATUS_FMT	"\177\020"				      \
+	"b\000"	"UNDEFINED\0"						      \
+	"b\004"	"DL_PROTOCOL_ERROR\0"					      \
+	"b\005"	"SURPRISE_DOWN_ERROR\0"					      \
+	"b\014"	"POISONED_TLP\0"					      \
+	"b\015"	"FC_PROTOCOL_ERROR\0"					      \
+	"b\016"	"COMPLETION_TIMEOUT\0"					      \
+	"b\017"	"COMPLETION_ABORT\0"					      \
+	"b\020"	"UNEXPECTED_COMPLETION\0"				      \
+	"b\021"	"RECEIVER_OVERFLOW\0"					      \
+	"b\022"	"MALFORMED_TLP\0"					      \
+	"b\023"	"ECRC_ERROR\0"						      \
+	"b\024"	"UNSUPPORTED_REQUEST_ERROR\0"				      \
+	"b\025"	"ACS_VIOLATION\0"					      \
+	"b\026"	"INTERNAL_ERROR\0"					      \
+	"b\027"	"MC_BLOCKED_TLP\0"					      \
+	"b\030"	"ATOMIC_OP_EGRESS_BLOCKED\0"				      \
+	"b\031"	"TLP_PREFIX_BLOCKED_ERROR\0"				      \
+	"b\032"	"POISONTLP_EGRESS_BLOCKED\0"				      \
+	"\0"
+
+		uc_status = le32dec(&PE->AERInfo[PCI_AER_UC_STATUS]);
+		uc_sev = le32dec(&PE->AERInfo[PCI_AER_UC_SEVERITY]);
+		cor_status = le32dec(&PE->AERInfo[PCI_AER_COR_STATUS]);
+		control = le32dec(&PE->AERInfo[PCI_AER_CAP_CONTROL]);
+
+		if (uc_status & uc_sev) {
+			snprintb(bitbuf, sizeof(bitbuf), PCI_AER_UC_STATUS_FMT,
+			    uc_status & uc_sev);
+			device_printf(sc->sc_dev, "%s:"
+			    " AER hardware fatal uncorrectable errors: %s\n",
+			    ctx, bitbuf);
+		}
+		if (uc_status & ~uc_sev) {
+			snprintb(bitbuf, sizeof(bitbuf), PCI_AER_UC_STATUS_FMT,
+			    uc_status & uc_sev);
+			device_printf(sc->sc_dev, "%s:"
+			    " AER hardware fatal uncorrectable errors: %s\n",
+			    ctx, bitbuf);
+		}
+		if (uc_status) {
+			unsigned first = __SHIFTOUT(control,
+			    PCI_AER_FIRST_ERROR_PTR);
+			snprintb(bitbuf, sizeof(bitbuf), PCI_AER_UC_STATUS_FMT,
+			    (uint32_t)1 << first);
+			device_printf(sc->sc_dev, "%s:"
+			    " AER hardware first uncorrectable error: %s\n",
+			    ctx, bitbuf);
+		}
+		if (cor_status) {
+			/* XXX move me to pcireg.h */
+			snprintb(bitbuf, sizeof(bitbuf), "\177\020"
+			    "b\000"	"RECEIVER_ERROR\0"
+			    "b\006"	"BAD_TLP\0"
+			    "b\007"	"BAD_DLLP\0"
+			    "b\010"	"REPLAY_NUM_ROLLOVER\0"
+			    "b\014"	"REPLAY_TIMER_TIMEOUT\0"
+			    "b\015"	"ADVISORY_NF_ERROR\0"
+			    "b\016"	"INTERNAL_ERROR\0"
+			    "b\017"	"HEADER_LOG_OVERFLOW\0"
+			    "\0", cor_status);
+			device_printf(sc->sc_dev, "%s:"
+			    " AER hardware corrected error: %s\n",
+			    ctx, bitbuf);
+		}
+	}
+
+out:	/*
+	 * XXX pass this on to the PCI subsystem to handle
+	 */
+	return;
+}
+
+/*
  * apei_cper_reports
  *
  *	Table of known Common Platform Error Record types, symbolic
@@ -509,6 +746,9 @@ static const struct apei_cper_report {
 	{ "memory", &CPER_MEMORY_ERROR_SECTION,
 	  sizeof(struct cper_memory_error),
 	  apei_cper_memory_error_report },
+	{ "PCIe", &CPER_PCIE_ERROR_SECTION,
+	  sizeof(struct cper_pcie_error),
+	  apei_cper_pcie_error_report },
 };
 
 /*

Index: src/sys/dev/acpi/apei_cper.h
diff -u src/sys/dev/acpi/apei_cper.h:1.4 src/sys/dev/acpi/apei_cper.h:1.5
--- src/sys/dev/acpi/apei_cper.h:1.4	Sun Oct 27 12:14:07 2024
+++ src/sys/dev/acpi/apei_cper.h	Sun Oct 27 12:59:08 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: apei_cper.h,v 1.4 2024/10/27 12:14:07 riastradh Exp $	*/
+/*	$NetBSD: apei_cper.h,v 1.5 2024/10/27 12:59:08 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2024 The NetBSD Foundation, Inc.
@@ -233,4 +233,80 @@ enum {				/* struct cper_memory_error_ex
 	CPER_MEMORY_ERROR_EXTENDED_CHIPID		= __BITS(7,5),
 };
 
+/*
+ * N.2.7. PCI Express Error Section
+ *
+ * https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#pci-express-error-section
+ *
+ * Type: {0xd995e954,0xbbc1,0x430f,{0xad,0x91,0xb4,0x4d,0xcb,0x3c,0x6f,0x35}}
+ */
+
+struct cper_pcie_error {
+	uint64_t	ValidationBits;
+	uint32_t	PortType;
+	uint32_t	Version;
+	uint32_t	CommandStatus;
+	uint32_t	Reserved0;
+	struct {
+		uint8_t		VendorID[2];
+		uint8_t		DeviceID[2]; /* product */
+		uint8_t		ClassCode[3];
+		uint8_t		Function;
+		uint8_t		Device;
+		uint8_t		Segment[2];
+		uint8_t		Bus;
+		uint8_t		SecondaryBus;
+		uint8_t		Slot[2]; /* bits 0:2 resv, bits 3:15 slot */
+		uint8_t		Reserved0;
+	}		DeviceID;
+	uint64_t	DeviceSerial;
+	uint32_t	BridgeControlStatus;
+	uint8_t		CapabilityStructure[60];
+	uint8_t		AERInfo[96];
+} __packed;
+__CTASSERT(sizeof(struct cper_pcie_error) == 208);
+
+enum {				/* struct cper_pcie_error::ValidationBits */
+	CPER_PCIE_ERROR_VALID_PORT_TYPE			= __BIT(0),
+	CPER_PCIE_ERROR_VALID_VERSION			= __BIT(1),
+	CPER_PCIE_ERROR_VALID_COMMAND_STATUS		= __BIT(2),
+	CPER_PCIE_ERROR_VALID_DEVICE_ID			= __BIT(3),
+	CPER_PCIE_ERROR_VALID_DEVICE_SERIAL		= __BIT(4),
+	CPER_PCIE_ERROR_VALID_BRIDGE_CONTROL_STATUS	= __BIT(5),
+	CPER_PCIE_ERROR_VALID_CAPABILITY_STRUCTURE	= __BIT(6),
+	CPER_PCIE_ERROR_VALID_AER_INFO			= __BIT(7),
+};
+
+#define	CPER_PCIE_ERROR_VALIDATION_BITS_FMT	"\177\020"		      \
+	"b\000"	"PORT_TYPE\0"						      \
+	"b\001"	"VERSION\0"						      \
+	"b\002"	"COMMAND_STATUS\0"					      \
+	"b\003"	"DEVICE_ID\0"						      \
+	"b\004"	"DEVICE_SERIAL\0"					      \
+	"b\005"	"BRIDGE_CONTROL_STATUS\0"				      \
+	"b\006"	"CAPABILITY_STRUCTURE\0"				      \
+	"b\007"	"AER_INFO\0"						      \
+	"\0"
+
+#define	CPER_PCIE_ERROR_PORT_TYPES(F)					      \
+	F(CPER_PCIE_ERROR_PORT_TYPE_PCIE_ENDPOINT, PCIE_ENDPOINT, 0)	      \
+	F(CPER_PCIE_ERROR_PORT_TYPE_LEGACY_PCI_ENDPOINT, LEGACY_PCI_ENDPOINT, \
+	    1)								      \
+	F(CPER_PCIE_ERROR_PORT_TYPE_ROOTPORT5_UPSTREAMSWITCH,		      \
+	    ROOTPORT5_UPSTREAMSWITCH, 4)				      \
+	F(CPER_PCIE_ERROR_PORT_TYPE_DOWNSTREAMSWITCH, DOWNSTREAMSWITCH, 6)    \
+	F(CPER_PCIE_ERROR_PORT_TYPE_PCIE_PCI_BRIDGE, PCIE_PCI_BRIDGE, 7)      \
+	F(CPER_PCIE_ERROR_PORT_TYPE_PCI_PCIE_BRIDGE, PCI_PCIE_BRIDGE, 8)      \
+	F(CPER_PCIE_ERROR_PORT_TYPE_RCIEP_DEV, RCIEP_DEV, 9)		      \
+		/* Root Complex Integrated Endpoint Device */		      \
+	F(CPER_PCIE_ERROR_PORT_TYPE_RCEC, RCEC, 10)			      \
+		/* Root Complex Event Collector */			      \
+	/* end of CPER_PCIE_ERROR_PORT_TYPES */
+
+enum cper_pcie_error_port_type { /* struct cper_pcie_error::PortType */
+#define	CPER_PCIE_ERROR_PORT_TYPE_DEF(LN, SN, V)	LN = V,
+	CPER_PCIE_ERROR_PORT_TYPES(CPER_PCIE_ERROR_PORT_TYPE_DEF)
+#undef	CPER_PCIE_ERROR_PORT_TYPE_DEF
+};
+
 #endif	/* _SYS_DEV_ACPI_APEI_CPER_H_ */

Reply via email to