Module Name: src
Committed By: riastradh
Date: Sun Oct 27 12:13:42 UTC 2024
Modified Files:
src/sys/dev/acpi: apei.c
Log Message:
apei(4): Rate-limit error reports to console.
For each severity level (corrected, uncorrectable/recoverable,
uncorrectable/fatal, other), print at most one report per minute,
and, if any reports were suppressed, the number of reports that had
been suppressed since the last one, up to a maximum of 2^32 - 1.
PR kern/58775: apei(4) spamming console
To generate a diff of this commit:
cvs rdiff -u -r1.4 -r1.5 src/sys/dev/acpi/apei.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/dev/acpi/apei.c
diff -u src/sys/dev/acpi/apei.c:1.4 src/sys/dev/acpi/apei.c:1.5
--- src/sys/dev/acpi/apei.c:1.4 Sun Oct 27 12:13:28 2024
+++ src/sys/dev/acpi/apei.c Sun Oct 27 12:13:42 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: apei.c,v 1.4 2024/10/27 12:13:28 riastradh Exp $ */
+/* $NetBSD: apei.c,v 1.5 2024/10/27 12:13:42 riastradh Exp $ */
/*-
* Copyright (c) 2024 The NetBSD Foundation, Inc.
@@ -38,7 +38,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: apei.c,v 1.4 2024/10/27 12:13:28 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: apei.c,v 1.5 2024/10/27 12:13:42 riastradh Exp $");
#include <sys/param.h>
#include <sys/types.h>
@@ -363,11 +363,17 @@ static const struct uuid CPER_MEMORY_ERR
static void
apei_cper_memory_error_report(struct apei_softc *sc, const void *buf,
- size_t len, const char *ctx)
+ size_t len, const char *ctx, bool ratelimitok)
{
const struct cper_memory_error *ME = buf;
char bitbuf[1024];
+ /*
+ * If we've hit the rate limit, skip printing the error.
+ */
+ if (!ratelimitok)
+ goto out;
+
snprintb(bitbuf, sizeof(bitbuf),
CPER_MEMORY_ERROR_VALIDATION_BITS_FMT, ME->ValidationBits);
aprint_debug_dev(sc->sc_dev, "%s: ValidationBits=%s\n", ctx, bitbuf);
@@ -472,6 +478,12 @@ apei_cper_memory_error_report(struct ape
ctx, t);
}
}
+
+out: /*
+ * XXX pass this through to uvm(9) or userland for decisions
+ * like page retirement
+ */
+ return;
}
/*
@@ -489,7 +501,8 @@ static const struct apei_cper_report {
const char *name;
const struct uuid *type;
size_t minlength;
- void (*func)(struct apei_softc *, const void *, size_t, const char *);
+ void (*func)(struct apei_softc *, const void *, size_t, const char *,
+ bool);
} apei_cper_reports[] = {
{ "memory", &CPER_MEMORY_ERROR_SECTION,
sizeof(struct cper_memory_error),
@@ -497,10 +510,10 @@ static const struct apei_cper_report {
};
/*
- * apei_gede_report_header(sc, gede, ctx, &headerlen, &report)
+ * apei_gede_report_header(sc, gede, ctx, ratelimitok, &headerlen, &report)
*
* Report the header of the ith Generic Error Data Entry in the
- * given context.
+ * given context, if ratelimitok is true.
*
* Return the actual length of the header in headerlen, or 0 if
* not known because the revision isn't recognized.
@@ -510,7 +523,7 @@ static const struct apei_cper_report {
*/
static void
apei_gede_report_header(struct apei_softc *sc,
- const ACPI_HEST_GENERIC_DATA *gede, const char *ctx,
+ const ACPI_HEST_GENERIC_DATA *gede, const char *ctx, bool ratelimitok,
size_t *headerlenp, const struct apei_cper_report **reportp)
{
const ACPI_HEST_GENERIC_DATA_V300 *const gede_v3 = (const void *)gede;
@@ -538,14 +551,19 @@ apei_gede_report_header(struct apei_soft
if (memcmp(§ype, report->type, sizeof(sectype)) != 0)
continue;
- device_printf(sc->sc_dev, "%s: SectionType=%s (%s error)\n",
- ctx, guidstr, report->name);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " SectionType=%s (%s error)\n",
+ ctx, guidstr, report->name);
+ }
*reportp = report;
break;
}
if (i == __arraycount(apei_cper_reports)) {
- device_printf(sc->sc_dev, "%s: SectionType=%s\n", ctx,
- guidstr);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s: SectionType=%s\n", ctx,
+ guidstr);
+ }
*reportp = NULL;
}
@@ -553,11 +571,14 @@ apei_gede_report_header(struct apei_soft
* Print the numeric severity and, if we have it, a symbolic
* name for it.
*/
- device_printf(sc->sc_dev, "%s: ErrorSeverity=%"PRIu32" (%s)\n", ctx,
- gede->ErrorSeverity,
- (gede->ErrorSeverity < __arraycount(apei_gede_severity)
- ? apei_gede_severity[gede->ErrorSeverity]
- : "unknown"));
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s: ErrorSeverity=%"PRIu32" (%s)\n",
+ ctx,
+ gede->ErrorSeverity,
+ (gede->ErrorSeverity < __arraycount(apei_gede_severity)
+ ? apei_gede_severity[gede->ErrorSeverity]
+ : "unknown"));
+ }
/*
* The Revision may not often be useful, but this is only ever
@@ -565,8 +586,10 @@ apei_gede_report_header(struct apei_soft
* you can glean at your convenience with acpidump. So print
* it anyway.
*/
- device_printf(sc->sc_dev, "%s: Revision=0x%"PRIx16"\n", ctx,
- gede->Revision);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s: Revision=0x%"PRIx16"\n", ctx,
+ gede->Revision);
+ }
/*
* Don't touch anything past the Revision until we've
@@ -587,38 +610,49 @@ apei_gede_report_header(struct apei_soft
* Print the validation bits at debug level. Only really
* helpful if there are bits we _don't_ know about.
*/
- /* XXX define this format somewhere */
- snprintb(buf, sizeof(buf), "\177\020"
- "b\000" "FRU_ID\0"
- "b\001" "FRU_TEXT\0" /* `FRU string', sometimes */
- "b\002" "TIMESTAMP\0"
- "\0", gede->ValidationBits);
- aprint_debug_dev(sc->sc_dev, "%s: ValidationBits=%s\n", ctx, buf);
+ if (ratelimitok) {
+ /* XXX define this format somewhere */
+ snprintb(buf, sizeof(buf), "\177\020"
+ "b\000" "FRU_ID\0"
+ "b\001" "FRU_TEXT\0" /* `FRU string', sometimes */
+ "b\002" "TIMESTAMP\0"
+ "\0", gede->ValidationBits);
+ aprint_debug_dev(sc->sc_dev, "%s: ValidationBits=%s\n", ctx,
+ buf);
+ }
/*
* Print the CPER section flags.
*/
- snprintb(buf, sizeof(buf), CPER_SECTION_FLAGS_FMT, gede->Flags);
- device_printf(sc->sc_dev, "%s: Flags=%s\n", ctx, buf);
+ if (ratelimitok) {
+ snprintb(buf, sizeof(buf), CPER_SECTION_FLAGS_FMT,
+ gede->Flags);
+ device_printf(sc->sc_dev, "%s: Flags=%s\n", ctx, buf);
+ }
/*
* The ErrorDataLength is unlikely to be useful for the log, so
* print it at debug level only.
*/
- aprint_debug_dev(sc->sc_dev, "%s: ErrorDataLength=0x%"PRIu32"\n",
- ctx, gede->ErrorDataLength);
+ if (ratelimitok) {
+ aprint_debug_dev(sc->sc_dev, "%s:"
+ " ErrorDataLength=0x%"PRIu32"\n",
+ ctx, gede->ErrorDataLength);
+ }
/*
* Print the FRU Id and text, if available.
*/
- if (gede->ValidationBits & ACPI_HEST_GEN_VALID_FRU_ID) {
+ if (ratelimitok &&
+ (gede->ValidationBits & ACPI_HEST_GEN_VALID_FRU_ID) != 0) {
struct uuid fruid;
apei_cper_guid_dec(gede->FruId, &fruid);
apei_format_guid(&fruid, guidstr);
device_printf(sc->sc_dev, "%s: FruId=%s\n", ctx, guidstr);
}
- if (gede->ValidationBits & ACPI_HEST_GEN_VALID_FRU_STRING) {
+ if (ratelimitok &&
+ (gede->ValidationBits & ACPI_HEST_GEN_VALID_FRU_STRING) != 0) {
device_printf(sc->sc_dev, "%s: FruText=%.20s\n",
ctx, gede->FruText);
}
@@ -627,7 +661,8 @@ apei_gede_report_header(struct apei_soft
* Print the timestamp, if available by the revision number and
* the validation bits.
*/
- if (gede->Revision >= 0x0300 && gede->Revision < 0x0400 &&
+ if (ratelimitok &&
+ gede->Revision >= 0x0300 && gede->Revision < 0x0400 &&
gede->ValidationBits & ACPI_HEST_GEN_VALID_TIMESTAMP) {
const uint8_t *const t = (const uint8_t *)&gede_v3->TimeStamp;
const uint8_t s = t[0];
@@ -648,6 +683,106 @@ apei_gede_report_header(struct apei_soft
}
/*
+ * apei_gesb_ratelimit
+ *
+ * State to limit the rate of console log messages about hardware
+ * errors. For each of the four severity levels in a Generic
+ * Error Status Block,
+ *
+ * 0 - Recoverable (uncorrectable),
+ * 1 - Fatal (uncorrectable),
+ * 2 - Corrected, and
+ * 3 - None (including ill-formed errors),
+ *
+ * we record the last time it happened, protected by a CPU simple
+ * lock that we only try-acquire so it is safe to use in any
+ * context, including non-maskable interrupt context.
+ */
+
+static struct {
+ __cpu_simple_lock_t lock;
+ struct timeval lasttime;
+ volatile uint32_t suppressed;
+} __aligned(COHERENCY_UNIT) apei_gesb_ratelimit[4] __cacheline_aligned = {
+ [ACPI_HEST_GEN_ERROR_RECOVERABLE] = { .lock = __SIMPLELOCK_UNLOCKED },
+ [ACPI_HEST_GEN_ERROR_FATAL] = { .lock = __SIMPLELOCK_UNLOCKED },
+ [ACPI_HEST_GEN_ERROR_CORRECTED] = { .lock = __SIMPLELOCK_UNLOCKED },
+ [ACPI_HEST_GEN_ERROR_NONE] = { .lock = __SIMPLELOCK_UNLOCKED },
+};
+
+static void
+atomic_incsat_32(volatile uint32_t *p)
+{
+ uint32_t o, n;
+
+ do {
+ o = atomic_load_relaxed(p);
+ if (__predict_false(o == UINT_MAX))
+ return;
+ n = o + 1;
+ } while (__predict_false(atomic_cas_32(p, o, n) != o));
+}
+
+/*
+ * apei_gesb_ratecheck(sc, severity, suppressed)
+ *
+ * Check for a rate limit on errors of the specified severity.
+ *
+ * => Return true if the error should be printed, and format into
+ * the buffer suppressed a message saying how many errors were
+ * previously suppressed.
+ *
+ * => Return false if the error should be suppressed because the
+ * last one printed was too recent.
+ */
+static bool
+apei_gesb_ratecheck(struct apei_softc *sc, uint32_t severity,
+ char suppressed[static sizeof(" (4294967295 or more errors suppressed)")])
+{
+ /* one of each type per minute (XXX worth making configurable?) */
+ const struct timeval mininterval = {60, 0};
+ unsigned i = MIN(severity, ACPI_HEST_GEN_ERROR_NONE); /* paranoia */
+ bool ok = false;
+
+ /*
+ * If the lock is contended, the rate limit is probably
+ * exceeded, so it's not OK to print.
+ *
+ * Otherwise, with the lock held, ask ratecheck(9) whether it's
+ * OK to print.
+ */
+ if (!__cpu_simple_lock_try(&apei_gesb_ratelimit[i].lock))
+ goto out;
+ ok = ratecheck(&apei_gesb_ratelimit[i].lasttime, &mininterval);
+ __cpu_simple_unlock(&apei_gesb_ratelimit[i].lock);
+
+out: /*
+ * If it's OK to print, report the number of errors that were
+ * suppressed. If it's not OK to print, count a suppressed
+ * error.
+ */
+ if (ok) {
+ const uint32_t n =
+ atomic_swap_32(&apei_gesb_ratelimit[i].suppressed, 0);
+
+ if (n == 0) {
+ suppressed[0] = '\0';
+ } else {
+ snprintf(suppressed,
+ sizeof(" (4294967295 or more errors suppressed)"),
+ " (%u%s error%s suppressed)",
+ n,
+ n == UINT32_MAX ? " or more" : "",
+ n == 1 ? "" : "s");
+ }
+ } else {
+ atomic_incsat_32(&apei_gesb_ratelimit[i].suppressed);
+ suppressed[0] = '\0';
+ }
+ return ok;
+}
+
+/*
* apei_gesb_report(sc, gesb, size, ctx)
*
* Check a Generic Error Status Block, of at most the specified
@@ -663,7 +798,8 @@ apei_gesb_report(struct apei_softc *sc,
uint32_t datalen, rawdatalen;
const ACPI_HEST_GENERIC_DATA *gede0, *gede;
const unsigned char *rawdata;
- char statusbuf[128];
+ bool ratelimitok = false;
+ char suppressed[sizeof(" (4294967295 or more errors suppressed)")];
bool fatal = false;
/*
@@ -671,8 +807,13 @@ apei_gesb_report(struct apei_softc *sc,
* Block before we try to touch anything in it.
*/
if (size < sizeof(*gesb)) {
- device_printf(sc->sc_dev, "%s: truncated GESB, %zu < %zu\n",
- ctx, size, sizeof(*gesb));
+ ratelimitok = apei_gesb_ratecheck(sc, ACPI_HEST_GEN_ERROR_NONE,
+ suppressed);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev,
+ "%s: truncated GESB, %zu < %zu%s\n",
+ ctx, size, sizeof(*gesb), suppressed);
+ }
status = 0;
goto out;
}
@@ -696,29 +837,42 @@ apei_gesb_report(struct apei_softc *sc,
goto out;
}
- /* XXX define this format somewhere */
- snprintb(statusbuf, sizeof(statusbuf), "\177\020"
- "b\000" "UE\0"
- "b\001" "CE\0"
- "b\002" "MULTI_UE\0"
- "b\003" "MULTI_CE\0"
- "f\004\010" "GEDE_COUNT\0"
- "\0", status);
+ /*
+ * Read out the severity and get the number of entries in this
+ * status block.
+ */
+ severity = gesb->ErrorSeverity;
+ nentries = __SHIFTOUT(status, ACPI_HEST_ERROR_ENTRY_COUNT);
/*
* Print a message to the console and dmesg about the severity
* of the error.
*/
- severity = gesb->ErrorSeverity;
- nentries = __SHIFTOUT(status, ACPI_HEST_ERROR_ENTRY_COUNT);
- if (severity < __arraycount(apei_gesb_severity)) {
- device_printf(sc->sc_dev, "%s reported hardware error:"
- " severity=%s nentries=%u status=%s\n",
- ctx, apei_gesb_severity[severity], nentries, statusbuf);
- } else {
- device_printf(sc->sc_dev, "%s reported error:"
- " severity=%"PRIu32" nentries=%u status=%s\n",
- ctx, severity, nentries, statusbuf);
+ ratelimitok = apei_gesb_ratecheck(sc, severity, suppressed);
+ if (ratelimitok) {
+ char statusbuf[128];
+
+ /* XXX define this format somewhere */
+ snprintb(statusbuf, sizeof(statusbuf), "\177\020"
+ "b\000" "UE\0"
+ "b\001" "CE\0"
+ "b\002" "MULTI_UE\0"
+ "b\003" "MULTI_CE\0"
+ "f\004\010" "GEDE_COUNT\0"
+ "\0", status);
+
+ if (severity < __arraycount(apei_gesb_severity)) {
+ device_printf(sc->sc_dev, "%s"
+ " reported hardware error%s:"
+ " severity=%s nentries=%u status=%s\n",
+ ctx, suppressed,
+ apei_gesb_severity[severity], nentries, statusbuf);
+ } else {
+ device_printf(sc->sc_dev, "%s reported error%s:"
+ " severity=%"PRIu32" nentries=%u status=%s\n",
+ ctx, suppressed,
+ severity, nentries, statusbuf);
+ }
}
/*
@@ -750,9 +904,8 @@ apei_gesb_report(struct apei_softc *sc,
unknownstatus &= ~ACPI_HEST_CORRECTABLE;
unknownstatus &= ~ACPI_HEST_MULTIPLE_CORRECTABLE;
unknownstatus &= ~ACPI_HEST_ERROR_ENTRY_COUNT;
- if (unknownstatus != 0) {
+ if (ratelimitok && unknownstatus != 0) {
/* XXX dtrace */
- /* XXX rate-limit? */
device_printf(sc->sc_dev, "%s: unknown BlockStatus bits:"
" 0x%"PRIx32"\n", ctx, unknownstatus);
}
@@ -769,9 +922,12 @@ apei_gesb_report(struct apei_softc *sc,
*/
datalen = gesb->DataLength;
if (size < datalen) {
- device_printf(sc->sc_dev, "%s:"
- " GESB DataLength exceeds bounds: %zu < %"PRIu32"\n",
- ctx, size, datalen);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " GESB DataLength exceeds bounds:"
+ " %zu < %"PRIu32"\n",
+ ctx, size, datalen);
+ }
datalen = size;
}
size -= datalen;
@@ -795,9 +951,11 @@ apei_gesb_report(struct apei_softc *sc,
* GEDE header, stop here.
*/
if (datalen < sizeof(*gede)) {
- device_printf(sc->sc_dev, "%s:"
- " truncated GEDE: %"PRIu32" < %zu bytes\n",
- subctx, datalen, sizeof(*gede));
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " truncated GEDE: %"PRIu32" < %zu bytes\n",
+ subctx, datalen, sizeof(*gede));
+ }
break;
}
@@ -806,7 +964,7 @@ apei_gesb_report(struct apei_softc *sc,
* vary from revision to revision of the GEDE) and the
* CPER report function if possible.
*/
- apei_gede_report_header(sc, gede, subctx,
+ apei_gede_report_header(sc, gede, subctx, ratelimitok,
&headerlen, &report);
/*
@@ -814,9 +972,11 @@ apei_gesb_report(struct apei_softc *sc,
* unfamiliar revision, stop here.
*/
if (headerlen == 0) {
- device_printf(sc->sc_dev, "%s:"
- " unknown revision: 0x%"PRIx16"\n",
- subctx, gede->Revision);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " unknown revision: 0x%"PRIx16"\n",
+ subctx, gede->Revision);
+ }
break;
}
@@ -826,9 +986,12 @@ apei_gesb_report(struct apei_softc *sc,
*/
datalen -= headerlen;
if (datalen < gede->ErrorDataLength) {
- device_printf(sc->sc_dev, "%s: truncated GEDE payload:"
- " %"PRIu32" < %"PRIu32" bytes\n",
- subctx, datalen, gede->ErrorDataLength);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " truncated GEDE payload:"
+ " %"PRIu32" < %"PRIu32" bytes\n",
+ subctx, datalen, gede->ErrorDataLength);
+ }
break;
}
@@ -837,10 +1000,14 @@ apei_gesb_report(struct apei_softc *sc,
* this Generic Error Data Entry.
*/
if (report == NULL) {
- device_printf(sc->sc_dev, "%s: [unknown type]\n", ctx);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " [unknown type]\n", ctx);
+ }
} else {
+ /* XXX pass ratelimit through */
(*report->func)(sc, (const char *)gede + headerlen,
- gede->ErrorDataLength, subctx);
+ gede->ErrorDataLength, subctx, ratelimitok);
}
/*
@@ -866,9 +1033,12 @@ apei_gesb_report(struct apei_softc *sc,
*/
rawdatalen = gesb->RawDataLength;
if (size < rawdatalen) {
- device_printf(sc->sc_dev, "%s:"
- " GESB RawDataLength exceeds bounds: %zu < %"PRIu32"\n",
- ctx, size, rawdatalen);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " GESB RawDataLength exceeds bounds:"
+ " %zu < %"PRIu32"\n",
+ ctx, size, rawdatalen);
+ }
rawdatalen = size;
}
size -= rawdatalen;
@@ -876,7 +1046,7 @@ apei_gesb_report(struct apei_softc *sc,
/*
* Hexdump the raw data, if any.
*/
- if (rawdatalen > 0) {
+ if (ratelimitok && rawdatalen > 0) {
char devctx[128];
snprintf(devctx, sizeof(devctx), "%s: %s: raw data",
@@ -887,7 +1057,7 @@ apei_gesb_report(struct apei_softc *sc,
/*
* If there's anything left after the raw data, warn.
*/
- if (size > 0) {
+ if (ratelimitok && size > 0) {
device_printf(sc->sc_dev, "%s: excess data: %zu bytes\n",
ctx, size);
}