Module Name: src Committed By: riastradh Date: Sun Oct 27 12:13:42 UTC 2024
Modified Files: src/sys/dev/acpi: apei.c Log Message: apei(4): Rate-limit error reports to console. For each severity level (corrected, uncorrectable/recoverable, uncorrectable/fatal, other), print at most one report per minute, and, if any reports were suppressed, the number of reports that had been suppressed since the last one, up to a maximum of 2^32 - 1. PR kern/58775: apei(4) spamming console To generate a diff of this commit: cvs rdiff -u -r1.4 -r1.5 src/sys/dev/acpi/apei.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/dev/acpi/apei.c diff -u src/sys/dev/acpi/apei.c:1.4 src/sys/dev/acpi/apei.c:1.5 --- src/sys/dev/acpi/apei.c:1.4 Sun Oct 27 12:13:28 2024 +++ src/sys/dev/acpi/apei.c Sun Oct 27 12:13:42 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: apei.c,v 1.4 2024/10/27 12:13:28 riastradh Exp $ */ +/* $NetBSD: apei.c,v 1.5 2024/10/27 12:13:42 riastradh Exp $ */ /*- * Copyright (c) 2024 The NetBSD Foundation, Inc. @@ -38,7 +38,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: apei.c,v 1.4 2024/10/27 12:13:28 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: apei.c,v 1.5 2024/10/27 12:13:42 riastradh Exp $"); #include <sys/param.h> #include <sys/types.h> @@ -363,11 +363,17 @@ static const struct uuid CPER_MEMORY_ERR static void apei_cper_memory_error_report(struct apei_softc *sc, const void *buf, - size_t len, const char *ctx) + size_t len, const char *ctx, bool ratelimitok) { const struct cper_memory_error *ME = buf; char bitbuf[1024]; + /* + * If we've hit the rate limit, skip printing the error. + */ + if (!ratelimitok) + goto out; + snprintb(bitbuf, sizeof(bitbuf), CPER_MEMORY_ERROR_VALIDATION_BITS_FMT, ME->ValidationBits); aprint_debug_dev(sc->sc_dev, "%s: ValidationBits=%s\n", ctx, bitbuf); @@ -472,6 +478,12 @@ apei_cper_memory_error_report(struct ape ctx, t); } } + +out: /* + * XXX pass this through to uvm(9) or userland for decisions + * like page retirement + */ + return; } /* @@ -489,7 +501,8 @@ static const struct apei_cper_report { const char *name; const struct uuid *type; size_t minlength; - void (*func)(struct apei_softc *, const void *, size_t, const char *); + void (*func)(struct apei_softc *, const void *, size_t, const char *, + bool); } apei_cper_reports[] = { { "memory", &CPER_MEMORY_ERROR_SECTION, sizeof(struct cper_memory_error), @@ -497,10 +510,10 @@ static const struct apei_cper_report { }; /* - * apei_gede_report_header(sc, gede, ctx, &headerlen, &report) + * apei_gede_report_header(sc, gede, ctx, ratelimitok, &headerlen, &report) * * Report the header of the ith Generic Error Data Entry in the - * given context. + * given context, if ratelimitok is true. * * Return the actual length of the header in headerlen, or 0 if * not known because the revision isn't recognized. @@ -510,7 +523,7 @@ static const struct apei_cper_report { */ static void apei_gede_report_header(struct apei_softc *sc, - const ACPI_HEST_GENERIC_DATA *gede, const char *ctx, + const ACPI_HEST_GENERIC_DATA *gede, const char *ctx, bool ratelimitok, size_t *headerlenp, const struct apei_cper_report **reportp) { const ACPI_HEST_GENERIC_DATA_V300 *const gede_v3 = (const void *)gede; @@ -538,14 +551,19 @@ apei_gede_report_header(struct apei_soft if (memcmp(§ype, report->type, sizeof(sectype)) != 0) continue; - device_printf(sc->sc_dev, "%s: SectionType=%s (%s error)\n", - ctx, guidstr, report->name); + if (ratelimitok) { + device_printf(sc->sc_dev, "%s:" + " SectionType=%s (%s error)\n", + ctx, guidstr, report->name); + } *reportp = report; break; } if (i == __arraycount(apei_cper_reports)) { - device_printf(sc->sc_dev, "%s: SectionType=%s\n", ctx, - guidstr); + if (ratelimitok) { + device_printf(sc->sc_dev, "%s: SectionType=%s\n", ctx, + guidstr); + } *reportp = NULL; } @@ -553,11 +571,14 @@ apei_gede_report_header(struct apei_soft * Print the numeric severity and, if we have it, a symbolic * name for it. */ - device_printf(sc->sc_dev, "%s: ErrorSeverity=%"PRIu32" (%s)\n", ctx, - gede->ErrorSeverity, - (gede->ErrorSeverity < __arraycount(apei_gede_severity) - ? apei_gede_severity[gede->ErrorSeverity] - : "unknown")); + if (ratelimitok) { + device_printf(sc->sc_dev, "%s: ErrorSeverity=%"PRIu32" (%s)\n", + ctx, + gede->ErrorSeverity, + (gede->ErrorSeverity < __arraycount(apei_gede_severity) + ? apei_gede_severity[gede->ErrorSeverity] + : "unknown")); + } /* * The Revision may not often be useful, but this is only ever @@ -565,8 +586,10 @@ apei_gede_report_header(struct apei_soft * you can glean at your convenience with acpidump. So print * it anyway. */ - device_printf(sc->sc_dev, "%s: Revision=0x%"PRIx16"\n", ctx, - gede->Revision); + if (ratelimitok) { + device_printf(sc->sc_dev, "%s: Revision=0x%"PRIx16"\n", ctx, + gede->Revision); + } /* * Don't touch anything past the Revision until we've @@ -587,38 +610,49 @@ apei_gede_report_header(struct apei_soft * Print the validation bits at debug level. Only really * helpful if there are bits we _don't_ know about. */ - /* XXX define this format somewhere */ - snprintb(buf, sizeof(buf), "\177\020" - "b\000" "FRU_ID\0" - "b\001" "FRU_TEXT\0" /* `FRU string', sometimes */ - "b\002" "TIMESTAMP\0" - "\0", gede->ValidationBits); - aprint_debug_dev(sc->sc_dev, "%s: ValidationBits=%s\n", ctx, buf); + if (ratelimitok) { + /* XXX define this format somewhere */ + snprintb(buf, sizeof(buf), "\177\020" + "b\000" "FRU_ID\0" + "b\001" "FRU_TEXT\0" /* `FRU string', sometimes */ + "b\002" "TIMESTAMP\0" + "\0", gede->ValidationBits); + aprint_debug_dev(sc->sc_dev, "%s: ValidationBits=%s\n", ctx, + buf); + } /* * Print the CPER section flags. */ - snprintb(buf, sizeof(buf), CPER_SECTION_FLAGS_FMT, gede->Flags); - device_printf(sc->sc_dev, "%s: Flags=%s\n", ctx, buf); + if (ratelimitok) { + snprintb(buf, sizeof(buf), CPER_SECTION_FLAGS_FMT, + gede->Flags); + device_printf(sc->sc_dev, "%s: Flags=%s\n", ctx, buf); + } /* * The ErrorDataLength is unlikely to be useful for the log, so * print it at debug level only. */ - aprint_debug_dev(sc->sc_dev, "%s: ErrorDataLength=0x%"PRIu32"\n", - ctx, gede->ErrorDataLength); + if (ratelimitok) { + aprint_debug_dev(sc->sc_dev, "%s:" + " ErrorDataLength=0x%"PRIu32"\n", + ctx, gede->ErrorDataLength); + } /* * Print the FRU Id and text, if available. */ - if (gede->ValidationBits & ACPI_HEST_GEN_VALID_FRU_ID) { + if (ratelimitok && + (gede->ValidationBits & ACPI_HEST_GEN_VALID_FRU_ID) != 0) { struct uuid fruid; apei_cper_guid_dec(gede->FruId, &fruid); apei_format_guid(&fruid, guidstr); device_printf(sc->sc_dev, "%s: FruId=%s\n", ctx, guidstr); } - if (gede->ValidationBits & ACPI_HEST_GEN_VALID_FRU_STRING) { + if (ratelimitok && + (gede->ValidationBits & ACPI_HEST_GEN_VALID_FRU_STRING) != 0) { device_printf(sc->sc_dev, "%s: FruText=%.20s\n", ctx, gede->FruText); } @@ -627,7 +661,8 @@ apei_gede_report_header(struct apei_soft * Print the timestamp, if available by the revision number and * the validation bits. */ - if (gede->Revision >= 0x0300 && gede->Revision < 0x0400 && + if (ratelimitok && + gede->Revision >= 0x0300 && gede->Revision < 0x0400 && gede->ValidationBits & ACPI_HEST_GEN_VALID_TIMESTAMP) { const uint8_t *const t = (const uint8_t *)&gede_v3->TimeStamp; const uint8_t s = t[0]; @@ -648,6 +683,106 @@ apei_gede_report_header(struct apei_soft } /* + * apei_gesb_ratelimit + * + * State to limit the rate of console log messages about hardware + * errors. For each of the four severity levels in a Generic + * Error Status Block, + * + * 0 - Recoverable (uncorrectable), + * 1 - Fatal (uncorrectable), + * 2 - Corrected, and + * 3 - None (including ill-formed errors), + * + * we record the last time it happened, protected by a CPU simple + * lock that we only try-acquire so it is safe to use in any + * context, including non-maskable interrupt context. + */ + +static struct { + __cpu_simple_lock_t lock; + struct timeval lasttime; + volatile uint32_t suppressed; +} __aligned(COHERENCY_UNIT) apei_gesb_ratelimit[4] __cacheline_aligned = { + [ACPI_HEST_GEN_ERROR_RECOVERABLE] = { .lock = __SIMPLELOCK_UNLOCKED }, + [ACPI_HEST_GEN_ERROR_FATAL] = { .lock = __SIMPLELOCK_UNLOCKED }, + [ACPI_HEST_GEN_ERROR_CORRECTED] = { .lock = __SIMPLELOCK_UNLOCKED }, + [ACPI_HEST_GEN_ERROR_NONE] = { .lock = __SIMPLELOCK_UNLOCKED }, +}; + +static void +atomic_incsat_32(volatile uint32_t *p) +{ + uint32_t o, n; + + do { + o = atomic_load_relaxed(p); + if (__predict_false(o == UINT_MAX)) + return; + n = o + 1; + } while (__predict_false(atomic_cas_32(p, o, n) != o)); +} + +/* + * apei_gesb_ratecheck(sc, severity, suppressed) + * + * Check for a rate limit on errors of the specified severity. + * + * => Return true if the error should be printed, and format into + * the buffer suppressed a message saying how many errors were + * previously suppressed. + * + * => Return false if the error should be suppressed because the + * last one printed was too recent. + */ +static bool +apei_gesb_ratecheck(struct apei_softc *sc, uint32_t severity, + char suppressed[static sizeof(" (4294967295 or more errors suppressed)")]) +{ + /* one of each type per minute (XXX worth making configurable?) */ + const struct timeval mininterval = {60, 0}; + unsigned i = MIN(severity, ACPI_HEST_GEN_ERROR_NONE); /* paranoia */ + bool ok = false; + + /* + * If the lock is contended, the rate limit is probably + * exceeded, so it's not OK to print. + * + * Otherwise, with the lock held, ask ratecheck(9) whether it's + * OK to print. + */ + if (!__cpu_simple_lock_try(&apei_gesb_ratelimit[i].lock)) + goto out; + ok = ratecheck(&apei_gesb_ratelimit[i].lasttime, &mininterval); + __cpu_simple_unlock(&apei_gesb_ratelimit[i].lock); + +out: /* + * If it's OK to print, report the number of errors that were + * suppressed. If it's not OK to print, count a suppressed + * error. + */ + if (ok) { + const uint32_t n = + atomic_swap_32(&apei_gesb_ratelimit[i].suppressed, 0); + + if (n == 0) { + suppressed[0] = '\0'; + } else { + snprintf(suppressed, + sizeof(" (4294967295 or more errors suppressed)"), + " (%u%s error%s suppressed)", + n, + n == UINT32_MAX ? " or more" : "", + n == 1 ? "" : "s"); + } + } else { + atomic_incsat_32(&apei_gesb_ratelimit[i].suppressed); + suppressed[0] = '\0'; + } + return ok; +} + +/* * apei_gesb_report(sc, gesb, size, ctx) * * Check a Generic Error Status Block, of at most the specified @@ -663,7 +798,8 @@ apei_gesb_report(struct apei_softc *sc, uint32_t datalen, rawdatalen; const ACPI_HEST_GENERIC_DATA *gede0, *gede; const unsigned char *rawdata; - char statusbuf[128]; + bool ratelimitok = false; + char suppressed[sizeof(" (4294967295 or more errors suppressed)")]; bool fatal = false; /* @@ -671,8 +807,13 @@ apei_gesb_report(struct apei_softc *sc, * Block before we try to touch anything in it. */ if (size < sizeof(*gesb)) { - device_printf(sc->sc_dev, "%s: truncated GESB, %zu < %zu\n", - ctx, size, sizeof(*gesb)); + ratelimitok = apei_gesb_ratecheck(sc, ACPI_HEST_GEN_ERROR_NONE, + suppressed); + if (ratelimitok) { + device_printf(sc->sc_dev, + "%s: truncated GESB, %zu < %zu%s\n", + ctx, size, sizeof(*gesb), suppressed); + } status = 0; goto out; } @@ -696,29 +837,42 @@ apei_gesb_report(struct apei_softc *sc, goto out; } - /* XXX define this format somewhere */ - snprintb(statusbuf, sizeof(statusbuf), "\177\020" - "b\000" "UE\0" - "b\001" "CE\0" - "b\002" "MULTI_UE\0" - "b\003" "MULTI_CE\0" - "f\004\010" "GEDE_COUNT\0" - "\0", status); + /* + * Read out the severity and get the number of entries in this + * status block. + */ + severity = gesb->ErrorSeverity; + nentries = __SHIFTOUT(status, ACPI_HEST_ERROR_ENTRY_COUNT); /* * Print a message to the console and dmesg about the severity * of the error. */ - severity = gesb->ErrorSeverity; - nentries = __SHIFTOUT(status, ACPI_HEST_ERROR_ENTRY_COUNT); - if (severity < __arraycount(apei_gesb_severity)) { - device_printf(sc->sc_dev, "%s reported hardware error:" - " severity=%s nentries=%u status=%s\n", - ctx, apei_gesb_severity[severity], nentries, statusbuf); - } else { - device_printf(sc->sc_dev, "%s reported error:" - " severity=%"PRIu32" nentries=%u status=%s\n", - ctx, severity, nentries, statusbuf); + ratelimitok = apei_gesb_ratecheck(sc, severity, suppressed); + if (ratelimitok) { + char statusbuf[128]; + + /* XXX define this format somewhere */ + snprintb(statusbuf, sizeof(statusbuf), "\177\020" + "b\000" "UE\0" + "b\001" "CE\0" + "b\002" "MULTI_UE\0" + "b\003" "MULTI_CE\0" + "f\004\010" "GEDE_COUNT\0" + "\0", status); + + if (severity < __arraycount(apei_gesb_severity)) { + device_printf(sc->sc_dev, "%s" + " reported hardware error%s:" + " severity=%s nentries=%u status=%s\n", + ctx, suppressed, + apei_gesb_severity[severity], nentries, statusbuf); + } else { + device_printf(sc->sc_dev, "%s reported error%s:" + " severity=%"PRIu32" nentries=%u status=%s\n", + ctx, suppressed, + severity, nentries, statusbuf); + } } /* @@ -750,9 +904,8 @@ apei_gesb_report(struct apei_softc *sc, unknownstatus &= ~ACPI_HEST_CORRECTABLE; unknownstatus &= ~ACPI_HEST_MULTIPLE_CORRECTABLE; unknownstatus &= ~ACPI_HEST_ERROR_ENTRY_COUNT; - if (unknownstatus != 0) { + if (ratelimitok && unknownstatus != 0) { /* XXX dtrace */ - /* XXX rate-limit? */ device_printf(sc->sc_dev, "%s: unknown BlockStatus bits:" " 0x%"PRIx32"\n", ctx, unknownstatus); } @@ -769,9 +922,12 @@ apei_gesb_report(struct apei_softc *sc, */ datalen = gesb->DataLength; if (size < datalen) { - device_printf(sc->sc_dev, "%s:" - " GESB DataLength exceeds bounds: %zu < %"PRIu32"\n", - ctx, size, datalen); + if (ratelimitok) { + device_printf(sc->sc_dev, "%s:" + " GESB DataLength exceeds bounds:" + " %zu < %"PRIu32"\n", + ctx, size, datalen); + } datalen = size; } size -= datalen; @@ -795,9 +951,11 @@ apei_gesb_report(struct apei_softc *sc, * GEDE header, stop here. */ if (datalen < sizeof(*gede)) { - device_printf(sc->sc_dev, "%s:" - " truncated GEDE: %"PRIu32" < %zu bytes\n", - subctx, datalen, sizeof(*gede)); + if (ratelimitok) { + device_printf(sc->sc_dev, "%s:" + " truncated GEDE: %"PRIu32" < %zu bytes\n", + subctx, datalen, sizeof(*gede)); + } break; } @@ -806,7 +964,7 @@ apei_gesb_report(struct apei_softc *sc, * vary from revision to revision of the GEDE) and the * CPER report function if possible. */ - apei_gede_report_header(sc, gede, subctx, + apei_gede_report_header(sc, gede, subctx, ratelimitok, &headerlen, &report); /* @@ -814,9 +972,11 @@ apei_gesb_report(struct apei_softc *sc, * unfamiliar revision, stop here. */ if (headerlen == 0) { - device_printf(sc->sc_dev, "%s:" - " unknown revision: 0x%"PRIx16"\n", - subctx, gede->Revision); + if (ratelimitok) { + device_printf(sc->sc_dev, "%s:" + " unknown revision: 0x%"PRIx16"\n", + subctx, gede->Revision); + } break; } @@ -826,9 +986,12 @@ apei_gesb_report(struct apei_softc *sc, */ datalen -= headerlen; if (datalen < gede->ErrorDataLength) { - device_printf(sc->sc_dev, "%s: truncated GEDE payload:" - " %"PRIu32" < %"PRIu32" bytes\n", - subctx, datalen, gede->ErrorDataLength); + if (ratelimitok) { + device_printf(sc->sc_dev, "%s:" + " truncated GEDE payload:" + " %"PRIu32" < %"PRIu32" bytes\n", + subctx, datalen, gede->ErrorDataLength); + } break; } @@ -837,10 +1000,14 @@ apei_gesb_report(struct apei_softc *sc, * this Generic Error Data Entry. */ if (report == NULL) { - device_printf(sc->sc_dev, "%s: [unknown type]\n", ctx); + if (ratelimitok) { + device_printf(sc->sc_dev, "%s:" + " [unknown type]\n", ctx); + } } else { + /* XXX pass ratelimit through */ (*report->func)(sc, (const char *)gede + headerlen, - gede->ErrorDataLength, subctx); + gede->ErrorDataLength, subctx, ratelimitok); } /* @@ -866,9 +1033,12 @@ apei_gesb_report(struct apei_softc *sc, */ rawdatalen = gesb->RawDataLength; if (size < rawdatalen) { - device_printf(sc->sc_dev, "%s:" - " GESB RawDataLength exceeds bounds: %zu < %"PRIu32"\n", - ctx, size, rawdatalen); + if (ratelimitok) { + device_printf(sc->sc_dev, "%s:" + " GESB RawDataLength exceeds bounds:" + " %zu < %"PRIu32"\n", + ctx, size, rawdatalen); + } rawdatalen = size; } size -= rawdatalen; @@ -876,7 +1046,7 @@ apei_gesb_report(struct apei_softc *sc, /* * Hexdump the raw data, if any. */ - if (rawdatalen > 0) { + if (ratelimitok && rawdatalen > 0) { char devctx[128]; snprintf(devctx, sizeof(devctx), "%s: %s: raw data", @@ -887,7 +1057,7 @@ apei_gesb_report(struct apei_softc *sc, /* * If there's anything left after the raw data, warn. */ - if (size > 0) { + if (ratelimitok && size > 0) { device_printf(sc->sc_dev, "%s: excess data: %zu bytes\n", ctx, size); }