Greetings. I am soliciting feedback on a patch to detect and mitigate 
uncontrolled ACPI GPE interrupt storms.

Rationale: There have been a number of threads in the recent past on bugs@ and 
misc@ with acpi0 spinning a CPU at 100% [1][2][3][4]. The immediate cause is 
likely a buggy BIOS and its ACPI implementation. However, this type of bug is 
not exclusive to no-name hardware from China, nor is it specific to a 
particular hardware vendor, BIOS vendor, or GPE pin. Hardware that is or was 
affected can include Intel [5], Lenovo [6], HP [7], ASUS [8], and Apple [9].

I have been testing with a half-dozen ACPI-equipped systems in various states: 
storming and behaving, booting and resuming, 7.2 and -current, SMALL and not. 
The attached diff uses a minimum 5-second evaluation window, driven by the 
firing of ACPI GPE interrupts (no additional accounting thread, etc). An 
uncontrolled GPE storm will be logged as such (real number):

Feb 17 22:57:06 acpitest3 /bsd: uncontrolled GPE storm 7242/s, disabling GPE 06

Alternatively, if this is still too close to papering over the problem, perhaps 
a smaller diff that only logs the problem, allowing a user see what the storm 
is and report it to their BIOS/hardware vendor?

Thank you for your time.

Brian Conway

[1] https://marc.info/?t=166422981800001
[2] https://marc.info/?t=166497726600004
[3] https://marc.info/?t=167356490500003
[4] https://marc.info/?t=167614389600001
[5] 
https://community.intel.com/t5/Intel-NUCs/APCI-GPE-0x6F-Interrupt-Storm-under-OpenBSD/m-p/1426755
[6] 
https://forums.lenovo.com/t5/ThinkPad-T400-T500-and-newer-T/T480s-ACPI-bug/m-p/4057604
[7] 
https://h30434.www3.hp.com/t5/Gaming-Notebooks/High-CPU-Usage-System-ACPI-sys-GPE-L6F-Storm-Omen-15-17/td-p/7169255
[8] 
https://answers.microsoft.com/en-us/windows/forum/all/stopping-a-gpe-event-acpi-system-interrupts/cec51e6c-1ed4-4369-9e6f-108c4d6333a6
[9] https://bugzilla.kernel.org/show_bug.cgi?id=117481

diff --git sys/dev/acpi/acpi.c sys/dev/acpi/acpi.c
index 853bad1ab..26a5c1702 100644
--- sys/dev/acpi/acpi.c
+++ sys/dev/acpi/acpi.c
@@ -52,6 +52,9 @@
 #define APMDEV_NORMAL  0
 #define APMDEV_CTL     8
 
+#define GPE_RATE_MIN_CYCLE     5               /* seconds    */
+#define GPE_RATE_MAX           2000    /* per second */
+
 #include "wd.h"
 
 #ifdef ACPI_DEBUG
@@ -98,6 +101,8 @@ void acpi_disable_allgpes(struct acpi_softc *);
 struct gpe_block *acpi_find_gpe(struct acpi_softc *, int);
 void   acpi_enable_onegpe(struct acpi_softc *, int);
 int    acpi_gpe(struct acpi_softc *, int, void *);
+void   acpi_init_gpe_rate(struct acpi_softc *, int);
+int    acpi_gpe_rate(struct acpi_softc *, int);
 
 void   acpi_enable_rungpes(struct acpi_softc *);
 
@@ -2229,6 +2234,7 @@ acpi_enable_onegpe(struct acpi_softc *sc, int gpe)
        dnprintf(50, "enabling GPE %.2x (current: %sabled) %.2x\n",
            gpe, (en & mask) ? "en" : "dis", en);
        acpi_write_pmreg(sc, ACPIREG_GPE_EN, gpe>>3, en | mask);
+       acpi_init_gpe_rate(sc, gpe);
 }
 
 /* Clear all GPEs */
@@ -2307,7 +2313,40 @@ acpi_gpe(struct acpi_softc *sc, int gpe, void *arg)
        if (sc->gpe_table[gpe].flags & GPE_LEVEL)
                acpi_write_pmreg(sc, ACPIREG_GPE_STS, gpe>>3, mask);
        en = acpi_read_pmreg(sc, ACPIREG_GPE_EN,  gpe>>3);
-       acpi_write_pmreg(sc, ACPIREG_GPE_EN,  gpe>>3, en | mask);
+       /* Re-enable if GPE rate passes, otherwise leave disabled */
+       if (!acpi_gpe_rate(sc, gpe))
+               acpi_write_pmreg(sc, ACPIREG_GPE_EN,  gpe>>3, en | mask);
+       return (0);
+}
+
+void
+acpi_init_gpe_rate(struct acpi_softc *sc, int gpe)
+{
+       sc->gpe_table[gpe].rate_start = getuptime();
+       sc->gpe_table[gpe].rate_count = 0;
+}
+
+int
+acpi_gpe_rate(struct acpi_softc *sc, int gpe)
+{
+       struct gpe_block *pgpe = &sc->gpe_table[gpe];
+       time_t cycle;
+
+       pgpe->rate_count++;
+       dnprintf(10, "rate GPE %.2x start %lld elapsed %lld count %zu\n", gpe,
+           pgpe->rate_start, getuptime() - pgpe->rate_start, pgpe->rate_count);
+
+       cycle = getuptime() - pgpe->rate_start;
+       if (cycle >= GPE_RATE_MIN_CYCLE) {
+               if (pgpe->rate_count > (GPE_RATE_MAX * cycle)) {
+                       printf("uncontrolled GPE storm %lld/s, disabling GPE 
%.2x\n",
+                           pgpe->rate_count / cycle, gpe);
+                       return (1);
+               }
+
+               /* Reset and start a new cycle */
+               acpi_init_gpe_rate(sc, gpe);
+       }
        return (0);
 }
 
diff --git sys/dev/acpi/acpivar.h sys/dev/acpi/acpivar.h
index a9b4a2ae9..4e2f47053 100644
--- sys/dev/acpi/acpivar.h
+++ sys/dev/acpi/acpivar.h
@@ -185,6 +185,9 @@ struct gpe_block {
        void *arg;
        int   active;
        int   flags;
+
+       time_t rate_start;
+       size_t rate_count;
 };
 
 struct acpi_devlist {

Reply via email to