Author: jhb
Date: Wed Jul 14 21:10:14 2010
New Revision: 210079
URL: http://svn.freebsd.org/changeset/base/210079

Log:
  MFC 208507,208556,208621:
  Add support for corrected machine check interrupts.  CMCI is a new local
  APIC interrupt that fires when a threshold of corrected machine check
  events is reached.  CMCI also includes a count of events when reporting
  corrected errors in the bank's status register.  Note that individual
  banks may or may not support CMCI.  If they do, each bank includes its own
  threshold register that determines when the interrupt fires.  Currently
  the code uses a very simple strategy where it doubles the threshold on
  each interrupt until it succeeds in throttling the interrupt to occur
  only once a minute (this interval can be tuned via sysctl).  The threshold
  is also adjusted on each hourly poll which will lower the threshold once
  events stop occurring.

Modified:
  stable/8/sys/amd64/amd64/apic_vector.S
  stable/8/sys/amd64/amd64/local_apic.c
  stable/8/sys/amd64/amd64/machdep.c
  stable/8/sys/amd64/amd64/mca.c
  stable/8/sys/amd64/include/apicreg.h
  stable/8/sys/amd64/include/apicvar.h
  stable/8/sys/amd64/include/mca.h
  stable/8/sys/amd64/include/pcpu.h
  stable/8/sys/amd64/include/specialreg.h
  stable/8/sys/i386/i386/apic_vector.s
  stable/8/sys/i386/i386/local_apic.c
  stable/8/sys/i386/i386/machdep.c
  stable/8/sys/i386/i386/mca.c
  stable/8/sys/i386/include/apicreg.h
  stable/8/sys/i386/include/apicvar.h
  stable/8/sys/i386/include/mca.h
  stable/8/sys/i386/include/pcpu.h
  stable/8/sys/i386/include/specialreg.h
Directory Properties:
  stable/8/sys/   (props changed)
  stable/8/sys/amd64/include/xen/   (props changed)
  stable/8/sys/cddl/contrib/opensolaris/   (props changed)
  stable/8/sys/contrib/dev/acpica/   (props changed)
  stable/8/sys/contrib/pf/   (props changed)
  stable/8/sys/dev/xen/xenpci/   (props changed)

Modified: stable/8/sys/amd64/amd64/apic_vector.S
==============================================================================
--- stable/8/sys/amd64/amd64/apic_vector.S      Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/amd64/amd64/apic_vector.S      Wed Jul 14 21:10:14 2010        
(r210079)
@@ -105,6 +105,18 @@ IDTVEC(timerint)
        jmp     doreti
 
 /*
+ * Local APIC CMCI handler.
+ */
+       .text
+       SUPERALIGN_TEXT
+IDTVEC(cmcint)
+       PUSH_FRAME
+       FAKE_MCOUNT(TF_RIP(%rsp))
+       call    lapic_handle_cmc
+       MEXITCOUNT
+       jmp     doreti
+
+/*
  * Local APIC error interrupt handler.
  */
        .text

Modified: stable/8/sys/amd64/amd64/local_apic.c
==============================================================================
--- stable/8/sys/amd64/amd64/local_apic.c       Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/amd64/amd64/local_apic.c       Wed Jul 14 21:10:14 2010        
(r210079)
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <machine/apicvar.h>
+#include <machine/mca.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
@@ -123,6 +124,7 @@ static struct lvt lvts[LVT_MAX + 1] = {
        { 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },      /* Error */
        { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 },     /* PMC */
        { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },    /* Thermal */
+       { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT },        /* CMCI */
 };
 
 static inthand_t *ioint_handlers[] = {
@@ -227,6 +229,9 @@ lapic_init(vm_paddr_t addr)
        setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_SYSIGT, SEL_KPL, 0);
 
        /* XXX: Thermal interrupt */
+
+       /* Local APIC CMCI. */
+       setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_SYSIGT, SEL_KPL, 0);
 }
 
 /*
@@ -252,7 +257,7 @@ lapic_create(u_int apic_id, int boot_cpu
         */
        lapics[apic_id].la_present = 1;
        lapics[apic_id].la_id = apic_id;
-       for (i = 0; i < LVT_MAX; i++) {
+       for (i = 0; i <= LVT_MAX; i++) {
                lapics[apic_id].la_lvts[i] = lvts[i];
                lapics[apic_id].la_lvts[i].lvt_active = 0;
        }
@@ -282,6 +287,7 @@ lapic_dump(const char* str)
        printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n",
            lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error,
            lapic->lvt_pcint);
+       printf("   cmci: 0x%08x\n", lapic->lvt_cmci);
 }
 
 void
@@ -333,6 +339,10 @@ lapic_setup(int boot)
 
        /* XXX: Thermal LVT */
 
+       /* Program the CMCI LVT entry if present. */
+       if (maxlvt >= LVT_CMCI)
+               lapic->lvt_cmci = lvt_mode(la, LVT_CMCI, lapic->lvt_cmci);
+           
        intr_restore(eflags);
 }
 
@@ -857,6 +867,34 @@ lapic_timer_enable_intr(void)
 }
 
 void
+lapic_handle_cmc(void)
+{
+
+       lapic_eoi();
+       cmc_intr();
+}
+
+/*
+ * Called from the mca_init() to activate the CMC interrupt if this CPU is
+ * responsible for monitoring any MC banks for CMC events.  Since mca_init()
+ * is called prior to lapic_setup() during boot, this just needs to unmask
+ * this CPU's LVT_CMCI entry.
+ */
+void
+lapic_enable_cmc(void)
+{
+       u_int apic_id;
+
+       apic_id = PCPU_GET(apic_id);
+       KASSERT(lapics[apic_id].la_present,
+           ("%s: missing APIC %u", __func__, apic_id));
+       lapics[apic_id].la_lvts[LVT_CMCI].lvt_masked = 0;
+       lapics[apic_id].la_lvts[LVT_CMCI].lvt_active = 1;
+       if (bootverbose)
+               printf("lapic%u: CMCI unmasked\n", apic_id);
+}
+
+void
 lapic_handle_error(void)
 {
        u_int32_t esr;

Modified: stable/8/sys/amd64/amd64/machdep.c
==============================================================================
--- stable/8/sys/amd64/amd64/machdep.c  Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/amd64/amd64/machdep.c  Wed Jul 14 21:10:14 2010        
(r210079)
@@ -283,7 +283,6 @@ cpu_startup(dummy)
        vm_pager_bufferinit();
 
        cpu_setregs();
-       mca_init();
 }
 
 /*

Modified: stable/8/sys/amd64/amd64/mca.c
==============================================================================
--- stable/8/sys/amd64/amd64/mca.c      Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/amd64/amd64/mca.c      Wed Jul 14 21:10:14 2010        
(r210079)
@@ -33,6 +33,8 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -43,11 +45,29 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
 #include <machine/cputypes.h>
 #include <machine/mca.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
+/* Modes for mca_scan() */
+enum scan_mode {
+       POLLED,
+       MCE,
+       CMCI,
+};
+
+/*
+ * State maintained for each monitored MCx bank to control the
+ * corrected machine check interrupt threshold.
+ */
+struct cmc_state {
+       int     max_threshold;
+       int     last_intr;
+};
+
 struct mca_internal {
        struct mca_record rec;
        int             logged;
@@ -79,19 +99,22 @@ static struct callout mca_timer;
 static int mca_ticks = 3600;   /* Check hourly by default. */
 static struct task mca_task;
 static struct mtx mca_lock;
+static struct cmc_state **cmc_state;   /* Indexed by cpuid, bank */
+static int cmc_banks;
+static int cmc_throttle = 60;  /* Time in seconds to throttle CMCI. */
 
 static int
-sysctl_mca_ticks(SYSCTL_HANDLER_ARGS)
+sysctl_positive_int(SYSCTL_HANDLER_ARGS)
 {
        int error, value;
 
-       value = mca_ticks;
+       value = *(int *)arg1;
        error = sysctl_handle_int(oidp, &value, 0, req);
        if (error || req->newptr == NULL)
                return (error);
        if (value <= 0)
                return (EINVAL);
-       mca_ticks = value;
+       *(int *)arg1 = value;
        return (0);
 }
 
@@ -401,31 +424,112 @@ mca_record_entry(const struct mca_record
 }
 
 /*
+ * Update the interrupt threshold for a CMCI.  The strategy is to use
+ * a low trigger that interrupts as soon as the first event occurs.
+ * However, if a steady stream of events arrive, the threshold is
+ * increased until the interrupts are throttled to once every
+ * cmc_throttle seconds or the periodic scan.  If a periodic scan
+ * finds that the threshold is too high, it is lowered.
+ */
+static void
+cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+{
+       struct cmc_state *cc;
+       uint64_t ctl;
+       u_int delta;
+       int count, limit;
+
+       /* Fetch the current limit for this bank. */
+       cc = &cmc_state[PCPU_GET(cpuid)][bank];
+       ctl = rdmsr(MSR_MC_CTL2(bank));
+       count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+       delta = (u_int)(ticks - cc->last_intr);
+
+       /*
+        * If an interrupt was received less than cmc_throttle seconds
+        * since the previous interrupt and the count from the current
+        * event is greater than or equal to the current threshold,
+        * double the threshold up to the max.
+        */
+       if (mode == CMCI && valid) {
+               limit = ctl & MC_CTL2_THRESHOLD;
+               if (delta < cmc_throttle && count >= limit &&
+                   limit < cc->max_threshold) {
+                       limit = min(limit << 1, cc->max_threshold);
+                       ctl &= ~MC_CTL2_THRESHOLD;
+                       ctl |= limit;
+                       wrmsr(MSR_MC_CTL2(bank), limit);
+               }
+               cc->last_intr = ticks;
+               return;
+       }
+
+       /*
+        * When the banks are polled, check to see if the threshold
+        * should be lowered.
+        */
+       if (mode != POLLED)
+               return;
+
+       /* If a CMCI occured recently, do nothing for now. */
+       if (delta < cmc_throttle)
+               return;
+
+       /*
+        * Compute a new limit based on the average rate of events per
+        * cmc_throttle seconds since the last interrupt.
+        */
+       if (valid) {
+               count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+               limit = count * cmc_throttle / delta;
+               if (limit <= 0)
+                       limit = 1;
+               else if (limit > cc->max_threshold)
+                       limit = cc->max_threshold;
+       } else
+               limit = 1;
+       if ((ctl & MC_CTL2_THRESHOLD) != limit) {
+               ctl &= ~MC_CTL2_THRESHOLD;
+               ctl |= limit;
+               wrmsr(MSR_MC_CTL2(bank), limit);
+       }
+}
+
+/*
  * This scans all the machine check banks of the current CPU to see if
  * there are any machine checks.  Any non-recoverable errors are
  * reported immediately via mca_log().  The current thread must be
- * pinned when this is called.  The 'mcip' parameter indicates if we
- * are being called from the MC exception handler.  In that case this
- * function returns true if the system is restartable.  Otherwise, it
- * returns a count of the number of valid MC records found.
+ * pinned when this is called.  The 'mode' parameter indicates if we
+ * are being called from the MC exception handler, the CMCI handler,
+ * or the periodic poller.  In the MC exception case this function
+ * returns true if the system is restartable.  Otherwise, it returns a
+ * count of the number of valid MC records found.
  */
 static int
-mca_scan(int mcip)
+mca_scan(enum scan_mode mode)
 {
        struct mca_record rec;
        uint64_t mcg_cap, ucmask;
-       int count, i, recoverable;
+       int count, i, recoverable, valid;
 
        count = 0;
        recoverable = 1;
        ucmask = MC_STATUS_UC | MC_STATUS_PCC;
 
        /* When handling a MCE#, treat the OVER flag as non-restartable. */
-       if (mcip)
+       if (mode == MCE)
                ucmask |= MC_STATUS_OVER;
        mcg_cap = rdmsr(MSR_MCG_CAP);
        for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
-               if (mca_check_status(i, &rec)) {
+               /*
+                * For a CMCI, only check banks this CPU is
+                * responsible for.
+                */
+               if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
+                       continue;
+
+               valid = mca_check_status(i, &rec);
+               if (valid) {
                        count++;
                        if (rec.mr_status & ucmask) {
                                recoverable = 0;
@@ -433,8 +537,15 @@ mca_scan(int mcip)
                        }
                        mca_record_entry(&rec);
                }
+       
+               /*
+                * If this is a bank this CPU monitors via CMCI,
+                * update the threshold.
+                */
+               if (PCPU_GET(cmci_mask) & (1 << i))
+                       cmci_update(mode, i, valid, &rec);
        }
-       return (mcip ? recoverable : count);
+       return (mode == MCE ? recoverable : count);
 }
 
 /*
@@ -457,7 +568,7 @@ mca_scan_cpus(void *context, int pending
                        continue;
                sched_bind(td, cpu);
                thread_unlock(td);
-               count += mca_scan(0);
+               count += mca_scan(POLLED);
                thread_lock(td);
                sched_unbind(td);
        }
@@ -511,7 +622,24 @@ mca_startup(void *dummy)
 SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
 
 static void
-mca_setup(void)
+cmci_setup(uint64_t mcg_cap)
+{
+       int i;
+
+       cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
+           M_MCA, M_WAITOK);
+       cmc_banks = mcg_cap & MCG_CAP_COUNT;
+       for (i = 0; i <= mp_maxid; i++)
+               cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
+                   M_MCA, M_WAITOK | M_ZERO);
+       SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
+           "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+           &cmc_throttle, 0, sysctl_positive_int, "I",
+           "Interval in seconds to throttle corrected MC interrupts");
+}
+
+static void
+mca_setup(uint64_t mcg_cap)
 {
 
        mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
@@ -522,13 +650,62 @@ mca_setup(void)
            "count", CTLFLAG_RD, &mca_count, 0, "Record count");
        SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
            "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
-           0, sysctl_mca_ticks, "I",
+           0, sysctl_positive_int, "I",
            "Periodic interval in seconds to scan for machine checks");
        SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
            "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
        SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
            "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
            sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
+       if (mcg_cap & MCG_CAP_CMCI_P)
+               cmci_setup(mcg_cap);
+}
+
+/*
+ * See if we should monitor CMCI for this bank.  If CMCI_EN is already
+ * set in MC_CTL2, then another CPU is responsible for this bank, so
+ * ignore it.  If CMCI_EN returns zero after being set, then this bank
+ * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
+ * now monitor this bank.
+ */
+static void
+cmci_monitor(int i)
+{
+       struct cmc_state *cc;
+       uint64_t ctl;
+
+       KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
+
+       ctl = rdmsr(MSR_MC_CTL2(i));
+       if (ctl & MC_CTL2_CMCI_EN)
+               /* Already monitored by another CPU. */
+               return;
+
+       /* Set the threshold to one event for now. */
+       ctl &= ~MC_CTL2_THRESHOLD;
+       ctl |= MC_CTL2_CMCI_EN | 1;
+       wrmsr(MSR_MC_CTL2(i), ctl);
+       ctl = rdmsr(MSR_MC_CTL2(i));
+       if (!(ctl & MC_CTL2_CMCI_EN))
+               /* This bank does not support CMCI. */
+               return;
+
+       cc = &cmc_state[PCPU_GET(cpuid)][i];
+
+       /* Determine maximum threshold. */
+       ctl &= ~MC_CTL2_THRESHOLD;
+       ctl |= 0x7fff;
+       wrmsr(MSR_MC_CTL2(i), ctl);
+       ctl = rdmsr(MSR_MC_CTL2(i));
+       cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
+
+       /* Start off with a threshold of 1. */
+       ctl &= ~MC_CTL2_THRESHOLD;
+       ctl |= 1;
+       wrmsr(MSR_MC_CTL2(i), ctl);
+
+       /* Mark this bank as monitored. */
+       PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
 }
 
 /* Must be executed on each CPU. */
@@ -554,14 +731,14 @@ mca_init(void)
                workaround_erratum383 = 1;
 
        if (cpu_feature & CPUID_MCA) {
-               if (PCPU_GET(cpuid) == 0)
-                       mca_setup();
+               PCPU_SET(cmci_mask, 0);
 
-               sched_pin();
                mcg_cap = rdmsr(MSR_MCG_CAP);
                if (mcg_cap & MCG_CAP_CTL_P)
                        /* Enable MCA features. */
                        wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
+               if (PCPU_GET(cpuid) == 0)
+                       mca_setup(mcg_cap);
 
                /*
                 * Disable logging of level one TLB parity (L1TP) errors by
@@ -597,15 +774,34 @@ mca_init(void)
 
                        if (!skip)
                                wrmsr(MSR_MC_CTL(i), ctl);
+
+                       if (mcg_cap & MCG_CAP_CMCI_P)
+                               cmci_monitor(i);
+
                        /* Clear all errors. */
                        wrmsr(MSR_MC_STATUS(i), 0);
                }
-               sched_unpin();
+
+               if (PCPU_GET(cmci_mask) != 0)
+                       lapic_enable_cmc();
        }
 
        load_cr4(rcr4() | CR4_MCE);
 }
 
+/*
+ * The machine check registers for the BSP cannot be initialized until
+ * the local APIC is initialized.  This happens at SI_SUB_CPU,
+ * SI_ORDER_SECOND.
+ */
+static void
+mca_init_bsp(void *arg __unused)
+{
+
+       mca_init();
+}
+SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
+
 /* Called when a machine check exception fires. */
 int
 mca_intr(void)
@@ -624,7 +820,7 @@ mca_intr(void)
        }
 
        /* Scan the banks and check for any non-recoverable errors. */
-       recoverable = mca_scan(1);
+       recoverable = mca_scan(MCE);
        mcg_status = rdmsr(MSR_MCG_STATUS);
        if (!(mcg_status & MCG_STATUS_RIPV))
                recoverable = 0;
@@ -633,3 +829,31 @@ mca_intr(void)
        wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
        return (recoverable);
 }
+
+/* Called for a CMCI (correctable machine check interrupt). */
+void
+cmc_intr(void)
+{
+       struct mca_internal *mca;
+       int count;
+
+       /*
+        * Serialize MCA bank scanning to prevent collisions from
+        * sibling threads.
+        */
+       count = mca_scan(CMCI);
+
+       /* If we found anything, log them to the console. */
+       if (count != 0) {
+               mtx_lock_spin(&mca_lock);
+               STAILQ_FOREACH(mca, &mca_records, link) {
+                       if (!mca->logged) {
+                               mca->logged = 1;
+                               mtx_unlock_spin(&mca_lock);
+                               mca_log(&mca->rec);
+                               mtx_lock_spin(&mca_lock);
+                       }
+               }
+               mtx_unlock_spin(&mca_lock);
+       }
+}

Modified: stable/8/sys/amd64/include/apicreg.h
==============================================================================
--- stable/8/sys/amd64/include/apicreg.h        Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/amd64/include/apicreg.h        Wed Jul 14 21:10:14 2010        
(r210079)
@@ -89,7 +89,7 @@
  * 2C0             Reserved
  * 2D0             Reserved
  * 2E0             Reserved
- * 2F0             Reserved
+ * 2F0             Local Vector Table (CMCI)       R/W
  * 300 ICR_LOW     Interrupt Command Reg. (0-31)   R/W
  * 310 ICR_HI      Interrupt Command Reg. (32-63)  R/W
  * 320             Local Vector Table (Timer)      R/W
@@ -172,7 +172,7 @@ struct LAPIC {
        /* reserved */          PAD4;
        /* reserved */          PAD4;
        /* reserved */          PAD4;
-       /* reserved */          PAD4;
+       u_int32_t lvt_cmci;     PAD3;
        u_int32_t icr_lo;       PAD3;
        u_int32_t icr_hi;       PAD3;
        u_int32_t lvt_timer;    PAD3;

Modified: stable/8/sys/amd64/include/apicvar.h
==============================================================================
--- stable/8/sys/amd64/include/apicvar.h        Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/amd64/include/apicvar.h        Wed Jul 14 21:10:14 2010        
(r210079)
@@ -108,8 +108,9 @@
 #define        APIC_LOCAL_INTS 240
 #define        APIC_ERROR_INT  APIC_LOCAL_INTS
 #define        APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
+#define        APIC_CMC_INT    (APIC_LOCAL_INTS + 2)
 
-#define        APIC_IPI_INTS   (APIC_LOCAL_INTS + 2)
+#define        APIC_IPI_INTS   (APIC_LOCAL_INTS + 3)
 #define        IPI_RENDEZVOUS  (APIC_IPI_INTS)         /* Inter-CPU 
rendezvous. */
 #define        IPI_INVLTLB     (APIC_IPI_INTS + 1)     /* TLB Shootdown IPIs */
 #define        IPI_INVLPG      (APIC_IPI_INTS + 2)
@@ -143,7 +144,8 @@
 #define        LVT_ERROR       3
 #define        LVT_PMC         4
 #define        LVT_THERMAL     5
-#define        LVT_MAX         LVT_THERMAL
+#define        LVT_CMCI        6
+#define        LVT_MAX         LVT_CMCI
 
 #ifndef LOCORE
 
@@ -179,8 +181,8 @@ struct apic_enumerator {
 inthand_t
        IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
        IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
-       IDTVEC(apic_isr7), IDTVEC(errorint), IDTVEC(spuriousint),
-       IDTVEC(timerint);
+       IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
+       IDTVEC(spuriousint), IDTVEC(timerint);
 
 extern vm_paddr_t lapic_paddr;
 extern int apic_cpuids[];
@@ -210,6 +212,7 @@ void        lapic_create(u_int apic_id, int boo
 void   lapic_disable(void);
 void   lapic_disable_pmc(void);
 void   lapic_dump(const char *str);
+void   lapic_enable_cmc(void);
 int    lapic_enable_pmc(void);
 void   lapic_eoi(void);
 int    lapic_id(void);
@@ -218,6 +221,7 @@ int lapic_intr_pending(u_int vector);
 void   lapic_ipi_raw(register_t icrlo, u_int dest);
 void   lapic_ipi_vectored(u_int vector, int dest);
 int    lapic_ipi_wait(int delay);
+void   lapic_handle_cmc(void);
 void   lapic_handle_error(void);
 void   lapic_handle_intr(int vector, struct trapframe *frame);
 void   lapic_handle_timer(struct trapframe *frame);

Modified: stable/8/sys/amd64/include/mca.h
==============================================================================
--- stable/8/sys/amd64/include/mca.h    Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/amd64/include/mca.h    Wed Jul 14 21:10:14 2010        
(r210079)
@@ -46,6 +46,7 @@ struct mca_record {
 
 #ifdef _KERNEL
 
+void   cmc_intr(void);
 void   mca_init(void);
 int    mca_intr(void);
 

Modified: stable/8/sys/amd64/include/pcpu.h
==============================================================================
--- stable/8/sys/amd64/include/pcpu.h   Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/amd64/include/pcpu.h   Wed Jul 14 21:10:14 2010        
(r210079)
@@ -75,7 +75,8 @@
        /* Pointer to the CPU LDT descriptor */                         \
        struct system_segment_descriptor *pc_ldt;                       \
        /* Pointer to the CPU TSS descriptor */                         \
-       struct system_segment_descriptor *pc_tss                        \
+       struct system_segment_descriptor *pc_tss;                       \
+       u_int   pc_cmci_mask            /* MCx banks for CMCI */        \
        PCPU_XEN_FIELDS
 
 #ifdef _KERNEL

Modified: stable/8/sys/amd64/include/specialreg.h
==============================================================================
--- stable/8/sys/amd64/include/specialreg.h     Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/amd64/include/specialreg.h     Wed Jul 14 21:10:14 2010        
(r210079)
@@ -385,7 +385,7 @@
 #define        MC_STATUS_VAL           0x8000000000000000
 #define        MC_MISC_RA_LSB          0x000000000000003f      /* If 
MCG_CAP_SER_P */
 #define        MC_MISC_ADDRESS_MODE    0x00000000000001c0      /* If 
MCG_CAP_SER_P */
-#define        MC_CTL2_THRESHOLD       0x0000000000003fff
+#define        MC_CTL2_THRESHOLD       0x0000000000007fff
 #define        MC_CTL2_CMCI_EN         0x0000000040000000
 
 /*

Modified: stable/8/sys/i386/i386/apic_vector.s
==============================================================================
--- stable/8/sys/i386/i386/apic_vector.s        Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/i386/i386/apic_vector.s        Wed Jul 14 21:10:14 2010        
(r210079)
@@ -113,6 +113,19 @@ IDTVEC(timerint)
        jmp     doreti
 
 /*
+ * Local APIC CMCI handler.
+ */
+       .text
+       SUPERALIGN_TEXT
+IDTVEC(cmcint)
+       PUSH_FRAME
+       SET_KERNEL_SREGS
+       FAKE_MCOUNT(TF_EIP(%esp))
+       call    lapic_handle_cmc
+       MEXITCOUNT
+       jmp     doreti
+
+/*
  * Local APIC error interrupt handler.
  */
        .text

Modified: stable/8/sys/i386/i386/local_apic.c
==============================================================================
--- stable/8/sys/i386/i386/local_apic.c Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/i386/i386/local_apic.c Wed Jul 14 21:10:14 2010        
(r210079)
@@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <machine/apicvar.h>
+#include <machine/mca.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
@@ -124,6 +125,7 @@ static struct lvt lvts[LVT_MAX + 1] = {
        { 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },      /* Error */
        { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 },     /* PMC */
        { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },    /* Thermal */
+       { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT },        /* CMCI */
 };
 
 static inthand_t *ioint_handlers[] = {
@@ -231,6 +233,10 @@ lapic_init(vm_paddr_t addr)
            GSEL(GCODE_SEL, SEL_KPL));
 
        /* XXX: Thermal interrupt */
+
+       /* Local APIC CMCI. */
+       setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_SYS386TGT, SEL_KPL,
+           GSEL(GCODE_SEL, SEL_KPL));
 }
 
 /*
@@ -256,7 +262,7 @@ lapic_create(u_int apic_id, int boot_cpu
         */
        lapics[apic_id].la_present = 1;
        lapics[apic_id].la_id = apic_id;
-       for (i = 0; i < LVT_MAX; i++) {
+       for (i = 0; i <= LVT_MAX; i++) {
                lapics[apic_id].la_lvts[i] = lvts[i];
                lapics[apic_id].la_lvts[i].lvt_active = 0;
        }
@@ -286,6 +292,7 @@ lapic_dump(const char* str)
        printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n",
            lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error,
            lapic->lvt_pcint);
+       printf("   cmci: 0x%08x\n", lapic->lvt_cmci);
 }
 
 void
@@ -337,6 +344,10 @@ lapic_setup(int boot)
 
        /* XXX: Thermal LVT */
 
+       /* Program the CMCI LVT entry if present. */
+       if (maxlvt >= LVT_CMCI)
+               lapic->lvt_cmci = lvt_mode(la, LVT_CMCI, lapic->lvt_cmci);
+           
        intr_restore(eflags);
 }
 
@@ -858,6 +869,34 @@ lapic_timer_enable_intr(void)
 }
 
 void
+lapic_handle_cmc(void)
+{
+
+       lapic_eoi();
+       cmc_intr();
+}
+
+/*
+ * Called from the mca_init() to activate the CMC interrupt if this CPU is
+ * responsible for monitoring any MC banks for CMC events.  Since mca_init()
+ * is called prior to lapic_setup() during boot, this just needs to unmask
+ * this CPU's LVT_CMCI entry.
+ */
+void
+lapic_enable_cmc(void)
+{
+       u_int apic_id;
+
+       apic_id = PCPU_GET(apic_id);
+       KASSERT(lapics[apic_id].la_present,
+           ("%s: missing APIC %u", __func__, apic_id));
+       lapics[apic_id].la_lvts[LVT_CMCI].lvt_masked = 0;
+       lapics[apic_id].la_lvts[LVT_CMCI].lvt_active = 1;
+       if (bootverbose)
+               printf("lapic%u: CMCI unmasked\n", apic_id);
+}
+
+void
 lapic_handle_error(void)
 {
        u_int32_t esr;

Modified: stable/8/sys/i386/i386/machdep.c
==============================================================================
--- stable/8/sys/i386/i386/machdep.c    Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/i386/i386/machdep.c    Wed Jul 14 21:10:14 2010        
(r210079)
@@ -328,7 +328,6 @@ cpu_startup(dummy)
 #ifndef XEN
        cpu_setregs();
 #endif
-       mca_init();
 }
 
 /*

Modified: stable/8/sys/i386/i386/mca.c
==============================================================================
--- stable/8/sys/i386/i386/mca.c        Wed Jul 14 20:55:45 2010        
(r210078)
+++ stable/8/sys/i386/i386/mca.c        Wed Jul 14 21:10:14 2010        
(r210079)
@@ -32,7 +32,11 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_apic.h"
+
 #include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -43,11 +47,31 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
 #include <machine/cputypes.h>
 #include <machine/mca.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
+/* Modes for mca_scan() */
+enum scan_mode {
+       POLLED,
+       MCE,
+       CMCI,
+};
+
+#ifdef DEV_APIC
+/*
+ * State maintained for each monitored MCx bank to control the
+ * corrected machine check interrupt threshold.
+ */
+struct cmc_state {
+       int     max_threshold;
+       int     last_intr;
+};
+#endif
+
 struct mca_internal {
        struct mca_record rec;
        int             logged;
@@ -80,18 +104,24 @@ static int mca_ticks = 3600;       /* Check ho
 static struct task mca_task;
 static struct mtx mca_lock;
 
+#ifdef DEV_APIC
+static struct cmc_state **cmc_state;   /* Indexed by cpuid, bank */
+static int cmc_banks;
+static int cmc_throttle = 60;  /* Time in seconds to throttle CMCI. */
+#endif
+
 static int
-sysctl_mca_ticks(SYSCTL_HANDLER_ARGS)
+sysctl_positive_int(SYSCTL_HANDLER_ARGS)
 {
        int error, value;
 
-       value = mca_ticks;
+       value = *(int *)arg1;
        error = sysctl_handle_int(oidp, &value, 0, req);
        if (error || req->newptr == NULL)
                return (error);
        if (value <= 0)
                return (EINVAL);
-       mca_ticks = value;
+       *(int *)arg1 = value;
        return (0);
 }
 
@@ -400,32 +430,117 @@ mca_record_entry(const struct mca_record
        mtx_unlock_spin(&mca_lock);
 }
 
+#ifdef DEV_APIC
+/*
+ * Update the interrupt threshold for a CMCI.  The strategy is to use
+ * a low trigger that interrupts as soon as the first event occurs.
+ * However, if a steady stream of events arrive, the threshold is
+ * increased until the interrupts are throttled to once every
+ * cmc_throttle seconds or the periodic scan.  If a periodic scan
+ * finds that the threshold is too high, it is lowered.
+ */
+static void
+cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+{
+       struct cmc_state *cc;
+       uint64_t ctl;
+       u_int delta;
+       int count, limit;
+
+       /* Fetch the current limit for this bank. */
+       cc = &cmc_state[PCPU_GET(cpuid)][bank];
+       ctl = rdmsr(MSR_MC_CTL2(bank));
+       count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+       delta = (u_int)(ticks - cc->last_intr);
+
+       /*
+        * If an interrupt was received less than cmc_throttle seconds
+        * since the previous interrupt and the count from the current
+        * event is greater than or equal to the current threshold,
+        * double the threshold up to the max.
+        */
+       if (mode == CMCI && valid) {
+               limit = ctl & MC_CTL2_THRESHOLD;
+               if (delta < cmc_throttle && count >= limit &&
+                   limit < cc->max_threshold) {
+                       limit = min(limit << 1, cc->max_threshold);
+                       ctl &= ~MC_CTL2_THRESHOLD;
+                       ctl |= limit;
+                       wrmsr(MSR_MC_CTL2(bank), limit);
+               }
+               cc->last_intr = ticks;
+               return;
+       }
+
+       /*
+        * When the banks are polled, check to see if the threshold
+        * should be lowered.
+        */
+       if (mode != POLLED)
+               return;
+
+       /* If a CMCI occured recently, do nothing for now. */
+       if (delta < cmc_throttle)
+               return;
+
+       /*
+        * Compute a new limit based on the average rate of events per
+        * cmc_throttle seconds since the last interrupt.
+        */
+       if (valid) {
+               count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+               limit = count * cmc_throttle / delta;
+               if (limit <= 0)
+                       limit = 1;
+               else if (limit > cc->max_threshold)
+                       limit = cc->max_threshold;
+       } else
+               limit = 1;
+       if ((ctl & MC_CTL2_THRESHOLD) != limit) {
+               ctl &= ~MC_CTL2_THRESHOLD;
+               ctl |= limit;
+               wrmsr(MSR_MC_CTL2(bank), limit);
+       }
+}
+#endif
+
 /*
  * This scans all the machine check banks of the current CPU to see if
  * there are any machine checks.  Any non-recoverable errors are
  * reported immediately via mca_log().  The current thread must be
- * pinned when this is called.  The 'mcip' parameter indicates if we
- * are being called from the MC exception handler.  In that case this
- * function returns true if the system is restartable.  Otherwise, it
- * returns a count of the number of valid MC records found.
+ * pinned when this is called.  The 'mode' parameter indicates if we
+ * are being called from the MC exception handler, the CMCI handler,
+ * or the periodic poller.  In the MC exception case this function
+ * returns true if the system is restartable.  Otherwise, it returns a
+ * count of the number of valid MC records found.
  */
 static int
-mca_scan(int mcip)
+mca_scan(enum scan_mode mode)
 {
        struct mca_record rec;
        uint64_t mcg_cap, ucmask;
-       int count, i, recoverable;
+       int count, i, recoverable, valid;
 
        count = 0;
        recoverable = 1;
        ucmask = MC_STATUS_UC | MC_STATUS_PCC;
 
        /* When handling a MCE#, treat the OVER flag as non-restartable. */
-       if (mcip)
+       if (mode == MCE)
                ucmask |= MC_STATUS_OVER;
        mcg_cap = rdmsr(MSR_MCG_CAP);
        for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
-               if (mca_check_status(i, &rec)) {
+#ifdef DEV_APIC
+               /*
+                * For a CMCI, only check banks this CPU is
+                * responsible for.
+                */
+               if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
+                       continue;
+#endif
+
+               valid = mca_check_status(i, &rec);
+               if (valid) {
                        count++;
                        if (rec.mr_status & ucmask) {
                                recoverable = 0;
@@ -433,8 +548,17 @@ mca_scan(int mcip)
                        }
                        mca_record_entry(&rec);
                }
+       
+#ifdef DEV_APIC
+               /*
+                * If this is a bank this CPU monitors via CMCI,
+                * update the threshold.
+                */
+               if (PCPU_GET(cmci_mask) & (1 << i))
+                       cmci_update(mode, i, valid, &rec);
+#endif
        }
-       return (mcip ? recoverable : count);
+       return (mode == MCE ? recoverable : count);
 }
 
 /*
@@ -457,7 +581,7 @@ mca_scan_cpus(void *context, int pending
                        continue;
                sched_bind(td, cpu);
                thread_unlock(td);
-               count += mca_scan(0);
+               count += mca_scan(POLLED);
                thread_lock(td);
                sched_unbind(td);
        }
@@ -510,8 +634,27 @@ mca_startup(void *dummy)
 }
 SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
 
+#ifdef DEV_APIC
 static void
-mca_setup(void)
+cmci_setup(uint64_t mcg_cap)
+{
+       int i;
+
+       cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
+           M_MCA, M_WAITOK);
+       cmc_banks = mcg_cap & MCG_CAP_COUNT;
+       for (i = 0; i <= mp_maxid; i++)
+               cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
+                   M_MCA, M_WAITOK | M_ZERO);
+       SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
+           "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+           &cmc_throttle, 0, sysctl_positive_int, "I",
+           "Interval in seconds to throttle corrected MC interrupts");
+}
+#endif
+

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to