On Mon, 17 Dec 2007 10:54:53 -0800 (PST) [EMAIL PROTECTED] (Russell Leidich) 
wrote:

> Hi,
> 
> This patch to 2.6.24-rc5 enables AMD Barcelona CPUs to register thermal 
> throttling events as machine checks, in the
> same fashion as the analogous Intel code.
> 
> Changed files:
> 
> arch/x86/kernel/cpu/mcheck/Makefile
> arch/x86/kernel/cpu/mcheck/mce_amd_64.c
> arch/x86/kernel/cpu/mcheck/mce_intel_64.c
> include/asm-x86/mce.h
> 
> New files:
> 
> arch/x86/kernel/cpu/mcheck/mce_thermal.c
> 
> ...
>
> +extern int num_k8_northbridges;
> +extern struct pci_dev **k8_northbridges;

Please never add extern declarations in C files - find a suitable header
which is seen by the definition and by all users.

> -static int threshold_cpu_callback(struct notifier_block *nfb,
> +static int amd_cpu_callback(struct notifier_block *nfb,
>                                           unsigned long action, void *hcpu)
>  {
>       /* cpu was unsigned int to begin with */
>       unsigned int cpu = (unsigned long)hcpu;
>  
> -     if (cpu >= NR_CPUS)
> -             goto out;
> -
>       switch (action) {
>       case CPU_ONLINE:
>       case CPU_ONLINE_FROZEN:
>               threshold_create_device(cpu);
> -             break;
> +             if (thermal_apic_init_allowed) {
> +                     /*
> +                      * We need to run thermal_apic_init() on the core that
> +                      * just came online.  If we're already on that core,
> +                      * then directly run it.  Otherwise
> +                      * smp_call_function_single() to that core.
> +                      */
> +                     if (cpu == get_cpu())
> +                             thermal_apic_init(NULL);
> +                     else
> +                             smp_call_function_single(cpu,
> +                                             &thermal_apic_init, NULL, 1, 0);
> +                     put_cpu();
> +             }

smp_call_function_single() already takes care of the
called-on-the-right-cpu case.

> +             break;
>       case CPU_DEAD:
>       case CPU_DEAD_FROZEN:
>               threshold_remove_device(cpu);
> @@ -665,12 +692,11 @@ static int threshold_cpu_callback(struct
>       default:
>               break;
>       }
> -      out:
>       return NOTIFY_OK;
>  }
>  
> +
> +/*
> + * Enable the delivery of thermal interrupts via the local APIC.
> + */
> +static void thermal_apic_init(void *unused) {

eek, google coding "style" ;)

Please feed patches through scripts/checkpatch.pl.

> +     unsigned int apic_lv_therm;
> +
> +     /* Set up APIC_LVTTHMR to issue THERMAL_APIC_VECTOR. */
> +     apic_lv_therm = apic_read(APIC_LVTTHMR);
> +     /*
> +      * See if some agent other than this routine has already initialized
> +      * APIC_LVTTHMR, i.e. if it's unmasked, but not equal to the value that
> +      * we would have programmed, had we been here before on this core.
> +      */
> +     if ((!(apic_lv_therm & APIC_LVT_MASKED)) && ((apic_lv_therm &
> +             (APIC_MODE_MASK | APIC_VECTOR_MASK)) != (APIC_DM_FIXED |
> +             THERMAL_APIC_VECTOR))) {
> +             unsigned int cpu = smp_processor_id();

afaict this function is called while the calling thread is running
preemptibly.  This smp_processor_id() call should have generated a runtime
warning if it was tested with all debug options enabled?

Please see Documentation/SumitChecklist.

> +             printk(KERN_CRIT "CPU 0x%x: Thermal monitoring not "
> +                     "functional.\n", cpu);
> +             if ((apic_lv_therm & APIC_MODE_MASK) == APIC_DM_SMI)
> +                     printk(KERN_DEBUG "Thermal interrupts already "
> +                             "handled by SMI according to (((local APIC "
> +                             "base) + 0x330) bit 0x9).\n");
> +             else
> +                     printk(KERN_DEBUG "Thermal interrupts unexpectedly "
> +                             "enabled at (((local APIC base) + 0x330) bit "
> +                             "0x10).\n");
> +     } else {
> +             /*
> +              * Configure the Local Thermal Vector Table Entry to issue
> +              * issue thermal interrupts to THERMAL_APIC_VECTOR.
> +              *
> +              * Start by masking off Delivery Mode and Vector.
> +              */
> +             apic_lv_therm &= ~(APIC_MODE_MASK | APIC_VECTOR_MASK);
> +             /* Fixed interrupt, masked for now. */
> +             apic_lv_therm |= APIC_LVT_MASKED | APIC_DM_FIXED |
> +                                                     THERMAL_APIC_VECTOR;
> +             apic_write(APIC_LVTTHMR, apic_lv_therm);
> +             /*
> +              * The Intel thermal kernel code implies that there may be a
> +              * race involving the mask bit, so clear it only now, after
> +              * the other bits have settled.
> +              */
> +             apic_write(APIC_LVTTHMR, apic_lv_therm & ~APIC_LVT_MASKED);
> +     }
> +}
> +
> +
> +/*
> + * Determine whether or not the northbridges support thermal throttling
> + * interrupts.  If so, initialize them for receiving the same, then perform
> + * corresponding APIC initialization on each core.
> + */
> +static int smp_thermal_interrupt_init(void)
> +{
> +     int nb_num;
> +     int thermal_registers_functional;
> +
> +     /*
> +      * If there are no recognized northbridges, then we can't talk to the
> +      * thermal registers.
> +      */
> +     thermal_registers_functional = num_k8_northbridges; 
> +     /*
> +      * If any of the northbridges has PCI ID 0x1103, then its thermal
> +      * hardware suffers from an erratum which prevents this code from 
> working,
> +      * so abort.
> +      */
> +     for (nb_num = 0; nb_num < num_k8_northbridges; nb_num++) {
> +             if ((k8_northbridges[nb_num]->device) == 0x1103) {
> +                     thermal_registers_functional = 0;
> +                     break;
> +             }
> +     }
> +     if (thermal_registers_functional) {
> +             /*
> +              * Assert that we should log thermal throttling events, whenever
> +              * we eventually get around to enabling them.
> +              */
> +             atomic_set(&therm_throt_en, 1);
> +             /*
> +              * Bind cpu_specific_smp_thermal_interrupt() to
> +              * amd_smp_thermal_interrupt().
> +              */
> +             cpu_specific_smp_thermal_interrupt = amd_smp_thermal_interrupt;
> +             smp_thermal_northbridge_init();
> +             /*
> +              * We've now initialized sufficient fabric to permit the
> +              * initialization of the thermal interrupt APIC vectors, such as
> +              * when a core comes online and calls amd_cpu_callback().
> +              */
> +             thermal_apic_init_allowed = 1;                  
> +             /*
> +              * Call thermal_apic_init() on each core.
> +              */
> +             on_each_cpu(&thermal_apic_init, NULL, 1, 0);
> +             smp_thermal_early_throttle_check();
> +     }
> +     return 0;
> +}

The logic in this function looks more complex than it needs to be.

        if (num_k8_northbridges == 0)
                goto out;
        for (nb_num = 0; nb_num < num_k8_northbridges; nb_num++) {
                if ((k8_northbridges[nb_num]->device) == 0x1103)
                        goto out;
        }

maybe?  

> --- linux-2.6.24-rc5.orig/arch/x86/kernel/cpu/mcheck/mce_thermal.c    
> 1969-12-31 16:00:00.000000000 -0800
> +++ linux-2.6.24-rc5/arch/x86/kernel/cpu/mcheck/mce_thermal.c 2007-12-13 
> 12:26:13.057412000 -0800
> @@ -0,0 +1,32 @@
> +/*
> + * Copyright (c) 2007 Google Inc.
> + *
> + * Written by Mike Waychison <[EMAIL PROTECTED]> and Russell Leidich <[EMAIL 
> PROTECTED]>.
> + *
> + * CPU-independent thermal interrupt handler.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/interrupt.h>
> +#include "mce.h"
> +#include <asm/hw_irq.h>
> +#include <asm/idle.h>
> +
> +static void default_smp_thermal_interrupt(void) {}

static void default_smp_thermal_interrupt(void)
{
}

please.


Can this function ever actually be called?

> +cpu_specific_smp_thermal_interrupt_callback 
> cpu_specific_smp_thermal_interrupt =
> +                                             default_smp_thermal_interrupt;
> +
> +/*
> + * Wrapper for the CPU-specific thermal interrupt service routine.  Without
> + * this, we'd have to discern the CPU brand at runtime (because support could
> + * be installed for more than one).
> + */
> +asmlinkage void smp_thermal_interrupt(void)
> +{
> +     ack_APIC_irq();
> +     exit_idle();
> +     irq_enter();
> +     cpu_specific_smp_thermal_interrupt();
> +     irq_exit();
> +}
> diff -uprN linux-2.6.24-rc5.orig/include/asm-x86/mce.h 
> linux-2.6.24-rc5/include/asm-x86/mce.h
> --- linux-2.6.24-rc5.orig/include/asm-x86/mce.h       2007-12-11 
> 14:31:34.263515000 -0800
> +++ linux-2.6.24-rc5/include/asm-x86/mce.h    2007-12-13 14:10:37.923970000 
> -0800
> @@ -113,7 +113,10 @@ static inline void mce_amd_feature_init(
>  #endif
>  
>  void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
> +typedef void (*cpu_specific_smp_thermal_interrupt_callback)(void);

Yes, this is the case where typedefs are OK.  But please put a _t at the end
of the name.

> +extern cpu_specific_smp_thermal_interrupt_callback
> +                                       cpu_specific_smp_thermal_interrupt;
>  extern atomic_t mce_entry;
>  
>  extern void do_machine_check(struct pt_regs *, long);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to