Hi Simon, On Wed, Apr 29, 2015 at 10:25 AM, Simon Glass <s...@chromium.org> wrote: > Most modern x86 CPUs include more than one CPU core. The OS normally requires > that these 'Application Processors' (APs) be brought up by the boot loader. > Add the required support to U-Boot to init additional APs. > > Signed-off-by: Simon Glass <s...@chromium.org> > --- > > Changes in v2: None > > arch/x86/Kconfig | 25 ++ > arch/x86/cpu/Makefile | 2 + > arch/x86/cpu/ivybridge/model_206ax.c | 4 +- > arch/x86/cpu/mp_init.c | 507 > +++++++++++++++++++++++++++++++++++ > arch/x86/cpu/sipi.S | 215 +++++++++++++++ > arch/x86/include/asm/mp.h | 94 +++++++ > arch/x86/include/asm/sipi.h | 79 ++++++ > arch/x86/include/asm/smm.h | 14 + > 8 files changed, 938 insertions(+), 2 deletions(-) > create mode 100644 arch/x86/cpu/mp_init.c > create mode 100644 arch/x86/cpu/sipi.S > create mode 100644 arch/x86/include/asm/mp.h > create mode 100644 arch/x86/include/asm/sipi.h > create mode 100644 arch/x86/include/asm/smm.h > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index f38e9ba..f89ee5c 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -361,6 +361,31 @@ config FSP_TEMP_RAM_ADDR > Stack top address which is used in FspInit after DRAM is ready and > CAR is disabled. > > +config MAX_CPUS > + int "Maximum number of CPUs permitted" > + default 4 > + help > + When using multi-CPU chips it is possible for U-Boot to start up > + more than one CPU. The stack memory used by all of these CPUs is > + pre-allocated so at present U-Boot wants to know the maximum > + number of CPUs that may be present. Set this to at least as high > + as the number of CPUs in your system (it uses about 4KB of RAM for > + each CPU). > + > +config SMP > + bool "Enable Symmetric Multiprocessing" > + default n > + help > + Enable use of more than one CPU in U-Boot and the Operating System > + when loaded. Each CPU will be started up and information can be > + obtained using the 'cpu' command. If this option is disabled, then > + only one CPU will be enabled regardless of the number of CPUs > + available. > + > +config STACK_SIZE > + hex > + default 0x1000
The name looks too generic. Maybe AP_STACK_SIZE? Please also include a help text. > + > config TSC_CALIBRATION_BYPASS > bool "Bypass Time-Stamp Counter (TSC) calibration" > default n > diff --git a/arch/x86/cpu/Makefile b/arch/x86/cpu/Makefile > index 6ded0a7..9a08ab4 100644 > --- a/arch/x86/cpu/Makefile > +++ b/arch/x86/cpu/Makefile > @@ -19,6 +19,8 @@ obj-$(CONFIG_NORTHBRIDGE_INTEL_IVYBRIDGE) += ivybridge/ > obj-$(CONFIG_INTEL_QUARK) += quark/ > obj-$(CONFIG_INTEL_QUEENSBAY) += queensbay/ > obj-y += lapic.o > +obj-$(CONFIG_SMP) += mp_init.o > obj-y += mtrr.o > obj-$(CONFIG_PCI) += pci.o > +obj-$(CONFIG_SMP) += sipi.o > obj-y += turbo.o > diff --git a/arch/x86/cpu/ivybridge/model_206ax.c > b/arch/x86/cpu/ivybridge/model_206ax.c > index 11dc625..8b08c40 100644 > --- a/arch/x86/cpu/ivybridge/model_206ax.c > +++ b/arch/x86/cpu/ivybridge/model_206ax.c > @@ -435,8 +435,8 @@ static int intel_cores_init(struct x86_cpu_priv *cpu) > > debug("CPU: %u has core %u\n", cpu->apic_id, > new_cpu->apic_id); > > -#if CONFIG_SMP && CONFIG_MAX_CPUS > 1 > - /* Start the new cpu */ > +#if 0 && CONFIG_SMP && CONFIG_MAX_CPUS > 1 > + /* TODO(s...@chromium.org): Start the new cpu */ > if (!start_cpu(new_cpu)) { > /* Record the error in cpu? */ > printk(BIOS_ERR, "CPU %u would not start!\n", > diff --git a/arch/x86/cpu/mp_init.c b/arch/x86/cpu/mp_init.c > new file mode 100644 > index 0000000..c1fd4e0 > --- /dev/null > +++ b/arch/x86/cpu/mp_init.c > @@ -0,0 +1,507 @@ > +/* > + * Copyright (C) 2015 Google, Inc > + * > + * SPDX-License-Identifier: GPL-2.0+ > + * > + * Based on code from the coreboot file of the same name > + */ > + > +#include <common.h> > +#include <cpu.h> > +#include <dm.h> > +#include <errno.h> > +#include <malloc.h> > +#include <asm/atomic.h> > +#include <asm/cpu.h> > +#include <asm/interrupt.h> > +#include <asm/lapic.h> > +#include <asm/mp.h> > +#include <asm/mtrr.h> > +#include <asm/sipi.h> > +#include <asm/smm.h> > +#include <dm/device-internal.h> > +#include <dm/uclass-internal.h> > +#include <linux/linkage.h> > + > +/* This also needs to match the sipi.S assembly code for saved MSR encoding > */ > +struct saved_msr { > + uint32_t index; > + uint32_t lo; > + uint32_t hi; > +} __packed; > + > + > +/* > + * The SIPI vector is loaded at the SMM_DEFAULT_BASE. The reason is at the > + * memory range is already reserved so the OS cannot use it. That region is > + * free to use for AP bringup before SMM is initialised. > + */ > +static const uint32_t sipi_vector_location = SMM_DEFAULT_BASE; > +static const int sipi_vector_location_size = SMM_DEFAULT_SIZE; Since they are static and const, we can just use the macro directly without declare a variable. > + > +struct mp_flight_plan { > + int num_records; > + struct mp_flight_record *records; > +}; > + > +static struct mp_flight_plan mp_info; > + > +struct cpu_map { > + struct udevice *dev; > + int apic_id; > + int err_code; > +}; > + > +static inline void barrier_wait(atomic_t *b) > +{ > + while (atomic_read(b) == 0) > + asm("pause"); > + mfence(); > +} > + > +static inline void release_barrier(atomic_t *b) > +{ > + mfence(); > + atomic_set(b, 1); > +} > + > +/* Returns 1 if timeout waiting for APs. 0 if target APs found */ > +static int wait_for_aps(atomic_t *val, int target, int total_delay, > + int delay_step) > +{ > + int timeout = 0; > + int delayed = 0; > + > + while (atomic_read(val) != target) { > + udelay(delay_step); > + delayed += delay_step; > + if (delayed >= total_delay) { > + timeout = 1; > + break; > + } > + } > + > + return timeout; > +} > + > +static void ap_do_flight_plan(struct udevice *cpu) > +{ > + int i; > + > + for (i = 0; i < mp_info.num_records; i++) { > + struct mp_flight_record *rec = &mp_info.records[i]; > + > + atomic_inc(&rec->cpus_entered); > + barrier_wait(&rec->barrier); > + > + if (rec->ap_call != NULL) > + rec->ap_call(cpu, rec->ap_arg); > + } > +} > + > +static int find_cpu_by_apid_id(int apic_id, struct udevice **devp) > +{ > + struct udevice *dev; > + > + *devp = NULL; > + for (uclass_find_first_device(UCLASS_CPU, &dev); > + dev; > + uclass_find_next_device(&dev)) { > + struct cpu_platdata *plat = dev_get_parent_platdata(dev); > + > + if (plat->cpu_id == apic_id) { > + *devp = dev; > + return 0; > + } > + } > + > + return -ENOENT; > +} > + > +/* > + * By the time APs call ap_init() caching has been setup, and microcode has > + * been loaded > + */ > +static void asmlinkage ap_init(unsigned int cpu_index) Is asmlinkage a must? I think we can pass cpu_index via eax. > +{ > + struct udevice *dev; > + int apic_id; > + int ret; > + > + /* Ensure the local apic is enabled */ > + enable_lapic(); > + > + apic_id = lapicid(); > + ret = find_cpu_by_apid_id(apic_id, &dev); > + if (ret) { > + debug("Unknown CPU apic_id %x\n", apic_id); > + goto done; > + } > + > + debug("AP: slot %d apic_id %x, dev %s\n", cpu_index, apic_id, > + dev ? dev->name : "(apic_id not found)"); > + > + /* Walk the flight plan */ > + ap_do_flight_plan(dev); > + > + /* Park the AP */ > + debug("parking\n"); > +done: > + stop_this_cpu(); > +} > + > +#define NUM_FIXED_MTRRS 11 Can we move this to the MTRR patch (#11 of this series)? So that NUM_FIXED_RANGES in patch#11 can be defined as #define NUM_FIXED_RANGES (NUM_FIXED_MTRRS * RANGES_PER_FIXED_MTRR) > + > +static const unsigned int fixed_mtrrs[NUM_FIXED_MTRRS] = { > + MTRR_FIX_64K_00000_MSR, MTRR_FIX_16K_80000_MSR, > MTRR_FIX_16K_A0000_MSR, > + MTRR_FIX_4K_C0000_MSR, MTRR_FIX_4K_C8000_MSR, MTRR_FIX_4K_D0000_MSR, > + MTRR_FIX_4K_D8000_MSR, MTRR_FIX_4K_E0000_MSR, MTRR_FIX_4K_E8000_MSR, > + MTRR_FIX_4K_F0000_MSR, MTRR_FIX_4K_F8000_MSR, > +}; > + > +static inline struct saved_msr *save_msr(int index, struct saved_msr *entry) > +{ > + msr_t msr; > + > + msr = msr_read(index); > + entry->index = index; > + entry->lo = msr.lo; > + entry->hi = msr.hi; > + > + /* Return the next entry */ > + entry++; > + return entry; > +} > + > +static int save_bsp_msrs(char *start, int size) > +{ > + int msr_count; > + int num_var_mtrrs; > + struct saved_msr *msr_entry; > + int i; > + msr_t msr; > + > + /* Determine number of MTRRs need to be saved */ > + msr = msr_read(MTRR_CAP_MSR); > + num_var_mtrrs = msr.lo & 0xff; > + > + /* 2 * num_var_mtrrs for base and mask. +1 for IA32_MTRR_DEF_TYPE */ > + msr_count = 2 * num_var_mtrrs + NUM_FIXED_MTRRS + 1; > + > + if ((msr_count * sizeof(struct saved_msr)) > size) { > + printf("Cannot mirror all %d msrs.\n", msr_count); > + return -ENOSPC; > + } > + > + msr_entry = (void *)start; > + for (i = 0; i < NUM_FIXED_MTRRS; i++) > + msr_entry = save_msr(fixed_mtrrs[i], msr_entry); > + > + for (i = 0; i < num_var_mtrrs; i++) { > + msr_entry = save_msr(MTRR_PHYS_BASE_MSR(i), msr_entry); > + msr_entry = save_msr(MTRR_PHYS_MASK_MSR(i), msr_entry); > + } > + > + msr_entry = save_msr(MTRR_DEF_TYPE_MSR, msr_entry); > + > + return msr_count; > +} > + > +static int load_sipi_vector(atomic_t **ap_countp) > +{ > + struct sipi_params_16bit *params16; > + struct sipi_params *params; > + static char msr_save[512]; > + char *stack; > + ulong addr; > + int code_len; > + int size; > + int ret; > + > + /* Copy in the code */ > + code_len = ap_start16_code_end - ap_start16; > + debug("Copying SIPI code to %x: %d bytes\n", SMM_DEFAULT_BASE, > + code_len); > + memcpy((void *)SMM_DEFAULT_BASE, ap_start16, code_len); > + > + addr = SMM_DEFAULT_BASE + (ulong)sipi_params_16bit - > (ulong)ap_start16; > + params16 = (struct sipi_params_16bit *)addr; > + params16->ap_start = (uint32_t)ap_start; My read of this is that AP will jump to the flash address of ap_start (pre-relocation)? Is this intentional? > + params16->gdt = (uint32_t)gd->arch.gdt; > + params16->gdt_limit = X86_GDT_SIZE - 1; > + debug("gdt = %x, gdt_limit = %x\n", params16->gdt, > params16->gdt_limit); > + > + params = (struct sipi_params *)sipi_params; > + debug("SIPI 32-bit params at %p\n", params); > + params->idt_ptr = (uint32_t)x86_get_idt(); > + > + params->stack_size = 4096; > + size = params->stack_size * CONFIG_MAX_CPUS; > + stack = memalign(size, 4096); > + if (!stack) > + return -ENOMEM; > + params->stack_top = (u32)(stack + size); > + > + params->microcode_ptr = 0; > + params->msr_table_ptr = (u32)msr_save; > + ret = save_bsp_msrs(msr_save, sizeof(msr_save)); > + if (ret < 0) > + return ret; > + params->msr_count = ret; > + > + params->c_handler = (uint32_t)&ap_init; Also here flash address of ap_init()? > + > + *ap_countp = ¶ms->ap_count; > + atomic_set(*ap_countp, 0); > + debug("SIPI vector is ready\n"); > + > + return 0; > +} > + > +static int check_cpu_devices(int expected_cpus) > +{ > + int i; > + > + for (i = 0; i < expected_cpus; i++) { > + struct udevice *dev; > + int ret; > + > + ret = uclass_find_device(UCLASS_CPU, i, &dev); > + if (ret) { > + debug("Cannot find CPU %d in device tree\n", i); > + return ret; > + } > + } > + > + return 0; > +} > + > +/* Returns 1 for timeout. 0 on success */ > +static int apic_wait_timeout(int total_delay, int delay_step) > +{ > + int total = 0; > + int timeout = 0; > + > + while (lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY) { > + udelay(delay_step); > + total += delay_step; > + if (total >= total_delay) { > + timeout = 1; > + break; > + } > + } > + > + return timeout; > +} > + > +static int start_aps(int ap_count, atomic_t *num_aps) > +{ > + int sipi_vector; > + /* Max location is 4KiB below 1MiB */ > + const int max_vector_loc = ((1 << 20) - (1 << 12)) >> 12; > + > + if (ap_count == 0) > + return 0; > + > + /* The vector is sent as a 4k aligned address in one byte */ > + sipi_vector = sipi_vector_location >> 12; > + > + if (sipi_vector > max_vector_loc) { > + printf("SIPI vector too large! 0x%08x\n", > + sipi_vector); > + return -1; > + } Guess this sanity check is not necessary given we hardcode the sipi_vector to 0x30. > + > + debug("Attempting to start %d APs\n", ap_count); > + > + if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) { > + debug("Waiting for ICR not to be busy..."); > + if (apic_wait_timeout(1000 /* 1 ms */, 50)) { Can we move the /* 1ms */ comment out of the codes? If we fix, please fix this globally in this file. > + debug("timed out. Aborting.\n"); > + return -1; > + } else { > + debug("done.\n"); > + } > + } > + > + /* Send INIT IPI to all but self */ > + lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(0)); > + lapic_write_around(LAPIC_ICR, LAPIC_DEST_ALLBUT | LAPIC_INT_ASSERT | > + LAPIC_DM_INIT); > + debug("Waiting for 10ms after sending INIT.\n"); > + mdelay(10); > + > + /* Send 1st SIPI */ > + if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) { > + debug("Waiting for ICR not to be busy..."); > + if (apic_wait_timeout(1000 /* 1 ms */, 50)) { > + debug("timed out. Aborting.\n"); > + return -1; > + } else { > + debug("done.\n"); > + } > + } > + > + lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(0)); > + lapic_write_around(LAPIC_ICR, LAPIC_DEST_ALLBUT | LAPIC_INT_ASSERT | > + LAPIC_DM_STARTUP | sipi_vector); > + debug("Waiting for 1st SIPI to complete..."); > + if (apic_wait_timeout(10000 /* 10 ms */, 50 /* us */)) { > + debug("timed out.\n"); > + return -1; > + } else { > + debug("done.\n"); > + } > + > + /* Wait for CPUs to check in up to 200 us */ > + wait_for_aps(num_aps, ap_count, 200 /* us */, 15 /* us */); > + > + /* Send 2nd SIPI */ > + if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) { > + debug("Waiting for ICR not to be busy..."); > + if (apic_wait_timeout(1000 /* 1 ms */, 50)) { > + debug("timed out. Aborting.\n"); > + return -1; > + } else { > + debug("done.\n"); > + } > + } > + > + lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(0)); > + lapic_write_around(LAPIC_ICR, LAPIC_DEST_ALLBUT | LAPIC_INT_ASSERT | > + LAPIC_DM_STARTUP | sipi_vector); > + debug("Waiting for 2nd SIPI to complete..."); > + if (apic_wait_timeout(10000 /* 10 ms */, 50 /* us */)) { > + debug("timed out.\n"); > + return -1; > + } else { > + debug("done.\n"); > + } > + > + /* Wait for CPUs to check in */ > + if (wait_for_aps(num_aps, ap_count, 10000 /* 10 ms */, 50 /* us */)) { > + debug("Not all APs checked in: %d/%d.\n", > + atomic_read(num_aps), ap_count); > + return -1; > + } > + > + return 0; > +} > + > +static int bsp_do_flight_plan(struct udevice *cpu, struct mp_params > *mp_params) > +{ > + int i; > + int ret = 0; > + const int timeout_us = 100000; > + const int step_us = 100; > + int num_aps = mp_params->num_cpus - 1; > + > + for (i = 0; i < mp_params->num_records; i++) { > + struct mp_flight_record *rec = &mp_params->flight_plan[i]; > + > + /* Wait for APs if the record is not released */ > + if (atomic_read(&rec->barrier) == 0) { > + /* Wait for the APs to check in */ > + if (wait_for_aps(&rec->cpus_entered, num_aps, > + timeout_us, step_us)) { > + debug("MP record %d timeout.\n", i); > + ret = -1; > + } > + } > + > + if (rec->bsp_call != NULL) > + rec->bsp_call(cpu, rec->bsp_arg); > + > + release_barrier(&rec->barrier); > + } > + return ret; > +} > + > +static int init_bsp(struct udevice **devp) > +{ > + char processor_name[CPU_MAX_NAME_LEN]; > + int apic_id; > + int ret; > + > + cpu_get_name(processor_name); > + debug("CPU: %s.\n", processor_name); > + > + enable_lapic(); > + > + apic_id = lapicid(); > + ret = find_cpu_by_apid_id(apic_id, devp); > + if (ret) { > + printf("Cannot find boot CPU, APIC ID %d\n", apic_id); > + return ret; > + } > + > + return 0; > +} > + > +int mp_init(struct mp_params *p) > +{ > + int num_aps; > + atomic_t *ap_count; > + struct udevice *cpu; > + int ret; > + > + /* This will cause the CPUs devices to be bound */ > + struct uclass *uc; > + ret = uclass_get(UCLASS_CPU, &uc); > + if (ret) > + return ret; > + > + ret = init_bsp(&cpu); > + if (ret) { > + debug("Cannot init boot CPU: err=%d\n", ret); > + return ret; > + } > + > + if (p == NULL || p->flight_plan == NULL || p->num_records < 1) { > + printf("Invalid MP parameters\n"); > + return -1; > + } > + > + ret = check_cpu_devices(p->num_cpus); > + if (ret) > + debug("Warning: Device tree does not describe all CPUs. Extra > ones will not be started correctly\n"); > + > + /* Copy needed parameters so that APs have a reference to the plan */ > + mp_info.num_records = p->num_records; > + mp_info.records = p->flight_plan; > + > + /* Load the SIPI vector */ > + ret = load_sipi_vector(&ap_count); > + if (ap_count == NULL) > + return -1; > + > + /* > + * Make sure SIPI data hits RAM so the APs that come up will see > + * the startup code even if the caches are disabled > + */ > + wbinvd(); > + > + /* Start the APs providing number of APs and the cpus_entered field */ > + num_aps = p->num_cpus - 1; > + ret = start_aps(num_aps, ap_count); > + if (ret) { > + mdelay(1000); > + debug("%d/%d eventually checked in?\n", atomic_read(ap_count), > + num_aps); > + return ret; > + } > + > + /* Walk the flight plan for the BSP */ > + ret = bsp_do_flight_plan(cpu, p); > + if (ret) { > + debug("CPU init failed: err=%d\n", ret); > + return ret; > + } > + > + return 0; > +} > + > +int mp_init_cpu(struct udevice *cpu, void *unused) > +{ > + return device_probe(cpu); > +} > diff --git a/arch/x86/cpu/sipi.S b/arch/x86/cpu/sipi.S > new file mode 100644 > index 0000000..09fde21 > --- /dev/null > +++ b/arch/x86/cpu/sipi.S > @@ -0,0 +1,215 @@ > +/* > + * Copyright (c) 2015 Google, Inc > + * > + * SPDX-License-Identifier: GPL-2.0 > + * > + * Taken from coreboot file of the same name Looks that this is from coreboot is sipi_vector.S. Should we keep the same file name as coreboot? > + */ > + > +/* > + * The SIPI vector is responsible for initializing the APs in the sytem. It > + * loads microcode, sets up MSRs, and enables caching before calling into > + * C code > + */ > + > +#include <asm/global_data.h> > +#include <asm/msr-index.h> > +#include <asm/processor.h> > +#include <asm/processor-flags.h> > +#include <asm/smm.h> > + > +#define CODE_SEG (X86_GDT_ENTRY_32BIT_CS * X86_GDT_ENTRY_SIZE) > +#define DATA_SEG (X86_GDT_ENTRY_32BIT_DS * X86_GDT_ENTRY_SIZE) > + > +#define o32 .byte 0x66; data32 directly? > + > +/* > + * First we have the 16-bit section. Every AP process starts here. > + * The simple task is to load U-Boot's Global Descriptor Table (GDT) to allow > + * U-Boot's 32-bit code to become visible, then jump to ap_start32. ap_start > + * > + * Note that this code is copied to RAM below 1MB in mp_init.c, and runs from > + * there, but the 32-bit code (ap_start32 and onwards) is part of U-Boot and ap_start > + * is therefore relocated to the top of RAM with other U-Boot code. This > + * means that for the 16-bit code we must write relocatable code, but for the > + * rest, we can do what we like. > + */ > +.text > +.code16 > +.globl ap_start16 > +ap_start16: > + cli > + xorl %eax, %eax > + movl %eax, %cr3 /* Invalidate TLB */ > + > + /* setup the data segment */ > + movw %cs, %ax > + movw %ax, %ds > + > + /* Use an address relative to the data segment for the GDT */ > + movl $gdtaddr, %ebx > + subl $ap_start16, %ebx > + > + data32 lgdt (%ebx) > + > + movl %cr0, %eax > + andl $0x7ffaffd1, %eax /* PG, AM, WP, NE, TS, EM, MP = 0 */ > + orl $0x60000001, %eax /* CD, NW, PE = 1 */ > + movl %eax, %cr0 Can we use macros for the cr0 bit fileds? Like we used in arch/x86/cpu/start16.S > + movl $ap_start_jmp, %eax > + subl $ap_start16, %eax > + movw %ax, %bp > + > + /* Jump to ap_start32 within U-Boot */ ap_start > +o32 cs ljmp *(%bp) > + > + .align 4 > +.globl sipi_params_16bit > +sipi_params_16bit: > + /* 48-bit far pointer */ > +ap_start_jmp: > + .long 0 /* offset set to ap_start by U-Boot */ > + .word CODE_SEG /* segment */ > + > + .word 0 /* padding */ > +gdtaddr: > + .word 0 /* limit */ > + .long 0 /* table */ > + .word 0 /* unused */ > + > +.globl ap_start16_code_end > +ap_start16_code_end: > + > +/* > + * Set up the special 'fs' segment for global_data. Then jump to ap_continue > + * to set up the AP. > + */ > +.globl ap_start > +ap_start: > + .code32 > + movw $DATA_SEG, %ax > + movw %ax, %ds > + movw %ax, %es > + movw %ax, %ss > + movw %ax, %gs > + > + movw $(X86_GDT_ENTRY_32BIT_FS * X86_GDT_ENTRY_SIZE), %ax > + movw %ax, %fs > + > + /* Load the Interrupt descriptor table */ > + mov idt_ptr, %ebx > + lidt (%ebx) > + > + /* Obtain cpu number */ > + movl ap_count, %eax > +1: > + movl %eax, %ecx > + inc %ecx > + lock cmpxchg %ecx, ap_count > + jnz 1b > + > + /* Setup stacks for each CPU */ > + movl stack_size, %eax > + mul %ecx > + movl stack_top, %edx > + subl %eax, %edx > + mov %edx, %esp > + /* Save cpu number */ > + mov %ecx, %esi > + > + /* Determine if one should check microcode versions */ > + mov microcode_ptr, %edi > + test %edi, %edi > + jz microcode_done /* Bypass if no microde exists */ > + > + /* Get the Microcode version */ > + mov $1, %eax > + cpuid > + mov $MSR_IA32_UCODE_REV, %ecx > + rdmsr > + /* If something already loaded skip loading again */ > + test %edx, %edx > + jnz microcode_done > + > + /* Determine if parallel microcode loading is allowed */ > + cmp $0xffffffff, microcode_lock > + je load_microcode > + > + /* Protect microcode loading */ > +lock_microcode: > + lock bts $0, microcode_lock > + jc lock_microcode > + > +load_microcode: > + /* Load new microcode */ > + mov $MSR_IA32_UCODE_WRITE, %ecx > + xor %edx, %edx > + mov %edi, %eax > + /* The microcode pointer is passed in pointing to the header. Adjust > + * pointer to reflect the payload (header size is 48 bytes) */ Please fix the multi-line comment style. > + add $48, %eax 48->UCODE_HEADER_LEN > + pusha > + wrmsr > + popa > + > + /* Unconditionally unlock microcode loading */ > + cmp $0xffffffff, microcode_lock > + je microcode_done > + > + xor %eax, %eax > + mov %eax, microcode_lock > + > +microcode_done: > + /* > + * Load MSRs. Each entry in the table consists of: > + * 0: index, > + * 4: value[31:0] > + * 8: value[63:32] > + * See struct saved_msr in mp_init.c. > + */ > + mov msr_table_ptr, %edi > + mov msr_count, %ebx > + test %ebx, %ebx > + jz 1f > +load_msr: > + mov (%edi), %ecx > + mov 4(%edi), %eax > + mov 8(%edi), %edx > + wrmsr > + add $12, %edi > + dec %ebx > + jnz load_msr > + > +1: > + /* Enable caching */ > + mov %cr0, %eax > + and $0x9fffffff, %eax /* CD, NW = 0 */ Please use existing macros. > + mov %eax, %cr0 > + > + /* c_handler(cpu_num) */ > + push %esi /* cpu_num */ > + mov c_handler, %eax > + call *%eax > + > + .align 4 > +.globl sipi_params > +sipi_params: > +idt_ptr: > + .long 0 > +stack_top: > + .long 0 > +stack_size: > + .long 0 > +microcode_lock: > + .long 0 > +microcode_ptr: > + .long 0 > +msr_table_ptr: > + .long 0 > +msr_count: > + .long 0 > +c_handler: > + .long 0 > +ap_count: > + .long 0 > diff --git a/arch/x86/include/asm/mp.h b/arch/x86/include/asm/mp.h > new file mode 100644 > index 0000000..5dc0e33 > --- /dev/null > +++ b/arch/x86/include/asm/mp.h > @@ -0,0 +1,94 @@ > +/* > + * Copyright (c) 2015 Google, Inc > + * > + * SPDX-License-Identifier: GPL-2.0 > + * > + * Taken from coreboot file of the same name > + */ > + > +#ifndef _X86_MP_H_ > +#define _X86_MP_H_ > + > +#include <asm/atomic.h> > + > +typedef int (*mp_callback_t)(struct udevice *cpu, void *arg); > + > +/* > + * A mp_flight_record details a sequence of calls for the APs to perform > + * along with the BSP to coordinate sequencing. Each flight record either > + * provides a barrier for each AP before calling the callback or the APs > + * are allowed to perform the callback without waiting. Regardless, each > + * record has the cpus_entered field incremented for each record. When > + * the BSP observes that the cpus_entered matches the number of APs > + * the bsp_call is called with bsp_arg and upon returning releases the > + * barrier allowing the APs to make further progress. > + * > + * Note that ap_call() and bsp_call() can be NULL. In the NULL case the > + * callback will just not be called. > + */ > +struct mp_flight_record { > + atomic_t barrier; > + atomic_t cpus_entered; > + mp_callback_t ap_call; > + void *ap_arg; > + mp_callback_t bsp_call; > + void *bsp_arg; > +} __attribute__((aligned(ARCH_DMA_MINALIGN))); > + > +#define _MP_FLIGHT_RECORD(barrier_, ap_func_, ap_arg_, bsp_func_, bsp_arg_) \ Can we remove the underscore in those variables? > + { \ > + .barrier = ATOMIC_INIT(barrier_), \ > + .cpus_entered = ATOMIC_INIT(0), \ > + .ap_call = ap_func_, \ > + .ap_arg = ap_arg_, \ > + .bsp_call = bsp_func_, \ > + .bsp_arg = bsp_arg_, \ > + } > + > +#define MP_FR_BLOCK_APS(ap_func_, ap_arg_, bsp_func_, bsp_arg_) \ > + _MP_FLIGHT_RECORD(0, ap_func_, ap_arg_, bsp_func_, bsp_arg_) underscore > + > +#define MP_FR_NOBLOCK_APS(ap_func_, ap_arg_, bsp_func_, bsp_arg_) \ > + _MP_FLIGHT_RECORD(1, ap_func_, ap_arg_, bsp_func_, bsp_arg_) underscore > + > +/* > + * The mp_params structure provides the arguments to the mp subsystem > + * for bringing up APs. > + * > + * At present this is overkill for U-Boot, but it may make it easier to add > + * SMM support. > + */ > +struct mp_params { > + int num_cpus; /* Total cpus include BSP */ > + int parallel_microcode_load; > + const void *microcode_pointer; > + /* Flight plan for APs and BSP */ > + struct mp_flight_record *flight_plan; > + int num_records; > +}; > + > +/* > + * mp_init() will set up the SIPI vector and bring up the APs according to > + * mp_params. Each flight record will be executed according to the plan. Note > + * that the MP infrastructure uses SMM default area without saving it. It's > + * up to the chipset or mainboard to either e820 reserve this area or save > this > + * region prior to calling mp_init() and restoring it after mp_init returns. > + * > + * At the time mp_init() is called the MTRR MSRs are mirrored into APs then > + * caching is enabled before running the flight plan. > + * > + * The MP init has the following properties: > + * 1. APs are brought up in parallel. > + * 2. The ordering of coreboot cpu number and APIC ids is not deterministic. coreboot->U-Boot > + * Therefore, one cannot rely on this property or the order of devices in > + * the device tree unless the chipset or mainboard know the APIC ids > + * a priori. > + * > + * mp_init() returns < 0 on error, 0 on success. > + */ > +int mp_init(struct mp_params *params); > + > +/* Probes the CPU device */ > +int mp_init_cpu(struct udevice *cpu, void *unused); > + > +#endif /* _X86_MP_H_ */ > diff --git a/arch/x86/include/asm/sipi.h b/arch/x86/include/asm/sipi.h > new file mode 100644 > index 0000000..bb2f0de > --- /dev/null > +++ b/arch/x86/include/asm/sipi.h > @@ -0,0 +1,79 @@ > +/* > + * Copyright (c) 2015 Gooogle, Inc > + * Written by Simon Glass <s...@chromium.org> > + * > + * SPDX-License-Identifier: GPL-2.0+ > + */ > + > +#ifndef _ASM_SIPI_H > +#define _ASM_SIPI_H > + > +/** > + * struct sipi_params_16bit - 16-bit SIPI entry-point parameters > + * > + * These are set up in the same space as the SIPI 16-bit code so that each AP > + * can access the parameters when it boots. > + * > + * Each of these must be set up for the AP to boot, except @segment which is > + * set in the assembly code. > + * > + * @ap_start: 32-bit SIPI entry point for U-Boot > + * @segment: Code segment for U-Boot > + * @pad: Padding (not used) > + * @gdt_limit: U-Boot GDT limit (X86_GDT_SIZE - 1) > + * @gdt: U-Boot GDT (gd->arch.gdt) > + * @unused: Not used > + */ > +struct __packed sipi_params_16bit { > + u32 ap_start; > + u16 segment; > + u16 pad; > + Please remove this blank line. > + u16 gdt_limit; > + u32 gdt; > + u16 unused; > +}; > + > +/** > + * struct sipi_params - 32-bit SIP entry-point parameters > + * > + * These are used by the AP init code and must be set up before the APs > start. > + * > + * The stack area extends down from @stack_top, with @stack_size allocated > + * for each AP. > + * > + * @idt_ptr: Interrupt descriptor table pointer > + * @stack_top: Top of the AP stack area > + * @stack_size: Size of each AP's stack > + * @microcode_lock: Used to ensure only one AP loads microcode at once We should document: 0xffffffff means parallel loading > + * @microcode_ptr: Pointer to microcode, or 0 if none > + * @msr_table_ptr: Pointer to saved MSRs, a list of struct saved_msr > + * @msr_count: Number of saved MSRs > + * @c_handler: C function to call once early init is complete > + * @ap_count: Shared atomic value to allocate CPU indexes > + */ > +struct sipi_params { > + u32 idt_ptr; > + u32 stack_top; > + u32 stack_size; > + u32 microcode_lock; > + u32 microcode_ptr; > + u32 msr_table_ptr; > + u32 msr_count; > + u32 c_handler; > + atomic_t ap_count; > +}; > + > +/* 16-bit AP entry point */ > +void ap_start16(void); > + > +/* end of 16-bit code/data, marks the region to be copied to SIP vector */ > +void ap_start16_code_end(void); > + > +/* 32-bit AP entry point */ > +void ap_start(void); > + > +extern char sipi_params_16bit[]; > +extern char sipi_params[]; > + > +#endif > diff --git a/arch/x86/include/asm/smm.h b/arch/x86/include/asm/smm.h > new file mode 100644 > index 0000000..79b4a8e > --- /dev/null > +++ b/arch/x86/include/asm/smm.h > @@ -0,0 +1,14 @@ > +/* > + * Copyright (c) 2015 Google, Inc > + * Copyright (C) 2008-2009 coresystems GmbH > + * > + * SPDX-License-Identifier: GPL-2.0 > + */ > + > +#ifndef CPU_X86_SMM_H > +#define CPU_X86_SMM_H > + > +#define SMM_DEFAULT_BASE 0x30000 > +#define SMM_DEFAULT_SIZE 0x10000 Actually they are not related to SMM, but just for AP start-up codes entry. Should we rename them to AP_xxx and move them to sipi.h? > +#endif > -- Regards, Bin _______________________________________________ U-Boot mailing list U-Boot@lists.denx.de http://lists.denx.de/mailman/listinfo/u-boot