* link kernel at 0x4000000 as the xen version, higher values causes linker errors. * we can't use full segmentation in long mode, so we need to create a temporary mapping during early boot to be able to jump to high addresses * build direct map for first 4G in boothdr (seems required by Linux drivers * enable also write page access check in kernel mode
Signed-off-by: Luca Dariz <l...@orpolo.org> --- configure.ac | 3 +- i386/configfrag.ac | 2 + i386/i386/vm_param.h | 4 +- i386/intel/pmap.c | 4 +- i386/intel/pmap.h | 4 + x86_64/Makefrag.am | 18 +++- x86_64/boothdr.S | 214 +++++++++++++++++++++++++++++++++++++++++++ x86_64/interrupt.S | 4 +- x86_64/ldscript | 28 ++++-- x86_64/locore.S | 4 +- 10 files changed, 264 insertions(+), 21 deletions(-) create mode 100644 x86_64/boothdr.S diff --git a/configure.ac b/configure.ac index 019842db..3aaa935c 100644 --- a/configure.ac +++ b/configure.ac @@ -56,8 +56,7 @@ case $host_platform:$host_cpu in default:i?86) host_platform=at;; default:x86_64)] - AC_MSG_WARN([Platform set to Xen by default, this can not boot on non-Xen systems, you currently need a 32bit build for that.]) - [host_platform=xen;; + [host_platform=at;; at:i?86 | xen:i?86 | at:x86_64 | xen:x86_64) :;; *)] diff --git a/i386/configfrag.ac b/i386/configfrag.ac index f697e277..f07a98ca 100644 --- a/i386/configfrag.ac +++ b/i386/configfrag.ac @@ -106,6 +106,8 @@ AC_ARG_ENABLE([apic], enable_pae=${enable_pae-yes};; *:i?86) :;; + *:x86_64) + enable_pae=${enable_pae-yes};; *) if [ x"$enable_pae" = xyes ]; then] AC_MSG_ERROR([can only enable the `PAE' feature on ix86.]) diff --git a/i386/i386/vm_param.h b/i386/i386/vm_param.h index edd9522c..c00c05b2 100644 --- a/i386/i386/vm_param.h +++ b/i386/i386/vm_param.h @@ -36,7 +36,7 @@ * for better trace support in kdb; the _START symbol has to be offset by the * same amount. */ #ifdef __x86_64__ -#define VM_MIN_KERNEL_ADDRESS 0x40000000UL +#define VM_MIN_KERNEL_ADDRESS KERNEL_MAP_BASE #else #define VM_MIN_KERNEL_ADDRESS 0xC0000000UL #endif @@ -73,7 +73,7 @@ /* This is the kernel address range in linear addresses. */ #ifdef __x86_64__ #define LINEAR_MIN_KERNEL_ADDRESS VM_MIN_KERNEL_ADDRESS -#define LINEAR_MAX_KERNEL_ADDRESS (0x00007fffffffffffUL) +#define LINEAR_MAX_KERNEL_ADDRESS (0xffffffffffffffffUL) #else /* On x86, the kernel virtual address space is actually located at high linear addresses. */ diff --git a/i386/intel/pmap.c b/i386/intel/pmap.c index 3bf00659..91835b30 100644 --- a/i386/intel/pmap.c +++ b/i386/intel/pmap.c @@ -655,7 +655,7 @@ void pmap_bootstrap(void) pa_to_pte(_kvtophys((void *) kernel_page_dir + i * INTEL_PGBYTES)) | INTEL_PTE_VALID -#ifdef MACH_PV_PAGETABLES +#if defined(MACH_PV_PAGETABLES) || defined(__x86_64__) | INTEL_PTE_WRITE #endif ); @@ -1297,7 +1297,7 @@ pmap_t pmap_create(vm_size_t size) WRITE_PTE(&p->pdpbase[i], pa_to_pte(kvtophys((vm_offset_t) page_dir[i])) | INTEL_PTE_VALID -#ifdef MACH_PV_PAGETABLES +#if defined(MACH_PV_PAGETABLES) || defined(__x86_64__) | INTEL_PTE_WRITE #endif ); diff --git a/i386/intel/pmap.h b/i386/intel/pmap.h index f24b3a71..d9222e95 100644 --- a/i386/intel/pmap.h +++ b/i386/intel/pmap.h @@ -156,7 +156,11 @@ typedef phys_addr_t pt_entry_t; #endif /* MACH_PV_PAGETABLES */ #define INTEL_PTE_WIRED 0x00000200 #ifdef PAE +#ifdef __x86_64__ +#define INTEL_PTE_PFN 0xfffffffffffff000ULL +#else /* __x86_64__ */ #define INTEL_PTE_PFN 0x00007ffffffff000ULL +#endif/* __x86_64__ */ #else #define INTEL_PTE_PFN 0xfffff000 #endif diff --git a/x86_64/Makefrag.am b/x86_64/Makefrag.am index 40b50bc9..5da734de 100644 --- a/x86_64/Makefrag.am +++ b/x86_64/Makefrag.am @@ -207,11 +207,27 @@ nodist_libkernel_a_SOURCES += \ EXTRA_DIST += \ x86_64/ldscript + if PLATFORM_at +# This should probably be 0xffffffff80000000 for mcmodel=kernel, but let's try +# to stay in the first 8G first, otherwise we have to fix the pmap module to +# actually use the l4 page level +#KERNEL_MAP_BASE=0x100000000 +# but for nor try with < 4G, otherwise we have linker errors +KERNEL_MAP_BASE=0x40000000 gnumach_LINKFLAGS += \ --defsym _START_MAP=$(_START_MAP) \ - --defsym _START=_START_MAP+0x40000000 \ + --defsym _START=_START_MAP \ + --defsym KERNEL_MAP_BASE=$(KERNEL_MAP_BASE) \ -T '$(srcdir)'/x86_64/ldscript + +AM_CFLAGS += -D_START_MAP=$(_START_MAP) \ + -DKERNEL_MAP_BASE=$(KERNEL_MAP_BASE) +AM_CCASFLAGS += -D_START_MAP=$(_START_MAP) \ + -DKERNEL_MAP_BASE=$(KERNEL_MAP_BASE) + +AM_CCASFLAGS += \ + -Ii386 endif AM_CPPFLAGS += \ diff --git a/x86_64/boothdr.S b/x86_64/boothdr.S new file mode 100644 index 00000000..3375c6c9 --- /dev/null +++ b/x86_64/boothdr.S @@ -0,0 +1,214 @@ + +#include <mach/machine/asm.h> + +#include <i386/i386asm.h> + /* + * This section will be put first into .boot. See also x86_64/ldscript. + */ + .section .boot.text,"ax" + .globl boot_start + + /* We should never be entered this way. */ + .code32 +boot_start: + jmp boot_entry + + /* MultiBoot header - see multiboot.h. */ +#define MULTIBOOT_MAGIC 0x1BADB002 +#ifdef __ELF__ +#define MULTIBOOT_FLAGS 0x00000003 +#else /* __ELF__ */ +#define MULTIBOOT_FLAGS 0x00010003 +#endif /* __ELF__ */ + P2ALIGN(2) +boot_hdr: + .long MULTIBOOT_MAGIC + .long MULTIBOOT_FLAGS + /* + * The next item here is the checksum. + * XX this works OK until we need at least the 30th bit. + */ + .long - (MULTIBOOT_MAGIC+MULTIBOOT_FLAGS) +#ifndef __ELF__ /* a.out kludge */ + .long boot_hdr /* header_addr */ + .long _start /* load_addr */ + .long _edata /* load_end_addr */ + .long _end /* bss_end_addr */ + .long boot_entry /* entry */ +#endif /* __ELF__ */ + +boot_entry: + /* + * Prepare minimal page mapping to jump to 64 bit and to C code. + * The first 4GB is identity mapped, and the first 2GB are re-mapped + * to high addresses at KERNEL_MAP_BASE + */ + + movl $p3table,%eax + or $0b11,%eax + movl %eax,(p4table) + /* + * Fill 4 entries to cover the whole 32-bit 4GB address space, just to + * be sure. Part of it might be remapped later if the kernel is mapped + * below 4G. + */ + movl $p2table,%eax + or $0b11,%eax + movl %eax,(p3table) + movl $p2table1,%eax + or $0b11,%eax + movl %eax,(p3table + 8) + movl $p2table2,%eax + or $0b11,%eax + movl %eax,(p3table + 16) + movl $p2table3,%eax + or $0b11,%eax + movl %eax,(p3table + 24) + /* point each page table level two entry to a page */ + mov $0,%ecx +.map_p2_table: + mov $0x200000,%eax // 2MiB page, should be always available + mul %ecx + or $0b10000011,%eax + mov %eax,p2table(,%ecx,8) + inc %ecx + cmp $2048,%ecx + jne .map_p2_table + + /* KERNEL_MAP_BASE must me aligned to 2GB */ +.kernel_map: +#if KERNEL_MAP_BASE >= (1U << 39) + movl $p3ktable,%eax + or $0b11,%eax + movl %eax,(p4table + (8 * ((KERNEL_MAP_BASE >> 39) & 0x1FF))) // entry for 0b111111111 mask + movl $p2ktable1,%eax + or $0b11,%eax + movl %eax,(p3ktable + (8 * ((KERNEL_MAP_BASE >> 30) & 0x1FF) )) // entry for 0b111111110 mask + movl $p2ktable2,%eax + or $0b11,%eax + movl %eax,(p3ktable + (8 * (((KERNEL_MAP_BASE >> 30) & 0x1FF) + 1) )) // entry for 0b111111111 mask +#else + movl $p2ktable1,%eax + or $0b11,%eax + movl %eax,(p3table + (8 * ((KERNEL_MAP_BASE >> 30) & 0x1FF) )) + movl $p2ktable2,%eax + or $0b11,%eax + movl %eax,(p3table + (8 * (((KERNEL_MAP_BASE >> 30) & 0x1FF) + 1) )) +#endif + + mov $0,%ecx +.map_p2k_table: + mov $0x200000,%eax // 2MiB page + mul %ecx + or $0b10000011,%eax + mov %eax,p2ktable1(,%ecx,8) + inc %ecx + cmp $1024,%ecx + jne .map_p2k_table + +switch64: + /* + * Jump to 64 bit code, we have to + * - enable PAE + * - enable long mode + * - enable paging by loading the tables filled above in $cr3 + * - jump to a 64-bit code segment + */ + mov %cr4,%eax + or $(1 << 5),%eax // PAE bit + mov %eax,%cr4 + mov $0xC0000080,%ecx // select EFER register + rdmsr + or $(1 << 8),%eax // long mode enable bit + wrmsr + mov $p4table,%eax + mov %eax,%cr3 + mov %cr0,%eax + or $(1 << 31),%eax // Paging bit + or $(1 << 16),%eax // Write-protect enabled also in kernel mode + mov %eax,%cr0 + + lgdt gdt64pointer + movw $0,%ax + movw %ax,%ds + movw %ax,%es + movw %ax,%fs + movw %ax,%gs + movw $16,%ax + movw %ax,%ds + movw %ax,%es + movw %ax,%ss + ljmp $8,$boot_entry64 + +// .section .text.start + .code64 + + /* why do we need this? it seems overwritten by linker */ + .globl _start +_start: + +boot_entry64: + /* Switch to our own interrupt stack. */ + movq $(_intstack+INTSTACK_SIZE),%rax + andq $(~15),%rax + movq %rax,%rsp + + /* Reset EFLAGS to a known state. */ + pushq $0 + popf + // save multiboot info + movq %rbx,%r8 + + /* Fix ifunc entries */ + movq $__rela_iplt_start,%rsi + movq $__rela_iplt_end,%rdi +iplt_cont: + cmpq %rdi,%rsi + jae iplt_done + movq (%rsi),%rbx /* r_offset */ + movb 4(%rsi),%al /* info */ + cmpb $42,%al /* IRELATIVE */ + jnz iplt_next + call *(%ebx) /* call ifunc */ + movq %rax,(%rbx) /* fixed address */ +iplt_next: + addq $8,%rsi + jmp iplt_cont +iplt_done: + + movq %r8,%rdi + /* Jump into C code. */ + call EXT(c_boot_entry) + /* not reached */ + nop + + .section .boot.data + .comm _intstack,INTSTACK_SIZE + + .code32 + .section .boot.data + .align 4096 +gdt64: + .quad 0 +gdt64code: + .quad (1<<44) | (1<<47) | (1<<41) | (1<<43) | (1<<53) +gdt64data: + .quad (1<<44) | (1<<47) | (1<<41) +gdt64end: + .skip (4096 - (gdt64end - gdt64)) +gdt64pointer: + // .word gdt64pointer - gdt64 - 1 + .word gdt64end - gdt64 - 1 + .quad gdt64 + + .section .boot.data + .align 4096 +p4table: .space 4096 +p3table: .space 4096 +p2table: .space 4096 +p2table1: .space 4096 +p2table2: .space 4096 +p2table3: .space 4096 +p3ktable: .space 4096 +p2ktable1: .space 4096 +p2ktable2: .space 4096 diff --git a/x86_64/interrupt.S b/x86_64/interrupt.S index fccf6e28..eab643a5 100644 --- a/x86_64/interrupt.S +++ b/x86_64/interrupt.S @@ -41,12 +41,12 @@ ENTRY(interrupt) movl 8(%esp),%edx /* set irq number as 3rd arg */ movl %edx,%ebx /* copy irq number */ shll $2,%ebx /* irq * 4 */ - movl EXT(iunit)(%ebx),%edi /* get device unit number as 1st arg */ + movl EXT(iunit)(%rbx),%edi /* get device unit number as 1st arg */ movl %eax, %esi /* previous ipl as 2nd arg */ movq 16(%esp), %rcx /* return address as 4th arg */ movq 24(%esp), %r8 /* address of interrupted registers as 5th arg */ shll $1,%ebx /* irq * 8 */ - call *EXT(ivect)(%ebx) /* call interrupt handler */ + call *EXT(ivect)(%rbx) /* call interrupt handler */ popq %rdi /* restore previous ipl */ call splx_cli /* restore previous ipl */ diff --git a/x86_64/ldscript b/x86_64/ldscript index 375e8104..de99795e 100644 --- a/x86_64/ldscript +++ b/x86_64/ldscript @@ -2,7 +2,7 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") OUTPUT_ARCH(i386:x86-64) -ENTRY(_start) +ENTRY(boot_start) SECTIONS { /* @@ -11,22 +11,30 @@ SECTIONS * be first in there. See also `i386/i386at/boothdr.S' and * `gnumach_LINKFLAGS' in `i386/Makefrag.am'. */ - . = _START; - .text : - AT (_START_MAP) + + . = _START_MAP; + .boot : + { + *(.boot.text) + *(.boot.data) + } =0x90909090 + + . += KERNEL_MAP_BASE; + _start = .; + .text : AT(((ADDR(.text)) - KERNEL_MAP_BASE)) { - *(.text.start) + *(.text*) *(.text .stub .text.* .gnu.linkonce.t.*) *(.text.unlikely .text.*_unlikely) KEEP (*(.text.*personality*)) /* .gnu.warning sections are handled specially by elf32.em. */ *(.gnu.warning) } =0x90909090 - .init : + .init : AT(((ADDR(.init)) - KERNEL_MAP_BASE)) { KEEP (*(.init)) } =0x90909090 - .fini : + .fini : AT(((ADDR(.fini)) - KERNEL_MAP_BASE)) { KEEP (*(.fini)) } =0x90909090 @@ -69,7 +77,7 @@ SECTIONS PROVIDE_HIDDEN (__rela_iplt_end = .); } .plt : { *(.plt) *(.iplt) } - .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata : AT(((ADDR(.rodata)) - KERNEL_MAP_BASE)) { *(.rodata .rodata.* .gnu.linkonce.r.*) } .rodata1 : { *(.rodata1) } .eh_frame_hdr : { *(.eh_frame_hdr) } .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } @@ -139,7 +147,7 @@ SECTIONS .got : { *(.got) *(.igot) } . = DATA_SEGMENT_RELRO_END (24, .); .got.plt : { *(.got.plt) *(.igot.plt) } - .data : + .data : AT(((ADDR(.data)) - KERNEL_MAP_BASE)) { *(.data .data.* .gnu.linkonce.d.*) SORT(CONSTRUCTORS) @@ -147,7 +155,7 @@ SECTIONS .data1 : { *(.data1) } _edata = .; PROVIDE (edata = .); __bss_start = .; - .bss : + .bss : AT(((ADDR(.bss)) - KERNEL_MAP_BASE)) { *(.dynbss) *(.bss .bss.* .gnu.linkonce.b.*) diff --git a/x86_64/locore.S b/x86_64/locore.S index 612fc493..a7266dab 100644 --- a/x86_64/locore.S +++ b/x86_64/locore.S @@ -1136,7 +1136,7 @@ syscall_native: #endif shll $5,%eax /* manual indexing of mach_trap_t */ xorq %r10,%r10 - movl EXT(mach_trap_table)(%eax),%r10d + mov EXT(mach_trap_table)(%rax),%r10 /* get number of arguments */ andq %r10,%r10 jz mach_call_call /* skip argument copy if none */ @@ -1184,7 +1184,7 @@ mach_call_call: 0: #endif /* DEBUG */ - call *EXT(mach_trap_table)+8(%eax) + call *EXT(mach_trap_table)+8(%rax) /* call procedure */ movq %rsp,%rcx /* get kernel stack */ or $(KERNEL_STACK_SIZE-1),%rcx -- 2.30.2