* link kernel at 0x4000000 as the xen version, higher values causes
  linker errors.
* we can't use full segmentation in long mode, so we need to create a
  temporary mapping during early boot to be able to jump to high
* build direct map for first 4G in boothdr (seems required by Linux
* enable also write page access check in kernel mode

Signed-off-by: Luca Dariz <l...@orpolo.org>
 configure.ac         |   3 +-
 i386/configfrag.ac   |   2 +
 i386/i386/vm_param.h |   4 +-
 i386/intel/pmap.c    |   4 +-
 i386/intel/pmap.h    |   4 +
 x86_64/Makefrag.am   |  18 +++-
 x86_64/boothdr.S     | 214 +++++++++++++++++++++++++++++++++++++++++++
 x86_64/interrupt.S   |   4 +-
 x86_64/ldscript      |  28 ++++--
 x86_64/locore.S      |   4 +-
 10 files changed, 264 insertions(+), 21 deletions(-)
 create mode 100644 x86_64/boothdr.S

diff --git a/configure.ac b/configure.ac
index 019842db..3aaa935c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -56,8 +56,7 @@ case $host_platform:$host_cpu in
-    AC_MSG_WARN([Platform set to Xen by default, this can not boot on non-Xen 
systems, you currently need a 32bit build for that.])
-    [host_platform=xen;;
+    [host_platform=at;;
   at:i?86 | xen:i?86 | at:x86_64 | xen:x86_64)
diff --git a/i386/configfrag.ac b/i386/configfrag.ac
index f697e277..f07a98ca 100644
--- a/i386/configfrag.ac
+++ b/i386/configfrag.ac
@@ -106,6 +106,8 @@ AC_ARG_ENABLE([apic],
+  *:x86_64)
+    enable_pae=${enable_pae-yes};;
     if [ x"$enable_pae" = xyes ]; then]
       AC_MSG_ERROR([can only enable the `PAE' feature on ix86.])
diff --git a/i386/i386/vm_param.h b/i386/i386/vm_param.h
index edd9522c..c00c05b2 100644
--- a/i386/i386/vm_param.h
+++ b/i386/i386/vm_param.h
@@ -36,7 +36,7 @@
  * for better trace support in kdb; the _START symbol has to be offset by the
  * same amount. */
 #ifdef __x86_64__
-#define VM_MIN_KERNEL_ADDRESS  0x40000000UL
 #define VM_MIN_KERNEL_ADDRESS  0xC0000000UL
@@ -73,7 +73,7 @@
 /* This is the kernel address range in linear addresses.  */
 #ifdef __x86_64__
-#define LINEAR_MAX_KERNEL_ADDRESS      (0x00007fffffffffffUL)
+#define LINEAR_MAX_KERNEL_ADDRESS      (0xffffffffffffffffUL)
 /* On x86, the kernel virtual address space is actually located
    at high linear addresses. */
diff --git a/i386/intel/pmap.c b/i386/intel/pmap.c
index 3bf00659..91835b30 100644
--- a/i386/intel/pmap.c
+++ b/i386/intel/pmap.c
@@ -655,7 +655,7 @@ void pmap_bootstrap(void)
                                  pa_to_pte(_kvtophys((void *) kernel_page_dir
                                                      + i * INTEL_PGBYTES))
                                  | INTEL_PTE_VALID
+#if defined(MACH_PV_PAGETABLES) || defined(__x86_64__)
                                  | INTEL_PTE_WRITE
@@ -1297,7 +1297,7 @@ pmap_t pmap_create(vm_size_t size)
                                  pa_to_pte(kvtophys((vm_offset_t) page_dir[i]))
                                  | INTEL_PTE_VALID
+#if defined(MACH_PV_PAGETABLES) || defined(__x86_64__)
                                  | INTEL_PTE_WRITE
diff --git a/i386/intel/pmap.h b/i386/intel/pmap.h
index f24b3a71..d9222e95 100644
--- a/i386/intel/pmap.h
+++ b/i386/intel/pmap.h
@@ -156,7 +156,11 @@ typedef phys_addr_t pt_entry_t;
 #endif /* MACH_PV_PAGETABLES */
 #define INTEL_PTE_WIRED                0x00000200
 #ifdef PAE
+#ifdef __x86_64__
+#define INTEL_PTE_PFN          0xfffffffffffff000ULL
+#else /* __x86_64__ */
 #define INTEL_PTE_PFN          0x00007ffffffff000ULL
+#endif/* __x86_64__ */
 #define INTEL_PTE_PFN          0xfffff000
diff --git a/x86_64/Makefrag.am b/x86_64/Makefrag.am
index 40b50bc9..5da734de 100644
--- a/x86_64/Makefrag.am
+++ b/x86_64/Makefrag.am
@@ -207,11 +207,27 @@ nodist_libkernel_a_SOURCES += \
+# This should probably be 0xffffffff80000000 for mcmodel=kernel, but let's try
+# to stay in the first 8G first, otherwise we have to fix the pmap module to
+# actually use the l4 page level
+# but for nor try with < 4G, otherwise we have linker errors
 gnumach_LINKFLAGS += \
        --defsym _START_MAP=$(_START_MAP) \
-       --defsym _START=_START_MAP+0x40000000 \
+       --defsym _START=_START_MAP \
+       --defsym KERNEL_MAP_BASE=$(KERNEL_MAP_BASE) \
        -T '$(srcdir)'/x86_64/ldscript
+       -Ii386
diff --git a/x86_64/boothdr.S b/x86_64/boothdr.S
new file mode 100644
index 00000000..3375c6c9
--- /dev/null
+++ b/x86_64/boothdr.S
@@ -0,0 +1,214 @@
+#include <mach/machine/asm.h>
+#include <i386/i386asm.h>
+       /*
+        * This section will be put first into .boot.  See also x86_64/ldscript.
+        */
+       .section .boot.text,"ax"
+        .globl boot_start
+       /* We should never be entered this way.  */
+        .code32
+       jmp     boot_entry
+       /* MultiBoot header - see multiboot.h.  */
+#define        MULTIBOOT_MAGIC         0x1BADB002
+#ifdef __ELF__
+#define MULTIBOOT_FLAGS                0x00000003
+#else  /* __ELF__ */
+#define MULTIBOOT_FLAGS                0x00010003
+#endif /* __ELF__ */
+       P2ALIGN(2)
+       .long   MULTIBOOT_MAGIC
+       .long   MULTIBOOT_FLAGS
+       /*
+       * The next item here is the checksum.
+       * XX this works OK until we need at least the 30th bit.
+       */
+#ifndef __ELF__        /* a.out kludge */
+       .long   boot_hdr        /* header_addr */
+       .long   _start          /* load_addr */
+       .long   _edata          /* load_end_addr */
+       .long   _end            /* bss_end_addr */
+       .long   boot_entry      /* entry */
+#endif /* __ELF__ */
+        /*
+         * Prepare minimal page mapping to jump to 64 bit and to C code.
+         * The first 4GB is identity mapped, and the first 2GB are re-mapped
+         * to high addresses at KERNEL_MAP_BASE
+         */
+       movl    $p3table,%eax
+       or      $0b11,%eax
+       movl    %eax,(p4table)
+        /*
+         * Fill 4 entries to cover the whole 32-bit 4GB address space, just to
+         * be sure. Part of it might be remapped later if the kernel is mapped
+         * below 4G.
+         */
+       movl    $p2table,%eax
+       or      $0b11,%eax
+        movl    %eax,(p3table)
+        movl    $p2table1,%eax
+        or      $0b11,%eax
+        movl    %eax,(p3table + 8)
+        movl    $p2table2,%eax
+        or      $0b11,%eax
+        movl    %eax,(p3table + 16)
+        movl    $p2table3,%eax
+        or      $0b11,%eax
+        movl    %eax,(p3table + 24)
+        /* point each page table level two entry to a page */
+        mov     $0,%ecx
+        mov     $0x200000,%eax   // 2MiB page, should be always available
+        mul     %ecx
+        or      $0b10000011,%eax
+        mov     %eax,p2table(,%ecx,8)
+        inc     %ecx
+        cmp     $2048,%ecx
+        jne     .map_p2_table
+        /* KERNEL_MAP_BASE must me aligned to 2GB */
+#if KERNEL_MAP_BASE >= (1U << 39)
+        movl    $p3ktable,%eax
+        or      $0b11,%eax
+        movl    %eax,(p4table + (8 * ((KERNEL_MAP_BASE >> 39) & 0x1FF)))  // 
entry for 0b111111111 mask
+        movl    $p2ktable1,%eax
+        or      $0b11,%eax
+        movl    %eax,(p3ktable + (8 * ((KERNEL_MAP_BASE >> 30) & 0x1FF) ))  // 
entry for 0b111111110 mask
+        movl    $p2ktable2,%eax
+        or      $0b11,%eax
+        movl    %eax,(p3ktable + (8 * (((KERNEL_MAP_BASE >> 30) & 0x1FF) + 1) 
))  // entry for 0b111111111 mask
+        movl    $p2ktable1,%eax
+        or      $0b11,%eax
+        movl    %eax,(p3table + (8 * ((KERNEL_MAP_BASE >> 30) & 0x1FF) ))
+        movl    $p2ktable2,%eax
+        or      $0b11,%eax
+        movl    %eax,(p3table + (8 * (((KERNEL_MAP_BASE >> 30) & 0x1FF) + 1) ))
+        mov     $0,%ecx
+        mov     $0x200000,%eax   // 2MiB page
+        mul     %ecx
+        or      $0b10000011,%eax
+        mov     %eax,p2ktable1(,%ecx,8)
+        inc     %ecx
+        cmp     $1024,%ecx
+        jne     .map_p2k_table
+        /*
+         * Jump to 64 bit code, we have to
+         * - enable PAE
+         * - enable long mode
+         * - enable paging by loading the tables filled above in $cr3
+         * - jump to a 64-bit code segment
+        */
+        mov     %cr4,%eax
+        or      $(1 << 5),%eax  // PAE bit
+        mov     %eax,%cr4
+        mov     $0xC0000080,%ecx  // select EFER register
+        rdmsr
+        or      $(1 << 8),%eax  // long mode enable bit
+        wrmsr
+        mov     $p4table,%eax
+        mov     %eax,%cr3
+        mov     %cr0,%eax
+        or      $(1 << 31),%eax  // Paging bit
+        or      $(1 << 16),%eax  // Write-protect enabled also in kernel mode
+        mov     %eax,%cr0
+        lgdt    gdt64pointer
+        movw   $0,%ax
+       movw    %ax,%ds
+       movw    %ax,%es
+       movw    %ax,%fs
+       movw    %ax,%gs
+        movw   $16,%ax
+       movw    %ax,%ds
+       movw    %ax,%es
+       movw    %ax,%ss
+        ljmp   $8,$boot_entry64
+//        .section .text.start
+        .code64
+        /* why do we need this? it seems overwritten by linker */
+       .globl  _start
+       /* Switch to our own interrupt stack.  */
+       movq    $(_intstack+INTSTACK_SIZE),%rax
+       andq    $(~15),%rax
+       movq    %rax,%rsp
+       /* Reset EFLAGS to a known state.  */
+       pushq   $0
+       popf
+        // save multiboot info
+       movq    %rbx,%r8
+       /* Fix ifunc entries */
+       movq    $__rela_iplt_start,%rsi
+       movq    $__rela_iplt_end,%rdi
+       cmpq    %rdi,%rsi
+       jae     iplt_done
+       movq    (%rsi),%rbx     /* r_offset */
+       movb    4(%rsi),%al     /* info */
+       cmpb    $42,%al         /* IRELATIVE */
+       jnz     iplt_next
+       call    *(%ebx)         /* call ifunc */
+       movq    %rax,(%rbx)     /* fixed address */
+       addq    $8,%rsi
+       jmp     iplt_cont
+       movq    %r8,%rdi
+       /* Jump into C code.  */
+       call    EXT(c_boot_entry)
+        /* not reached */
+        nop
+        .section .boot.data
+       .comm   _intstack,INTSTACK_SIZE
+        .code32
+        .section .boot.data
+        .align 4096
+        .quad   0
+        .quad   (1<<44) | (1<<47) | (1<<41) | (1<<43) | (1<<53)
+        .quad   (1<<44) | (1<<47) | (1<<41)
+        .skip   (4096 - (gdt64end - gdt64))
+        //        .word   gdt64pointer - gdt64 - 1
+        .word   gdt64end - gdt64 - 1
+        .quad   gdt64
+        .section .boot.data
+        .align 4096
+p4table:        .space 4096
+p3table:        .space 4096
+p2table:        .space 4096
+p2table1:       .space 4096
+p2table2:       .space 4096
+p2table3:       .space 4096
+p3ktable:       .space 4096
+p2ktable1:       .space 4096
+p2ktable2:       .space 4096
diff --git a/x86_64/interrupt.S b/x86_64/interrupt.S
index fccf6e28..eab643a5 100644
--- a/x86_64/interrupt.S
+++ b/x86_64/interrupt.S
@@ -41,12 +41,12 @@ ENTRY(interrupt)
        movl    8(%esp),%edx            /* set irq number as 3rd arg */
        movl    %edx,%ebx               /* copy irq number */
        shll    $2,%ebx                 /* irq * 4 */
-       movl    EXT(iunit)(%ebx),%edi   /* get device unit number as 1st arg */
+       movl    EXT(iunit)(%rbx),%edi   /* get device unit number as 1st arg */
        movl    %eax, %esi              /* previous ipl as 2nd arg */
        movq    16(%esp), %rcx          /* return address as 4th arg */
        movq    24(%esp), %r8           /* address of interrupted registers as 
5th arg */
        shll    $1,%ebx                 /* irq * 8 */
-       call    *EXT(ivect)(%ebx)       /* call interrupt handler */
+       call    *EXT(ivect)(%rbx)       /* call interrupt handler */
        popq    %rdi                    /* restore previous ipl */
        call    splx_cli                /* restore previous ipl */
diff --git a/x86_64/ldscript b/x86_64/ldscript
index 375e8104..de99795e 100644
--- a/x86_64/ldscript
+++ b/x86_64/ldscript
@@ -2,7 +2,7 @@
 OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64",
@@ -11,22 +11,30 @@ SECTIONS
    * be first in there.  See also `i386/i386at/boothdr.S' and
    * `gnumach_LINKFLAGS' in `i386/Makefrag.am'.
-  . = _START;
-  .text           :
+  . = _START_MAP;
+  .boot           :
+  {
+    *(.boot.text)
+    *(.boot.data)
+  } =0x90909090
+  _start = .;
+  .text           : AT(((ADDR(.text)) - KERNEL_MAP_BASE))
-    *(.text.start)
+    *(.text*)
     *(.text .stub .text.* .gnu.linkonce.t.*)
     *(.text.unlikely .text.*_unlikely)
     KEEP (*(.text.*personality*))
     /* .gnu.warning sections are handled specially by elf32.em.  */
   } =0x90909090
-  .init           :
+  .init           : AT(((ADDR(.init)) - KERNEL_MAP_BASE))
     KEEP (*(.init))
   } =0x90909090
-  .fini           :
+  .fini           : AT(((ADDR(.fini)) - KERNEL_MAP_BASE))
     KEEP (*(.fini))
   } =0x90909090
@@ -69,7 +77,7 @@ SECTIONS
       PROVIDE_HIDDEN (__rela_iplt_end = .);
   .plt            : { *(.plt) *(.iplt) }
-  .rodata         : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+  .rodata         :  AT(((ADDR(.rodata)) - KERNEL_MAP_BASE)) { *(.rodata 
.rodata.* .gnu.linkonce.r.*) }
   .rodata1        : { *(.rodata1) }
   .eh_frame_hdr : { *(.eh_frame_hdr) }
   .eh_frame       : ONLY_IF_RO { KEEP (*(.eh_frame)) }
@@ -139,7 +147,7 @@ SECTIONS
   .got            : { *(.got) *(.igot) }
   . = DATA_SEGMENT_RELRO_END (24, .);
   .got.plt        : { *(.got.plt)  *(.igot.plt) }
-  .data           :
+  .data           : AT(((ADDR(.data)) - KERNEL_MAP_BASE))
     *(.data .data.* .gnu.linkonce.d.*)
@@ -147,7 +155,7 @@ SECTIONS
   .data1          : { *(.data1) }
   _edata = .; PROVIDE (edata = .);
   __bss_start = .;
-  .bss            :
+  .bss            : AT(((ADDR(.bss)) - KERNEL_MAP_BASE))
    *(.bss .bss.* .gnu.linkonce.b.*)
diff --git a/x86_64/locore.S b/x86_64/locore.S
index 612fc493..a7266dab 100644
--- a/x86_64/locore.S
+++ b/x86_64/locore.S
@@ -1136,7 +1136,7 @@ syscall_native:
        shll    $5,%eax                 /* manual indexing of mach_trap_t */
        xorq    %r10,%r10
-       movl    EXT(mach_trap_table)(%eax),%r10d
+       mov     EXT(mach_trap_table)(%rax),%r10
                                        /* get number of arguments */
        andq    %r10,%r10
        jz      mach_call_call          /* skip argument copy if none */
@@ -1184,7 +1184,7 @@ mach_call_call:
 #endif /* DEBUG */
-       call    *EXT(mach_trap_table)+8(%eax)
+       call    *EXT(mach_trap_table)+8(%rax)
                                        /* call procedure */
        movq    %rsp,%rcx               /* get kernel stack */
        or      $(KERNEL_STACK_SIZE-1),%rcx

Reply via email to