--- i386/Makefrag.am | 2 + i386/i386/gdt.c | 17 +++++ i386/i386/gdt.h | 7 +- i386/i386/locore.S | 177 ++++++++++++++++++++++++++++++++++++++++++++++++ i386/i386/pcb.c | 24 +++---- i386/i386/syscall.c | 103 ++++++++++++++++++++++++++++ i386/i386/syscall.h | 7 ++ i386/i386/tss.h | 1 + i386/i386at/conf.c | 8 +++ i386/i386at/model_dep.c | 2 + 10 files changed, 335 insertions(+), 13 deletions(-) create mode 100644 i386/i386/syscall.c create mode 100644 i386/i386/syscall.h
diff --git a/i386/Makefrag.am b/i386/Makefrag.am index 4dd6a9f..f59ac29 100644 --- a/i386/Makefrag.am +++ b/i386/Makefrag.am @@ -147,6 +147,8 @@ libkernel_a_SOURCES += \ i386/i386/setjmp.h \ i386/i386/spl.S \ i386/i386/spl.h \ + i386/i386/syscall.c \ + i386/i386/syscall.h \ i386/i386/task.h \ i386/i386/thread.h \ i386/i386/time_stamp.h \ diff --git a/i386/i386/gdt.c b/i386/i386/gdt.c index c895eb3..0f9d0e3 100644 --- a/i386/i386/gdt.c +++ b/i386/i386/gdt.c @@ -57,6 +57,23 @@ gdt_init(void) LINEAR_MIN_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, LINEAR_MAX_KERNEL_ADDRESS - (LINEAR_MIN_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) - 1, ACC_PL_K|ACC_DATA_W, SZ_32); + fill_gdt_descriptor(KERNEL_ENTER_CS, + LINEAR_MIN_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, + LINEAR_MAX_KERNEL_ADDRESS - (LINEAR_MIN_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) - 1, + ACC_PL_K|ACC_CODE_R, SZ_32); + fill_gdt_descriptor(KERNEL_ENTER_DS, + LINEAR_MIN_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, + LINEAR_MAX_KERNEL_ADDRESS - (LINEAR_MIN_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) - 1, + ACC_PL_K|ACC_DATA_W, SZ_32); + fill_gdt_descriptor(USER_EXIT_CS, + VM_MIN_ADDRESS, + VM_MAX_ADDRESS-VM_MIN_ADDRESS-4096, + /* XXX LINEAR_... */ + ACC_PL_U|ACC_CODE_R, SZ_32); + fill_gdt_descriptor(USER_EXIT_DS, + VM_MIN_ADDRESS, + VM_MAX_ADDRESS-VM_MIN_ADDRESS-4096, + ACC_PL_U|ACC_DATA_W, SZ_32); #ifndef MACH_PV_DESCRIPTORS fill_gdt_descriptor(LINEAR_DS, 0, diff --git a/i386/i386/gdt.h b/i386/i386/gdt.h index d865640..37ca6f5 100644 --- a/i386/i386/gdt.h +++ b/i386/i386/gdt.h @@ -55,7 +55,12 @@ #define USER_GDT 0x48 /* user-defined GDT entries */ #define USER_GDT_SLOTS 2 -#define GDTSZ (USER_GDT/8 + USER_GDT_SLOTS) +#define KERNEL_ENTER_CS (0x58 | SEL_PL_K) /* kernel code */ +#define KERNEL_ENTER_DS (0x60 | SEL_PL_K) /* kernel data */ +#define USER_EXIT_CS (0x68 | SEL_PL_U) /* user code */ +#define USER_EXIT_DS (0x70 | SEL_PL_U) /* user data */ + +#define GDTSZ (USER_EXIT_DS/8 + 1) extern struct real_descriptor gdt[GDTSZ]; diff --git a/i386/i386/locore.S b/i386/i386/locore.S index cfda86f..aa13c6b 100644 --- a/i386/i386/locore.S +++ b/i386/i386/locore.S @@ -592,6 +592,7 @@ ENTRY(thread_syscall_return) or $(KERNEL_STACK_SIZE-1),%ecx movl -3-IKS_SIZE(%ecx),%esp /* switch back to PCB stack */ movl %eax,R_EAX(%esp) /* save return value */ + /* XXX make this work with sysenter */ jmp _return_from_trap ENTRY(call_continuation) @@ -978,6 +979,18 @@ ttd_from_iret_i: /* on interrupt stack */ #endif /* MACH_TTD */ +/* User stub for calling the kernel using the trap gate. */ + .globl user_trapgate_stub_start +user_trapgate_stub_start: + popl %ecx /* Pop return address into %ecx. */ + popl %eax /* Pop syscall number into %eax. */ + pushl %ecx /* Push back return address. */ + lcall $7, $0 + subl $4, %esp /* magic */ + ret + .globl user_trapgate_stub_end +user_trapgate_stub_end: + /* * System call enters through a call gate. Flags are not saved - * we must shuffle stack to look like trap save area. @@ -1171,6 +1184,170 @@ syscall_addr: jmp _take_trap /* treat as a trap */ +/* User stub for calling the kernel using the sysenter instruction. */ + .globl user_sysenter_stub_start +user_sysenter_stub_start: + push %ebp + mov %esp, %ebp + pushf + push %ebx + push %esi + push %edi /* xxxmore callee-saved registers? */ + mov 8(%ebp), %eax /* Move syscall number into %eax. */ + mov 12(%ebp), %ebx /* Move first argument into %ebx. */ + mov 16(%ebp), %ecx /* Move second argument into %ecx. */ + mov 20(%ebp), %edx /* Move third argument into %edx. */ + mov 24(%ebp), %esi /* Move fourth argument into %esi. */ + call get_ip /* compute location of sysexit */ +get_ip: pop %edi /* load current ip */ + add $8, %edi /* userspace return address */ + movl %esp, %ebp /* userspace stack pointer */ + sysenter + pop %edi + pop %esi + pop %ebx + popf + pop %ebp + ret + .globl user_sysenter_stub_end +user_sysenter_stub_end: + +/* + * SYSENTER entry point. + * + * Calling convention: + * %eax - syscall number + * %ebx - syscall argument 1 + * %ecx - syscall argument 2 + * %edx - syscall argument 3 + * %esi - syscall argument 4 + * %edi - userspace return address + * %ebp - userspace stack pointer + * + * Kernel Stack layout: + * kernel stack base -> EAX + * ECX + * EDX + * EBX + * ESP + * EBP + * ESI + * EDI + * XXX: make this look like a trap save area to make thread_syscall_return work + */ +/* Offsets from %ebp */ +#define SE_EAX (4 * 7) +#define SE_ECX (4 * 6) +#define SE_EDX (4 * 5) +#define SE_EBX (4 * 4) +#define SE_ESX (4 * 3) +#define SE_EBP (4 * 2) +#define SE_ESI (4 * 1) +#define SE_EDI (4 * 0) +#define SE_STACK_POINTER SE_EBP +#define SE_RETURN_ADDRESS SE_EDI + +#define SE_USER_SKIP 20 /* skip past the scratchpad */ + +ENTRY(sysenter_entry) + pusha /* save all registers */ + mov %esp, %ebp /* to access the sysenter stack */ + cld /* clear direction flag */ + + pushl %ds /* save the segment registers */ + pushl %es + pushl %fs + pushl %gs + + mov %ss,%cx /* switch to kernel data segment */ + mov %cx,%ds + mov %cx,%es + mov %cx,%fs + mov %cx,%gs + + CPU_NUMBER(%edx) + movl CX(EXT(kernel_stack),%edx),%ebx + /* get current kernel stack */ + xchgl %ebx, %esp /* switch stacks */ + pushl %ebx /* save sysenter sp */ + movl %esp,%ebx /* save kernel sp for argument + unwinding */ + + negl %eax /* get system call number */ + /* xxx sysenter_mach_call_range */ + jl mach_call_range /* out of range if it was positive */ + cmpl EXT(mach_trap_count),%eax /* check system call table bounds */ + /* xxx sysenter_mach_call_range */ + jg mach_call_range /* error if out of range */ + + shll $4,%eax /* manual indexing */ + movl EXT(mach_trap_table)(%eax),%ecx + /* get number of arguments */ + + cmp $4, %ecx + ja se_args_5plus + je se_args_4 + cmp $2, %ecx + ja se_args_3 + je se_args_2 + cmp $1, %ecx + je se_args_1 + jmp se_args_0 + +se_args_5plus: + + sub $4, %ecx /* skip the four first arguments */ + movl SE_STACK_POINTER(%ebp), %esi + /* get user stack pointer */ + lea (4 /* skip user return address */\ + +4 /* point past last argument */\ + +16 /* skip register arguments */\ + +SE_USER_SKIP)(%esi,%ecx,4),%esi + /* and skip past the userspace + local storage */ + + movl $USER_DS,%edx /* use user data segment for accesses */ + mov %dx,%fs + +0: subl $4,%esi + RECOVER(sysenter_mach_call_addr_push) + pushl %fs:(%esi) /* push argument on stack */ + loop 0b /* loop for all arguments */ + +se_args_4: + push SE_ESI(%ebp) /* push fourth argument */ +se_args_3: + push SE_EDX(%ebp) /* push third argument */ +se_args_2: + push SE_ECX(%ebp) /* push second argument */ +se_args_1: + push SE_EBX(%ebp) /* push first argument */ +se_args_0: + sti /* xxx: sti/cli where ? */ + call *EXT(mach_trap_table)+4(%eax) + /* call procedure */ + cli /* xxx: sti/cli where ? */ + movl %ebx, %esp /* clean parameters from stack */ +return_from_sysenter: + /* xxx: process ast */ + popl %esp /* restore sysenter sp */ + popl %gs /* restore segment registers */ + popl %fs + popl %es + popl %ds + + movl SE_RETURN_ADDRESS(%ebp), %edx + movl SE_STACK_POINTER(%ebp), %ecx + sti /* xxx: sti/cli where ? */ + sysexit + +sysenter_mach_call_addr_push: + movl %ebx,%esp /* clean parameters from stack */ + /* xxx signal page-fault */ + jmp sysenter_mach_call_addr_push +#undef SE_STACK_POINTER +#undef SE_RETURN_ADDRESS + .data DATA(cpu_features) .long 0 diff --git a/i386/i386/pcb.c b/i386/i386/pcb.c index e8040c8..2da3804 100644 --- a/i386/i386/pcb.c +++ b/i386/i386/pcb.c @@ -391,12 +391,12 @@ void pcb_init(thread_t thread) * Guarantee that the bootstrapped thread will be in user * mode. */ - pcb->iss.cs = USER_CS; - pcb->iss.ss = USER_DS; - pcb->iss.ds = USER_DS; - pcb->iss.es = USER_DS; - pcb->iss.fs = USER_DS; - pcb->iss.gs = USER_DS; + pcb->iss.cs = USER_EXIT_CS; + pcb->iss.ss = USER_EXIT_DS; + pcb->iss.ds = USER_EXIT_DS; + pcb->iss.es = USER_EXIT_DS; + pcb->iss.fs = USER_EXIT_DS; + pcb->iss.gs = USER_EXIT_DS; pcb->iss.efl = EFL_USER_SET; thread->pcb = pcb; @@ -524,12 +524,12 @@ kern_return_t thread_setstatus( * 386 mode. Set segment registers for flat * 32-bit address space. */ - saved_state->cs = USER_CS; - saved_state->ss = USER_DS; - saved_state->ds = USER_DS; - saved_state->es = USER_DS; - saved_state->fs = USER_DS; - saved_state->gs = USER_DS; + saved_state->cs = USER_EXIT_CS; + saved_state->ss = USER_EXIT_DS; + saved_state->ds = USER_EXIT_DS; + saved_state->es = USER_EXIT_DS; + saved_state->fs = USER_EXIT_DS; + saved_state->gs = USER_EXIT_DS; } else { /* diff --git a/i386/i386/syscall.c b/i386/i386/syscall.c new file mode 100644 index 0000000..e9b17d0 --- /dev/null +++ b/i386/i386/syscall.c @@ -0,0 +1,103 @@ +#include <mach/vm_param.h> +#include <mach/vm_prot.h> +#include <vm/pmap.h> +#include <vm/vm_kern.h> +#include <string.h> +#include <kern/debug.h> + +#include <machine/tss.h> +#include <i386/i386/ktss.h> +#include <i386/i386/gdt.h> +#include <i386/i386/locore.h> + +#include "syscall.h" + +#include <kern/printf.h> // xxx + +static vm_offset_t msyscall = 0; + +void user_trapgate_stub_start(); +void user_trapgate_stub_end(); + +void user_sysenter_stub_start(); +void user_sysenter_stub_end(); + +void +syscall_init(void) +{ + kern_return_t kr; + vm_offset_t user_stub_start; + vm_offset_t user_stub_end; + + kr = kmem_alloc_wired(kernel_map, &msyscall, PAGE_SIZE); + if (kr != KERN_SUCCESS) + panic("syscall_init"); + + memset((void *) msyscall, 0, PAGE_SIZE); + + if (CPU_HAS_FEATURE (CPU_FEATURE_SEP)) { + printf ("syscall: using SYSENTER/SYSEXIT\n"); + user_stub_start = (vm_offset_t) user_sysenter_stub_start; + user_stub_end = (vm_offset_t) user_sysenter_stub_end; + } else { + printf ("syscall: using trap gate\n"); + user_stub_start = (vm_offset_t) user_trapgate_stub_start; + user_stub_end = (vm_offset_t) user_trapgate_stub_end; + } + + memcpy((void *) msyscall, (void *) user_stub_start, + (size_t) (user_stub_end - user_stub_start)); + + syscall_init_cpu(); +} + +static void +wrmsr(unsigned int msr, unsigned long long val) +{ + __asm__ __volatile__("wrmsr" + : /* no Outputs */ + : "c" (msr), "A" (val)); +} + +#define MSR_IA32_SYSENTER_CS 0x00000174 +#define MSR_IA32_SYSENTER_ESP 0x00000175 +#define MSR_IA32_SYSENTER_EIP 0x00000176 + +extern void sysenter_entry(void); + +void +syscall_init_cpu(void) +{ + if (! CPU_HAS_FEATURE (CPU_FEATURE_SEP)) + return; + + //struct task_tss *tss = curr_ktss (cpu_number ()); + struct task_tss *tss = &ktss; + + wrmsr(MSR_IA32_SYSENTER_CS, KERNEL_ENTER_CS); + wrmsr(MSR_IA32_SYSENTER_ESP, + (unsigned long) tss->sysenter_stack + sizeof tss->sysenter_stack); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry); +} + +int +syscall_open(dev_t dev, int flag, io_req_t ior) +{ + return 0; +} + +void +syscall_close(dev_t dev, int flag) +{ + return; +} + +int +syscall_mmap(dev_t dev, vm_offset_t off, vm_prot_t prot) +{ + if (prot & VM_PROT_WRITE) + return (-1); + + return (i386_btop(pmap_extract(pmap_kernel(), + (vm_offset_t) msyscall))); +} diff --git a/i386/i386/syscall.h b/i386/i386/syscall.h new file mode 100644 index 0000000..de9670c --- /dev/null +++ b/i386/i386/syscall.h @@ -0,0 +1,7 @@ +// XXX + +void syscall_init(void); +void syscall_init_cpu(void); +int syscall_open(dev_t dev, int flag, io_req_t ior); +void syscall_close(dev_t dev, int flag); +int syscall_mmap(dev_t dev, vm_offset_t off, vm_prot_t prot); diff --git a/i386/i386/tss.h b/i386/i386/tss.h index ff25f21..8c939c7 100644 --- a/i386/i386/tss.h +++ b/i386/i386/tss.h @@ -76,6 +76,7 @@ struct task_tss struct i386_tss tss; unsigned char iopb[IOPB_BYTES]; unsigned char barrier; + unsigned long sysenter_stack[64]; /* xxx */ }; diff --git a/i386/i386at/conf.c b/i386/i386at/conf.c index ab4f680..d7f9e6f 100644 --- a/i386/i386at/conf.c +++ b/i386/i386at/conf.c @@ -68,6 +68,9 @@ #define hypcnname "hyp" #endif /* MACH_HYP */ +#include <i386/syscall.h> +#define syscall_name "syscall" + /* * List of devices - console must be at slot 0 */ @@ -143,6 +146,11 @@ struct dev_ops dev_name_list[] = nodev }, #endif /* MACH_HYP */ + { syscall_name, syscall_open, syscall_close, nulldev_read, + nulldev_write, nulldev_getstat, nulldev_setstat, + syscall_mmap, + nodev, nulldev, nulldev_portdeath, 0, + nodev }, }; int dev_name_count = sizeof(dev_name_list)/sizeof(dev_name_list[0]); diff --git a/i386/i386at/model_dep.c b/i386/i386at/model_dep.c index bc34c9b..210e54d 100644 --- a/i386/i386at/model_dep.c +++ b/i386/i386at/model_dep.c @@ -63,6 +63,7 @@ #include <i386/proc_reg.h> #include <i386/locore.h> #include <i386/model_dep.h> +#include <i386/syscall.h> #include <i386at/autoconf.h> #include <i386at/idt.h> #include <i386at/int_init.h> @@ -197,6 +198,7 @@ void machine_init(void) */ pmap_unmap_page_zero(); #endif + syscall_init(); } /* Conserve power on processor CPU. */ -- 2.1.4