Luca Dariz, le lun. 27 févr. 2023 21:45:01 +0100, a ecrit: > diff --git a/i386/i386/ldt.h b/i386/i386/ldt.h > index b15f11a5..4490f99f 100644 > --- a/i386/i386/ldt.h > +++ b/i386/i386/ldt.h > @@ -45,9 +45,14 @@ > #define USER_SCALL 0x07 /* system call gate */ > #ifdef __x86_64__ > /* Call gate needs two entries */ > -#endif > + > +/* The sysret instruction puts some constraints on the user segment indexes > */ > +#define USER_CS 0x1f /* user code segment */ > +#define USER_DS 0x17 /* user data segment */
I'd say we'd rather avoid changing them for the x86_64 && USER32 case? > +#else > #define USER_CS 0x17 /* user code segment */ > #define USER_DS 0x1f /* user data segment */ > +#endif > > #define LDTSZ 4 > > diff --git a/i386/include/mach/i386/syscall_sw.h > b/i386/include/mach/i386/syscall_sw.h > index 86f6ff2f..20ef7c13 100644 > --- a/i386/include/mach/i386/syscall_sw.h > +++ b/i386/include/mach/i386/syscall_sw.h > @@ -29,16 +29,16 @@ > > #include <mach/machine/asm.h> > > -#if BSD_TRAP > -#define kernel_trap(trap_name,trap_number,number_args) \ > -ENTRY(trap_name) \ > - movl $ trap_number,%eax; \ > - SVC; \ > - jb LCL(cerror); \ > - ret; \ > +#if defined(__x86_64__) && ! defined(USER32) > +#define kernel_trap(trap_name,trap_number,number_args) \ > +ENTRY(trap_name) \ > + movq $ trap_number,%rax; \ > + movq %rcx,%r10; \ What is that for? > + syscall; \ > + ret; \ > END(trap_name) > #else > -#define kernel_trap(trap_name,trap_number,number_args) \ > +#define kernel_trap(trap_name,trap_number,number_args) \ > ENTRY(trap_name) \ > movl $ trap_number,%eax; \ > SVC; \ > diff --git a/x86_64/locore.S b/x86_64/locore.S > index 47d9085c..fdf7300b 100644 > --- a/x86_64/locore.S > +++ b/x86_64/locore.S > @@ -1281,6 +1281,142 @@ DATA(cpu_features_ecx) > > END(syscall) > > + > +/* Entry point for 64-bit syscalls. > + * On entry we're still on the user stack, so better not use it. Instead we > + * save the thread state immediately in thread->pcb->iss, then try to invoke > + * the syscall. > + * TODO: > + - for now we assume the return address is canonical, but apparently > there > + can be cases where it's not (see how Linux handles this). Does it > apply > + here? > + - do we need to check for ast on syscalls? Maybe on interrupts is enough > + - check that the case where a task is suspended, and later returns via > + iretq from return_from_trap, works fine in all combinations > + - emulated syscalls - are they used anywhere? Not that I know of. > + */ > +ENTRY(syscall64) > + /* RFLAGS[32:63] are reserved, so combine syscall num (32 bit) and > + * eflags in RAX to allow using r11 as temporary register */ > + shlq $32,%r11 > + shlq $32,%rax /* make sure bits 32:63 of %rax are zero */ > + shrq $32,%rax > + or %r11,%rax > + > + /* Save thread state in pcb->iss, as on exception entry. > + * Since this is triggered synchronously from userspace, we can > + * save only the callee-preserved status according to the C ABI, > + * plus RIP and EFLAGS for sysret */ > + CPU_NUMBER(%r11) > + movq CX(EXT(active_threads),%r11),%r11 /* point to current thread */ > + movq TH_PCB(%r11),%r11 /* point to pcb */ > + addq $ PCB_ISS,%r11 /* point to saved state */ > + > + mov %gs,R_GS(%r11) > + mov %fs,R_FS(%r11) > + mov %rsp,R_UESP(%r11) /* callee-preserved register */ > + mov %rcx,R_EIP(%r11) /* syscall places user RIP in RCX */ > + mov %rbx,R_EBX(%r11) /* callee-preserved register */ > + mov %rax,%rbx /* Now we can unpack eflags again */ > + shr $32,%rbx > + mov %rbx,R_EFLAGS(%r11) /* ... and save them in pcb as well */ > + mov %rbp,R_EBP(%r11) /* callee-preserved register */ > + mov %r12,R_R12(%r11) /* callee-preserved register */ > + mov %r13,R_R13(%r11) /* callee-preserved register */ > + mov %r14,R_R14(%r11) /* callee-preserved register */ > + mov %r15,R_R15(%r11) /* callee-preserved register */ > + mov %r11,%rbx /* prepare for error handling */ > + mov %r10,%rcx /* fix arg3 location according to C ABI > */ > + > + /* switch to kernel stack */ > + CPU_NUMBER(%r11) > + movq CX(EXT(kernel_stack),%r11),%rsp > + > + /* Now we have saved state and args 1-6 are in place. > + * Before invoking the syscall we do some bound checking and, > + * if we have more that 6 arguments, we need to copy the > + * remaining ones to the kernel stack, handling page faults when > + * accessing the user stack. > + */ > + shlq $32,%rax /* make sure bits 32:63 of %rax are > zero */ > + shrq $32,%rax > + negl %eax /* get system call number */ > + jl _syscall64_range /* out of range if it was positive */ > + cmpl EXT(mach_trap_count),%eax /* check system call table > bounds */ > + jg _syscall64_range /* error if out of range */ > + shll $5,%eax /* manual indexing of mach_trap_t */ > + > + /* check if we need to place some arguments on the stack */ > +_syscall64_args_stack: > + mov EXT(mach_trap_table)(%rax),%r10 /* get number of arguments */ > + subq $6,%r10 /* the first 6 args are already in > place */ > + jl _syscall64_call /* skip argument copy if >6 args */ jle? > + > + movq R_UESP(%rbx),%r11 /* get user stack pointer */ > + addq $8,%r11 /* Skip user return address */ > + > + mov $USER_DS,%r12 /* use user data segment for accesses */ > + mov %r12,%fs > + > + lea (%r11,%r10,8),%r11 /* point past last argument */ > + xorq %r12,%r12 Why clearing it? > +0: subq $8,%r11 > + RECOVER(_syscall64_addr_push) > + mov %fs:(%r11),%r12 > + pushq %r12 /* push argument on stack */ > + dec %r10 > + jnz 0b /* loop for all remaining arguments */ > + > +_syscall64_call: > + call *EXT(mach_trap_table)+8(%rax) /* call procedure */ > + // XXX: check ast on exit? > + > + /* avoid leaking information in callee-clobbered registers */ > + mov $0,%rdi Rather xorq? > + mov $0,%rsi > + mov $0,%rdx > + mov $0,%r10 > + mov $0,%r9 > + mov $0,%r8 > + > + /* restore thread state and return to user using sysret */ > + CPU_NUMBER(%r11) > + movq CX(EXT(active_threads),%r11),%r11 /* point to current thread */ > + movq TH_PCB(%r11),%r11 /* point to pcb */ > + addq $ PCB_ISS,%r11 /* point to saved state */ > + > + mov R_GS(%r11),%gs > + mov R_FS(%r11),%fs > + mov R_UESP(%r11),%rsp /* callee-preserved register, > + * switch to user stack */ > + mov R_EIP(%r11),%rcx /* sysret convention */ > + mov R_EBX(%r11),%rbx /* callee-preserved register */ > + mov R_EBP(%r11),%rbp /* callee-preserved register */ > + mov R_R12(%r11),%r12 /* callee-preserved register */ > + mov R_R13(%r11),%r13 /* callee-preserved register */ > + mov R_R14(%r11),%r14 /* callee-preserved register */ > + mov R_R15(%r11),%r15 /* callee-preserved register */ > + mov R_EFLAGS(%r11),%r11 /* sysret convention */ > + > + sysretq /* fast return to user-space, the thread didn't block */ > + > +/* Error handling fragments, from here we jump directly to the trap handler > */ > +_syscall64_addr_push: > + movq %rbx,%rsp /* clean parameters from stack */ > + movq %r11,R_CR2(%rbx) /* set fault address */ > + movq $(T_PAGE_FAULT),R_TRAPNO(%rbx) /* set page-fault trap */ > + movq $(T_PF_USER),R_ERR(%rbx) /* set error code - read user space */ > + jmp _take_trap /* treat as a trap */ > + > +_syscall64_range: > + movq $(T_INVALID_OPCODE),R_TRAPNO(%rbx) > + /* set invalid-operation trap */ > + movq $0,R_ERR(%rbx) /* clear error code */ > + jmp _take_trap /* treat as a trap */ > + > +END(syscall64) > + > /* Discover what kind of cpu we have; return the family number > (3, 4, 5, 6, for 386, 486, 586, 686 respectively). */ > ENTRY(discover_x86_cpu_type) > -- > 2.30.2 > > -- Samuel --- Pour une évaluation indépendante, transparente et rigoureuse ! Je soutiens la Commission d'Évaluation de l'Inria.