Il 27/02/23 23:02, Samuel Thibault ha scritto:
Luca Dariz, le lun. 27 févr. 2023 21:45:01 +0100, a ecrit:
diff --git a/i386/i386/ldt.h b/i386/i386/ldt.h
index b15f11a5..4490f99f 100644
--- a/i386/i386/ldt.h
+++ b/i386/i386/ldt.h
@@ -45,9 +45,14 @@
#define USER_SCALL 0x07 /* system call gate */
#ifdef __x86_64__
/* Call gate needs two entries */
-#endif
+
+/* The sysret instruction puts some constraints on the user segment indexes */
+#define USER_CS 0x1f /* user code segment */
+#define USER_DS 0x17 /* user data segment */
I'd say we'd rather avoid changing them for the x86_64 && USER32 case?
Right, I forgot to add ! USER32 here
+#else
#define USER_CS 0x17 /* user code segment */
#define USER_DS 0x1f /* user data segment */
+#endif
#define LDTSZ 4
diff --git a/i386/include/mach/i386/syscall_sw.h b/i386/include/mach/i386/syscall_sw.h
index 86f6ff2f..20ef7c13 100644
--- a/i386/include/mach/i386/syscall_sw.h
+++ b/i386/include/mach/i386/syscall_sw.h
@@ -29,16 +29,16 @@
#include <mach/machine/asm.h>
-#if BSD_TRAP
-#define kernel_trap(trap_name,trap_number,number_args) \
-ENTRY(trap_name) \
- movl $ trap_number,%eax; \
- SVC; \
- jb LCL(cerror); \
- ret; \
+#if defined(__x86_64__) && ! defined(USER32)
+#define kernel_trap(trap_name,trap_number,number_args) \
+ENTRY(trap_name) \
+ movq $ trap_number,%rax; \
+ movq %rcx,%r10; \
What is that for?
The syscall instruction automatically stores RIP in RCX, but RCX is also
the place for the 4th arg passed to a function, so we need another
register to store it. In this case R10 is the only non-callee-preserved
register remaining. In the syscall64 code below, this value is moved
back to RCX after saving the thread state.
+ syscall; \
+ ret; \
END(trap_name)
#else
-#define kernel_trap(trap_name,trap_number,number_args) \
+#define kernel_trap(trap_name,trap_number,number_args) \
ENTRY(trap_name) \
movl $ trap_number,%eax; \
SVC; \
diff --git a/x86_64/locore.S b/x86_64/locore.S
index 47d9085c..fdf7300b 100644
--- a/x86_64/locore.S
+++ b/x86_64/locore.S
@@ -1281,6 +1281,142 @@ DATA(cpu_features_ecx)
END(syscall)
+
+/* Entry point for 64-bit syscalls.
+ * On entry we're still on the user stack, so better not use it. Instead we
+ * save the thread state immediately in thread->pcb->iss, then try to invoke
+ * the syscall.
+ * TODO:
+ - for now we assume the return address is canonical, but apparently there
+ can be cases where it's not (see how Linux handles this). Does it apply
+ here?
+ - do we need to check for ast on syscalls? Maybe on interrupts is enough
+ - check that the case where a task is suspended, and later returns via
+ iretq from return_from_trap, works fine in all combinations
+ - emulated syscalls - are they used anywhere?
Not that I know of.
Ok, I'll update the comment about emulated syscalls.
+ */
+ENTRY(syscall64)
+ /* RFLAGS[32:63] are reserved, so combine syscall num (32 bit) and
+ * eflags in RAX to allow using r11 as temporary register */
+ shlq $32,%r11
+ shlq $32,%rax /* make sure bits 32:63 of %rax are zero */
+ shrq $32,%rax
+ or %r11,%rax
+
+ /* Save thread state in pcb->iss, as on exception entry.
+ * Since this is triggered synchronously from userspace, we can
+ * save only the callee-preserved status according to the C ABI,
+ * plus RIP and EFLAGS for sysret */
+ CPU_NUMBER(%r11)
+ movq CX(EXT(active_threads),%r11),%r11 /* point to current thread */
+ movq TH_PCB(%r11),%r11 /* point to pcb */
+ addq $ PCB_ISS,%r11 /* point to saved state */
+
+ mov %gs,R_GS(%r11)
+ mov %fs,R_FS(%r11)
+ mov %rsp,R_UESP(%r11) /* callee-preserved register */
+ mov %rcx,R_EIP(%r11) /* syscall places user RIP in RCX */
+ mov %rbx,R_EBX(%r11) /* callee-preserved register */
+ mov %rax,%rbx /* Now we can unpack eflags again */
+ shr $32,%rbx
+ mov %rbx,R_EFLAGS(%r11) /* ... and save them in pcb as well */
+ mov %rbp,R_EBP(%r11) /* callee-preserved register */
+ mov %r12,R_R12(%r11) /* callee-preserved register */
+ mov %r13,R_R13(%r11) /* callee-preserved register */
+ mov %r14,R_R14(%r11) /* callee-preserved register */
+ mov %r15,R_R15(%r11) /* callee-preserved register */
+ mov %r11,%rbx /* prepare for error handling */
+ mov %r10,%rcx /* fix arg3 location according to C ABI
*/
+
+ /* switch to kernel stack */
+ CPU_NUMBER(%r11)
+ movq CX(EXT(kernel_stack),%r11),%rsp
+
+ /* Now we have saved state and args 1-6 are in place.
+ * Before invoking the syscall we do some bound checking and,
+ * if we have more that 6 arguments, we need to copy the
+ * remaining ones to the kernel stack, handling page faults when
+ * accessing the user stack.
+ */
+ shlq $32,%rax /* make sure bits 32:63 of %rax are
zero */
+ shrq $32,%rax
+ negl %eax /* get system call number */
+ jl _syscall64_range /* out of range if it was positive */
+ cmpl EXT(mach_trap_count),%eax /* check system call table
bounds */
+ jg _syscall64_range /* error if out of range */
+ shll $5,%eax /* manual indexing of mach_trap_t */
+
+ /* check if we need to place some arguments on the stack */
+_syscall64_args_stack:
+ mov EXT(mach_trap_table)(%rax),%r10 /* get number of arguments */
+ subq $6,%r10 /* the first 6 args are already in
place */
+ jl _syscall64_call /* skip argument copy if >6 args */
jle?
Right, I didn't test a 6-args syscall.
+
+ movq R_UESP(%rbx),%r11 /* get user stack pointer */
+ addq $8,%r11 /* Skip user return address */
+
+ mov $USER_DS,%r12 /* use user data segment for accesses */
+ mov %r12,%fs
+
+ lea (%r11,%r10,8),%r11 /* point past last argument */
+ xorq %r12,%r12
Why clearing it?
Actually no need to do it, it's overwritten later
+0: subq $8,%r11
+ RECOVER(_syscall64_addr_push)
+ mov %fs:(%r11),%r12
+ pushq %r12 /* push argument on stack */
+ dec %r10
+ jnz 0b /* loop for all remaining arguments */
+
+_syscall64_call:
+ call *EXT(mach_trap_table)+8(%rax) /* call procedure */
+ // XXX: check ast on exit?
+
+ /* avoid leaking information in callee-clobbered registers */
+ mov $0,%rdi
Rather xorq?
Will do.
Thanks!
Luca