Il 27/02/23 23:02, Samuel Thibault ha scritto:
Luca Dariz, le lun. 27 févr. 2023 21:45:01 +0100, a ecrit:
diff --git a/i386/i386/ldt.h b/i386/i386/ldt.h
index b15f11a5..4490f99f 100644
--- a/i386/i386/ldt.h
+++ b/i386/i386/ldt.h
@@ -45,9 +45,14 @@
  #define       USER_SCALL      0x07            /* system call gate */
  #ifdef __x86_64__
  /* Call gate needs two entries */
+/* The sysret instruction puts some constraints on the user segment indexes */
+#define        USER_CS         0x1f            /* user code segment */
+#define        USER_DS         0x17            /* user data segment */

I'd say we'd rather avoid changing them for the x86_64 && USER32 case?

Right, I forgot to add ! USER32 here

  #define       USER_CS         0x17            /* user code segment */
  #define       USER_DS         0x1f            /* user data segment */
#define LDTSZ 4 diff --git a/i386/include/mach/i386/syscall_sw.h b/i386/include/mach/i386/syscall_sw.h
index 86f6ff2f..20ef7c13 100644
--- a/i386/include/mach/i386/syscall_sw.h
+++ b/i386/include/mach/i386/syscall_sw.h
@@ -29,16 +29,16 @@
#include <mach/machine/asm.h> -#if BSD_TRAP
-#define kernel_trap(trap_name,trap_number,number_args) \
-ENTRY(trap_name) \
-       movl    $ trap_number,%eax; \
-       SVC; \
-       jb LCL(cerror); \
-       ret; \
+#if defined(__x86_64__) && ! defined(USER32)
+#define kernel_trap(trap_name,trap_number,number_args)  \
+ENTRY(trap_name)                                       \
+       movq    $ trap_number,%rax;                     \

+       movq    %rcx,%r10;                              \

What is that for?

The syscall instruction automatically stores RIP in RCX, but RCX is also the place for the 4th arg passed to a function, so we need another register to store it. In this case R10 is the only non-callee-preserved register remaining. In the syscall64 code below, this value is moved back to RCX after saving the thread state.

+       syscall;                                        \
+       ret;                                            \
-#define kernel_trap(trap_name,trap_number,number_args) \
+#define kernel_trap(trap_name,trap_number,number_args)  \
  ENTRY(trap_name) \
        movl    $ trap_number,%eax; \
        SVC; \
diff --git a/x86_64/locore.S b/x86_64/locore.S
index 47d9085c..fdf7300b 100644
--- a/x86_64/locore.S
+++ b/x86_64/locore.S
@@ -1281,6 +1281,142 @@ DATA(cpu_features_ecx)
END(syscall) +
+/* Entry point for 64-bit syscalls.
+ * On entry we're still on the user stack, so better not use it. Instead we
+ * save the thread state immediately in thread->pcb->iss, then try to invoke
+ * the syscall.
+ * TODO:
+     - for now we assume the return address is canonical, but apparently there
+       can be cases where it's not (see how Linux handles this). Does it apply
+       here?
+     - do we need to check for ast on syscalls? Maybe on interrupts is enough
+     - check that the case where a task is suspended, and later returns via
+       iretq from return_from_trap, works fine in all combinations
+     - emulated syscalls - are they used anywhere?

Not that I know of.

Ok, I'll update the comment about emulated syscalls.

+ */
+       /* RFLAGS[32:63] are reserved, so combine syscall num (32 bit) and
+        * eflags in RAX to allow using r11 as temporary register */
+       shlq    $32,%r11
+       shlq    $32,%rax        /* make sure bits 32:63 of %rax are zero */
+       shrq    $32,%rax
+       or      %r11,%rax
+       /* Save thread state in pcb->iss, as on exception entry.
+        * Since this is triggered synchronously from userspace, we can
+        * save only the callee-preserved status according to the C ABI,
+        * plus RIP and EFLAGS for sysret */
+       CPU_NUMBER(%r11)
+       movq    CX(EXT(active_threads),%r11),%r11 /* point to current thread */
+       movq    TH_PCB(%r11),%r11               /* point to pcb */
+       addq    $ PCB_ISS,%r11                  /* point to saved state */
+       mov     %gs,R_GS(%r11)
+       mov     %fs,R_FS(%r11)
+       mov     %rsp,R_UESP(%r11)       /* callee-preserved register */
+       mov     %rcx,R_EIP(%r11)        /* syscall places user RIP in RCX */
+       mov     %rbx,R_EBX(%r11)        /* callee-preserved register */
+       mov     %rax,%rbx               /* Now we can unpack eflags again */
+       shr     $32,%rbx
+       mov     %rbx,R_EFLAGS(%r11)     /* ... and save them in pcb as well */
+       mov     %rbp,R_EBP(%r11)        /* callee-preserved register */
+       mov     %r12,R_R12(%r11)        /* callee-preserved register */
+       mov     %r13,R_R13(%r11)        /* callee-preserved register */
+       mov     %r14,R_R14(%r11)        /* callee-preserved register */
+       mov     %r15,R_R15(%r11)        /* callee-preserved register */
+       mov     %r11,%rbx               /* prepare for error handling */
+       mov     %r10,%rcx               /* fix arg3 location according to C ABI 
+       /* switch to kernel stack */
+       CPU_NUMBER(%r11)
+       movq    CX(EXT(kernel_stack),%r11),%rsp
+       /* Now we have saved state and args 1-6 are in place.
+        * Before invoking the syscall we do some bound checking and,
+        * if we have more that 6 arguments, we need to copy the
+        * remaining ones to the kernel stack, handling page faults when
+        * accessing the user stack.
+        */
+       shlq    $32,%rax                /* make sure bits 32:63 of %rax are 
zero */
+       shrq    $32,%rax
+       negl    %eax                    /* get system call number */
+       jl      _syscall64_range        /* out of range if it was positive */
+       cmpl    EXT(mach_trap_count),%eax       /* check system call table 
bounds */
+       jg      _syscall64_range        /* error if out of range */
+       shll    $5,%eax                 /* manual indexing of mach_trap_t */
+       /* check if we need to place some arguments on the stack */
+       mov     EXT(mach_trap_table)(%rax),%r10 /* get number of arguments */
+       subq    $6,%r10                 /* the first 6 args are already in 
place */
+       jl      _syscall64_call         /* skip argument copy if >6 args */


Right, I didn't test a 6-args syscall.

+       movq    R_UESP(%rbx),%r11       /* get user stack pointer */
+       addq    $8,%r11                 /* Skip user return address */
+       mov     $USER_DS,%r12           /* use user data segment for accesses */
+       mov     %r12,%fs
+       lea     (%r11,%r10,8),%r11      /* point past last argument */

+       xorq    %r12,%r12

Why clearing it?

Actually no need to do it, it's overwritten later

+0:     subq    $8,%r11
+       RECOVER(_syscall64_addr_push)
+       mov     %fs:(%r11),%r12
+       pushq   %r12                    /* push argument on stack */
+       dec     %r10
+       jnz     0b                      /* loop for all remaining arguments */
+       call    *EXT(mach_trap_table)+8(%rax)  /* call procedure */
+       // XXX: check ast on exit?
+       /* avoid leaking information in callee-clobbered registers */
+       mov     $0,%rdi

Rather xorq?

Will do.



Reply via email to