Applied, thanks! Luca Dariz, le sam. 29 juil. 2023 19:47:52 +0200, a ecrit: > The actual values are not saved together with the rest of the thread > state, both because it would be quite espensive (reading MSR, unless > rdfsbase instructions are supported, but that's optional) and not > really needed. The only way the user has to change its value is with a > specific RPC, so we can intercept the change easily. Furthermore, > Leaving the values there exposes them to being corrupted in case of a > double interruption, e.g. an irq is handled just before iretq but > after starting to restore the thread state. This solution was > suggested by Sergey Bugaev. > > * i386/i386/db_trace.c: remove fsbase/gsbase from the registers > available > * i386/i386/debug_i386.c: remove fsbase/gsbase from the printed thread > state > * i386/i386/i386asm.sym: remove fsbase/gsbase as it's not needed in > asm anymore > * i386/i386/pcb.c: point fsbase/gsbase to the new location > * i386/i386/thread.h: move fsbase/gsbase to the machine state > * x86_64/locore.S: generalize segment-handling including es/ds/gs/fs > and remove fsbase/gsbase handling. Also, factor out kernel segment > selector setting to a macro. > --- > i386/i386/db_trace.c | 2 - > i386/i386/debug_i386.c | 2 - > i386/i386/i386asm.sym | 2 - > i386/i386/pcb.c | 12 +-- > i386/i386/thread.h | 22 +++- > x86_64/locore.S | 228 +++++++++++++++-------------------------- > 6 files changed, 108 insertions(+), 160 deletions(-) > > diff --git a/i386/i386/db_trace.c b/i386/i386/db_trace.c > index 2b6ad741..8bd86fa5 100644 > --- a/i386/i386/db_trace.c > +++ b/i386/i386/db_trace.c > @@ -78,8 +78,6 @@ struct db_variable db_regs[] = { > { "r13",(long *)&ddb_regs.r13, db_i386_reg_value }, > { "r14",(long *)&ddb_regs.r14, db_i386_reg_value }, > { "r15",(long *)&ddb_regs.r15, db_i386_reg_value }, > - { "fsb",(long *)&ddb_regs.fsbase,db_i386_reg_value }, > - { "gsb",(long *)&ddb_regs.gsbase,db_i386_reg_value }, > #endif > }; > struct db_variable *db_eregs = db_regs + sizeof(db_regs)/sizeof(db_regs[0]); > diff --git a/i386/i386/debug_i386.c b/i386/i386/debug_i386.c > index b5465796..41d032e3 100644 > --- a/i386/i386/debug_i386.c > +++ b/i386/i386/debug_i386.c > @@ -40,8 +40,6 @@ void dump_ss(const struct i386_saved_state *st) > st->r8, st->r9, st->r10, st->r11); > printf("R12 %016lx R13 %016lx R14 %016lx R15 %016lx\n", > st->r12, st->r13, st->r14, st->r15); > - printf("FSBASE %016lx GSBASE %016lx\n", > - st->fsbase, st->gsbase); > printf("RIP %016lx EFLAGS %08lx\n", st->eip, st->efl); > #else > printf("EAX %08lx EBX %08lx ECX %08lx EDX %08lx\n", > diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym > index fd0be557..1b9b40bb 100644 > --- a/i386/i386/i386asm.sym > +++ b/i386/i386/i386asm.sym > @@ -108,8 +108,6 @@ offset i386_saved_state r r12 > offset i386_saved_state r r13 > offset i386_saved_state r r14 > offset i386_saved_state r r15 > -offset i386_saved_state r fsbase > -offset i386_saved_state r gsbase > #endif > > offset i386_interrupt_state i eip > diff --git a/i386/i386/pcb.c b/i386/i386/pcb.c > index d1c5fb50..1cf87eb1 100644 > --- a/i386/i386/pcb.c > +++ b/i386/i386/pcb.c > @@ -229,8 +229,8 @@ void switch_ktss(pcb_t pcb) > #endif /* MACH_PV_DESCRIPTORS */ > > #if defined(__x86_64__) && !defined(USER32) > - wrmsr(MSR_REG_FSBASE, pcb->iss.fsbase); > - wrmsr(MSR_REG_GSBASE, pcb->iss.gsbase); > + wrmsr(MSR_REG_FSBASE, pcb->ims.sbs.fsbase); > + wrmsr(MSR_REG_GSBASE, pcb->ims.sbs.gsbase); > #endif > > db_load_context(pcb); > @@ -687,8 +687,8 @@ kern_return_t thread_setstatus( > return KERN_INVALID_ARGUMENT; > > state = (struct i386_fsgs_base_state *) tstate; > - thread->pcb->iss.fsbase = state->fs_base; > - thread->pcb->iss.gsbase = state->gs_base; > + thread->pcb->ims.sbs.fsbase = state->fs_base; > + thread->pcb->ims.sbs.gsbase = state->gs_base; > if (thread == current_thread()) { > wrmsr(MSR_REG_FSBASE, state->fs_base); > wrmsr(MSR_REG_GSBASE, state->gs_base); > @@ -889,8 +889,8 @@ kern_return_t thread_getstatus( > return KERN_INVALID_ARGUMENT; > > state = (struct i386_fsgs_base_state *) tstate; > - state->fs_base = thread->pcb->iss.fsbase; > - state->gs_base = thread->pcb->iss.gsbase; > + state->fs_base = thread->pcb->ims.sbs.fsbase; > + state->gs_base = thread->pcb->ims.sbs.gsbase; > *count = i386_FSGS_BASE_STATE_COUNT; > break; > } > diff --git a/i386/i386/thread.h b/i386/i386/thread.h > index 2378154f..86a44098 100644 > --- a/i386/i386/thread.h > +++ b/i386/i386/thread.h > @@ -51,10 +51,6 @@ > */ > > struct i386_saved_state { > -#ifdef __x86_64__ > - unsigned long fsbase; > - unsigned long gsbase; > -#endif > unsigned long gs; > unsigned long fs; > unsigned long es; > @@ -162,6 +158,13 @@ struct v86_assist_state { > #define V86_IF_PENDING 0x8000 /* unused bit */ > #endif > > +#if defined(__x86_64__) && !defined(USER32) > +struct i386_segment_base_state { > + unsigned long fsbase; > + unsigned long gsbase; > +}; > +#endif > + > /* > * i386_interrupt_state: > * > @@ -206,14 +209,23 @@ struct i386_machine_state { > #endif > struct real_descriptor user_gdt[USER_GDT_SLOTS]; > struct i386_debug_state ids; > +#if defined(__x86_64__) && !defined(USER32) > + struct i386_segment_base_state sbs; > +#endif > }; > > typedef struct pcb { > + /* START of the exception stack. > + * NOTE: this area is used as exception stack when switching > + * CPL, and it MUST be big enough to save the thread state and > + * switch to a proper stack area, even considering recursive > + * exceptions, otherwise it could corrupt nearby memory */ > struct i386_interrupt_state iis[2]; /* interrupt and NMI */ > #ifdef __x86_64__ > - unsigned long pad; /* ensure exception stack is aligned to 16 */ > + unsigned long pad; /* ensure exception stack is aligned to 16 */ > #endif > struct i386_saved_state iss; > + /* END of exception stack*/ > struct i386_machine_state ims; > decl_simple_lock_data(, lock) > unsigned short init_control; /* Initial FPU control to set */ > diff --git a/x86_64/locore.S b/x86_64/locore.S > index 7127957b..66a9436a 100644 > --- a/x86_64/locore.S > +++ b/x86_64/locore.S > @@ -39,6 +39,10 @@ > #include <i386/i386/cpu_number.h> > #include <i386/i386/xen.h> > > + > +/* > + * Helpers for thread state as saved in the pcb area, during trap or irq > handling > + */ > #define pusha \ > pushq %rax ;\ > pushq %rcx ;\ > @@ -75,45 +79,74 @@ > popq %rcx ;\ > popq %rax > > +/* > + * Note that we have to load the kernel segment registers even if this > + * is a trap from the kernel, because the kernel uses user segment > + * registers for copyin/copyout. > + * (XXX Would it be smarter just to use fs or gs for that?) > + */ > #ifdef USER32 > -#define PUSH_FSGS \ > +#define PUSH_SEGMENTS(reg) \ > + movq %ds,reg ;\ > + pushq reg ;\ > + movq %es,reg ;\ > + pushq reg ;\ > pushq %fs ;\ > - pushq %gs ;\ > - subq $16,%rsp > + pushq %gs > #else > -#define PUSH_FSGS \ > +#define PUSH_SEGMENTS(reg) \ > subq $32,%rsp > #endif > > #ifdef USER32 > -#define POP_FSGS \ > +#define POP_SEGMENTS(reg) \ > popq %gs ;\ > popq %fs ;\ > - addq $16,%rsp > + popq reg ;\ > + movq reg,%es ;\ > + popq reg ;\ > + movq reg,%ds > #else > -#define POP_FSGS \ > +#define POP_SEGMENTS(reg) \ > addq $32,%rsp > #endif > > #ifdef USER32 > -#define PUSH_FSGS_ISR \ > +#define PUSH_SEGMENTS_ISR(reg) \ > + movq %ds,reg ;\ > + pushq reg ;\ > + movq %es,reg ;\ > + pushq reg ;\ > pushq %fs ;\ > pushq %gs > #else > -#define PUSH_FSGS_ISR \ > - subq $16,%rsp > +#define PUSH_SEGMENTS_ISR(reg) \ > + subq $32,%rsp > #endif > > #ifdef USER32 > -#define POP_FSGS_ISR \ > +#define POP_SEGMENTS_ISR(reg) \ > popq %gs ;\ > - popq %fs > + popq %fs ;\ > + popq reg ;\ > + movq reg,%es ;\ > + popq reg ;\ > + movq reg,%ds > #else > -#define POP_FSGS_ISR \ > - addq $16,%rsp > +#define POP_SEGMENTS_ISR(reg) \ > + addq $32,%rsp > #endif > > - > +#ifdef USER32 > +#define SET_KERNEL_SEGMENTS \ > + mov %ss,%dx /* switch to kernel segments */ ;\ > + mov %dx,%ds ;\ > + mov %dx,%es ;\ > + mov %dx,%fs ;\ > + mov %dx,%gs > +#else > +#define SET_KERNEL_SEGMENTS > +#endif > > /* > * Fault recovery. > @@ -350,32 +383,17 @@ ENTRY(start_timer) > /* > * Trap/interrupt entry points. > * > - * All traps must create the following save area on the kernel stack: > - * > - * gs > - * fs > - * es > - * ds > - * edi > - * esi > - * ebp > - * cr2 if page fault - otherwise unused > - * ebx > - * edx > - * ecx > - * eax > - * trap number > - * error code > - * eip > - * cs > - * eflags > - * user rsp - if from user > - * user ss - if from user > - * es - if from V86 thread > - * ds - if from V86 thread > - * fs - if from V86 thread > - * gs - if from V86 thread > + * All traps must create the i386_saved_state struct on the stack on > + * entry. Note that: > + * - CR2 is only used if the trap is a page fault > + * - user_rsp/user_ss are only used if entering from user space > + * - v86_regs are used only from V86 threads > + * (TODO check if V86 is still used with USER32) > * > + * Depending the CPL before entry, the stack might be switched or not; > + * if entering from user-space the CPU loads TSS->RSP0 in RSP, > + * otherwise RSP is unchanged. After this, the cpu pushes > + * SS/RSP/RFLAFS/CS/RIP and optionally ErrorCode and executes the handler. > */ > > /* Try to save/show some information when a double fault happens > @@ -426,16 +444,16 @@ trap_check_kernel_exit: > /* check for the kernel exit sequence */ > cmpq $_kret_iret,16(%rsp) /* on IRET? */ > je fault_iret > -#if 0 > +#ifdef USER32 > cmpq $_kret_popl_ds,16(%rsp) /* popping DS? */ > je fault_popl_ds > cmpq $_kret_popl_es,16(%rsp) /* popping ES? */ > je fault_popl_es > -#endif > cmpq $_kret_popl_fs,16(%rsp) /* popping FS? */ > je fault_popl_fs > cmpq $_kret_popl_gs,16(%rsp) /* popping GS? */ > je fault_popl_gs > +#endif > take_fault: /* if none of the above: */ > jmp EXT(alltraps) /* treat as normal trap. */ > > @@ -464,6 +482,7 @@ fault_iret: > popq %rax /* restore eax */ > jmp EXT(alltraps) /* take fault */ > > +#ifdef USER32 > /* > * Fault restoring a segment register. The user's registers are still > * saved on the stack. The offending segment register has not been > @@ -499,6 +518,7 @@ push_gs: > push_gsbase: > pushq $0 > pushq $0 > +#endif > push_segregs: > movq %rax,R_TRAPNO(%rsp) /* set trap number */ > movq %rdx,R_ERR(%rsp) /* set error code */ > @@ -562,23 +582,8 @@ ENTRY(t_page_fault) > ENTRY(alltraps) > pusha /* save the general registers */ > trap_push_segs: > - movq %ds,%rax /* and the segment registers */ > - pushq %rax > - movq %es,%rax /* and the segment registers */ > - pushq %rax > - PUSH_FSGS > - > - /* Note that we have to load the segment registers > - even if this is a trap from the kernel, > - because the kernel uses user segment registers for copyin/copyout. > - (XXX Would it be smarter just to use fs or gs for that?) */ > - mov %ss,%ax /* switch to kernel data segment */ > - mov %ax,%ds /* (same as kernel stack segment) */ > - mov %ax,%es > -#ifdef USER32 > - mov %ax,%fs > - mov %ax,%gs > -#endif > + PUSH_SEGMENTS(%rax) > + SET_KERNEL_SEGMENTS > trap_set_segs: > cld /* clear direction flag */ > #ifdef USER32 > @@ -634,23 +639,20 @@ _return_to_user: > */ > > _return_from_kernel: > - addq $16,%rsp /* skip FS/GS base */ > #ifndef USER32 > -_kret_popl_gs: > -_kret_popl_fs: > - addq $16,%rsp /* skip FS/GS selector */ > + addq $32,%rsp /* skip FS/GS selector */ > #else > _kret_popl_gs: > popq %gs /* restore segment registers */ > _kret_popl_fs: > popq %fs > -#endif > _kret_popl_es: > popq %rax > movq %rax,%es > _kret_popl_ds: > popq %rax > movq %rax,%ds > +#endif > popa /* restore general registers */ > addq $16,%rsp /* discard trap number and error code */ > _kret_iret: > @@ -777,8 +779,11 @@ INTERRUPT(255) > /* XXX handle NMI - at least print a warning like Linux does. */ > > /* > - * All interrupts enter here. > - * old %eax on stack; interrupt number in %eax. > + * All interrupts enter here. The cpu might have loaded a new RSP, > + * depending on the previous CPL, as in alltraps. > + * Old %eax on stack, interrupt number in %eax; we need to fill the remaining > + * fields of struct i386_interrupt_state, which might be in the pcb or in the > + * interrupt stack. > */ > ENTRY(all_intrs) > pushq %rcx /* save registers */ > @@ -791,24 +796,15 @@ ENTRY(all_intrs) > pushq %r11 > cld /* clear direction flag */ > > - movq %ds,%rdx /* save segment registers */ > - pushq %rdx > - movq %es,%rdx > - pushq %rdx > - PUSH_FSGS_ISR > + PUSH_SEGMENTS_ISR(%rdx) > > movq %rsp,%rdx /* on an interrupt stack? */ > and $(~(INTSTACK_SIZE-1)),%rdx > cmpq %ss:EXT(int_stack_base),%rdx > je int_from_intstack /* if not: */ > > - mov %ss,%dx /* switch to kernel segments */ > - mov %dx,%ds > - mov %dx,%es > -#ifdef USER32 > - mov %dx,%fs > - mov %dx,%gs > -#endif > + SET_KERNEL_SEGMENTS > + > CPU_NUMBER(%edx) > > movq CX(EXT(int_stack_top),%edx),%rcx > @@ -849,11 +845,7 @@ LEXT(return_to_iret) /* ( label for > kdb_kintr and hardclock) */ > cmpq $0,CX(EXT(need_ast),%edx) > jnz ast_from_interrupt /* take it if so */ > 1: > - POP_FSGS_ISR > - pop %rdx > - mov %rdx,%es > - pop %rdx > - mov %rdx,%ds > + POP_SEGMENTS_ISR(%rdx) > pop %r11 > pop %r10 > pop %r9 > @@ -871,12 +863,7 @@ int_from_intstack: > jb stack_overflowed /* if not: */ > call EXT(interrupt) /* call interrupt routine */ > _return_to_iret_i: /* ( label for kdb_kintr) */ > - POP_FSGS_ISR > - pop %rdx > - mov %rdx,%es > - pop %rdx > - mov %rdx,%ds > - > + POP_SEGMENTS_ISR(%rdx) > pop %r11 > pop %r10 > pop %r9 > @@ -909,11 +896,7 @@ stack_overflowed: > * ss > */ > ast_from_interrupt: > - POP_FSGS_ISR > - pop %rdx > - mov %rdx,%es > - pop %rdx > - mov %rdx,%ds > + POP_SEGMENTS_ISR(%rdx) > popq %r11 > popq %r10 > popq %r9 > @@ -926,19 +909,8 @@ ast_from_interrupt: > pushq $0 /* zero code */ > pushq $0 /* zero trap number */ > pusha /* save general registers */ > - mov %ds,%rdx /* save segment registers */ > - push %rdx > - mov %es,%rdx > - push %rdx > - PUSH_FSGS_ISR > - > - mov %ss,%dx /* switch to kernel segments */ > - mov %dx,%ds > - mov %dx,%es > -#ifdef USER32 > - mov %dx,%fs > - mov %dx,%gs > -#endif > + PUSH_SEGMENTS_ISR(%rdx) > + SET_KERNEL_SEGMENTS > CPU_NUMBER(%edx) > TIME_TRAP_UENTRY > > @@ -1056,20 +1028,12 @@ kdb_from_iret_i: /* on interrupt > stack */ > pushq $0 /* zero error code */ > pushq $0 /* zero trap number */ > pusha /* save general registers */ > - mov %ds,%rdx /* save segment registers */ > - push %rdx > - mov %es,%rdx > - push %rdx > - PUSH_FSGS > + PUSH_SEGMENTS(%rdx) > movq %rsp,%rdx /* pass regs, */ > movq $0,%rsi /* code, */ > movq $-1,%rdi /* type to kdb */ > call EXT(kdb_trap) > - POP_FSGS > - pop %rdx > - mov %rdx,%es > - pop %rdx > - mov %rdx,%ds > + POP_SEGMENTS(%rdx) > popa /* restore general registers */ > addq $16,%rsp > > @@ -1144,23 +1108,13 @@ ttd_from_iret_i: /* on interrupt > stack */ > pushq $0 /* zero error code */ > pushq $0 /* zero trap number */ > pusha /* save general registers */ > - mov %ds,%rdx /* save segment registers */ > - push %rdx > - mov %es,%rdx > - push %rdx > - push %fs > - push %gs > + PUSH_SEGMENTS_ISR(%rdx) > ud2 // TEST it > movq %rsp,%rdx /* pass regs, */ > movq $0,%rsi /* code, */ > movq $-1,%rdi /* type to kdb */ > call _kttd_trap > - pop %gs /* restore segment registers */ > - pop %fs > - pop %rdx > - mov %rdx,%es > - pop %rdx > - mov %rdx,%ds > + POP_SEGMENTS_ISR(%rdx) > popa /* restore general registers */ > addq $16,%rsp > > @@ -1193,20 +1147,8 @@ syscall_entry_2: > pushq $0 /* clear trap number slot */ > > pusha /* save the general registers */ > - movq %ds,%rdx /* and the segment registers */ > - pushq %rdx > - movq %es,%rdx > - pushq %rdx > - pushq %fs > - pushq %gs > - pushq $0 // gsbase > - pushq $0 // fsbase > - > - mov %ss,%dx /* switch to kernel data segment */ > - mov %dx,%ds > - mov %dx,%es > - mov %dx,%fs > - mov %dx,%gs > + PUSH_SEGMENTS(%rdx) > + SET_KERNEL_SEGMENTS > > /* > * Shuffle eflags,eip,cs into proper places > -- > 2.39.2 > >
-- Samuel --- Pour une évaluation indépendante, transparente et rigoureuse ! Je soutiens la Commission d'Évaluation de l'Inria.