"Cherry G. Mathew" <che...@netbsd.org> writes: > Hello, > > So recently I've been looking at the interrupt subsystem for NetBSD/Xen, > which is a little unusual in that several interrupts can batch > together before a virtual cpu that is scheduled can be woken up to > handle them. > > When this happens, a single callback handler is invoked, which > demultiplexes all the pending interrupts, and passes on to everyone of > them the same trapframe that was passed to the callback. This is > particularly interesting for the clock handler, because what would have > been multiple ticks during any dormant period for the guest OS would be > only called once, and so the handler invokes hardclock() a number of > times to make up for lost time. Once again, it accounts for it using the > exact same trapframe that it got from the demultiplexing callback > handler - which if it were from userland, would account every tick to > that particular userland process (which was presumably dormant for the > duration lost). > > This got me thinking about deferred interrupts using spl(9), and how > time is accounted for those. (I will confess that I have an insiduous > interest, but I won't divulge what that is right now). > > For eg: if a clock interrupt from userland got deferred as pending, even > if it came in from userland (is this possible ?), because the current > spl level was at, say, SPL_HIGH, it now seems to be the case that the > system accounts for the delayed execution by charging the *entire* time > (from the last hardclock() inclusive of whatever was executing at > SPL_HIGH, to the system and not the userland process, thus charging the > time interval between when the last hardclock() was called, and when it > was actually serviced, to the system instead of the user process that > was originally interrupted. > > To emphasise my point, I've added a patch below that I think should > reflect the accounting correctly. > > I'm not sure I've understood this correctly, so I'd appreciate it if > someone who has a good understanding of this would be able to comment. > > Many Thanks, > > Cherry > > > > --- kern_clock.c.~1.138.~ 2018-09-21 11:28:02.792675611 +0000 > +++ kern_clock.c 2018-10-16 12:06:38.753987323 +0000 > @@ -352,6 +352,10 @@ > } > > if (CLKF_USERMODE(frame)) { > + if (p == NULL) { /* Account deferred clock ticks to > current user. */ > + p = l->l_proc; > + mutex_spin_enter(&p->p_stmutex); > + } > KASSERT(p != NULL); > if ((p->p_stflag & PST_PROFIL) && profsrc == > PROFSRC_CLOCK) > addupc_intr(l, CLKF_PC(frame));
Here's a full patch implementing what I have in mind for native and port-xen on arch/amd64 -- ~~cherry
diff -r 5a1052452d40 sys/arch/amd64/amd64/amd64_trap.S --- a/sys/arch/amd64/amd64/amd64_trap.S Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/amd64/amd64/amd64_trap.S Tue Oct 16 14:35:48 2018 +0000 @@ -628,6 +628,7 @@ */ ENTRY(alltraps) INTRENTRY + movq %rsp,CPUVAR(TF) .Lalltraps_noentry: STI(si) diff -r 5a1052452d40 sys/arch/amd64/amd64/genassym.cf --- a/sys/arch/amd64/amd64/genassym.cf Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/amd64/amd64/genassym.cf Tue Oct 16 14:35:48 2018 +0000 @@ -259,6 +259,7 @@ define CPU_INFO_CPUID offsetof(struct cpu_info, ci_cpuid) define CPU_INFO_ISTATE offsetof(struct cpu_info, ci_istate) define CPU_INFO_CC_SKEW offsetof(struct cpu_info, ci_data.cpu_cc_skew) +define CPU_INFO_TF offsetof(struct cpu_info, ci_tf) define ACPI_SUSPEND_GDT offsetof(struct cpu_info, ci_suspend_gdt) define ACPI_SUSPEND_IDT offsetof(struct cpu_info, ci_suspend_idt) diff -r 5a1052452d40 sys/arch/amd64/amd64/vector.S --- a/sys/arch/amd64/amd64/vector.S Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/amd64/amd64/vector.S Tue Oct 16 14:35:48 2018 +0000 @@ -238,7 +238,6 @@ movl $IPL_CLOCK,CPUVAR(ILEVEL) sti pushq %rbx - movq %rsp,%rsi xorq %rdi,%rdi call _C_LABEL(lapic_clockintr) jmp _C_LABEL(Xdoreti) @@ -252,12 +251,14 @@ pushq $0 pushq $T_ASTFLT INTRENTRY + movq %rsp,CPUVAR(TF) jmp _C_LABEL(Xhandle_x2apic_ltimer) IDTVEC_END(intr_x2apic_ltimer) IDTVEC(intr_lapic_ltimer) pushq $0 pushq $T_ASTFLT INTRENTRY + movq %rsp,CPUVAR(TF) jmp _C_LABEL(Xhandle_lapic_ltimer) IDTVEC_END(intr_lapic_ltimer) TEXT_USER_END @@ -346,7 +347,6 @@ movl IH_LEVEL(%rbx),%r12d ;\ cmpl %r13d,%r12d ;\ jle 7f ;\ - movq %rsp,%rsi ;\ movq IH_ARG(%rbx),%rdi ;\ movl %r12d,CPUVAR(ILEVEL) ;\ call *IH_FUN(%rbx) /* call it */ ;\ @@ -382,6 +382,7 @@ pushq $0 /* dummy error code */ ;\ pushq $T_ASTFLT /* trap # for doing ASTs */ ;\ INTRENTRY ;\ + movq %rsp,CPUVAR(TF) ;\ jmp _C_LABEL(Xhandle_ ## name ## num) ;\ IDTVEC_END(intr_ ## name ## num) ;\ TEXT_USER_END @@ -665,7 +666,6 @@ movq IS_HANDLERS(%r14),%rbx ;\ 6: \ movq IH_ARG(%rbx),%rdi ;\ - movq %rsp,%rsi ;\ call *IH_FUN(%rbx) /* call it */ ;\ movq IH_NEXT(%rbx),%rbx /* next handler in chain */ ;\ testq %rbx,%rbx ;\ @@ -765,6 +765,8 @@ /* sti?? */ movq %rsp,%rdi subq $8,%rdi; /* don't forget if_ppl */ + movq %rdi,CPUVAR(TF) + xorq %rdi, %rdi /* We don't want to pass on as arg */ call do_hypervisor_callback testb $SEL_RPL,TF_CS(%rsp) jnz doreti_checkast @@ -782,6 +784,8 @@ INTRENTRY movq %rsp,%rdi subq $8,%rdi; /* don't forget if_ppl */ + movq %rdi,CPUVAR(TF) + xorq %rdi, %rdi /* We don't want to pass on as arg */ call xen_failsafe_handler INTRFASTEXIT /* jmp HYPERVISOR_iret */ diff -r 5a1052452d40 sys/arch/i386/i386/genassym.cf --- a/sys/arch/i386/i386/genassym.cf Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/i386/i386/genassym.cf Tue Oct 16 14:35:48 2018 +0000 @@ -262,6 +262,7 @@ define CPU_INFO_NINTR offsetof(struct cpu_info, ci_data.cpu_nintr) define CPU_INFO_CURPRIORITY offsetof(struct cpu_info, ci_schedstate.spc_curpriority) define CPU_INFO_CC_SKEW offsetof(struct cpu_info, ci_data.cpu_cc_skew) +define CPU_INFO_TF offsetof(struct cpu_info, ci_tf) define CPU_INFO_VENDOR offsetof(struct cpu_info, ci_vendor[0]) diff -r 5a1052452d40 sys/arch/i386/i386/vector.S --- a/sys/arch/i386/i386/vector.S Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/i386/i386/vector.S Tue Oct 16 14:35:48 2018 +0000 @@ -1027,9 +1027,9 @@ cmpl $ecrit,%eax jb critical_region_fixup 11: pushl CPUVAR(ILEVEL) - push %esp + movq %esp, CPUVAR(TF) call do_hypervisor_callback - add $8,%esp + add $4,%esp xorl %eax,%eax movb TF_CS(%esp),%cl test $CHK_UPL,%cl /* slow return to ring 2 or 3 */ diff -r 5a1052452d40 sys/arch/x86/include/cpu.h --- a/sys/arch/x86/include/cpu.h Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/x86/include/cpu.h Tue Oct 16 14:35:48 2018 +0000 @@ -167,7 +167,7 @@ const struct cpu_functions *ci_func; /* start/stop functions */ struct trapframe *ci_ddb_regs; - + struct trapframe *ci_tf; /* Saved for clock handlers */ u_int ci_cflush_lsize; /* CLFLUSH insn line size */ struct x86_cache_info ci_cinfo[CAI_COUNT]; diff -r 5a1052452d40 sys/arch/x86/isa/clock.c --- a/sys/arch/x86/isa/clock.c Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/x86/isa/clock.c Tue Oct 16 14:35:48 2018 +0000 @@ -188,7 +188,7 @@ void sysbeep(int, int); static void tickle_tc(void); -static int clockintr(void *, struct intrframe *); +static int clockintr(void *); int sysbeepdetach(device_t, int); @@ -371,11 +371,11 @@ } static int -clockintr(void *arg, struct intrframe *frame) +clockintr(void *arg) { tickle_tc(); - hardclock((struct clockframe *)frame); + hardclock((struct clockframe *)curcpu()->ci_tf); #if NMCA > 0 if (MCA_system) { diff -r 5a1052452d40 sys/arch/x86/x86/lapic.c --- a/sys/arch/x86/x86/lapic.c Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/x86/x86/lapic.c Tue Oct 16 14:35:48 2018 +0000 @@ -90,7 +90,7 @@ #include <x86/x86/vmtvar.h> /* for vmt_hvcall() */ /* Referenced from vector.S */ -void lapic_clockintr(void *, struct intrframe *); +void lapic_clockintr(void *); static void lapic_delay(unsigned int); static uint32_t lapic_gettick(void); @@ -561,13 +561,13 @@ extern u_int i8254_get_timecount(struct timecounter *); void -lapic_clockintr(void *arg, struct intrframe *frame) +lapic_clockintr(void *arg) { struct cpu_info *ci = curcpu(); ci->ci_lapic_counter += lapic_tval; ci->ci_isources[LIR_TIMER]->is_evcnt.ev_count++; - hardclock((struct clockframe *)frame); + hardclock((struct clockframe *)ci->ci_tf); } void diff -r 5a1052452d40 sys/arch/xen/include/evtchn.h --- a/sys/arch/xen/include/evtchn.h Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/xen/include/evtchn.h Tue Oct 16 14:35:48 2018 +0000 @@ -38,9 +38,7 @@ bool events_suspend(void); bool events_resume(void); -unsigned int evtchn_do_event(int, struct intrframe *); -void call_evtchn_do_event(int, struct intrframe *); -void call_xenevt_event(int); +unsigned int evtchn_do_event(int); int event_set_handler(int, int (*func)(void *), void *, int, const char *, const char *); int event_remove_handler(int, int (*func)(void *), void *); diff -r 5a1052452d40 sys/arch/xen/include/hypervisor.h --- a/sys/arch/xen/include/hypervisor.h Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/xen/include/hypervisor.h Tue Oct 16 14:35:48 2018 +0000 @@ -129,7 +129,7 @@ /* hypervisor.c */ struct intrframe; struct cpu_info; -void do_hypervisor_callback(struct intrframe *regs); +void do_hypervisor_callback(void); void hypervisor_prime_pirq_event(int, unsigned int); void hypervisor_enable_event(unsigned int); diff -r 5a1052452d40 sys/arch/xen/x86/hypervisor_machdep.c --- a/sys/arch/xen/x86/hypervisor_machdep.c Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/xen/x86/hypervisor_machdep.c Tue Oct 16 14:35:48 2018 +0000 @@ -215,10 +215,9 @@ evt_do_hypervisor_callback(unsigned int port, unsigned int l1i, unsigned int l2i, void *args) { - KASSERT(args != NULL); + KASSERT(args == NULL); struct cpu_info *ci = curcpu(); - struct intrframe *regs = args; #ifdef PORT_DEBUG if (port == PORT_DEBUG) @@ -226,7 +225,7 @@ #endif if (evtsource[port]) { ci->ci_idepth++; - evtchn_do_event(port, regs); + evtchn_do_event(port); ci->ci_idepth--; } #ifdef DOM0OPS @@ -248,7 +247,7 @@ } void -do_hypervisor_callback(struct intrframe *regs) +do_hypervisor_callback(void) { volatile shared_info_t *s = HYPERVISOR_shared_info; struct cpu_info *ci; @@ -273,7 +272,7 @@ evt_iterate_bits(&vci->evtchn_pending_sel, s->evtchn_pending, s->evtchn_mask, - evt_do_hypervisor_callback, regs); + evt_do_hypervisor_callback, NULL); } #ifdef DIAGNOSTIC diff -r 5a1052452d40 sys/arch/xen/xen/clock.c --- a/sys/arch/xen/xen/clock.c Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/xen/xen/clock.c Tue Oct 16 14:35:48 2018 +0000 @@ -76,7 +76,7 @@ static unsigned xen_get_timecount(struct timecounter *); static int xen_rtc_get(struct todr_chip_handle *, struct timeval *); static int xen_rtc_set(struct todr_chip_handle *, struct timeval *); -static int xen_timer_handler(void *, struct clockframe *); +static int xen_timer_handler(void *); /* * xen timecounter: @@ -802,7 +802,7 @@ * The cookie is the pointer to struct cpu_info. */ static int -xen_timer_handler(void *cookie, struct clockframe *frame) +xen_timer_handler(void *cookie) { struct cpu_info *ci = curcpu(); uint64_t last, now, delta, next; @@ -836,7 +836,7 @@ while (delta >= NS_PER_TICK) { ci->ci_xen_hardclock_systime_ns += NS_PER_TICK; delta -= NS_PER_TICK; - hardclock(frame); + hardclock((struct clockframe *)ci->ci_tf); if (__predict_false(delta >= NS_PER_TICK)) ci->ci_xen_missed_hardclock_evcnt.ev_count++; } diff -r 5a1052452d40 sys/arch/xen/xen/evtchn.c --- a/sys/arch/xen/xen/evtchn.c Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/arch/xen/xen/evtchn.c Tue Oct 16 14:35:48 2018 +0000 @@ -305,12 +305,12 @@ unsigned int -evtchn_do_event(int evtch, struct intrframe *regs) +evtchn_do_event(int evtch) { struct cpu_info *ci; int ilevel; struct intrhand *ih; - int (*ih_fun)(void *, void *); + int (*ih_fun)(void *); uint32_t iplmask; int i; uint32_t iplbit; @@ -386,7 +386,7 @@ iplmask &= ~IUNMASK(ci, ih->ih_level); ci->ci_ilevel = ih->ih_level; ih_fun = (void *)ih->ih_fun; - ih_fun(ih->ih_arg, regs); + ih_fun(ih->ih_arg); ih = ih->ih_evt_next; } mutex_spin_exit(&evtlock[evtch]); @@ -410,7 +410,7 @@ KASSERT(ih->ih_cpu == ci); sti(); ih_fun = (void *)ih->ih_fun; - ih_fun(ih->ih_arg, regs); + ih_fun(ih->ih_arg); cli(); } hypervisor_enable_ipl(i); diff -r 5a1052452d40 sys/kern/kern_clock.c --- a/sys/kern/kern_clock.c Tue Oct 16 09:33:04 2018 +0000 +++ b/sys/kern/kern_clock.c Tue Oct 16 14:35:48 2018 +0000 @@ -352,6 +352,10 @@ } if (CLKF_USERMODE(frame)) { + if (p == NULL) { /* Account deferred clock ticks to current user. */ + p = l->l_proc; + mutex_spin_enter(&p->p_stmutex); + } KASSERT(p != NULL); if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK) addupc_intr(l, CLKF_PC(frame));