This extends the previous debug utility to trace system calls with more events and the ability to control the tracing from userspace, collecting a trace of the whole systems.
This tool was quite useful in porting the rpc format to 64 bit and handle the 32to64 translation, but also to debug user-space programs. It's still a draft with some limitations and open points: - DEBUG_TRACE for 64 bit is missing - check if we can have meaningful and reliable data removing the syscall number on syscall return; this would avoid saving it in the thread structure. - add more info to each trace entry, e.g. current CPU or timestamps - trace more events - add more filtering capabilities - trace a single subhurd - trace format is not exported --- Makefrag.am | 7 + configfrag.ac | 12 ++ i386/i386/debug.h | 16 +- i386/i386/debug_i386.c | 45 ++-- i386/i386/locore.S | 33 ++- i386/i386/pcb.c | 4 + i386/i386/trap.c | 5 + include/mach/gnumach.defs | 33 +++ include/mach/trace.h | 61 ++++++ ipc/ipc_kmsg.c | 6 +- ipc/mach_msg.c | 2 + kern/exception.c | 1 + kern/startup.c | 2 + kern/thread.h | 4 + kern/trace.c | 432 ++++++++++++++++++++++++++++++++++++++ kern/trace.h | 42 ++++ scripts/gnumach-gdb.py | 63 ++++++ tests/test-trace.c | 58 +++++ tests/user-qemu.mk | 3 +- x86_64/debug_trace.S | 9 +- x86_64/interrupt.S | 19 ++ x86_64/locore.S | 84 +++++++- 22 files changed, 887 insertions(+), 54 deletions(-) create mode 100644 include/mach/trace.h create mode 100644 kern/trace.c create mode 100644 kern/trace.h create mode 100644 scripts/gnumach-gdb.py create mode 100644 tests/test-trace.c diff --git a/Makefrag.am b/Makefrag.am index 82fce628..25d73ec3 100644 --- a/Makefrag.am +++ b/Makefrag.am @@ -221,6 +221,12 @@ libkernel_a_SOURCES += \ kern/xpr.h \ kern/elf-load.c \ kern/boot_script.c + +if KERNEL_TRACE +libkernel_a_SOURCES += \ + kern/trace.c +endif + EXTRA_DIST += \ kern/exc.defs \ kern/mach.srv \ @@ -407,6 +413,7 @@ include_mach_HEADERS = \ include/mach/profilparam.h \ include/mach/std_types.h \ include/mach/syscall_sw.h \ + include/mach/trace.h \ include/mach/task_info.h \ include/mach/task_special_ports.h \ include/mach/thread_info.h \ diff --git a/configfrag.ac b/configfrag.ac index b8b41261..f4dbe63d 100644 --- a/configfrag.ac +++ b/configfrag.ac @@ -139,6 +139,18 @@ AC_DEFINE([SLAB_VERIFY], [0], [SLAB_VERIFY]) # Enable the CPU pool layer in the slab allocator. AC_DEFINE([SLAB_USE_CPU_POOLS], [0], [SLAB_USE_CPU_POOLS]) + +# enable support for tracing various kernel events +AC_ARG_ENABLE([kernel-trace], + AS_HELP_STRING([--enable-kernel-trace], [Enable kernel event tracing])) +[if [ x"$enable_kernel_trace" = xyes ]; then] + AC_DEFINE([KERNEL_TRACE], [1], [Enable kernel event tracing]) + AM_CONDITIONAL([KERNEL_TRACE], [true]) +[else] + AC_DEFINE([KERNEL_TRACE], [0], [Enable kernel event tracing]) + AM_CONDITIONAL([KERNEL_TRACE], [false]) +[fi] + # # Options. diff --git a/i386/i386/debug.h b/i386/i386/debug.h index 84397ba8..eff330c6 100644 --- a/i386/i386/debug.h +++ b/i386/i386/debug.h @@ -54,6 +54,7 @@ void debug_trace_dump(void); #else /* __ASSEMBLER__ */ +#ifndef __x86_64__ #define DEBUG_TRACE \ pushl $__LINE__ ;\ pushl $9f ;\ @@ -62,10 +63,21 @@ void debug_trace_dump(void); .data ;\ 9: .ascii __FILE__"\0" ;\ .text - +#else /* __x86_64__ */ +#define DEBUG_TRACE \ + pushq %rdi ;\ + pushq %rs1 ;\ + movq $__LINE__,%rdi ;\ + movq $9f,%rsi ;\ + call __debug_trace ;\ + popq %rsi ;\ + popq %rdi ;\ + .data ;\ +9: .ascii __FILE__"\0" ;\ + .text +#endif #endif /* __ASSEMBLER__ */ - #endif /* DEBUG */ /* XXX #include_next "debug.h" */ diff --git a/i386/i386/debug_i386.c b/i386/i386/debug_i386.c index 41d032e3..61d73dff 100644 --- a/i386/i386/debug_i386.c +++ b/i386/i386/debug_i386.c @@ -22,6 +22,7 @@ */ #include <kern/printf.h> +#include <kern/trace.h> #include "thread.h" #include "trap.h" @@ -140,39 +141,23 @@ debug_trace_dump(void) splx(s); } +#endif /* DEBUG */ -#include <kern/syscall_sw.h> - -int syscall_trace = 0; -task_t syscall_trace_task; +#if KERNEL_TRACE -int -syscall_trace_print(int syscallvec, ...) +#ifndef __x86_64__ +#define SYSCALL_NUM_SHIFT 4 +#else +#define SYSCALL_NUM_SHIFT 5 +#endif +long +syscall_trace_enter(long *sysv) { - int syscallnum = syscallvec >> 4; - int i; - const mach_trap_t *trap = &mach_trap_table[syscallnum]; - - if (syscall_trace_task && syscall_trace_task != current_task()) - goto out; - - printf("0x%08x:0x%08x:%s(", - current_task(), current_thread(), trap->mach_trap_name); - for (i = 0; i < trap->mach_trap_arg_count; i++) { - unsigned long value = (&syscallvec)[1+i]; - /* Use a crude heuristic to format pointers. */ - if (value > 1024) - printf("0x%08x", value); - else - printf("%d", value); - - if (i + 1 < trap->mach_trap_arg_count) - printf(", "); - } - printf(")\n"); + long sysnum = sysv[0]; - out: - return syscallvec; + syscall_trace_common(sysnum >> SYSCALL_NUM_SHIFT, sysv + 1); + + return sysnum; } -#endif /* DEBUG */ +#endif /* KERNEL_TRACE */ diff --git a/i386/i386/locore.S b/i386/i386/locore.S index 9d0513a1..26c5843c 100644 --- a/i386/i386/locore.S +++ b/i386/i386/locore.S @@ -634,6 +634,15 @@ ENTRY(thread_bootstrap_return) */ ENTRY(thread_syscall_return) +#if KERNEL_TRACE + testb $0xff,EXT(syscall_trace) + jz 1f + movl S_ARG0,%eax /* get return value */ + pushl %eax + call syscall_trace_return + movl %eax,S_ARG0 /* restore return value */ +1: +#endif movl S_ARG0,%eax /* get return value */ movl %esp,%ecx /* get kernel stack */ or $(KERNEL_STACK_SIZE-1),%ecx @@ -1174,18 +1183,26 @@ syscall_native: mach_call_call: -#ifdef DEBUG +#if KERNEL_TRACE testb $0xff,EXT(syscall_trace) jz 0f - pushl %eax - call EXT(syscall_trace_print) - /* will return with syscallofs still (or again) in eax */ - addl $4,%esp + pushl %eax /* add syscall num to args array */ + pushl %esp /* args array is the first argument*/ + call EXT(syscall_trace_enter) + popl %eax + popl %eax 0: -#endif /* DEBUG */ +#endif /* KERNEL_TRACE */ + call *EXT(mach_trap_table)+4(%eax) /* call procedure */ +#if KERNEL_TRACE + testb $0xff,EXT(syscall_trace) + jz 1f + pushl %eax + call syscall_trace_return + /* will return with return value still (or again) in eax */ +1: +#endif /* KERNEL_TRACE */ - call *EXT(mach_trap_table)+4(%eax) - /* call procedure */ movl %esp,%ecx /* get kernel stack */ or $(KERNEL_STACK_SIZE-1),%ecx movl -3-IKS_SIZE(%ecx),%esp /* switch back to PCB stack */ diff --git a/i386/i386/pcb.c b/i386/i386/pcb.c index e8901550..a3ed7d67 100644 --- a/i386/i386/pcb.c +++ b/i386/i386/pcb.c @@ -37,6 +37,7 @@ #include <kern/counters.h> #include <kern/debug.h> #include <kern/thread.h> +#include <kern/trace.h> #include <kern/sched_prim.h> #include <kern/slab.h> #include <vm/vm_kern.h> @@ -308,6 +309,7 @@ void stack_handoff( /* * Load the rest of the user state for the new thread */ + context_switch_trace(old, NULL, new); switch_ktss(new->pcb); /* @@ -332,6 +334,7 @@ void stack_handoff( */ void load_context(thread_t new) { + context_switch_trace(NULL, NULL, new); switch_ktss(new->pcb); Load_context(new); } @@ -382,6 +385,7 @@ thread_t switch_context( /* * Load the rest of the user state for the new thread */ + context_switch_trace(old, continuation, new); switch_ktss(new->pcb); return Switch_context(old, continuation, new); } diff --git a/i386/i386/trap.c b/i386/i386/trap.c index 555d7498..17f7eb3c 100644 --- a/i386/i386/trap.c +++ b/i386/i386/trap.c @@ -53,6 +53,7 @@ #include <kern/printf.h> #include <kern/thread.h> #include <kern/task.h> +#include <kern/trace.h> #include <kern/sched.h> #include <kern/sched_prim.h> #include <kern/exception.h> @@ -162,6 +163,8 @@ void kernel_trap(struct i386_saved_state *regs) thread_t thread; extern char _start[], etext[]; + trap_trace(TRACE_TRAP_KERNEL, regs); + type = regs->trapno; code = regs->err; thread = current_thread(); @@ -366,6 +369,7 @@ int user_trap(struct i386_saved_state *regs) unsigned long type; thread_t thread = current_thread(); + trap_trace(TRACE_TRAP_USER, regs); #ifdef __x86_64__ assert(regs == &thread->pcb->iss); #endif @@ -592,6 +596,7 @@ int user_trap(struct i386_saved_state *regs) void i386_astintr(void) { + trap_trace(TRACE_TRAP_ASTINTR, &(current_thread()->pcb->iss)); (void) splsched(); /* block interrupts to check reasons */ #ifndef MACH_RING1 int mycpu = cpu_number(); diff --git a/include/mach/gnumach.defs b/include/mach/gnumach.defs index 7ecf74d3..583bdc36 100644 --- a/include/mach/gnumach.defs +++ b/include/mach/gnumach.defs @@ -215,3 +215,36 @@ routine vm_pages_phys( simpleroutine thread_set_name( thread : thread_t; name : kernel_debug_name_t); + +/* + * Start tracing kernel events, using a buffer of BUFSIZE bytes and + * tracing only events set in KINDS. + */ +simpleroutine ktrace_start( + host_priv : host_priv_t; + bufsize : rpc_vm_size_t; + kinds : rpc_long_integer_t); + +/* + * Stop tracing kernel events. + */ +simpleroutine ktrace_stop( + host_priv : host_priv_t); + +/* + * Free the tracing buffer allocated in ktrace_start(). + */ +simpleroutine ktrace_free( + host_priv : host_priv_t); + +/* + * Read the content of the tracing buffer in BUF; additionally we get + * a pointer to the first free byte after the tracing data in LAST_PTR + * and some trace parameters in FLAGS. + */ +routine ktrace_collect( + host_priv : host_priv_t; + out buf : pointer_t; + out last_ptr : vm_offset_t; + out flags : natural_t); + diff --git a/include/mach/trace.h b/include/mach/trace.h new file mode 100644 index 00000000..3fccbbc7 --- /dev/null +++ b/include/mach/trace.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2023 Free Software Foundation + * + * This program is free software ; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation ; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY ; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the program ; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _MACH_TRACE_H_ +#define _MACH_TRACE_H_ + +#include <stdint.h> +#include <mach/std_types.h> + +/* trace types as encoded in the ring buffer */ +#define DEBUG_TRACE_INVALID 0 +#define DEBUG_TRACE_SYSCALL 1 +#define DEBUG_TRACE_SAMPLE 2 +#define DEBUG_TRACE_TRAP 3 +#define DEBUG_TRACE_CSW 4 +#define DEBUG_TRACE_IRQ 5 +#define DEBUG_TRACE_RPC 6 +#define DEBUG_TRACE_DUMMY 15 + +/* trap events */ +#define TRACE_TRAP_USER 0 +#define TRACE_TRAP_KERNEL 1 +#define TRACE_TRAP_ASTINTR 2 + +/* trace types bitmask, to be used in trace_start()*/ +#define TRACE_TYPE_SYSCALL 0x01 +#define TRACE_TYPE_TRAP 0x02 +#define TRACE_TYPE_CSW 0x04 +#define TRACE_TYPE_IRQ 0x08 +#define TRACE_TYPE_RPC 0x10 +#define TRACE_TYPE_ALL 0x1F + +/* Trace file header definitions */ +#define TH_MAGIC "GMTR" +#define TH_OVERRUN_MASK 0x0f +#define TH_KERNEL64 0x10 +#define TH_USER64 0x20 + +typedef struct { + char magic[4]; + uint32_t flags; + uint32_t last; + uint32_t ov_first; +} trace_header_t; + +#endif /* _MACH_TRACE_H_ */ diff --git a/ipc/ipc_kmsg.c b/ipc/ipc_kmsg.c index 8bd645ff..8a153386 100644 --- a/ipc/ipc_kmsg.c +++ b/ipc/ipc_kmsg.c @@ -509,7 +509,7 @@ ipc_kmsg_get( ikm_free(kmsg); return MACH_SEND_INVALID_DATA; } - + rpc_trace(0, msg, size); *kmsgp = kmsg; return MACH_MSG_SUCCESS; } @@ -576,8 +576,10 @@ ipc_kmsg_put( if (copyoutmsg(&kmsg->ikm_header, msg, size)) mr = MACH_RCV_INVALID_DATA; - else + else { mr = MACH_MSG_SUCCESS; + rpc_trace(1, msg, msg->msgh_size); + } ikm_cache_free(kmsg); diff --git a/ipc/mach_msg.c b/ipc/mach_msg.c index ff5e5b09..3a4298ff 100644 --- a/ipc/mach_msg.c +++ b/ipc/mach_msg.c @@ -464,6 +464,7 @@ mach_msg_trap( ikm_free(kmsg); goto slow_get; } + rpc_trace(0, msg, send_size); fast_copyin: /* @@ -1194,6 +1195,7 @@ mach_msg_trap( if (!ikm_cache_free_try(kmsg)) goto slow_put; + rpc_trace(1, msg, msg->msgh_size); thread_syscall_return(MACH_MSG_SUCCESS); /*NOTREACHED*/ return MACH_MSG_SUCCESS; /* help for the compiler */ diff --git a/kern/exception.c b/kern/exception.c index 15f29705..0e0260f0 100644 --- a/kern/exception.c +++ b/kern/exception.c @@ -702,6 +702,7 @@ exception_raise( /*NOTREACHED*/ } + rpc_trace(1, receiver->ith_msg, receiver->ith_msg->msgh_size); thread_syscall_return(MACH_MSG_SUCCESS); /*NOTREACHED*/ #ifndef __GNUC__ diff --git a/kern/startup.c b/kern/startup.c index 61751d35..ce8f0123 100644 --- a/kern/startup.c +++ b/kern/startup.c @@ -47,6 +47,7 @@ #include <kern/thread.h> #include <kern/thread_swap.h> #include <kern/timer.h> +#include <kern/trace.h> #include <kern/xpr.h> #include <kern/bootstrap.h> #include <kern/startup.h> @@ -163,6 +164,7 @@ void setup_main(void) compute_mach_factor(); gsync_setup (); + trace_init(); /* * Create a kernel thread to start the other kernel diff --git a/kern/thread.h b/kern/thread.h index 81d32924..5b5d2235 100644 --- a/kern/thread.h +++ b/kern/thread.h @@ -240,6 +240,10 @@ struct thread { #endif char name[THREAD_NAME_SIZE]; + +#if KERNEL_TRACE + unsigned int syscall_num; /* for syscall tracing */ +#endif }; #include <kern/cpu_number.h> diff --git a/kern/trace.c b/kern/trace.c new file mode 100644 index 00000000..69e6eaf0 --- /dev/null +++ b/kern/trace.c @@ -0,0 +1,432 @@ +/* + * Copyright (C) 2024 Free Software Foundation + * + * This program is free software ; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation ; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY ; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the program ; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "kern/thread.h" +#include <kern/gnumach.server.h> +#include <kern/kalloc.h> +#include <kern/printf.h> +#include <kern/syscall_sw.h> +#include <kern/trace.h> +#include <mach/message.h> +#include <mach/trace.h> +#include <machine/locore.h> +#include <string.h> +#include "vm/vm_kern.h" + +/* Common information for each trace entry */ +struct msg_trace_header { + char task_name[TASK_NAME_SIZE]; + thread_t thread; + //int time; // to address thread/task reuse? we should trace also task creation/destruction... + unsigned int type : 8; + unsigned int size : 24; // in bytes, can probably be reduced + uint8_t data[]; +} __attribute__((packed)); + +struct syscall_trace_data { + unsigned int num : 7; // syscall id + unsigned int direction : 1; // entry/exit + unsigned int reserved : 24; + uint8_t data[]; // args/retcode, depending on the syscall number and direction +} __attribute__((packed)); + +struct rpc_trace_data { + unsigned int size : 16; // in bytes + unsigned int direction : 1; // send-copyin/recv-copyout + unsigned int reserved : 15; + uint8_t data[]; +} __attribute__((packed)); + +struct trap_trace_data { + unsigned int origin : 4; // user/kernel + unsigned int reserved : 28; + struct i386_saved_state regs; +} __attribute__((packed)); + +struct csw_trace_data { + char old_task_name[TASK_NAME_SIZE]; + thread_t old_thread; + continuation_t continuation; + char new_task_name[TASK_NAME_SIZE]; + thread_t new_thread; +} __attribute__((packed)); + +struct irq_trace_data { + int irqn; +} __attribute__((packed)); + +struct sample_trace_data { + unsigned int linenum; + char filename[]; +} __attribute__((packed)); + +static uint8_t *msg_trace_buf = NULL; +static vm_size_t msg_trace_size = 0; +static uint8_t *msg_trace_ptr = NULL; +static int msg_trace_overrun = 0; + +int syscall_trace; +static int trace_rpc_enable; +static int trace_trap_enable; +static int trace_csw_enable; +int interrupt_trace_enable; + +task_t syscall_trace_task = 0; +unsigned int syscall_seqn = 0; + +/* Called to initialize kernel tracing */ +void trace_init(void) +{ + msg_trace_buf = NULL; + msg_trace_ptr = NULL; + msg_trace_size = 0; + msg_trace_overrun = 0; + + syscall_trace = 0; + trace_rpc_enable = 0; + trace_trap_enable = 0; + trace_csw_enable = 0; + interrupt_trace_enable = 0; + + /* Useful for debugging small tests and the bootstrap modules */ + extern char* kernel_cmdline; // TODO: find a nice header + if (strstr(kernel_cmdline, "tracestart") != NULL) + { + kern_return_t ret = ktrace_start(&realhost, 20*1024*1024, TRACE_TYPE_ALL); + if (ret == KERN_SUCCESS) + printf("trace: started from command line\n"); + else + panic("Can't initialize trace buffer: error %d\n", ret); + } +} + +kern_return_t ktrace_start(const host_t host_priv, rpc_vm_size_t bufsize, + rpc_long_integer_t kinds) +{ + if (host_priv == HOST_NULL) + return KERN_INVALID_ARGUMENT; + + if (msg_trace_buf != NULL) + return KERN_FAILURE; + + int kr = kmem_alloc(kernel_map, (vm_offset_t*) &msg_trace_buf, bufsize); + if (kr != KERN_SUCCESS) + return KERN_RESOURCE_SHORTAGE; + + if (msg_trace_buf == 0) + return KERN_RESOURCE_SHORTAGE; + + msg_trace_ptr = msg_trace_buf; + msg_trace_size = bufsize; + msg_trace_overrun = 0; + + if (kinds & TRACE_TYPE_SYSCALL) + syscall_trace = 1; + if (kinds & TRACE_TYPE_RPC) + trace_rpc_enable = 1; + if (kinds & TRACE_TYPE_TRAP) + trace_trap_enable = 1; + if (kinds & TRACE_TYPE_CSW) + trace_csw_enable = 1; + if (kinds & TRACE_TYPE_IRQ) + interrupt_trace_enable = 1; + + if (current_thread()) + current_thread()->syscall_num = 25; // fake mach_msg() entry for better trace + + return KERN_SUCCESS; +} + +kern_return_t ktrace_stop(const host_t host_priv) +{ + if (host_priv == HOST_NULL) + return KERN_INVALID_ARGUMENT; + + if (msg_trace_buf == NULL) + return KERN_FAILURE; + + syscall_trace = 0; + trace_rpc_enable = 0; + trace_trap_enable = 0; + trace_csw_enable = 0; + interrupt_trace_enable = 0; + return KERN_SUCCESS; +} + +kern_return_t ktrace_free(const host_t host_priv) +{ + if (host_priv == HOST_NULL) + return KERN_INVALID_ARGUMENT; + + if (msg_trace_buf == NULL) + return KERN_FAILURE; + + kmem_free(kernel_map, (vm_offset_t) msg_trace_buf, msg_trace_size); + msg_trace_buf = NULL; + msg_trace_ptr = NULL; + return KERN_SUCCESS; +} + +kern_return_t ktrace_collect(const host_t host_priv, pointer_t *buf, + mach_msg_type_number_t *bufsize, + vm_size_t *last_ptr, natural_t *flags) +{ + if (host_priv == HOST_NULL) + return KERN_INVALID_ARGUMENT; + + if (msg_trace_buf == NULL) + return KERN_FAILURE; + + kern_return_t err; + vm_map_copy_t copy; + err = vm_map_copyin(kernel_map, (vm_offset_t) msg_trace_buf, msg_trace_size, + FALSE, ©); + if (err == KERN_SUCCESS) + { + *buf = (pointer_t) copy; + *bufsize = msg_trace_size; + *last_ptr = msg_trace_ptr - msg_trace_buf; + *flags = 0; + if (msg_trace_overrun > 0) + { + if (msg_trace_overrun < TH_OVERRUN_MASK) + *flags |= msg_trace_overrun & TH_OVERRUN_MASK; + else + *flags |= TH_OVERRUN_MASK; + } +#ifdef __LP64__ + *flags |= TH_KERNEL64; +#ifndef USER32 + *flags |= TH_USER64; +#endif /* USER32 */ +#endif /* __x86_64__ */ + } + return err; +} + +/* + * Starts a new trace entry. It pre-allocates a given size and returns a pointer + * to a region that can be filled with the same amount of data. + * In this way we can copy the data only once, e.g. when tracing mach_msg() + */ +static void* trace_msg(unsigned int type, size_t size) +{ + const size_t msg_size = size + sizeof(struct msg_trace_header); + int overrun = 0; + uint8_t *tptr, *tptr_orig; + struct msg_trace_header *cur; + + /* + * Handle interrupitons from trap/irq in reserving the trace data + * entry. + * + * TODO: ensure this is SMP-safe and maybe add cpuid to the trace + * header. + */ + do { + tptr = __atomic_load_n(&msg_trace_ptr, __ATOMIC_RELAXED); + tptr_orig = tptr; + + if ((tptr - msg_trace_buf) > (msg_trace_size - msg_size)) + { + // circular buffer, reset to the beginning + tptr = msg_trace_buf; + overrun = 1; + } + + tptr += msg_size; + + } while (! __atomic_compare_exchange_n(&msg_trace_ptr, &tptr_orig, tptr, 0, + __ATOMIC_RELAXED, __ATOMIC_RELAXED)); + + if (overrun) + msg_trace_overrun += 1; + + cur = (struct msg_trace_header*)(tptr - msg_size); + memcpy(cur->task_name, current_task()->name, TASK_NAME_SIZE); + cur->thread = current_thread(); + cur->type = type; + cur->size = msg_size; + return cur->data; +} + +#ifdef SYSCALL_TRACE_RECORD +static void trace_syscall_entry(int num, long *args) +{ + const mach_trap_t *trap = &mach_trap_table[num]; + size_t argsize = sizeof(long)*trap->mach_trap_arg_count; + size_t size = sizeof(struct syscall_trace_data) + argsize; + struct syscall_trace_data *data = trace_msg(DEBUG_TRACE_SYSCALL, size); + data->direction = 0; + data->num = num; + memcpy(data->data, args, argsize); +} + +static void trace_syscall_exit(int num, long ret) +{ + size_t argsize = sizeof(long); + size_t size = sizeof(struct syscall_trace_data) + argsize; + struct syscall_trace_data *data = trace_msg(DEBUG_TRACE_SYSCALL, size); + data->direction = 1; + data->num = num; + memcpy(data->data, &ret, argsize); +} +#endif /* SYSCALL_TRACE_RECORD */ + +/* This needs to be called from the machine-specific entry point */ +void syscall_trace_common(long sysnum, long argv[MAX_SYSCALL_ARGS]) +{ + + if (syscall_trace == 0) + goto out; + + current_thread()->syscall_num = sysnum; + if (syscall_trace_task && syscall_trace_task != current_task()) + goto out; + +#ifdef SYSCALL_TRACE_PRINT + if (sysnum == 30) // don't track mach_print() + goto out; + + syscall_seqn++; + const mach_trap_t *trap = &mach_trap_table[sysnum]; + printf("%08d:%s:0x%p:%s(", + syscall_seqn, current_task()->name, current_thread(), trap->mach_trap_name); + for (int i=0; i<MAX_SYSCALL_ARGS; i++) + { + int value = argv[i]; + if (trap->mach_trap_arg_count <= i) + continue; + if ((value > 1024) || (value < -1024)) + printf("0x%08x", value); + else + printf("%d", value); + if (i + 1 < trap->mach_trap_arg_count) + printf(", "); + } + printf(")\n"); +#else /* SYSCALL_TRACE_PRINT */ + trace_syscall_entry(sysnum, argv); +#endif + out: + return; +} + +/* This needs to be called from the machine-specific entry point */ +int syscall_trace_return(int val) +{ + if (syscall_trace == 0) + return val; + if (syscall_trace_task && syscall_trace_task != current_task()) + return val; +#ifdef SYSCALL_TRACE_PRINT + if (current_thread()->syscall_num != 30) // don't track mach_print() + { + const mach_trap_t *trap = &mach_trap_table[current_thread()->syscall_num]; + printf("%08d:%s:0x%p: %s return(%08x)\n", + syscall_seqn, current_task()->name, current_thread(), + trap->mach_trap_name, val); + } +#else /* SYSCALL_TRACE_PRINT */ + trace_syscall_exit(current_thread()->syscall_num, val); +#endif + + return val; +} + +void rpc_trace(int direction, void *msg, long msize) +{ + if (!trace_rpc_enable) + return; + + size_t size = sizeof(struct rpc_trace_data); + struct rpc_trace_data *data = trace_msg(DEBUG_TRACE_RPC, size + msize); + data->direction = direction; + data->size = msize; + if (copyin(msg, data->data, msize)) + memset(data->data, 0, msize); +} + +/* This needs to be called from the machine-specific entry point */ +void trap_trace(int origin, struct i386_saved_state *regs) +{ + if (!trace_trap_enable) + return; + + size_t size = sizeof(struct trap_trace_data); + struct trap_trace_data *data = trace_msg(DEBUG_TRACE_TRAP, size); + data->origin = origin; + memcpy(&data->regs, regs, sizeof(*regs)); +} + +void context_switch_trace(thread_t old, continuation_t continuation, thread_t new) +{ + if (!trace_csw_enable) + return; + + size_t size = sizeof(struct csw_trace_data); + struct csw_trace_data *data = trace_msg(DEBUG_TRACE_CSW, size); + if (old) + strncpy(data->old_task_name, old->task->name, TASK_NAME_SIZE); + else + data->old_task_name[0] = '\0'; + data->old_thread = old; + data->continuation = continuation; + strncpy(data->new_task_name, new->task->name, TASK_NAME_SIZE); + data->new_thread = new; +} + +void interrupt_trace(int iunit, /* 'unit' number */ + int old_ipl, /* old interrupt level */ + const char *ret_addr, /* return address in interrupt handler */ + struct i386_interrupt_state *regs, /* saved registers */ + int irqn) +{ + size_t size = sizeof(struct irq_trace_data); + struct irq_trace_data *data = trace_msg(DEBUG_TRACE_IRQ, size); + data->irqn = irqn; +} + +/* This trace type is useful during debug to create on-the-fly custom + traces, e.g. to follow the evolution of a specific value or + expression along other trace points */ +void dummy_trace(void) +{ + struct dummy_data { + int nthreads; + }; + size_t size = sizeof(struct dummy_data); + struct dummy_data *data = trace_msg(DEBUG_TRACE_DUMMY, size); + + // as an example, we count the threads + task_t task; + thread_t thread; + processor_set_t pset; + data->nthreads = 0; + if (queue_first(&all_psets) == 0) + return; + queue_iterate(&all_psets, pset, processor_set_t, all_psets) { + if (queue_first(&pset->tasks) == 0) + continue; + queue_iterate(&pset->tasks, task, task_t, pset_tasks) { + queue_iterate(&task->thread_list, thread, thread_t, thread_list) { + data->nthreads++; + } + } + } +} diff --git a/kern/trace.h b/kern/trace.h new file mode 100644 index 00000000..06d77e93 --- /dev/null +++ b/kern/trace.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2023 Free Software Foundation + * + * This program is free software ; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation ; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY ; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the program ; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#if KERNEL_TRACE + +#include <mach/trace.h> +#include <kern/thread.h> + +/* syscall events */ +#define MAX_SYSCALL_ARGS 12 +//#define SYSCALL_TRACE_PRINT // useful for small tests +#define SYSCALL_TRACE_RECORD + +void trace_init(void); +void rpc_trace(int direction, void *msg, long size); +void trap_trace(int origin, struct i386_saved_state *regs); +void syscall_trace_common(long sysnum, long argv[MAX_SYSCALL_ARGS]); +int syscall_trace_return(int val); +void context_switch_trace(thread_t old, continuation_t continuation, thread_t new); + +#else /* ! KERNEL_TRACE */ + +#define trace_init() +#define trap_trace(origin, regs) +#define context_switch_trace(old, cont, new) + +#endif /* KERNEL_TRACE */ diff --git a/scripts/gnumach-gdb.py b/scripts/gnumach-gdb.py new file mode 100644 index 00000000..16728c06 --- /dev/null +++ b/scripts/gnumach-gdb.py @@ -0,0 +1,63 @@ + +import logging + +import gdb + + +log = logging.getLogger("gnumach-gdb") +on64bit = gdb.selected_inferior().architecture().name() == "i386:x86-64" +on64bituser = gdb.lookup_symbol("syscall64")[0] is not None + + +class TraceCmd(gdb.Command): + """gnumach-trace [FILE] + +Save the content of the kernel trace buffer to a file (default ./trace.bin). +""" + + def __init__(self): + super().__init__("gnumach-trace", gdb.COMMAND_USER) + + def invoke(self, arg_string, from_tty): + try: + self._invoke(arg_string, from_tty) + except Exception: + log.exception(arg_string) + + def _invoke(self, arg_string, from_tty): + args = gdb.string_to_argv(arg_string) + if args in ['-h', '--help']: + print(TraceCmd.__doc__) + return + outfile = "trace.bin" + if len(args) >= 1: + outfile = args[0] + + tbuf = gdb.lookup_static_symbol("msg_trace_buf") + if tbuf is None: + log.error("trace buffer not available.") + return + tbuf_base = gdb.lookup_static_symbol("msg_trace_buf").value() + tbuf_top = gdb.lookup_static_symbol("msg_trace_ptr").value() + tbuf_overrun = gdb.lookup_static_symbol("msg_trace_overrun").value() + tbuf_len = tbuf_top - tbuf_base + + print(f"Writing {tbuf_len} bytes to {outfile} (overrun {tbuf_overrun})") + proc = gdb.selected_inferior() + with open(outfile, 'wb') as tf: + # trace header (see mach/trace.h) + tf.write(b"GMTR") + flags = int(tbuf_overrun & 0xF) + if on64bit: + flags |= 0x10 + if on64bituser: + flags |= 0x20 + tf.write(flags.to_bytes(4, byteorder='little')) + tf.write(int(tbuf_len).to_bytes(4, byteorder='little')) + tf.write(int(0).to_bytes(4, byteorder='little')) + # trace payload + tf.write(proc.read_memory(tbuf_base, tbuf_len)) + + +logging.basicConfig(level=logging.INFO) +TraceCmd() diff --git a/tests/test-trace.c b/tests/test-trace.c new file mode 100644 index 00000000..d6bc8051 --- /dev/null +++ b/tests/test-trace.c @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2024 Free Software Foundation + * + * This program is free software ; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation ; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY ; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the program ; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <syscalls.h> +#include <testlib.h> + +#include <mach/trace.h> +#include <gnumach.user.h> +#include <mach.user.h> + +void test_trace(void) +{ + int err; + err = ktrace_start(host_priv(), 40 * 1024 * 1024, TRACE_TYPE_ALL); + ASSERT_RET(err, "ktrace_start"); + + err = mach_msg(NULL, MACH_MSG_OPTION_NONE, 0, 0, + MACH_PORT_NULL, 0, MACH_PORT_NULL); + ASSERT_RET(err, "syscall path with thread_syscall_return()"); + + err = ktrace_stop(host_priv()); + ASSERT_RET(err, "ktrace_stop"); + + vm_offset_t buf, last_ptr; + mach_msg_type_number_t bufsize; + int flags; + err = ktrace_collect(host_priv(), &buf, &bufsize, &last_ptr, &flags); + ASSERT_RET(err, "ktrace_collect"); + printf("size %lu last %lu flags %x\n", bufsize, last_ptr, flags); + ASSERT(bufsize >= last_ptr, "ERROR overrun?"); + + err = vm_deallocate(mach_task_self(), buf, bufsize); + ASSERT_RET(err, "vm_deallocate"); + + err = ktrace_free(host_priv()); + ASSERT_RET(err, "ktrace_free"); +} + +int main(int argc, char *argv[], int envc, char *envp[]) +{ + test_trace(); + return 0; +} diff --git a/tests/user-qemu.mk b/tests/user-qemu.mk index fd5ae1ab..c4df73f9 100644 --- a/tests/user-qemu.mk +++ b/tests/user-qemu.mk @@ -204,7 +204,8 @@ USER_TESTS := \ tests/test-syscalls \ tests/test-machmsg \ tests/test-task \ - tests/test-threads + tests/test-threads \ + tests/test-trace USER_TESTS_CLEAN = $(subst tests/,clean-,$(USER_TESTS)) diff --git a/x86_64/debug_trace.S b/x86_64/debug_trace.S index 7bed5ccc..97e48063 100644 --- a/x86_64/debug_trace.S +++ b/x86_64/debug_trace.S @@ -30,19 +30,18 @@ .text ENTRY(_debug_trace) + ud2 // to test, do we need to keep it in asm? pushf cli pushq %rax pushq %rbx - .byte 0x36 /* SS: bug in gas? */ movl %ss:EXT(debug_trace_pos),%eax movq S_ARG0,%rbx - movq %rbx,%ss:EXT(debug_trace_buf)(,%eax,16) - movl S_ARG1,%ebx - movl %ebx,%ss:EXT(debug_trace_buf)+8(,%eax,16) + movq %rbx,%ss:EXT(debug_trace_buf)(,%eax,8) + movq S_ARG1,%rbx + movq %rbx,%ss:EXT(debug_trace_buf)+8(,%eax,8) incl %eax andl $DEBUG_TRACE_LEN-1,%eax - .byte 0x36 /* SS: bug in gas? */ movl %eax,%ss:EXT(debug_trace_pos) popq %rbx popq %rax diff --git a/x86_64/interrupt.S b/x86_64/interrupt.S index 6fb77727..72a2fd0c 100644 --- a/x86_64/interrupt.S +++ b/x86_64/interrupt.S @@ -115,6 +115,25 @@ ENTRY(interrupt) shll $2,%eax /* irq * 4 */ movl EXT(iunit)(%rax),%edi /* get device unit number as 1st arg */ +#if KERNEL_TRACE + testb $0xff,EXT(interrupt_trace_enable) + jz 0f + movq S_IRQ,S_ARG4 + pushq %rax + // save args for the real handler + pushq S_ARG0 + pushq S_ARG1 + pushq S_ARG2 + pushq S_ARG3 + callq EXT(interrupt_trace) + popq S_ARG3 + popq S_ARG2 + popq S_ARG1 + popq S_ARG0 + popq %rax +0: +#endif /* KERNEL_TRACE */ + shll $1,%eax /* irq * 8 */ call *EXT(ivect)(%rax) /* call interrupt handler */ diff --git a/x86_64/locore.S b/x86_64/locore.S index 8f39a677..9e9e5833 100644 --- a/x86_64/locore.S +++ b/x86_64/locore.S @@ -747,6 +747,15 @@ ENTRY(thread_bootstrap_return) */ ENTRY(thread_syscall_return) +#if KERNEL_TRACE + testb $0xff,EXT(syscall_trace) + jz 1f + /* first arg is already set in rdi, save it */ + pushq %rdi + callq syscall_trace_return + popq %rdi +1: +#endif movq S_ARG0,%rax /* get return value */ movq %rsp,%rcx /* get kernel stack */ or $(KERNEL_STACK_SIZE-1),%rcx @@ -1275,16 +1284,41 @@ syscall_native: jnz 0b /* loop for all arguments */ mach_call_call: - -#ifdef DEBUG +#if KERNEL_TRACE testb $0xff,EXT(syscall_trace) jz 0f - movq %rax,%rdi - call EXT(syscall_trace_print) - /* will return with syscallofs still (or again) in eax */ + /* + * We need to save register arguments, for the actual tracing + * and to restore them before calling the real syscall. + */ + pushq %r9 + pushq %r8 + pushq %rcx + pushq %rdx + pushq %rsi + pushq %rdi + pushq %rax + movq %rsp,%rdi + callq EXT(syscall_trace_enter) + popq %rax + popq %rdi + popq %rsi + popq %rdx + popq %rcx + popq %r8 + popq %r9 + movq %rax,%r12 /* save sysnum for tracing return value */ 0: -#endif /* DEBUG */ +#endif /* KERNEL_TRACE */ call *EXT(mach_trap_table)+8(%rax) /* call procedure */ +#if KERNEL_TRACE + testb $0xff,EXT(syscall_trace) + jz 1f + movq %rax,%rdi + callq syscall_trace_return + /* will return with return value still (or again) in eax */ +1: +#endif /* KERNEL_TRACE */ movq %rsp,%rcx /* get kernel stack */ or $(KERNEL_STACK_SIZE-1),%rcx movq -7-IKS_SIZE(%rcx),%rsp /* switch back to PCB stack */ @@ -1460,7 +1494,45 @@ _syscall64_args_stack: jnz 0b /* loop for all remaining arguments */ _syscall64_call: +#if KERNEL_TRACE + testb $0xff,EXT(syscall_trace) + jz 0f + /* + * We need to save register arguments, for the actual tracing + * and to restore them before calling the real syscall. + * We store all arguments and the syscall number in an array + * on the stack; fortunately args > 6 are already there, so + * push the rest and restore them before actually calling the + * handler. + */ + pushq %r9 + pushq %r8 + pushq %rcx + pushq %rdx + pushq %rsi + pushq %rdi + pushq %rax + movq %rsp,%rdi + callq EXT(syscall_trace_enter) + popq %rax + popq %rdi + popq %rsi + popq %rdx + popq %rcx + popq %r8 + popq %r9 + movq %rax,%r12 /* save sysnum for tracing return value */ +0: +#endif /* KERNEL_TRACE */ call *EXT(mach_trap_table)+8(%rax) /* call procedure */ +#if KERNEL_TRACE + testb $0xff,EXT(syscall_trace) + jz 1f + movq %rax,%rdi + callq syscall_trace_return + /* will return with return value still (or again) in eax */ +1: +#endif /* KERNEL_TRACE */ _syscall64_check_for_ast: /* Check for ast. */ -- 2.39.2