Hi, On 2017-06-22 14:08:45 -0700, Andres Freund wrote: > At pgcon some people were talking about the difficulty of instrumenting > the time actually spent waiting for lwlocks and related measurements. > I'd mentioned that linux these days provides infrastructure to measure > such things in unmodified binaries. > > Attached is a prototype of a script that measures the time spent inside > PGSemaphoreLock(), aggregates that inside the kernel, grouped by pid and > stacktrace. That allows one to generate nice flame graphs showing which > part of the code waits how long for lwlocks. > > The attached script clearly needs improvements, but I thought I'd show > some of the results it can get. To run it you need the the python > library of the 'bcc' project [1], and a sufficiently new kernel. Some > distributions, e.g. newer debian versions, package this as python-bpfcc > and similar. > > The output of the script can be turned into a flamegraph with > https://github.com/brendangregg/FlameGraph 's flamegraph.pl.
The script has bitrot slightly, due to python3 and postgres changes (the move to posix semaphores). Updated version attached. Based on the discussion in https://www.postgresql.org/message-id/20200813003934.yrm4qqngfygr6ii7%40alap3.anarazel.de Greetings, Andres Freund
#!/usr/bin/python from __future__ import print_function from bcc import BPF from time import sleep import argparse import signal def positive_int(val): try: ival = int(val) except ValueError: raise argparse.ArgumentTypeError("must be an integer") if ival < 0: raise argparse.ArgumentTypeError("must be positive") return ival def positive_nonzero_int(val): ival = positive_int(val) if ival == 0: raise argparse.ArgumentTypeError("must be nonzero") return ival bpf_text = """ #include <uapi/linux/ptrace.h> #include <linux/sched.h> struct stats_key_t { u32 pid; int user_stack_id; }; struct stats_value_t { u64 total_time; }; struct start_key_t { u32 pid; }; struct start_value_t { u64 last_start; }; // map_type, key_type, leaf_type, table_name, num_entry BPF_HASH(stats, struct stats_key_t, struct stats_value_t); BPF_HASH(start, struct start_key_t, struct start_value_t); BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE); int trace_sem_entry(struct pt_regs *ctx) { u32 pid = bpf_get_current_pid_tgid(); struct start_key_t start_key = {}; struct start_value_t start_value = {}; if (!(THREAD_FILTER)) { return 0; } start_key.pid = pid; start_value.last_start = bpf_ktime_get_ns(); start.update(&start_key, &start_value); return 0; } int trace_sem_return(struct pt_regs *ctx) { u32 pid = bpf_get_current_pid_tgid(); struct stats_key_t stats_key = {}; struct start_key_t start_key = {}; struct stats_value_t zero = {}; struct stats_value_t *stats_value; struct start_value_t *start_value; u64 delta; if (!(THREAD_FILTER)) { return 0; } start_key.pid = pid; start_value = start.lookup(&start_key); if (!start_value) return 0; /* missed start */; delta = bpf_ktime_get_ns() - start_value->last_start; start.delete(&start_key); stats_key.pid = pid; stats_key.user_stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID | BPF_F_USER_STACK); stats_value = stats.lookup_or_init(&stats_key, &zero); stats_value->total_time += delta; return 0; } """ examples = """examples: ./pgsemwait.py -x BINARY # trace postgres lwlock wait time until Ctrl-C ./pgsemwait.py -x BINARY 5 # trace for 5 seconds only ./pgsemwait.py -x BINARY -p PID # trace PID """ parser = argparse.ArgumentParser( description="Measure Postgres LWLock Wait Time", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) parser.add_argument("-x", "--binary", metavar="BINARY", dest="binary", required = True, help="path to postgres binary") parser.add_argument("-p", "--pid", metavar="PID", dest="pid", help="trace this PID only", type=positive_int) parser.add_argument("-f", "--folded", action="store_true", help="output folded format") parser.add_argument("duration", nargs="?", default=99999999, type=positive_nonzero_int, help="duration of trace, in seconds") parser.add_argument("--stack-storage-size", default=1024, type=positive_nonzero_int, help="the number of unique stack traces that can be stored and " "displayed (default 1024)") args = parser.parse_args() folded = args.folded duration = int(args.duration) # set stack storage size bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) # setup pid filter thread_filter = '1' if args.pid is not None: thread_filter = 'pid == %d' % args.pid bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) binary = args.binary b = BPF(text=bpf_text) libpath = BPF.find_exe(binary) if not libpath: bail("can't resolve library %s" % library) library = libpath b.attach_uprobe(name=library, sym_re='PGSemaphoreLock|futex', fn_name="trace_sem_entry", pid = -1) b.attach_uretprobe(name=library, sym_re='PGSemaphoreLock|futex', fn_name="trace_sem_return", pid = -1) matched = b.num_open_uprobes() if matched == 0: print("error: 0 functions traced. Exiting.", file=stderr) exit(1) # signal handler def signal_ignore(signal, frame): print() signal.signal(signal.SIGINT, signal_ignore) sleep(duration) stats = b.get_table("stats") stack_traces = b.get_table("stack_traces") for k, v in stats.items(): user_stack = [] if k.user_stack_id < 0 else \ stack_traces.walk(k.user_stack_id) name = 'postgres' if v.total_time == 0: continue if folded: # print folded stack output user_stack = list(user_stack) line = [name] + \ [b.sym(addr, k.pid).decode() for addr in reversed(user_stack)] print("%s %d" % (";".join(line), v.total_time)) else: # print default multi-line stack output for addr in user_stack: print(" %s" % b.sym(addr, k.pid).decode()) print(" %-16s %s (%d)" % ("-", name, k.pid)) print(" %d\n" % v.total_time)