Re: [HACKERS] Dynamic instrumentation of lwlock wait times (lwlock flamegraphs)

Andres Freund Wed, 12 Aug 2020 17:43:23 -0700

Hi,

On 2017-06-22 14:08:45 -0700, Andres Freund wrote:
> At pgcon some people were talking about the difficulty of instrumenting
> the time actually spent waiting for lwlocks and related measurements.
> I'd mentioned that linux these days provides infrastructure to measure
> such things in unmodified binaries.
> 
> Attached is a prototype of a script that measures the time spent inside
> PGSemaphoreLock(), aggregates that inside the kernel, grouped by pid and
> stacktrace.  That allows one to generate nice flame graphs showing which
> part of the code waits how long for lwlocks.
> 
> The attached script clearly needs improvements, but I thought I'd show
> some of the results it can get.  To run it you need the the python
> library of the 'bcc' project [1], and a sufficiently new kernel.  Some
> distributions, e.g. newer debian versions, package this as python-bpfcc
> and similar.
> 
> The output of the script can be turned into a flamegraph with
> https://github.com/brendangregg/FlameGraph 's flamegraph.pl.


The script has bitrot slightly, due to python3 and postgres changes (the
move to posix semaphores). Updated version attached.

Based on the discussion in
https://www.postgresql.org/message-id/20200813003934.yrm4qqngfygr6ii7%40alap3.anarazel.de

Greetings,

Andres Freund

#!/usr/bin/python

from __future__ import print_function
from bcc import BPF
from time import sleep
import argparse
import signal

def positive_int(val):
    try:
        ival = int(val)
    except ValueError:
        raise argparse.ArgumentTypeError("must be an integer")

    if ival < 0:
        raise argparse.ArgumentTypeError("must be positive")
    return ival

def positive_nonzero_int(val):
    ival = positive_int(val)
    if ival == 0:
        raise argparse.ArgumentTypeError("must be nonzero")
    return ival

bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>

struct stats_key_t {
    u32 pid;
    int user_stack_id;
};

struct stats_value_t {
    u64 total_time;
};

struct start_key_t {
    u32 pid;
};

struct start_value_t {
    u64 last_start;
};

// map_type, key_type, leaf_type, table_name, num_entry
BPF_HASH(stats, struct stats_key_t, struct stats_value_t);
BPF_HASH(start, struct start_key_t, struct start_value_t);

BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);

int trace_sem_entry(struct pt_regs *ctx)
{
    u32 pid = bpf_get_current_pid_tgid();
    struct start_key_t start_key = {};
    struct start_value_t start_value = {};

    if (!(THREAD_FILTER)) {
        return 0;
    }

    start_key.pid = pid;
    start_value.last_start = bpf_ktime_get_ns();

    start.update(&start_key, &start_value);

    return 0;
}

int trace_sem_return(struct pt_regs *ctx)
{
    u32 pid = bpf_get_current_pid_tgid();
    struct stats_key_t stats_key = {};
    struct start_key_t start_key = {};
    struct stats_value_t zero = {};
    struct stats_value_t *stats_value;
    struct start_value_t *start_value;
    u64 delta;

    if (!(THREAD_FILTER)) {
        return 0;
    }

    start_key.pid = pid;
    start_value = start.lookup(&start_key);

    if (!start_value)
        return 0; /* missed start */;

    delta = bpf_ktime_get_ns() - start_value->last_start;
    start.delete(&start_key);

    stats_key.pid = pid;
    stats_key.user_stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID | BPF_F_USER_STACK);

    stats_value = stats.lookup_or_init(&stats_key, &zero);
    stats_value->total_time += delta;

    return 0;
}

"""

examples = """examples:
    ./pgsemwait.py -x BINARY        # trace postgres lwlock wait time until Ctrl-C
    ./pgsemwait.py -x BINARY 5      # trace for 5 seconds only
    ./pgsemwait.py -x BINARY -p PID # trace PID
"""

parser = argparse.ArgumentParser(
    description="Measure Postgres LWLock Wait Time",
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog=examples)
parser.add_argument("-x", "--binary", metavar="BINARY", dest="binary", required = True,
    help="path to postgres binary")
parser.add_argument("-p", "--pid", metavar="PID", dest="pid",
    help="trace this PID only", type=positive_int)
parser.add_argument("-f", "--folded", action="store_true",
    help="output folded format")
parser.add_argument("duration", nargs="?", default=99999999,
    type=positive_nonzero_int,
    help="duration of trace, in seconds")
parser.add_argument("--stack-storage-size", default=1024,
    type=positive_nonzero_int,
    help="the number of unique stack traces that can be stored and "
         "displayed (default 1024)")

args = parser.parse_args()

folded = args.folded
duration = int(args.duration)

# set stack storage size
bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))

# setup pid filter
thread_filter = '1'
if args.pid is not None:
    thread_filter = 'pid == %d' % args.pid
bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)

binary = args.binary

b = BPF(text=bpf_text)

libpath = BPF.find_exe(binary)
if not libpath:
    bail("can't resolve library %s" % library)
library = libpath

b.attach_uprobe(name=library, sym_re='PGSemaphoreLock|futex',
    fn_name="trace_sem_entry",
    pid = -1)

b.attach_uretprobe(name=library, sym_re='PGSemaphoreLock|futex',
    fn_name="trace_sem_return",
    pid = -1)

matched = b.num_open_uprobes()
if matched == 0:
    print("error: 0 functions traced. Exiting.", file=stderr)
    exit(1)

# signal handler
def signal_ignore(signal, frame):
    print()

signal.signal(signal.SIGINT, signal_ignore)

sleep(duration)

stats = b.get_table("stats")
stack_traces = b.get_table("stack_traces")

for k, v in stats.items():
    user_stack = [] if k.user_stack_id < 0 else \
        stack_traces.walk(k.user_stack_id)
    name = 'postgres'
    if v.total_time == 0:
        continue
    if folded:
        # print folded stack output
        user_stack = list(user_stack)
        line = [name] + \
            [b.sym(addr, k.pid).decode() for addr in reversed(user_stack)]
        print("%s %d" % (";".join(line), v.total_time))
    else:
        # print default multi-line stack output
        for addr in user_stack:
            print("    %s" % b.sym(addr, k.pid).decode())
        print("    %-16s %s (%d)" % ("-", name, k.pid))
        print("        %d\n" % v.total_time)

Re: [HACKERS] Dynamic instrumentation of lwlock wait times (lwlock flamegraphs)

Reply via email to