Hi, The attached patch registers a signal handler for SIGSEGV and launches GDB in batch mode on its own pid so that the stack leading to the SEGV can be dumped in the server logs. Also attached is an example of the stack dumped by gdb in server log file (caused by a `kill -segv nnn` on the backend).
Since this patch calls fork() inside a signal handler, I investigated a bit and found that, per POSIX, fork() is asynch-signal-safe and hence it can be called inside a handler. This in itself might not be very useful because I haven't seen many crash reports in the community, but it can be extended to dump stack on Assert so that it helps developers and our beta testers. It can also be used to dump stack of a process we are about to kill for deadlock reasons, and before certain PANIC conditions too. Right now it works only for gdb (setting the GUC to true actually check for the presence of gdb), but it can be made generic, they way our archive_command etc. work, so that we take a string and replace certain parameters with binary path and pid so that any debugger can be used. It also looks pretty easy to port it to Windows since all we really want to do is create an external process with certain parameters, and CreateProcess() is all we need. I haven't investigated seriously about that but of there's interest in this patch then I can spend some time on that too. Regards, -- Gurjeet Singh EnterpriseDB Corporation The Enterprise PostgreSQL Company
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index dca5efc..04cd900 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -5049,3 +5049,34 @@ InitPostmasterDeathWatchHandle(void) (int) GetLastError()))); #endif /* WIN32 */ } + +/* Fork a gdb process such that it emits my stack trace to the logs */ +static void +print_self_stack() +{ + char pid_buf[30]; + int child_pid; + + sprintf(pid_buf, "%d", getpid()); + child_pid = fork(); + + if (child_pid == 0) + { + fprintf(stderr, "stack trace for %s pid=%s\n", my_exec_path, pid_buf); + execlp("gdb", "gdb", "--batch", "-n", "-ex", "bt", my_exec_path, pid_buf, NULL); + abort(); /* If gdb failed to start */ + } + else + { + waitpid(child_pid,NULL,0); + } +} + +/* SIGSEGV handler, controlled by GUC */ +void +dump_stack(SIGNAL_ARGS) +{ + print_self_stack(); + + abort(); +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5841631..7262839 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -40,6 +40,7 @@ #include "libpq/auth.h" #include "libpq/be-fsstubs.h" #include "libpq/pqformat.h" +#include "libpq/pqsignal.h" #include "miscadmin.h" #include "optimizer/cost.h" #include "optimizer/geqo.h" @@ -167,6 +168,10 @@ static bool call_enum_check_hook(struct config_enum * conf, int *newval, static bool check_log_destination(char **newval, void **extra, GucSource source); static void assign_log_destination(const char *newval, void *extra); +bool dump_stack_on_crash = false; +static bool check_dump_stack_on_crash(bool *newval, void **extra, GucSource source); +static void assign_dump_stack_on_crash(bool newval, void *extra); + #ifdef HAVE_SYSLOG static int syslog_facility = LOG_LOCAL0; #else @@ -1422,6 +1427,17 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"dump_stack_on_crash", PGC_USERSET, ERROR_HANDLING_OPTIONS, + gettext_noop("Use GDB to dump the stack of a crashing backend process."), + gettext_noop("This requires that GDB be already installed and accessible to Postgres."), + }, + "e_all_identifiers, + false, + check_dump_stack_on_crash, assign_dump_stack_on_crash, NULL + }, + + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL @@ -8672,4 +8688,35 @@ show_log_file_mode(void) return buf; } +static bool +check_dump_stack_on_crash(bool *newval, void **extra, GucSource source) +{ + /* TODO: Check if GDB is available. If not, then complain and return false */ + + char gdb_path[MAXPGPATH]; + + if (*newval == false) + return true; + + Assert(*newval == true); + + if (find_my_exec("gdb", gdb_path) < 0) + { + elog(WARNING, "Could not locate gdb."); + + return false; + } + else + return true; +} + +static void +assign_dump_stack_on_crash(const bool newval, void *extra) +{ + if (newval) + pqsignal(SIGSEGV, dump_stack); + else + pqsignal(SIGSEGV, SIG_DFL); +} + #include "guc-file.c" diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index be4f8a7..f624d51 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -55,6 +55,8 @@ extern int SubPostmasterMain(int argc, char *argv[]); extern Size ShmemBackendArraySize(void); extern void ShmemBackendArrayAllocation(void); +extern void dump_stack(SIGNAL_ARGS); + #endif #endif /* _POSTMASTER_H */
LOG: database system is ready to accept connections stack trace for /mnt/storage/gurjeet/dev/builds/gdb_integration/db/bin/postgres pid=11477 0x00007f58cf016dee in waitpid () from /lib/libc.so.6 #0 0x00007f58cf016dee in waitpid () from /lib/libc.so.6 #1 0x00000000006bc3fd in print_self_stack () at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/postmaster/postmaster.c:5072 #2 0x00000000006bc42d in dump_stack (postgres_signal_arg=11) at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/postmaster/postmaster.c:5080 #3 <signal handler called> #4 0x00007f58cf054690 in recv () from /lib/libc.so.6 #5 0x000000000061b523 in secure_read (port=0x2be7910, ptr=0xc68700, len=8192) at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/libpq/be-secure.c:303 #6 0x000000000062612d in pq_recvbuf () at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/libpq/pqcomm.c:816 #7 0x00000000006261c3 in pq_getbyte () at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/libpq/pqcomm.c:857 #8 0x000000000070e185 in SocketBackend (inBuf=0x7fffb9042eb0) at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/tcop/postgres.c:343 #9 0x000000000070e4bc in ReadCommand (inBuf=0x7fffb9042eb0) at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/tcop/postgres.c:465 #10 0x0000000000712f53 in PostgresMain (argc=2, argv=0x2bc9060, username=0x2bc8e98 "postgres") at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/tcop/postgres.c:3888 #11 0x00000000006bb794 in BackendRun (port=0x2be7910) at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/postmaster/postmaster.c:3507 #12 0x00000000006bae14 in BackendStartup (port=0x2be7910) at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/postmaster/postmaster.c:3192 #13 0x00000000006b7f99 in ServerLoop () at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/postmaster/postmaster.c:1348 #14 0x00000000006b7952 in PostmasterMain (argc=3, argv=0x2bc6ae0) at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/postmaster/postmaster.c:1108 #15 0x00000000006284dd in main (argc=3, argv=0x2bc6ae0) at /mnt/storage/gurjeet/dev/POSTGRES/src/backend/main/main.c:199 LOG: server process (PID 11477) was terminated by signal 6: Aborted L
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers