Hi list, This patch adds the backend's current running query to the "backend crash" message.
The crashing query is often a good starting point in debugging the cause, and much more easily accessible than core dumps. Example output: LOG: server process (PID 31451) was terminated by signal 9: Killed DETAIL: Running query: DO LANGUAGE plpythonu 'import os;os.kill(os.getpid(),9)' The message "Running query" might not be entirely true, as it might be e.g. a vacuum activity string too. But it sounds better than "Activity string" or anything else I could come up with. Also refactored pgstat_get_backend_current_activity() such that it always returns consistent results. (Previously it returned a pointer to shared memory that could theoretically change and was vulnerable to races) The function can also deal with shared memory corruption (if isCrashed is true), so that corruption doesn't cause a postmaster crash or hang. I'm not happy with the indenting depth in the function, but it would be hard to split anything out. Regards, Marti
From 0eec93a7b9d59ab9d7e101f515b0c72c0c7c383f Mon Sep 17 00:00:00 2001 From: Marti Raudsepp <ma...@juffo.org> Date: Fri, 2 Sep 2011 03:04:24 +0300 Subject: [PATCH] Log crashed backend's query (activity string) The crashing query is often a good starting point in debugging the cause, and much more easily accessible than core dumps. Example output: LOG: server process (PID 31451) was terminated by signal 9: Killed DETAIL: Running query: DO LANGUAGE plpythonu 'import os;os.kill(os.getpid(),9)' Also refactored pgstat_get_backend_current_activity() such that it always returns consistent results. (Previously it returned a pointer to shared memory that could theoretically change and was vulnerable to races) The function can also deal with shared memory corruption (if isCrashed is true), so that corruption doesn't cause a postmaster crash or hang. --- src/backend/postmaster/pgstat.c | 58 ++++++++++++++++++++++++----------- src/backend/postmaster/postmaster.c | 17 +++++++--- src/backend/storage/lmgr/deadlock.c | 2 +- src/include/pgstat.h | 2 +- 4 files changed, 54 insertions(+), 25 deletions(-) diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index be32ca8..3b13e58 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -2681,22 +2681,20 @@ pgstat_read_current_status(void) * and so will provide current information regardless of the age of our * transaction's snapshot of the status array. * - * It is the caller's responsibility to invoke this only for backends whose - * state is expected to remain stable while the result is in use. The - * only current use is in deadlock reporting, where we can expect that - * the target backend is blocked on a lock. (There are corner cases - * where the target's wait could get aborted while we are looking at it, - * but the very worst consequence is to return a pointer to a string - * that's been changed, so we won't worry too much.) + * If isCrashed is true, we try to do it in a way that's safe from shared + * memory corruption. This is used for reporting the crashed backend's query. * * Note: return strings for special cases match pg_stat_get_backend_activity. * ---------- */ const char * -pgstat_get_backend_current_activity(int pid, bool checkUser) +pgstat_get_backend_current_activity(int pid, bool checkUser, bool isCrashed) { PgBackendStatus *beentry; int i; + int maxlen = pgstat_track_activity_query_size; + char *result = NULL; /* silence uninitialized warning */ + char *buffer = NULL; beentry = BackendStatusArray; for (i = 1; i <= MaxBackends; i++) @@ -2720,8 +2718,40 @@ pgstat_get_backend_current_activity(int pid, bool checkUser) found = (vbeentry->st_procpid == pid); + if (found) + { + char *activity = vbeentry->st_activity; + + if (checkUser && !superuser() && beentry->st_userid != GetUserId()) + result = "<insufficient privilege>"; + /* + * Don't access activity pointer before we verify that it + * falls into BackendActivityBuffer. To make sure that the + * string's ending is contained within the buffer, the string + * can start at offset (MaxBackends - 1) at most. + */ + else if (activity < BackendActivityBuffer || + activity > (BackendActivityBuffer + + mul_size(maxlen, MaxBackends - 1))) + result = "<command string corrupt>"; + else if (*(activity) == '\0') + result = "<command string not enabled>"; + else + { + if (!buffer) + buffer = (char *) palloc(maxlen); + + strncpy(buffer, activity, maxlen); + buffer[maxlen - 1] = '\0'; + result = buffer; + } + } + + /* Don't check changecount oddness if the backend crashed -- it + * might be corrupt and stuck at an odd value + */ if (save_changecount == vbeentry->st_changecount && - (save_changecount & 1) == 0) + (isCrashed || (save_changecount & 1) == 0)) break; /* Make sure we can break out of loop if stuck... */ @@ -2729,15 +2759,7 @@ pgstat_get_backend_current_activity(int pid, bool checkUser) } if (found) - { - /* Now it is safe to use the non-volatile pointer */ - if (checkUser && !superuser() && beentry->st_userid != GetUserId()) - return "<insufficient privilege>"; - else if (*(beentry->st_activity) == '\0') - return "<command string not enabled>"; - else - return beentry->st_activity; - } + return result; beentry++; } diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 6e231a5..57be402 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -2777,6 +2777,8 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) static void LogChildExit(int lev, const char *procname, int pid, int exitstatus) { + const char *activity = pgstat_get_backend_current_activity(pid, false, true); + if (WIFEXITED(exitstatus)) ereport(lev, @@ -2784,7 +2786,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus) translator: %s is a noun phrase describing a child process, such as "server process" */ (errmsg("%s (PID %d) exited with exit code %d", - procname, pid, WEXITSTATUS(exitstatus)))); + procname, pid, WEXITSTATUS(exitstatus)), + errdetail("Running query: %s", activity))); else if (WIFSIGNALED(exitstatus)) #if defined(WIN32) ereport(lev, @@ -2794,7 +2797,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus) "server process" */ (errmsg("%s (PID %d) was terminated by exception 0x%X", procname, pid, WTERMSIG(exitstatus)), - errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."))); + errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."), + errdetail("Running query: %s", activity))); #elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST ereport(lev, @@ -2804,7 +2808,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus) (errmsg("%s (PID %d) was terminated by signal %d: %s", procname, pid, WTERMSIG(exitstatus), WTERMSIG(exitstatus) < NSIG ? - sys_siglist[WTERMSIG(exitstatus)] : "(unknown)"))); + sys_siglist[WTERMSIG(exitstatus)] : "(unknown)"), + errdetail("Running query: %s", activity))); #else ereport(lev, @@ -2812,7 +2817,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus) translator: %s is a noun phrase describing a child process, such as "server process" */ (errmsg("%s (PID %d) was terminated by signal %d", - procname, pid, WTERMSIG(exitstatus)))); + procname, pid, WTERMSIG(exitstatus)), + errdetail("Running query: %s", activity))); #endif else ereport(lev, @@ -2821,7 +2827,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus) translator: %s is a noun phrase describing a child process, such as "server process" */ (errmsg("%s (PID %d) exited with unrecognized status %d", - procname, pid, exitstatus))); + procname, pid, exitstatus), + errdetail("Running query: %s", activity))); } /* diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c index 7e7f6af..f3d0553 100644 --- a/src/backend/storage/lmgr/deadlock.c +++ b/src/backend/storage/lmgr/deadlock.c @@ -933,7 +933,7 @@ DeadLockReport(void) appendStringInfo(&logbuf, _("Process %d: %s"), info->pid, - pgstat_get_backend_current_activity(info->pid, false)); + pgstat_get_backend_current_activity(info->pid, false, false)); } ereport(ERROR, diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 5446fa0..97b114e 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -718,7 +718,7 @@ extern void pgstat_report_activity(const char *cmd_str); extern void pgstat_report_appname(const char *appname); extern void pgstat_report_xact_timestamp(TimestampTz tstamp); extern void pgstat_report_waiting(bool waiting); -extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser); +extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser, bool isCrashed); extern PgStat_TableStatus *find_tabstat_entry(Oid rel_id); extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id); -- 1.7.6.1
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers