Hi, I encountered that the assertion error is reported instead of a proper PANIC message when failed to fsync WAL. The cause is that there are multiple places where we call XLogFileNameP function that calls palloc during critical section, for example XLogWrite function.
TRAP: FailedAssertion("CritSectionCount == 0 || (context)->allowInCritSection", File: "mcxt.c", Line: 956) As far as I can see there are five places we need to fix.I've attached a patch. Regards, -- Masahiko Sawada http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 5f0ee50092..436409bca1 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2499,14 +2499,21 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) pgstat_report_wait_end(); if (written <= 0) { + char xlogfname[MAXFNAMELEN]; + int save_errno; + if (errno == EINTR) continue; + + save_errno = errno; + XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, + wal_segment_size); + errno = save_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not write to log file %s " "at offset %u, length %zu: %m", - XLogFileNameP(ThisTimeLineID, openLogSegNo), - startoffset, nleft))); + xlogfname, startoffset, nleft))); } nleft -= written; from += written; @@ -3792,10 +3799,17 @@ XLogFileClose(void) #endif if (close(openLogFile) != 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo); + errno = save_errno; ereport(PANIC, (errcode_for_file_access(), - errmsg("could not close file \"%s\": %m", - XLogFileNameP(ThisTimeLineID, openLogSegNo)))); + errmsg("could not close file \"%s\": %m", xlogfname))); + } + openLogFile = -1; } @@ -10100,32 +10114,25 @@ assign_xlog_sync_method(int new_sync_method, void *extra) void issue_xlog_fsync(int fd, XLogSegNo segno) { + char *msg = NULL; + pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC); switch (sync_method) { case SYNC_METHOD_FSYNC: if (pg_fsync_no_writethrough(fd) != 0) - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", - XLogFileNameP(ThisTimeLineID, segno)))); + msg = "could not fsync file \"%s\": %m"; break; #ifdef HAVE_FSYNC_WRITETHROUGH case SYNC_METHOD_FSYNC_WRITETHROUGH: if (pg_fsync_writethrough(fd) != 0) - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not fsync write-through file \"%s\": %m", - XLogFileNameP(ThisTimeLineID, segno)))); + msg = "could not fsync write-through file \"%s\": %m"; break; #endif #ifdef HAVE_FDATASYNC case SYNC_METHOD_FDATASYNC: if (pg_fdatasync(fd) != 0) - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not fdatasync file \"%s\": %m", - XLogFileNameP(ThisTimeLineID, segno)))); + msg = "could not fdatasync file \"%s\": %m"; break; #endif case SYNC_METHOD_OPEN: @@ -10136,6 +10143,21 @@ issue_xlog_fsync(int fd, XLogSegNo segno) elog(PANIC, "unrecognized wal_sync_method: %d", sync_method); break; } + + /* PANIC if failed to fsync */ + if (msg) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, + wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg(msg, xlogfname))); + } + pgstat_report_wait_end(); }