From cf39fc78537af29931b9e8051f4a680f0b671dcf Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 28 Mar 2017 14:09:13 +0900
Subject: [PATCH 1/2] Change detection of corrupted 2PC files as FATAL

When scanning 2PC files, be it for initializing a hot standby node or
finding the XID range at the end of recovery, bumping on a corrupted
2PC file is reported as a WARNING and are blindly corrupted. If it
happened that the corrupted file was actually legit, there is actually
a risk of corruption, so switch that to a hard failure.
---
 src/backend/access/transam/twophase.c | 25 ++++++++++---------------
 src/backend/access/transam/xlog.c     | 10 +++++++---
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 83169cccc3..e1e4052253 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1689,6 +1689,10 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
  * write a WAL entry, and so there might be no evidence in WAL of those
  * subxact XIDs.
  *
+ * On corrupted two-phase files, fail immediately. Keeping around broken
+ * entries and let replay continue causes harm on the system, and likely
+ * a new backup should be rolled in.
+ *
  * Our other responsibility is to determine and return the oldest valid XID
  * among the prepared xacts (if none, return ShmemVariableCache->nextXid).
  * This is needed to synchronize pg_subtrans startup properly.
@@ -1740,25 +1744,16 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
 			/* Read and validate file */
 			buf = ReadTwoPhaseFile(xid, true);
 			if (buf == NULL)
-			{
-				ereport(WARNING,
-					  (errmsg("removing corrupt two-phase state file \"%s\"",
-							  clde->d_name)));
-				RemoveTwoPhaseFile(xid, true);
-				continue;
-			}
+				ereport(FATAL,
+						(errmsg("corrupted two-phase state file \"%s\"",
+								clde->d_name)));
 
 			/* Deconstruct header */
 			hdr = (TwoPhaseFileHeader *) buf;
 			if (!TransactionIdEquals(hdr->xid, xid))
-			{
-				ereport(WARNING,
-					  (errmsg("removing corrupt two-phase state file \"%s\"",
-							  clde->d_name)));
-				RemoveTwoPhaseFile(xid, true);
-				pfree(buf);
-				continue;
-			}
+				ereport(FATAL,
+						(errmsg("corrupted two-phase state file \"%s\"",
+								clde->d_name)));
 
 			/*
 			 * OK, we think this file is valid.  Incorporate xid into the
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 61ca81d1d2..af40b0497e 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7371,6 +7371,13 @@ StartupXLOG(void)
 	}
 
 	/*
+	 * Pre-scan prepared transactions to find out the range of XIDs present.
+	 * We don't need this information quite yet, but if it fails for some
+	 * reason, better to fail before we make any on-disk changes.
+	 */
+	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
+
+	/*
 	 * Consider whether we need to assign a new timeline ID.
 	 *
 	 * If we are doing an archive recovery, we always assign a new ID.  This
@@ -7493,9 +7500,6 @@ StartupXLOG(void)
 	XLogCtl->LogwrtRqst.Write = EndOfLog;
 	XLogCtl->LogwrtRqst.Flush = EndOfLog;
 
-	/* Pre-scan prepared transactions to find out the range of XIDs present */
-	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
-
 	/*
 	 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
 	 * record before resource manager writes cleanup WAL records or checkpoint
-- 
2.12.2

