Re: Add missing error codes to PANIC/FATAL error reports in xlogrecovery

Krishnakumar R Mon, 04 Dec 2023 01:07:56 -0800

Hi,

Updated the patch with ERRCODE_CLUSTER_CORRUPTED & kept
ERRCODE_DATA_CORRUPTED when recovery is not consistent.


> > > Hm, this one arguably is not corruption, but we still cannot
> > > continue. ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE or maybe a new error 
> > > code?

Added a ERRCODE_TIMELINE_INCONSISTENT to be specific about the
scenarios with timeline mismatches. Thoughts ?

>> Another aside: Isn't the hint here obsolete since we've removed exclusive
backups? I can't think of any scenario now where removing backup_label would
be correct in a non-exclusive backup.

Attached another patch which applies on top of the first patch to
remove the obsolete hint.

- KK

From b779b53ee0cde0ab239c44f5c6c83ec530c194ab Mon Sep 17 00:00:00 2001
From: "Krishnakumar R (KK)" <kksrcv...@gmail.com>
Date: Thu, 30 Nov 2023 00:56:40 -0800
Subject: [PATCH v2 1/2] Add missing error codes to PANIC/FATAL error reports.

---
 src/backend/access/transam/xlogrecovery.c | 45 +++++++++++++++--------
 src/backend/utils/errcodes.txt            |  2 +
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index c61566666a..cb54f21de2 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -630,7 +630,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 				if (!ReadRecord(xlogprefetcher, LOG, false,
 								checkPoint.ThisTimeLineID))
 					ereport(FATAL,
-							(errmsg("could not find redo location referenced by checkpoint record"),
+							(errcode(ERRCODE_CLUSTER_CORRUPTED),
+							 errmsg("could not find redo location referenced by checkpoint record"),
 							 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
 									 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
 									 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
@@ -640,7 +641,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		else
 		{
 			ereport(FATAL,
-					(errmsg("could not locate required checkpoint record"),
+					(errcode(ERRCODE_CLUSTER_CORRUPTED),
+					 errmsg("could not locate required checkpoint record"),
 					 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
 							 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
 							 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
@@ -764,7 +766,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 			 * simplify processing around checkpoints.
 			 */
 			ereport(PANIC,
-					(errmsg("could not locate a valid checkpoint record")));
+					(errcode(ERRCODE_CLUSTER_CORRUPTED),
+					 errmsg("could not locate a valid checkpoint record")));
 		}
 		memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
 		wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
@@ -817,7 +820,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		 */
 		switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
 		ereport(FATAL,
-				(errmsg("requested timeline %u is not a child of this server's history",
+				(errcode(ERRCODE_TIMELINE_INCONSISTENT),
+				 errmsg("requested timeline %u is not a child of this server's history",
 						recoveryTargetTLI),
 				 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
 						   LSN_FORMAT_ARGS(ControlFile->checkPoint),
@@ -833,7 +837,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
 		ControlFile->minRecoveryPointTLI)
 		ereport(FATAL,
-				(errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
+				(errcode(ERRCODE_TIMELINE_INCONSISTENT),
+				 errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
 						recoveryTargetTLI,
 						LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
 						ControlFile->minRecoveryPointTLI)));
@@ -861,12 +866,14 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 							 checkPoint.newestCommitTsXid)));
 	if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
 		ereport(PANIC,
-				(errmsg("invalid next transaction ID")));
+				(errcode(ERRCODE_CLUSTER_CORRUPTED),
+				 errmsg("invalid next transaction ID")));
 
 	/* sanity check */
 	if (checkPoint.redo > CheckPointLoc)
 		ereport(PANIC,
-				(errmsg("invalid redo in checkpoint record")));
+				(errcode(ERRCODE_CLUSTER_CORRUPTED),
+				 errmsg("invalid redo in checkpoint record")));
 
 	/*
 	 * Check whether we need to force recovery from WAL.  If it appears to
@@ -877,7 +884,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 	{
 		if (wasShutdown)
 			ereport(PANIC,
-					(errmsg("invalid redo record in shutdown checkpoint")));
+					(errcode(ERRCODE_CLUSTER_CORRUPTED),
+					 errmsg("invalid redo record in shutdown checkpoint")));
 		InRecovery = true;
 	}
 	else if (ControlFile->state != DB_SHUTDOWNED)
@@ -953,7 +961,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 				if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
 					dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
 					ereport(FATAL,
-							(errmsg("backup_label contains data inconsistent with control file"),
+							(errcode(ERRCODE_CLUSTER_CORRUPTED),
+							 errmsg("backup_label contains data inconsistent with control file"),
 							 errhint("This means that the backup is corrupted and you will "
 									 "have to use another backup for recovery.")));
 				ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
@@ -1664,7 +1673,8 @@ PerformWalRecovery(void)
 		if (record->xl_rmid != RM_XLOG_ID ||
 			(record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
 			ereport(FATAL,
-					(errmsg("unexpected record type found at redo point %X/%X",
+					(errcode(ERRCODE_CLUSTER_CORRUPTED),
+					 errmsg("unexpected record type found at redo point %X/%X",
 							LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
 	}
 	else
@@ -1792,7 +1802,8 @@ PerformWalRecovery(void)
 		{
 			if (!reachedConsistency)
 				ereport(FATAL,
-						(errmsg("requested recovery stop point is before consistent recovery point")));
+						(errcode(ERRCODE_DATA_CORRUPTED),
+						 errmsg("requested recovery stop point is before consistent recovery point")));
 
 			/*
 			 * This is the last point where we can restart recovery with a new
@@ -1850,7 +1861,8 @@ PerformWalRecovery(void)
 		recoveryTarget != RECOVERY_TARGET_UNSET &&
 		!reachedRecoveryTarget)
 		ereport(FATAL,
-				(errmsg("recovery ended before configured recovery target was reached")));
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("recovery ended before configured recovery target was reached")));
 }
 
 /*
@@ -2324,7 +2336,8 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
 	/* Check that the record agrees on what the current (old) timeline is */
 	if (prevTLI != replayTLI)
 		ereport(PANIC,
-				(errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
+				(errcode(ERRCODE_CLUSTER_CORRUPTED),
+				 errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
 						prevTLI, replayTLI)));
 
 	/*
@@ -2333,7 +2346,8 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
 	 */
 	if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
 		ereport(PANIC,
-				(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
+				(errcode(ERRCODE_CLUSTER_CORRUPTED),
+				 errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
 						newTLI, replayTLI)));
 
 	/*
@@ -2349,7 +2363,8 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
 		lsn < minRecoveryPoint &&
 		newTLI > minRecoveryPointTLI)
 		ereport(PANIC,
-				(errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
+				(errcode(ERRCODE_CLUSTER_CORRUPTED),
+				 errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
 						newTLI,
 						LSN_FORMAT_ARGS(minRecoveryPoint),
 						minRecoveryPointTLI)));
diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt
index 8e97a0150f..9e9e1c0948 100644
--- a/src/backend/utils/errcodes.txt
+++ b/src/backend/utils/errcodes.txt
@@ -484,6 +484,7 @@ P0001    E    ERRCODE_RAISE_EXCEPTION                                        rai
 P0002    E    ERRCODE_NO_DATA_FOUND                                          no_data_found
 P0003    E    ERRCODE_TOO_MANY_ROWS                                          too_many_rows
 P0004    E    ERRCODE_ASSERT_FAILURE                                         assert_failure
+P0005    E    ERRCODE_TIMELINE_INCONSISTENT                                  timeline_inconsistent
 
 Section: Class XX - Internal Error
 
@@ -491,3 +492,4 @@ Section: Class XX - Internal Error
 XX000    E    ERRCODE_INTERNAL_ERROR                                         internal_error
 XX001    E    ERRCODE_DATA_CORRUPTED                                         data_corrupted
 XX002    E    ERRCODE_INDEX_CORRUPTED                                        index_corrupted
+XX003    E    ERRCODE_CLUSTER_CORRUPTED                                      cluster_corrupted
\ No newline at end of file
-- 
2.40.1

From 2d506a309036f801d338266ca933cc4e2a137183 Mon Sep 17 00:00:00 2001
From: "Krishnakumar R (KK)" <kksrcv...@gmail.com>
Date: Mon, 4 Dec 2023 00:29:34 -0800
Subject: [PATCH v1] Purge error hints which are obsolete due to the removal of
 exclusive backup.

---
 src/backend/access/transam/xlogrecovery.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index cb54f21de2..2b3fda6f08 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -632,10 +632,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 					ereport(FATAL,
 							(errcode(ERRCODE_CLUSTER_CORRUPTED),
 							 errmsg("could not find redo location referenced by checkpoint record"),
-							 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
-									 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
-									 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
-									 DataDir, DataDir, DataDir, DataDir)));
+							 errhint("If not found, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n",
+									 DataDir, DataDir)));
 			}
 		}
 		else
@@ -643,10 +641,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 			ereport(FATAL,
 					(errcode(ERRCODE_CLUSTER_CORRUPTED),
 					 errmsg("could not locate required checkpoint record"),
-					 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
-							 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
-							 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
-							 DataDir, DataDir, DataDir, DataDir)));
+					 errhint("If not found, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n",
+							 DataDir, DataDir)));
 			wasShutdown = false;	/* keep compiler quiet */
 		}
 
-- 
2.40.1

Re: Add missing error codes to PANIC/FATAL error reports in xlogrecovery

Reply via email to