Hi, Updated the patch with ERRCODE_CLUSTER_CORRUPTED & kept ERRCODE_DATA_CORRUPTED when recovery is not consistent.
> > > Hm, this one arguably is not corruption, but we still cannot > > > continue. ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE or maybe a new error > > > code? Added a ERRCODE_TIMELINE_INCONSISTENT to be specific about the scenarios with timeline mismatches. Thoughts ? >> Another aside: Isn't the hint here obsolete since we've removed exclusive backups? I can't think of any scenario now where removing backup_label would be correct in a non-exclusive backup. Attached another patch which applies on top of the first patch to remove the obsolete hint. - KK
From b779b53ee0cde0ab239c44f5c6c83ec530c194ab Mon Sep 17 00:00:00 2001 From: "Krishnakumar R (KK)" <kksrcv...@gmail.com> Date: Thu, 30 Nov 2023 00:56:40 -0800 Subject: [PATCH v2 1/2] Add missing error codes to PANIC/FATAL error reports. --- src/backend/access/transam/xlogrecovery.c | 45 +++++++++++++++-------- src/backend/utils/errcodes.txt | 2 + 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index c61566666a..cb54f21de2 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -630,7 +630,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID)) ereport(FATAL, - (errmsg("could not find redo location referenced by checkpoint record"), + (errcode(ERRCODE_CLUSTER_CORRUPTED), + errmsg("could not find redo location referenced by checkpoint record"), errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", @@ -640,7 +641,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, else { ereport(FATAL, - (errmsg("could not locate required checkpoint record"), + (errcode(ERRCODE_CLUSTER_CORRUPTED), + errmsg("could not locate required checkpoint record"), errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", @@ -764,7 +766,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * simplify processing around checkpoints. */ ereport(PANIC, - (errmsg("could not locate a valid checkpoint record"))); + (errcode(ERRCODE_CLUSTER_CORRUPTED), + errmsg("could not locate a valid checkpoint record"))); } memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); @@ -817,7 +820,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, */ switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL); ereport(FATAL, - (errmsg("requested timeline %u is not a child of this server's history", + (errcode(ERRCODE_TIMELINE_INCONSISTENT), + errmsg("requested timeline %u is not a child of this server's history", recoveryTargetTLI), errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.", LSN_FORMAT_ARGS(ControlFile->checkPoint), @@ -833,7 +837,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) != ControlFile->minRecoveryPointTLI) ereport(FATAL, - (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u", + (errcode(ERRCODE_TIMELINE_INCONSISTENT), + errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u", recoveryTargetTLI, LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), ControlFile->minRecoveryPointTLI))); @@ -861,12 +866,14 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, checkPoint.newestCommitTsXid))); if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid))) ereport(PANIC, - (errmsg("invalid next transaction ID"))); + (errcode(ERRCODE_CLUSTER_CORRUPTED), + errmsg("invalid next transaction ID"))); /* sanity check */ if (checkPoint.redo > CheckPointLoc) ereport(PANIC, - (errmsg("invalid redo in checkpoint record"))); + (errcode(ERRCODE_CLUSTER_CORRUPTED), + errmsg("invalid redo in checkpoint record"))); /* * Check whether we need to force recovery from WAL. If it appears to @@ -877,7 +884,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, { if (wasShutdown) ereport(PANIC, - (errmsg("invalid redo record in shutdown checkpoint"))); + (errcode(ERRCODE_CLUSTER_CORRUPTED), + errmsg("invalid redo record in shutdown checkpoint"))); InRecovery = true; } else if (ControlFile->state != DB_SHUTDOWNED) @@ -953,7 +961,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY && dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY) ereport(FATAL, - (errmsg("backup_label contains data inconsistent with control file"), + (errcode(ERRCODE_CLUSTER_CORRUPTED), + errmsg("backup_label contains data inconsistent with control file"), errhint("This means that the backup is corrupted and you will " "have to use another backup for recovery."))); ControlFile->backupEndPoint = ControlFile->minRecoveryPoint; @@ -1664,7 +1673,8 @@ PerformWalRecovery(void) if (record->xl_rmid != RM_XLOG_ID || (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO) ereport(FATAL, - (errmsg("unexpected record type found at redo point %X/%X", + (errcode(ERRCODE_CLUSTER_CORRUPTED), + errmsg("unexpected record type found at redo point %X/%X", LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)))); } else @@ -1792,7 +1802,8 @@ PerformWalRecovery(void) { if (!reachedConsistency) ereport(FATAL, - (errmsg("requested recovery stop point is before consistent recovery point"))); + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("requested recovery stop point is before consistent recovery point"))); /* * This is the last point where we can restart recovery with a new @@ -1850,7 +1861,8 @@ PerformWalRecovery(void) recoveryTarget != RECOVERY_TARGET_UNSET && !reachedRecoveryTarget) ereport(FATAL, - (errmsg("recovery ended before configured recovery target was reached"))); + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("recovery ended before configured recovery target was reached"))); } /* @@ -2324,7 +2336,8 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, /* Check that the record agrees on what the current (old) timeline is */ if (prevTLI != replayTLI) ereport(PANIC, - (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record", + (errcode(ERRCODE_CLUSTER_CORRUPTED), + errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record", prevTLI, replayTLI))); /* @@ -2333,7 +2346,8 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, */ if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs)) ereport(PANIC, - (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", + (errcode(ERRCODE_CLUSTER_CORRUPTED), + errmsg("unexpected timeline ID %u (after %u) in checkpoint record", newTLI, replayTLI))); /* @@ -2349,7 +2363,8 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, lsn < minRecoveryPoint && newTLI > minRecoveryPointTLI) ereport(PANIC, - (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u", + (errcode(ERRCODE_CLUSTER_CORRUPTED), + errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u", newTLI, LSN_FORMAT_ARGS(minRecoveryPoint), minRecoveryPointTLI))); diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt index 8e97a0150f..9e9e1c0948 100644 --- a/src/backend/utils/errcodes.txt +++ b/src/backend/utils/errcodes.txt @@ -484,6 +484,7 @@ P0001 E ERRCODE_RAISE_EXCEPTION rai P0002 E ERRCODE_NO_DATA_FOUND no_data_found P0003 E ERRCODE_TOO_MANY_ROWS too_many_rows P0004 E ERRCODE_ASSERT_FAILURE assert_failure +P0005 E ERRCODE_TIMELINE_INCONSISTENT timeline_inconsistent Section: Class XX - Internal Error @@ -491,3 +492,4 @@ Section: Class XX - Internal Error XX000 E ERRCODE_INTERNAL_ERROR internal_error XX001 E ERRCODE_DATA_CORRUPTED data_corrupted XX002 E ERRCODE_INDEX_CORRUPTED index_corrupted +XX003 E ERRCODE_CLUSTER_CORRUPTED cluster_corrupted \ No newline at end of file -- 2.40.1
From 2d506a309036f801d338266ca933cc4e2a137183 Mon Sep 17 00:00:00 2001 From: "Krishnakumar R (KK)" <kksrcv...@gmail.com> Date: Mon, 4 Dec 2023 00:29:34 -0800 Subject: [PATCH v1] Purge error hints which are obsolete due to the removal of exclusive backup. --- src/backend/access/transam/xlogrecovery.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index cb54f21de2..2b3fda6f08 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -632,10 +632,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, ereport(FATAL, (errcode(ERRCODE_CLUSTER_CORRUPTED), errmsg("could not find redo location referenced by checkpoint record"), - errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" - "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" - "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", - DataDir, DataDir, DataDir, DataDir))); + errhint("If not found, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n", + DataDir, DataDir))); } } else @@ -643,10 +641,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, ereport(FATAL, (errcode(ERRCODE_CLUSTER_CORRUPTED), errmsg("could not locate required checkpoint record"), - errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" - "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" - "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", - DataDir, DataDir, DataDir, DataDir))); + errhint("If not found, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n", + DataDir, DataDir))); wasShutdown = false; /* keep compiler quiet */ } -- 2.40.1