Hi!

Currently we amcheck supports lossy checking for missing parent
downlinks.  It collects bitmap of downlink hashes and use it to check
subsequent tree level.  We've experienced some large corrupted indexes
which pass this check due to its looseness.

However, it seems to me we can implement this check in non-lossy
manner without making it significantly slower.  We anyway traverse
downlinks from parent to children in order to verify that hikeys are
corresponding to downlink keys.  We can also traverse from one
downlink to subsequent using rightlinks.  So, if there are some
intermediate pages between them, they are candidates to have missing
parent downlinks.  The patch is attached.

With this patch amcheck could successfully detect corruption for our
customer, which unpatched amcheck couldn't find.

Opinions?

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
commit f433c2cc817eb5b6b11e4974127ec3e6904bcec0
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date:   Thu Dec 6 01:04:33 2018 +0300

    Improve checking for missing parent links
    
    Instead of collecting lossy bitmap, traverse from one downlink to subsequent
    using rightlinks.  Intermediate pages are candidates to have lost parent
    downlinks.

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 591e0a3e46a..f2b08d5dbdb 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -92,6 +92,8 @@ typedef struct BtreeCheckState
 	BlockNumber targetblock;
 	/* Target page's LSN */
 	XLogRecPtr	targetlsn;
+	/* Previous downlink block number */
+	BlockNumber	prevdownlink;
 
 	/*
 	 * Mutable state, for optional heapallindexed verification:
@@ -99,8 +101,6 @@ typedef struct BtreeCheckState
 
 	/* Bloom filter fingerprints B-Tree index */
 	bloom_filter *filter;
-	/* Bloom filter fingerprints downlink blocks within tree */
-	bloom_filter *downlinkfilter;
 	/* Right half of incomplete split marker */
 	bool		rightsplit;
 	/* Debug counter */
@@ -137,7 +137,10 @@ static void bt_target_page_check(BtreeCheckState *state);
 static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state);
 static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
 				  BlockNumber childblock);
-static void bt_downlink_missing_check(BtreeCheckState *state);
+static void bt_downlink_connectivity_check(BtreeCheckState *state,
+				   BlockNumber nextdownlink, uint32 parent_level);
+static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
+						  BlockNumber targetblock, Page target);
 static void bt_tuple_present_callback(Relation index, HeapTuple htup,
 						  Datum *values, bool *isnull,
 						  bool tupleIsAlive, void *checkstate);
@@ -420,23 +423,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
 						 errmsg("index \"%s\" cannot be verified using transaction snapshot",
 								RelationGetRelationName(rel))));
 		}
-		else
-		{
-			int64		total_pages;
-
-			/*
-			 * Extra readonly downlink check.
-			 *
-			 * In readonly case, we know that there cannot be a concurrent
-			 * page split or a concurrent page deletion, which gives us the
-			 * opportunity to verify that every non-ignorable page had a
-			 * downlink one level up.  We must be tolerant of interrupted page
-			 * splits and page deletions, though.  This is taken care of in
-			 * bt_downlink_missing_check().
-			 */
-			total_pages = (int64) state->rel->rd_rel->relpages;
-			state->downlinkfilter = bloom_create(total_pages, work_mem, seed);
-		}
 	}
 
 	Assert(!state->rootdescend || state->readonly);
@@ -514,16 +500,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
 		IndexInfo  *indexinfo = BuildIndexInfo(state->rel);
 		TableScanDesc scan;
 
-		/* Report on extra downlink checks performed in readonly case */
-		if (state->readonly)
-		{
-			ereport(DEBUG1,
-					(errmsg_internal("finished verifying presence of downlink blocks within index \"%s\" with bitset %.2f%% set",
-									 RelationGetRelationName(rel),
-									 100.0 * bloom_prop_bits_set(state->downlinkfilter))));
-			bloom_free(state->downlinkfilter);
-		}
-
 		/*
 		 * Create our own scan for table_index_build_scan(), rather than
 		 * getting it to do so for us.  This is required so that we can
@@ -626,6 +602,8 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
 		 level.istruerootlevel ?
 		 " (true root level)" : level.level == 0 ? " (leaf level)" : "");
 
+	state->prevdownlink = InvalidBlockNumber;
+
 	do
 	{
 		/* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */
@@ -923,14 +901,14 @@ bt_target_page_check(BtreeCheckState *state)
 										(uint32) state->targetlsn)));
 		}
 
-		/* Fingerprint downlink blocks in heapallindexed + readonly case */
-		if (state->heapallindexed && state->readonly && !P_ISLEAF(topaque))
+		/*
+		 * Check connectivity of downlinks.
+		 */
+		if (!P_ISLEAF(topaque) && state->readonly)
 		{
 			BlockNumber childblock = BTreeInnerTupleGetDownLink(itup);
 
-			bloom_add_element(state->downlinkfilter,
-							  (unsigned char *) &childblock,
-							  sizeof(BlockNumber));
+			bt_downlink_connectivity_check(state, childblock, topaque->btpo.level);
 		}
 
 		/*
@@ -1219,13 +1197,10 @@ bt_target_page_check(BtreeCheckState *state)
 		}
 	}
 
-	/*
-	 * * Check if page has a downlink in parent *
-	 *
-	 * This can only be checked in heapallindexed + readonly case.
-	 */
-	if (state->heapallindexed && state->readonly)
-		bt_downlink_missing_check(state);
+	if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
+	{
+		bt_downlink_connectivity_check(state, P_NONE, topaque->btpo.level);
+	}
 }
 
 /*
@@ -1441,6 +1416,83 @@ bt_right_page_check_scankey(BtreeCheckState *state)
 	return bt_mkscankey_pivotsearch(state->rel, firstitup);
 }
 
+/*
+ * Check connectivity of downlinks.  Traverse rightlinks from previous downlink
+ * to the current one.  Check that there are no intermediate pages with missing
+ * downlinks.
+ */
+static void
+bt_downlink_connectivity_check(BtreeCheckState *state,
+							   BlockNumber nextdownlink,
+							   uint32 parent_level)
+{
+	BlockNumber		blkno = state->prevdownlink;
+	Page			page;
+	BTPageOpaque	opaque;
+	bool			rightsplit = true;
+	bool			first = true;
+
+	/*
+	 * If no previos downlink is memorized, just save our downlink to future
+	 * usage.
+	 */
+	if (!BlockNumberIsValid(blkno))
+	{
+		state->prevdownlink = nextdownlink;
+		return;
+	}
+
+	while (true)
+	{
+		/* Did we traverse the whole tree level and don't find next downlink? */
+		if (blkno == P_NONE)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"",
+							state->prevdownlink, nextdownlink,
+							RelationGetRelationName(state->rel))));
+
+		page = palloc_btree_page(state, blkno);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		/* Check level for non-ignorable page */
+		if (!P_IGNORE(opaque) && opaque->btpo.level != parent_level - 1)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("block found while traversing rightlinks from downlink of index \"%s\" has invalid level",
+							RelationGetRelationName(state->rel)),
+					 errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
+										blkno, parent_level - 1, opaque->btpo.level)));
+
+		/* Try to detect circular links */
+		if ((!first && blkno == state->prevdownlink) || blkno == opaque->btpo_prev)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("circular link chain found in block %u of index \"%s\"",
+							blkno, RelationGetRelationName(state->rel))));
+
+		if (!first && !P_IGNORE(opaque))
+		{
+			/* blkno probably has missing parent downlink */
+			bt_downlink_missing_check(state, rightsplit, blkno, page);
+		}
+
+		/* Exit if we already found next downlink */
+		if (opaque->btpo_next == nextdownlink)
+		{
+			state->prevdownlink = nextdownlink;
+			pfree(page);
+			return;
+		}
+
+		/* Traverse to the next page using rightlink */
+		blkno = opaque->btpo_next;
+		rightsplit = P_INCOMPLETE_SPLIT(opaque);
+		pfree(page);
+		first = false;
+	}
+}
+
 /*
  * Checks one of target's downlink against its child page.
  *
@@ -1604,23 +1656,27 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
  * be concerned about concurrent page splits or page deletions.
  */
 static void
-bt_downlink_missing_check(BtreeCheckState *state)
+bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
+						  BlockNumber targetblock, Page target)
 {
-	BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
+	BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(target);
 	ItemId		itemid;
 	IndexTuple	itup;
 	Page		child;
 	BTPageOpaque copaque;
 	uint32		level;
 	BlockNumber childblk;
+	XLogRecPtr	targetlsn;
 
-	Assert(state->heapallindexed && state->readonly);
+	Assert(state->readonly);
 	Assert(!P_IGNORE(topaque));
 
 	/* No next level up with downlinks to fingerprint from the true root */
 	if (P_ISROOT(topaque))
 		return;
 
+	targetlsn = PageGetLSN(target);
+
 	/*
 	 * Incomplete (interrupted) page splits can account for the lack of a
 	 * downlink.  Some inserting transaction should eventually complete the
@@ -1642,26 +1698,20 @@ bt_downlink_missing_check(BtreeCheckState *state)
 	 * by design, so it shouldn't be taken to indicate corruption.  See
 	 * _bt_pagedel() for full details.
 	 */
-	if (state->rightsplit)
+	if (rightsplit)
 	{
 		ereport(DEBUG1,
 				(errcode(ERRCODE_NO_DATA),
 				 errmsg("harmless interrupted page split detected in index %s",
 						RelationGetRelationName(state->rel)),
 				 errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
-									state->targetblock, topaque->btpo.level,
+									targetblock, topaque->btpo.level,
 									topaque->btpo_prev,
-									(uint32) (state->targetlsn >> 32),
-									(uint32) state->targetlsn)));
+									(uint32) (targetlsn >> 32),
+									(uint32) targetlsn)));
 		return;
 	}
 
-	/* Target's downlink is typically present in parent/fingerprinted */
-	if (!bloom_lacks_element(state->downlinkfilter,
-							 (unsigned char *) &state->targetblock,
-							 sizeof(BlockNumber)))
-		return;
-
 	/*
 	 * Target is probably the "top parent" of a multi-level page deletion.
 	 * We'll need to descend the subtree to make sure that descendant pages
@@ -1678,17 +1728,17 @@ bt_downlink_missing_check(BtreeCheckState *state)
 				 errmsg("leaf index block lacks downlink in index \"%s\"",
 						RelationGetRelationName(state->rel)),
 				 errdetail_internal("Block=%u page lsn=%X/%X.",
-									state->targetblock,
-									(uint32) (state->targetlsn >> 32),
-									(uint32) state->targetlsn)));
+									targetblock,
+									(uint32) (targetlsn >> 32),
+									(uint32) targetlsn)));
 
 	/* Descend from the target page, which is an internal page */
 	elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
 		 RelationGetRelationName(state->rel));
 
 	level = topaque->btpo.level;
-	itemid = PageGetItemId(state->target, P_FIRSTDATAKEY(topaque));
-	itup = (IndexTuple) PageGetItem(state->target, itemid);
+	itemid = PageGetItemId(target, P_FIRSTDATAKEY(topaque));
+	itup = (IndexTuple) PageGetItem(target, itemid);
 	childblk = BTreeInnerTupleGetDownLink(itup);
 	for (;;)
 	{
@@ -1707,7 +1757,7 @@ bt_downlink_missing_check(BtreeCheckState *state)
 					 errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
 									 RelationGetRelationName(state->rel)),
 					 errdetail_internal("Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
-										state->targetblock, childblk,
+										targetblock, childblk,
 										level - 1, copaque->btpo.level)));
 
 		level = copaque->btpo.level;
@@ -1744,9 +1794,9 @@ bt_downlink_missing_check(BtreeCheckState *state)
 				 errmsg_internal("downlink to deleted leaf page found in index \"%s\"",
 								 RelationGetRelationName(state->rel)),
 				 errdetail_internal("Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X.",
-									state->targetblock, childblk,
-									(uint32) (state->targetlsn >> 32),
-									(uint32) state->targetlsn)));
+									targetblock, childblk,
+									(uint32) (targetlsn >> 32),
+									(uint32) targetlsn)));
 
 	/*
 	 * Iff leaf page is half-dead, its high key top parent link should point
@@ -1762,7 +1812,7 @@ bt_downlink_missing_check(BtreeCheckState *state)
 	{
 		itemid = PageGetItemId(child, P_HIKEY);
 		itup = (IndexTuple) PageGetItem(child, itemid);
-		if (BTreeTupleGetTopParent(itup) == state->targetblock)
+		if (BTreeTupleGetTopParent(itup) == targetblock)
 			return;
 	}
 
@@ -1771,9 +1821,9 @@ bt_downlink_missing_check(BtreeCheckState *state)
 			 errmsg("internal index block lacks downlink in index \"%s\"",
 					RelationGetRelationName(state->rel)),
 			 errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
-								state->targetblock, topaque->btpo.level,
-								(uint32) (state->targetlsn >> 32),
-								(uint32) state->targetlsn)));
+								targetblock, topaque->btpo.level,
+								(uint32) (targetlsn >> 32),
+								(uint32) targetlsn)));
 }
 
 /*

Reply via email to