Hi! Currently we amcheck supports lossy checking for missing parent downlinks. It collects bitmap of downlink hashes and use it to check subsequent tree level. We've experienced some large corrupted indexes which pass this check due to its looseness.
However, it seems to me we can implement this check in non-lossy manner without making it significantly slower. We anyway traverse downlinks from parent to children in order to verify that hikeys are corresponding to downlink keys. We can also traverse from one downlink to subsequent using rightlinks. So, if there are some intermediate pages between them, they are candidates to have missing parent downlinks. The patch is attached. With this patch amcheck could successfully detect corruption for our customer, which unpatched amcheck couldn't find. Opinions? ------ Alexander Korotkov Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
commit f433c2cc817eb5b6b11e4974127ec3e6904bcec0 Author: Alexander Korotkov <akorotkov@postgresql.org> Date: Thu Dec 6 01:04:33 2018 +0300 Improve checking for missing parent links Instead of collecting lossy bitmap, traverse from one downlink to subsequent using rightlinks. Intermediate pages are candidates to have lost parent downlinks. diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 591e0a3e46a..f2b08d5dbdb 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -92,6 +92,8 @@ typedef struct BtreeCheckState BlockNumber targetblock; /* Target page's LSN */ XLogRecPtr targetlsn; + /* Previous downlink block number */ + BlockNumber prevdownlink; /* * Mutable state, for optional heapallindexed verification: @@ -99,8 +101,6 @@ typedef struct BtreeCheckState /* Bloom filter fingerprints B-Tree index */ bloom_filter *filter; - /* Bloom filter fingerprints downlink blocks within tree */ - bloom_filter *downlinkfilter; /* Right half of incomplete split marker */ bool rightsplit; /* Debug counter */ @@ -137,7 +137,10 @@ static void bt_target_page_check(BtreeCheckState *state); static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state); static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, BlockNumber childblock); -static void bt_downlink_missing_check(BtreeCheckState *state); +static void bt_downlink_connectivity_check(BtreeCheckState *state, + BlockNumber nextdownlink, uint32 parent_level); +static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, + BlockNumber targetblock, Page target); static void bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *checkstate); @@ -420,23 +423,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, errmsg("index \"%s\" cannot be verified using transaction snapshot", RelationGetRelationName(rel)))); } - else - { - int64 total_pages; - - /* - * Extra readonly downlink check. - * - * In readonly case, we know that there cannot be a concurrent - * page split or a concurrent page deletion, which gives us the - * opportunity to verify that every non-ignorable page had a - * downlink one level up. We must be tolerant of interrupted page - * splits and page deletions, though. This is taken care of in - * bt_downlink_missing_check(). - */ - total_pages = (int64) state->rel->rd_rel->relpages; - state->downlinkfilter = bloom_create(total_pages, work_mem, seed); - } } Assert(!state->rootdescend || state->readonly); @@ -514,16 +500,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, IndexInfo *indexinfo = BuildIndexInfo(state->rel); TableScanDesc scan; - /* Report on extra downlink checks performed in readonly case */ - if (state->readonly) - { - ereport(DEBUG1, - (errmsg_internal("finished verifying presence of downlink blocks within index \"%s\" with bitset %.2f%% set", - RelationGetRelationName(rel), - 100.0 * bloom_prop_bits_set(state->downlinkfilter)))); - bloom_free(state->downlinkfilter); - } - /* * Create our own scan for table_index_build_scan(), rather than * getting it to do so for us. This is required so that we can @@ -626,6 +602,8 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) level.istruerootlevel ? " (true root level)" : level.level == 0 ? " (leaf level)" : ""); + state->prevdownlink = InvalidBlockNumber; + do { /* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */ @@ -923,14 +901,14 @@ bt_target_page_check(BtreeCheckState *state) (uint32) state->targetlsn))); } - /* Fingerprint downlink blocks in heapallindexed + readonly case */ - if (state->heapallindexed && state->readonly && !P_ISLEAF(topaque)) + /* + * Check connectivity of downlinks. + */ + if (!P_ISLEAF(topaque) && state->readonly) { BlockNumber childblock = BTreeInnerTupleGetDownLink(itup); - bloom_add_element(state->downlinkfilter, - (unsigned char *) &childblock, - sizeof(BlockNumber)); + bt_downlink_connectivity_check(state, childblock, topaque->btpo.level); } /* @@ -1219,13 +1197,10 @@ bt_target_page_check(BtreeCheckState *state) } } - /* - * * Check if page has a downlink in parent * - * - * This can only be checked in heapallindexed + readonly case. - */ - if (state->heapallindexed && state->readonly) - bt_downlink_missing_check(state); + if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly) + { + bt_downlink_connectivity_check(state, P_NONE, topaque->btpo.level); + } } /* @@ -1441,6 +1416,83 @@ bt_right_page_check_scankey(BtreeCheckState *state) return bt_mkscankey_pivotsearch(state->rel, firstitup); } +/* + * Check connectivity of downlinks. Traverse rightlinks from previous downlink + * to the current one. Check that there are no intermediate pages with missing + * downlinks. + */ +static void +bt_downlink_connectivity_check(BtreeCheckState *state, + BlockNumber nextdownlink, + uint32 parent_level) +{ + BlockNumber blkno = state->prevdownlink; + Page page; + BTPageOpaque opaque; + bool rightsplit = true; + bool first = true; + + /* + * If no previos downlink is memorized, just save our downlink to future + * usage. + */ + if (!BlockNumberIsValid(blkno)) + { + state->prevdownlink = nextdownlink; + return; + } + + while (true) + { + /* Did we traverse the whole tree level and don't find next downlink? */ + if (blkno == P_NONE) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"", + state->prevdownlink, nextdownlink, + RelationGetRelationName(state->rel)))); + + page = palloc_btree_page(state, blkno); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* Check level for non-ignorable page */ + if (!P_IGNORE(opaque) && opaque->btpo.level != parent_level - 1) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block found while traversing rightlinks from downlink of index \"%s\" has invalid level", + RelationGetRelationName(state->rel)), + errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", + blkno, parent_level - 1, opaque->btpo.level))); + + /* Try to detect circular links */ + if ((!first && blkno == state->prevdownlink) || blkno == opaque->btpo_prev) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("circular link chain found in block %u of index \"%s\"", + blkno, RelationGetRelationName(state->rel)))); + + if (!first && !P_IGNORE(opaque)) + { + /* blkno probably has missing parent downlink */ + bt_downlink_missing_check(state, rightsplit, blkno, page); + } + + /* Exit if we already found next downlink */ + if (opaque->btpo_next == nextdownlink) + { + state->prevdownlink = nextdownlink; + pfree(page); + return; + } + + /* Traverse to the next page using rightlink */ + blkno = opaque->btpo_next; + rightsplit = P_INCOMPLETE_SPLIT(opaque); + pfree(page); + first = false; + } +} + /* * Checks one of target's downlink against its child page. * @@ -1604,23 +1656,27 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, * be concerned about concurrent page splits or page deletions. */ static void -bt_downlink_missing_check(BtreeCheckState *state) +bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, + BlockNumber targetblock, Page target) { - BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(state->target); + BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(target); ItemId itemid; IndexTuple itup; Page child; BTPageOpaque copaque; uint32 level; BlockNumber childblk; + XLogRecPtr targetlsn; - Assert(state->heapallindexed && state->readonly); + Assert(state->readonly); Assert(!P_IGNORE(topaque)); /* No next level up with downlinks to fingerprint from the true root */ if (P_ISROOT(topaque)) return; + targetlsn = PageGetLSN(target); + /* * Incomplete (interrupted) page splits can account for the lack of a * downlink. Some inserting transaction should eventually complete the @@ -1642,26 +1698,20 @@ bt_downlink_missing_check(BtreeCheckState *state) * by design, so it shouldn't be taken to indicate corruption. See * _bt_pagedel() for full details. */ - if (state->rightsplit) + if (rightsplit) { ereport(DEBUG1, (errcode(ERRCODE_NO_DATA), errmsg("harmless interrupted page split detected in index %s", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.", - state->targetblock, topaque->btpo.level, + targetblock, topaque->btpo.level, topaque->btpo_prev, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + (uint32) (targetlsn >> 32), + (uint32) targetlsn))); return; } - /* Target's downlink is typically present in parent/fingerprinted */ - if (!bloom_lacks_element(state->downlinkfilter, - (unsigned char *) &state->targetblock, - sizeof(BlockNumber))) - return; - /* * Target is probably the "top parent" of a multi-level page deletion. * We'll need to descend the subtree to make sure that descendant pages @@ -1678,17 +1728,17 @@ bt_downlink_missing_check(BtreeCheckState *state) errmsg("leaf index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u page lsn=%X/%X.", - state->targetblock, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + targetblock, + (uint32) (targetlsn >> 32), + (uint32) targetlsn))); /* Descend from the target page, which is an internal page */ elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"", RelationGetRelationName(state->rel)); level = topaque->btpo.level; - itemid = PageGetItemId(state->target, P_FIRSTDATAKEY(topaque)); - itup = (IndexTuple) PageGetItem(state->target, itemid); + itemid = PageGetItemId(target, P_FIRSTDATAKEY(topaque)); + itup = (IndexTuple) PageGetItem(target, itemid); childblk = BTreeInnerTupleGetDownLink(itup); for (;;) { @@ -1707,7 +1757,7 @@ bt_downlink_missing_check(BtreeCheckState *state) errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down", RelationGetRelationName(state->rel)), errdetail_internal("Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u.", - state->targetblock, childblk, + targetblock, childblk, level - 1, copaque->btpo.level))); level = copaque->btpo.level; @@ -1744,9 +1794,9 @@ bt_downlink_missing_check(BtreeCheckState *state) errmsg_internal("downlink to deleted leaf page found in index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X.", - state->targetblock, childblk, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + targetblock, childblk, + (uint32) (targetlsn >> 32), + (uint32) targetlsn))); /* * Iff leaf page is half-dead, its high key top parent link should point @@ -1762,7 +1812,7 @@ bt_downlink_missing_check(BtreeCheckState *state) { itemid = PageGetItemId(child, P_HIKEY); itup = (IndexTuple) PageGetItem(child, itemid); - if (BTreeTupleGetTopParent(itup) == state->targetblock) + if (BTreeTupleGetTopParent(itup) == targetblock) return; } @@ -1771,9 +1821,9 @@ bt_downlink_missing_check(BtreeCheckState *state) errmsg("internal index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u level=%u page lsn=%X/%X.", - state->targetblock, topaque->btpo.level, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + targetblock, topaque->btpo.level, + (uint32) (targetlsn >> 32), + (uint32) targetlsn))); } /*