Hi, On 2022-09-22 08:19:09 -0700, Andres Freund wrote: > Hi, > > On 2022-08-17 17:28:02 +0500, Andrey Borodin wrote: > > Here's v13. Changes: > > 1. Fixed passing through downlink in GIN index > > 2. Fixed GIN tests (one test case was not working) > > > > Thanks to Vitaliy Kukharik for trying this patches. > > Due to the merge of the meson based build, this patch needs to be > adjusted. See > https://cirrus-ci.com/build/6637154947301376 > > The changes should be fairly simple, just mirroring the Makefile ones.
Here's an updated patch adding meson compat. I didn't fix the following warnings: [25/28 3 89%] Compiling C object contrib/amcheck/amcheck.dll.p/amcheck.c.obj ../../home/andres/src/postgresql/contrib/amcheck/amcheck.c: In function ‘amcheck_lock_relation_and_check’: ../../home/andres/src/postgresql/contrib/amcheck/amcheck.c:81:20: warning: implicit declaration of function ‘NewGUCNestLevel’ [-Wimplicit-function-declaration] 81 | save_nestlevel = NewGUCNestLevel(); | ^~~~~~~~~~~~~~~ ../../home/andres/src/postgresql/contrib/amcheck/amcheck.c:124:2: warning: implicit declaration of function ‘AtEOXact_GUC’; did you mean ‘AtEOXact_SMgr’? [-Wimplicit-function-declaration] 124 | AtEOXact_GUC(false, save_nestlevel); | ^~~~~~~~~~~~ | AtEOXact_SMgr [26/28 2 92%] Compiling C object contrib/amcheck/amcheck.dll.p/verify_gin.c.obj ../../home/andres/src/postgresql/contrib/amcheck/verify_gin.c: In function ‘gin_check_parent_keys_consistency’: ../../home/andres/src/postgresql/contrib/amcheck/verify_gin.c:423:8: warning: unused variable ‘heapallindexed’ [-Wunused-variable] 423 | bool heapallindexed = *((bool*)callback_state); | ^~~~~~~~~~~~~~ [28/28 1 100%] Linking target contrib/amcheck/amcheck.dll Greetings, Andres Freund
>From 9c2919df1419216e596819e9a3e23616d431d8d0 Mon Sep 17 00:00:00 2001 From: "Andrey M. Borodin" <x4mmm@flight.local> Date: Sat, 23 Jul 2022 14:08:10 +0500 Subject: [PATCH v14 1/3] Refactor amcheck to extract common locking routines --- contrib/amcheck/Makefile | 2 + contrib/amcheck/amcheck.c | 188 +++++++++++++++++++ contrib/amcheck/amcheck.h | 27 +++ contrib/amcheck/meson.build | 1 + contrib/amcheck/verify_nbtree.c | 308 ++++++++------------------------ 5 files changed, 297 insertions(+), 229 deletions(-) create mode 100644 contrib/amcheck/amcheck.c create mode 100644 contrib/amcheck/amcheck.h diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile index b82f221e50b..f10fd9d89d5 100644 --- a/contrib/amcheck/Makefile +++ b/contrib/amcheck/Makefile @@ -3,11 +3,13 @@ MODULE_big = amcheck OBJS = \ $(WIN32RES) \ + amcheck.o \ verify_heapam.o \ verify_nbtree.o EXTENSION = amcheck DATA = amcheck--1.2--1.3.sql amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql + PGFILEDESC = "amcheck - function for verifying relation integrity" REGRESS = check check_btree check_heap diff --git a/contrib/amcheck/amcheck.c b/contrib/amcheck/amcheck.c new file mode 100644 index 00000000000..0194ef0d7a2 --- /dev/null +++ b/contrib/amcheck/amcheck.c @@ -0,0 +1,188 @@ +/*------------------------------------------------------------------------- + * + * amcheck.c + * Utility functions common to all access methods. + * + * Copyright (c) 2017-2019, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/amcheck/amcheck.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/table.h" +#include "access/tableam.h" +#include "amcheck.h" +#include "catalog/index.h" +#include "commands/tablecmds.h" + + +static bool +amcheck_index_mainfork_expected(Relation rel); + +/* + * Check if index relation should have a file for its main relation + * fork. Verification uses this to skip unlogged indexes when in hot standby + * mode, where there is simply nothing to verify. + * + * NB: Caller should call index_checkable() + * before calling here. + */ +static bool +amcheck_index_mainfork_expected(Relation rel) +{ + if (rel->rd_rel->relpersistence != RELPERSISTENCE_UNLOGGED || + !RecoveryInProgress()) + return true; + + ereport(NOTICE, + (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), + errmsg("cannot verify unlogged index \"%s\" during recovery, skipping", + RelationGetRelationName(rel)))); + + return false; +} + +void +amcheck_lock_relation_and_check(Oid indrelid, IndexCheckableCallback checkable, + IndexDoCheckCallback check, LOCKMODE lockmode, void *state) +{ + Oid heapid; + Relation indrel; + Relation heaprel; + Oid save_userid; + int save_sec_context; + int save_nestlevel; + + /* + * We must lock table before index to avoid deadlocks. However, if the + * passed indrelid isn't an index then IndexGetRelation() will fail. + * Rather than emitting a not-very-helpful error message, postpone + * complaining, expecting that the is-it-an-index test below will fail. + * + * In hot standby mode this will raise an error when parentcheck is true. + */ + heapid = IndexGetRelation(indrelid, true); + if (OidIsValid(heapid)) + { + heaprel = table_open(heapid, lockmode); + + /* + * Switch to the table owner's userid, so that any index functions are + * run as that user. Also lock down security-restricted operations + * and arrange to make GUC variable changes local to this command. + */ + GetUserIdAndSecContext(&save_userid, &save_sec_context); + SetUserIdAndSecContext(heaprel->rd_rel->relowner, + save_sec_context | SECURITY_RESTRICTED_OPERATION); + save_nestlevel = NewGUCNestLevel(); + } + else + { + heaprel = NULL; + /* for "gcc -Og" https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78394 */ + save_userid = InvalidOid; + save_sec_context = -1; + save_nestlevel = -1; + } + + /* + * Open the target index relations separately (like relation_openrv(), but + * with heap relation locked first to prevent deadlocking). In hot + * standby mode this will raise an error when parentcheck is true. + * + * There is no need for the usual indcheckxmin usability horizon test + * here, even in the heapallindexed case, because index undergoing + * verification only needs to have entries for a new transaction snapshot. + * (If this is a parentcheck verification, there is no question about + * committed or recently dead heap tuples lacking index entries due to + * concurrent activity.) + */ + indrel = index_open(indrelid, lockmode); + + /* + * Since we did the IndexGetRelation call above without any lock, it's + * barely possible that a race against an index drop/recreation could have + * netted us the wrong table. + */ + if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("could not open parent table of index \"%s\"", + RelationGetRelationName(indrel)))); + + /* Relation suitable for checking */ + checkable(indrel); + + if (amcheck_index_mainfork_expected(indrel)) + check(indrel, heaprel, state); + + /* Roll back any GUC changes executed by index functions */ + AtEOXact_GUC(false, save_nestlevel); + + /* Restore userid and security context */ + SetUserIdAndSecContext(save_userid, save_sec_context); + + /* + * Release locks early. That's ok here because nothing in the called + * routines will trigger shared cache invalidations to be sent, so we can + * relax the usual pattern of only releasing locks after commit. + */ + index_close(indrel, lockmode); + if (heaprel) + table_close(heaprel, lockmode); +} + +/* + * PageGetItemId() wrapper that validates returned line pointer. + * + * Buffer page/page item access macros generally trust that line pointers are + * not corrupt, which might cause problems for verification itself. For + * example, there is no bounds checking in PageGetItem(). Passing it a + * corrupt line pointer can cause it to return a tuple/pointer that is unsafe + * to dereference. + * + * Validating line pointers before tuples avoids undefined behavior and + * assertion failures with corrupt indexes, making the verification process + * more robust and predictable. + */ +ItemId +PageGetItemIdCareful(Relation rel, BlockNumber block, Page page, + OffsetNumber offset, size_t opaquesize) +{ + ItemId itemid = PageGetItemId(page, offset); + + Assert(opaquesize == MAXALIGN(opaquesize)); + + if (ItemIdGetOffset(itemid) + ItemIdGetLength(itemid) > + BLCKSZ - MAXALIGN(opaquesize)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("line pointer points past end of tuple space in index \"%s\"", + RelationGetRelationName(rel)), + errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", + block, offset, ItemIdGetOffset(itemid), + ItemIdGetLength(itemid), + ItemIdGetFlags(itemid)))); + + /* + * Verify that line pointer isn't LP_REDIRECT or LP_UNUSED, since nbtree and gist + * never uses either. Verify that line pointer has storage, too, since + * even LP_DEAD items should. + */ + if (ItemIdIsRedirected(itemid) || !ItemIdIsUsed(itemid) || + ItemIdGetLength(itemid) == 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("invalid line pointer storage in index \"%s\"", + RelationGetRelationName(rel)), + errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", + block, offset, ItemIdGetOffset(itemid), + ItemIdGetLength(itemid), + ItemIdGetFlags(itemid)))); + + return itemid; +} diff --git a/contrib/amcheck/amcheck.h b/contrib/amcheck/amcheck.h new file mode 100644 index 00000000000..10906efd8a5 --- /dev/null +++ b/contrib/amcheck/amcheck.h @@ -0,0 +1,27 @@ +/*------------------------------------------------------------------------- + * + * amcheck.h + * Shared routines for amcheck verifications. + * + * Copyright (c) 2019, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/amcheck/amcheck.h + * + *------------------------------------------------------------------------- + */ +#include "storage/lockdefs.h" +#include "utils/relcache.h" +#include "miscadmin.h" + +/* Typedefs for callback functions for amcheck_lock_relation */ +typedef void (*IndexCheckableCallback) (Relation index); +typedef void (*IndexDoCheckCallback) (Relation rel, Relation heaprel, void* state); + +extern void amcheck_lock_relation_and_check(Oid indrelid, + IndexCheckableCallback checkable, + IndexDoCheckCallback check, + LOCKMODE lockmode, void *state); + +extern ItemId PageGetItemIdCareful(Relation rel, BlockNumber block, + Page page, OffsetNumber offset, size_t opaquesize); \ No newline at end of file diff --git a/contrib/amcheck/meson.build b/contrib/amcheck/meson.build index 1db3d20349e..227e68ff834 100644 --- a/contrib/amcheck/meson.build +++ b/contrib/amcheck/meson.build @@ -1,4 +1,5 @@ amcheck = shared_module('amcheck', [ + 'amcheck.c', 'verify_heapam.c', 'verify_nbtree.c', ], diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 9021d156eb7..950014f19d5 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -41,6 +41,8 @@ #include "utils/memutils.h" #include "utils/snapmgr.h" +#include "amcheck.h" + PG_MODULE_MAGIC; @@ -138,10 +140,8 @@ typedef struct BtreeLevel PG_FUNCTION_INFO_V1(bt_index_check); PG_FUNCTION_INFO_V1(bt_index_parent_check); -static void bt_index_check_internal(Oid indrelid, bool parentcheck, - bool heapallindexed, bool rootdescend); +static void bt_index_check_internal_callback(Relation indrel, Relation heaprel, void* state); static inline void btree_index_checkable(Relation rel); -static inline bool btree_index_mainfork_expected(Relation rel); static void bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, bool readonly, bool heapallindexed, bool rootdescend); @@ -184,12 +184,17 @@ static inline bool invariant_l_nontarget_offset(BtreeCheckState *state, static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum); static inline BTScanInsert bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup); -static ItemId PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, - Page page, OffsetNumber offset); static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot); static inline ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup); +typedef struct BTCheckCallbackState +{ + bool parentcheck; + bool heapallindexed; + bool rootdescend; +} BTCheckCallbackState; + /* * bt_index_check(index regclass, heapallindexed boolean) * @@ -203,12 +208,17 @@ Datum bt_index_check(PG_FUNCTION_ARGS) { Oid indrelid = PG_GETARG_OID(0); - bool heapallindexed = false; + BTCheckCallbackState args; - if (PG_NARGS() == 2) - heapallindexed = PG_GETARG_BOOL(1); + args.heapallindexed = false; + args.rootdescend = false; + args.parentcheck = false; - bt_index_check_internal(indrelid, false, heapallindexed, false); + if (PG_NARGS() >= 2) + args.heapallindexed = PG_GETARG_BOOL(1); + + amcheck_lock_relation_and_check(indrelid, btree_index_checkable, + bt_index_check_internal_callback, AccessShareLock, &args); PG_RETURN_VOID(); } @@ -226,15 +236,18 @@ Datum bt_index_parent_check(PG_FUNCTION_ARGS) { Oid indrelid = PG_GETARG_OID(0); - bool heapallindexed = false; - bool rootdescend = false; + BTCheckCallbackState args; + args.heapallindexed = false; + args.rootdescend = false; + args.parentcheck = true; if (PG_NARGS() >= 2) - heapallindexed = PG_GETARG_BOOL(1); + args.heapallindexed = PG_GETARG_BOOL(1); if (PG_NARGS() == 3) - rootdescend = PG_GETARG_BOOL(2); + args.rootdescend = PG_GETARG_BOOL(2); - bt_index_check_internal(indrelid, true, heapallindexed, rootdescend); + amcheck_lock_relation_and_check(indrelid, btree_index_checkable, + bt_index_check_internal_callback, ShareLock, &args); PG_RETURN_VOID(); } @@ -242,126 +255,35 @@ bt_index_parent_check(PG_FUNCTION_ARGS) /* * Helper for bt_index_[parent_]check, coordinating the bulk of the work. */ -static void -bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, - bool rootdescend) +static void bt_index_check_internal_callback(Relation indrel, Relation heaprel, void* state) { - Oid heapid; - Relation indrel; - Relation heaprel; - LOCKMODE lockmode; - Oid save_userid; - int save_sec_context; - int save_nestlevel; - - if (parentcheck) - lockmode = ShareLock; - else - lockmode = AccessShareLock; - - /* - * We must lock table before index to avoid deadlocks. However, if the - * passed indrelid isn't an index then IndexGetRelation() will fail. - * Rather than emitting a not-very-helpful error message, postpone - * complaining, expecting that the is-it-an-index test below will fail. - * - * In hot standby mode this will raise an error when parentcheck is true. - */ - heapid = IndexGetRelation(indrelid, true); - if (OidIsValid(heapid)) - { - heaprel = table_open(heapid, lockmode); - - /* - * Switch to the table owner's userid, so that any index functions are - * run as that user. Also lock down security-restricted operations - * and arrange to make GUC variable changes local to this command. - */ - GetUserIdAndSecContext(&save_userid, &save_sec_context); - SetUserIdAndSecContext(heaprel->rd_rel->relowner, - save_sec_context | SECURITY_RESTRICTED_OPERATION); - save_nestlevel = NewGUCNestLevel(); - } - else - { - heaprel = NULL; - /* Set these just to suppress "uninitialized variable" warnings */ - save_userid = InvalidOid; - save_sec_context = -1; - save_nestlevel = -1; - } - - /* - * Open the target index relations separately (like relation_openrv(), but - * with heap relation locked first to prevent deadlocking). In hot - * standby mode this will raise an error when parentcheck is true. - * - * There is no need for the usual indcheckxmin usability horizon test - * here, even in the heapallindexed case, because index undergoing - * verification only needs to have entries for a new transaction snapshot. - * (If this is a parentcheck verification, there is no question about - * committed or recently dead heap tuples lacking index entries due to - * concurrent activity.) - */ - indrel = index_open(indrelid, lockmode); - - /* - * Since we did the IndexGetRelation call above without any lock, it's - * barely possible that a race against an index drop/recreation could have - * netted us the wrong table. - */ - if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_TABLE), - errmsg("could not open parent table of index \"%s\"", - RelationGetRelationName(indrel)))); - - /* Relation suitable for checking as B-Tree? */ - btree_index_checkable(indrel); - - if (btree_index_mainfork_expected(indrel)) - { - bool heapkeyspace, + BTCheckCallbackState* args = (BTCheckCallbackState*) state; + bool heapkeyspace, allequalimage; - if (!smgrexists(RelationGetSmgr(indrel), MAIN_FORKNUM)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("index \"%s\" lacks a main relation fork", - RelationGetRelationName(indrel)))); + if (!smgrexists(RelationGetSmgr(indrel), MAIN_FORKNUM)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" lacks a main relation fork", + RelationGetRelationName(indrel)))); - /* Extract metadata from metapage, and sanitize it in passing */ - _bt_metaversion(indrel, &heapkeyspace, &allequalimage); - if (allequalimage && !heapkeyspace) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("index \"%s\" metapage has equalimage field set on unsupported nbtree version", - RelationGetRelationName(indrel)))); - if (allequalimage && !_bt_allequalimage(indrel, false)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("index \"%s\" metapage incorrectly indicates that deduplication is safe", - RelationGetRelationName(indrel)))); + /* Extract metadata from metapage, and sanitize it in passing */ + _bt_metaversion(indrel, &heapkeyspace, &allequalimage); + if (allequalimage && !heapkeyspace) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" metapage has equalimage field set on unsupported nbtree version", + RelationGetRelationName(indrel)))); + if (allequalimage && !_bt_allequalimage(indrel, false)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" metapage incorrectly indicates that deduplication is safe", + RelationGetRelationName(indrel)))); - /* Check index, possibly against table it is an index on */ - bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck, - heapallindexed, rootdescend); - } + /* Check index, possibly against table it is an index on */ + bt_check_every_level(indrel, heaprel, heapkeyspace, args->parentcheck, + args->heapallindexed, args->rootdescend); - /* Roll back any GUC changes executed by index functions */ - AtEOXact_GUC(false, save_nestlevel); - - /* Restore userid and security context */ - SetUserIdAndSecContext(save_userid, save_sec_context); - - /* - * Release locks early. That's ok here because nothing in the called - * routines will trigger shared cache invalidations to be sent, so we can - * relax the usual pattern of only releasing locks after commit. - */ - index_close(indrel, lockmode); - if (heaprel) - table_close(heaprel, lockmode); } /* @@ -398,29 +320,6 @@ btree_index_checkable(Relation rel) errdetail("Index is not valid."))); } -/* - * Check if B-Tree index relation should have a file for its main relation - * fork. Verification uses this to skip unlogged indexes when in hot standby - * mode, where there is simply nothing to verify. We behave as if the - * relation is empty. - * - * NB: Caller should call btree_index_checkable() before calling here. - */ -static inline bool -btree_index_mainfork_expected(Relation rel) -{ - if (rel->rd_rel->relpersistence != RELPERSISTENCE_UNLOGGED || - !RecoveryInProgress()) - return true; - - ereport(DEBUG1, - (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), - errmsg("cannot verify unlogged index \"%s\" during recovery, skipping", - RelationGetRelationName(rel)))); - - return false; -} - /* * Main entry point for B-Tree SQL-callable functions. Walks the B-Tree in * logical order, verifying invariants as it goes. Optionally, verification @@ -793,9 +692,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) ItemId itemid; /* Internal page -- downlink gets leftmost on next level */ - itemid = PageGetItemIdCareful(state, state->targetblock, + itemid = PageGetItemIdCareful(state->rel, state->targetblock, state->target, - P_FIRSTDATAKEY(opaque)); + P_FIRSTDATAKEY(opaque), sizeof(BTPageOpaqueData)); itup = (IndexTuple) PageGetItem(state->target, itemid); nextleveldown.leftmost = BTreeTupleGetDownLink(itup); nextleveldown.level = opaque->btpo_level - 1; @@ -875,8 +774,8 @@ nextpage: IndexTuple itup; ItemId itemid; - itemid = PageGetItemIdCareful(state, state->targetblock, - state->target, P_HIKEY); + itemid = PageGetItemIdCareful(state->rel, state->targetblock, + state->target, P_HIKEY, sizeof(BTPageOpaqueData)); itup = (IndexTuple) PageGetItem(state->target, itemid); state->lowkey = MemoryContextAlloc(oldcontext, IndexTupleSize(itup)); @@ -1093,8 +992,8 @@ bt_target_page_check(BtreeCheckState *state) IndexTuple itup; /* Verify line pointer before checking tuple */ - itemid = PageGetItemIdCareful(state, state->targetblock, - state->target, P_HIKEY); + itemid = PageGetItemIdCareful(state->rel, state->targetblock, + state->target, P_HIKEY, sizeof(BTPageOpaqueData)); if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target, P_HIKEY)) { @@ -1129,8 +1028,8 @@ bt_target_page_check(BtreeCheckState *state) CHECK_FOR_INTERRUPTS(); - itemid = PageGetItemIdCareful(state, state->targetblock, - state->target, offset); + itemid = PageGetItemIdCareful(state->rel, state->targetblock, + state->target, offset, sizeof(BTPageOpaqueData)); itup = (IndexTuple) PageGetItem(state->target, itemid); tupsize = IndexTupleSize(itup); @@ -1442,9 +1341,9 @@ bt_target_page_check(BtreeCheckState *state) OffsetNumberNext(offset)); /* Reuse itup to get pointed-to heap location of second item */ - itemid = PageGetItemIdCareful(state, state->targetblock, + itemid = PageGetItemIdCareful(state->rel, state->targetblock, state->target, - OffsetNumberNext(offset)); + OffsetNumberNext(offset), sizeof(BTPageOpaqueData)); itup = (IndexTuple) PageGetItem(state->target, itemid); tid = BTreeTupleGetPointsToTID(itup); nhtid = psprintf("(%u,%u)", @@ -1735,8 +1634,8 @@ bt_right_page_check_scankey(BtreeCheckState *state) if (P_ISLEAF(opaque) && nline >= P_FIRSTDATAKEY(opaque)) { /* Return first data item (if any) */ - rightitem = PageGetItemIdCareful(state, targetnext, rightpage, - P_FIRSTDATAKEY(opaque)); + rightitem = PageGetItemIdCareful(state->rel, targetnext, rightpage, + P_FIRSTDATAKEY(opaque), sizeof(BTPageOpaqueData)); } else if (!P_ISLEAF(opaque) && nline >= OffsetNumberNext(P_FIRSTDATAKEY(opaque))) @@ -1745,8 +1644,8 @@ bt_right_page_check_scankey(BtreeCheckState *state) * Return first item after the internal page's "negative infinity" * item */ - rightitem = PageGetItemIdCareful(state, targetnext, rightpage, - OffsetNumberNext(P_FIRSTDATAKEY(opaque))); + rightitem = PageGetItemIdCareful(state->rel, targetnext, rightpage, + OffsetNumberNext(P_FIRSTDATAKEY(opaque)), sizeof(BTPageOpaqueData)); } else { @@ -1865,8 +1764,8 @@ bt_child_highkey_check(BtreeCheckState *state, if (OffsetNumberIsValid(target_downlinkoffnum)) { - itemid = PageGetItemIdCareful(state, state->targetblock, - state->target, target_downlinkoffnum); + itemid = PageGetItemIdCareful(state->rel, state->targetblock, + state->target, target_downlinkoffnum, sizeof(BTPageOpaqueData)); itup = (IndexTuple) PageGetItem(state->target, itemid); downlink = BTreeTupleGetDownLink(itup); } @@ -1969,7 +1868,7 @@ bt_child_highkey_check(BtreeCheckState *state, OffsetNumber pivotkey_offset; /* Get high key */ - itemid = PageGetItemIdCareful(state, blkno, page, P_HIKEY); + itemid = PageGetItemIdCareful(state->rel, blkno, page, P_HIKEY, sizeof(BTPageOpaqueData)); highkey = (IndexTuple) PageGetItem(page, itemid); /* @@ -2020,8 +1919,8 @@ bt_child_highkey_check(BtreeCheckState *state, LSN_FORMAT_ARGS(state->targetlsn)))); pivotkey_offset = P_HIKEY; } - itemid = PageGetItemIdCareful(state, state->targetblock, - state->target, pivotkey_offset); + itemid = PageGetItemIdCareful(state->rel, state->targetblock, + state->target, pivotkey_offset, sizeof(BTPageOpaqueData)); itup = (IndexTuple) PageGetItem(state->target, itemid); } else @@ -2107,8 +2006,8 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, BTPageOpaque copaque; BTPageOpaque topaque; - itemid = PageGetItemIdCareful(state, state->targetblock, - state->target, downlinkoffnum); + itemid = PageGetItemIdCareful(state->rel, state->targetblock, + state->target, downlinkoffnum, sizeof(BTPageOpaqueData)); itup = (IndexTuple) PageGetItem(state->target, itemid); childblock = BTreeTupleGetDownLink(itup); @@ -2339,7 +2238,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, RelationGetRelationName(state->rel)); level = opaque->btpo_level; - itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque)); + itemid = PageGetItemIdCareful(state->rel, blkno, page, P_FIRSTDATAKEY(opaque), sizeof(BTPageOpaqueData)); itup = (IndexTuple) PageGetItem(page, itemid); childblk = BTreeTupleGetDownLink(itup); for (;;) @@ -2363,8 +2262,8 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, level - 1, copaque->btpo_level))); level = copaque->btpo_level; - itemid = PageGetItemIdCareful(state, childblk, child, - P_FIRSTDATAKEY(copaque)); + itemid = PageGetItemIdCareful(state->rel, childblk, child, + P_FIRSTDATAKEY(copaque), sizeof(BTPageOpaqueData)); itup = (IndexTuple) PageGetItem(child, itemid); childblk = BTreeTupleGetDownLink(itup); /* Be slightly more pro-active in freeing this memory, just in case */ @@ -2412,7 +2311,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, */ if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque)) { - itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY); + itemid = PageGetItemIdCareful(state->rel, childblk, child, P_HIKEY, sizeof(BTPageOpaqueData)); itup = (IndexTuple) PageGetItem(child, itemid); if (BTreeTupleGetTopParent(itup) == blkno) return; @@ -2782,8 +2681,8 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key, Assert(key->pivotsearch); /* Verify line pointer before checking tuple */ - itemid = PageGetItemIdCareful(state, state->targetblock, state->target, - upperbound); + itemid = PageGetItemIdCareful(state->rel, state->targetblock, state->target, + upperbound, sizeof(BTPageOpaqueData)); /* pg_upgrade'd indexes may legally have equal sibling tuples */ if (!key->heapkeyspace) return invariant_leq_offset(state, key, upperbound); @@ -2905,8 +2804,8 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, Assert(key->pivotsearch); /* Verify line pointer before checking tuple */ - itemid = PageGetItemIdCareful(state, nontargetblock, nontarget, - upperbound); + itemid = PageGetItemIdCareful(state->rel, nontargetblock, nontarget, + upperbound, sizeof(BTPageOpaqueData)); cmp = _bt_compare(state->rel, key, nontarget, upperbound); /* pg_upgrade'd indexes may legally have equal sibling tuples */ @@ -3143,55 +3042,6 @@ bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup) return skey; } -/* - * PageGetItemId() wrapper that validates returned line pointer. - * - * Buffer page/page item access macros generally trust that line pointers are - * not corrupt, which might cause problems for verification itself. For - * example, there is no bounds checking in PageGetItem(). Passing it a - * corrupt line pointer can cause it to return a tuple/pointer that is unsafe - * to dereference. - * - * Validating line pointers before tuples avoids undefined behavior and - * assertion failures with corrupt indexes, making the verification process - * more robust and predictable. - */ -static ItemId -PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, - OffsetNumber offset) -{ - ItemId itemid = PageGetItemId(page, offset); - - if (ItemIdGetOffset(itemid) + ItemIdGetLength(itemid) > - BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData))) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("line pointer points past end of tuple space in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", - block, offset, ItemIdGetOffset(itemid), - ItemIdGetLength(itemid), - ItemIdGetFlags(itemid)))); - - /* - * Verify that line pointer isn't LP_REDIRECT or LP_UNUSED, since nbtree - * never uses either. Verify that line pointer has storage, too, since - * even LP_DEAD items should within nbtree. - */ - if (ItemIdIsRedirected(itemid) || !ItemIdIsUsed(itemid) || - ItemIdGetLength(itemid) == 0) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("invalid line pointer storage in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", - block, offset, ItemIdGetOffset(itemid), - ItemIdGetLength(itemid), - ItemIdGetFlags(itemid)))); - - return itemid; -} - /* * BTreeTupleGetHeapTID() wrapper that enforces that a heap TID is present in * cases where that is mandatory (i.e. for non-pivot tuples) -- 2.37.3.542.gdd3f6c4cae
>From 5c6709a38561adf4cf2d6f72ad5fcd86fdf4bbde Mon Sep 17 00:00:00 2001 From: "Andrey M. Borodin" <x4mmm@flight.local> Date: Sat, 23 Jul 2022 14:17:44 +0500 Subject: [PATCH v14 2/3] Add gist_index_parent_check() function to verify GiST index --- contrib/amcheck/Makefile | 6 +- contrib/amcheck/amcheck--1.3--1.4.sql | 14 + contrib/amcheck/amcheck.control | 2 +- contrib/amcheck/expected/check_gist.out | 119 ++++++ contrib/amcheck/meson.build | 3 + contrib/amcheck/sql/check_gist.sql | 42 ++ contrib/amcheck/verify_gist.c | 520 ++++++++++++++++++++++++ doc/src/sgml/amcheck.sgml | 19 + 8 files changed, 722 insertions(+), 3 deletions(-) create mode 100644 contrib/amcheck/amcheck--1.3--1.4.sql create mode 100644 contrib/amcheck/expected/check_gist.out create mode 100644 contrib/amcheck/sql/check_gist.sql create mode 100644 contrib/amcheck/verify_gist.c diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile index f10fd9d89d5..a8174195817 100644 --- a/contrib/amcheck/Makefile +++ b/contrib/amcheck/Makefile @@ -4,15 +4,17 @@ MODULE_big = amcheck OBJS = \ $(WIN32RES) \ amcheck.o \ + verify_gist.o \ verify_heapam.o \ verify_nbtree.o EXTENSION = amcheck -DATA = amcheck--1.2--1.3.sql amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql +DATA = amcheck--1.2--1.3.sql amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql \ + amcheck--1.3--1.4.sql PGFILEDESC = "amcheck - function for verifying relation integrity" -REGRESS = check check_btree check_heap +REGRESS = check check_btree check_heap check_gist TAP_TESTS = 1 diff --git a/contrib/amcheck/amcheck--1.3--1.4.sql b/contrib/amcheck/amcheck--1.3--1.4.sql new file mode 100644 index 00000000000..93297379efe --- /dev/null +++ b/contrib/amcheck/amcheck--1.3--1.4.sql @@ -0,0 +1,14 @@ +/* contrib/amcheck/amcheck--1.3--1.4.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.4'" to load this file. \quit + + +-- gist_index_parent_check() +-- +CREATE FUNCTION gist_index_parent_check(index regclass, heapallindexed boolean) +RETURNS VOID +AS 'MODULE_PATHNAME', 'gist_index_parent_check' +LANGUAGE C STRICT; + +REVOKE ALL ON FUNCTION gist_index_parent_check(regclass, boolean) FROM PUBLIC; \ No newline at end of file diff --git a/contrib/amcheck/amcheck.control b/contrib/amcheck/amcheck.control index ab50931f754..e67ace01c99 100644 --- a/contrib/amcheck/amcheck.control +++ b/contrib/amcheck/amcheck.control @@ -1,5 +1,5 @@ # amcheck extension comment = 'functions for verifying relation integrity' -default_version = '1.3' +default_version = '1.4' module_pathname = '$libdir/amcheck' relocatable = true diff --git a/contrib/amcheck/expected/check_gist.out b/contrib/amcheck/expected/check_gist.out new file mode 100644 index 00000000000..9749adfd340 --- /dev/null +++ b/contrib/amcheck/expected/check_gist.out @@ -0,0 +1,119 @@ +SELECT setseed(1); + setseed +--------- + +(1 row) + +-- Test that index built with bulk load is correct +CREATE TABLE gist_check AS SELECT point(random(),s) c, random() p FROM generate_series(1,10000) s; +CREATE INDEX gist_check_idx1 ON gist_check USING gist(c); +CREATE INDEX gist_check_idx2 ON gist_check USING gist(c) INCLUDE(p); +SELECT gist_index_parent_check('gist_check_idx1', false); + gist_index_parent_check +------------------------- + +(1 row) + +SELECT gist_index_parent_check('gist_check_idx2', false); + gist_index_parent_check +------------------------- + +(1 row) + +SELECT gist_index_parent_check('gist_check_idx1', true); + gist_index_parent_check +------------------------- + +(1 row) + +SELECT gist_index_parent_check('gist_check_idx2', true); + gist_index_parent_check +------------------------- + +(1 row) + +-- Test that index is correct after inserts +INSERT INTO gist_check SELECT point(random(),s) c, random() p FROM generate_series(1,10000) s; +SELECT gist_index_parent_check('gist_check_idx1', false); + gist_index_parent_check +------------------------- + +(1 row) + +SELECT gist_index_parent_check('gist_check_idx2', false); + gist_index_parent_check +------------------------- + +(1 row) + +SELECT gist_index_parent_check('gist_check_idx1', true); + gist_index_parent_check +------------------------- + +(1 row) + +SELECT gist_index_parent_check('gist_check_idx2', true); + gist_index_parent_check +------------------------- + +(1 row) + +-- Test that index is correct after vacuuming +DELETE FROM gist_check WHERE c[1] < 5000; -- delete clustered data +DELETE FROM gist_check WHERE c[1]::int % 2 = 0; -- delete scattered data +-- We need two passes through the index and one global vacuum to actually +-- reuse page +VACUUM gist_check; +VACUUM; +SELECT gist_index_parent_check('gist_check_idx1', false); + gist_index_parent_check +------------------------- + +(1 row) + +SELECT gist_index_parent_check('gist_check_idx2', false); + gist_index_parent_check +------------------------- + +(1 row) + +SELECT gist_index_parent_check('gist_check_idx1', true); + gist_index_parent_check +------------------------- + +(1 row) + +SELECT gist_index_parent_check('gist_check_idx2', true); + gist_index_parent_check +------------------------- + +(1 row) + +-- Test that index is correct after reusing pages +INSERT INTO gist_check SELECT point(random(),s) c, random() p FROM generate_series(1,10000) s; +SELECT gist_index_parent_check('gist_check_idx1', false); + gist_index_parent_check +------------------------- + +(1 row) + +SELECT gist_index_parent_check('gist_check_idx2', false); + gist_index_parent_check +------------------------- + +(1 row) + +SELECT gist_index_parent_check('gist_check_idx1', true); + gist_index_parent_check +------------------------- + +(1 row) + +SELECT gist_index_parent_check('gist_check_idx2', true); + gist_index_parent_check +------------------------- + +(1 row) + +-- cleanup +DROP TABLE gist_check; diff --git a/contrib/amcheck/meson.build b/contrib/amcheck/meson.build index 227e68ff834..d84138f7fa6 100644 --- a/contrib/amcheck/meson.build +++ b/contrib/amcheck/meson.build @@ -1,5 +1,6 @@ amcheck = shared_module('amcheck', [ 'amcheck.c', + 'verify_gist.c', 'verify_heapam.c', 'verify_nbtree.c', ], @@ -13,6 +14,7 @@ install_data( 'amcheck--1.0--1.1.sql', 'amcheck--1.1--1.2.sql', 'amcheck--1.2--1.3.sql', + 'amcheck--1.3--1.4.sql', kwargs: contrib_data_args, ) @@ -25,6 +27,7 @@ tests += { 'check', 'check_btree', 'check_heap', + 'check_gist', ], }, 'tap': { diff --git a/contrib/amcheck/sql/check_gist.sql b/contrib/amcheck/sql/check_gist.sql new file mode 100644 index 00000000000..75b9ff4b436 --- /dev/null +++ b/contrib/amcheck/sql/check_gist.sql @@ -0,0 +1,42 @@ + +SELECT setseed(1); + +-- Test that index built with bulk load is correct +CREATE TABLE gist_check AS SELECT point(random(),s) c, random() p FROM generate_series(1,10000) s; +CREATE INDEX gist_check_idx1 ON gist_check USING gist(c); +CREATE INDEX gist_check_idx2 ON gist_check USING gist(c) INCLUDE(p); +SELECT gist_index_parent_check('gist_check_idx1', false); +SELECT gist_index_parent_check('gist_check_idx2', false); +SELECT gist_index_parent_check('gist_check_idx1', true); +SELECT gist_index_parent_check('gist_check_idx2', true); + +-- Test that index is correct after inserts +INSERT INTO gist_check SELECT point(random(),s) c, random() p FROM generate_series(1,10000) s; +SELECT gist_index_parent_check('gist_check_idx1', false); +SELECT gist_index_parent_check('gist_check_idx2', false); +SELECT gist_index_parent_check('gist_check_idx1', true); +SELECT gist_index_parent_check('gist_check_idx2', true); + +-- Test that index is correct after vacuuming +DELETE FROM gist_check WHERE c[1] < 5000; -- delete clustered data +DELETE FROM gist_check WHERE c[1]::int % 2 = 0; -- delete scattered data + +-- We need two passes through the index and one global vacuum to actually +-- reuse page +VACUUM gist_check; +VACUUM; + +SELECT gist_index_parent_check('gist_check_idx1', false); +SELECT gist_index_parent_check('gist_check_idx2', false); +SELECT gist_index_parent_check('gist_check_idx1', true); +SELECT gist_index_parent_check('gist_check_idx2', true); + + +-- Test that index is correct after reusing pages +INSERT INTO gist_check SELECT point(random(),s) c, random() p FROM generate_series(1,10000) s; +SELECT gist_index_parent_check('gist_check_idx1', false); +SELECT gist_index_parent_check('gist_check_idx2', false); +SELECT gist_index_parent_check('gist_check_idx1', true); +SELECT gist_index_parent_check('gist_check_idx2', true); +-- cleanup +DROP TABLE gist_check; diff --git a/contrib/amcheck/verify_gist.c b/contrib/amcheck/verify_gist.c new file mode 100644 index 00000000000..db65880d871 --- /dev/null +++ b/contrib/amcheck/verify_gist.c @@ -0,0 +1,520 @@ +/*------------------------------------------------------------------------- + * + * verify_gist.c + * Verifies the integrity of GiST indexes based on invariants. + * + * Verification checks that all paths in GiST graph contain + * consistent keys: tuples on parent pages consistently include tuples + * from children pages. Also, verification checks graph invariants: + * internal page must have at least one downlinks, internal page can + * reference either only leaf pages or only internal pages. + * + * + * Copyright (c) 2017-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/amcheck/verify_gist.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gist_private.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "catalog/pg_am.h" +#include "common/pg_prng.h" +#include "catalog/index.h" +#include "lib/bloomfilter.h" +#include "miscadmin.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +#include "amcheck.h" + +/* + * GistScanItem represents one item of depth-first scan of GiST index. + */ +typedef struct GistScanItem +{ + int depth; + IndexTuple parenttup; + BlockNumber parentblk; + XLogRecPtr parentlsn; + BlockNumber blkno; + struct GistScanItem *next; +} GistScanItem; + +typedef struct GistCheckState +{ + /* Bloom filter fingerprints B-Tree index */ + bloom_filter *filter; + /* Debug counter */ + int64 heaptuplespresent; + /* GiST state */ + GISTSTATE *state; + + Snapshot snapshot; + Relation rel; + Relation heaprel; +} GistCheckState; + +PG_FUNCTION_INFO_V1(gist_index_parent_check); + +static GistCheckState gist_init_heapallindexed(Relation rel); +static void gist_index_checkable(Relation rel); +static void gist_check_parent_keys_consistency(Relation rel, Relation heaprel, + void* callback_state); +static void check_index_page(Relation rel, Buffer buffer, BlockNumber blockNo); +static IndexTuple gist_refind_parent(Relation rel, BlockNumber parentblkno, + BlockNumber childblkno, + BufferAccessStrategy strategy); +static void gist_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, + bool *isnull, bool tupleIsAlive, void *checkstate); + +/* + * gist_index_parent_check(index regclass) + * + * Verify integrity of GiST index. + * + * Acquires AccessShareLock on heap & index relations. + */ +Datum gist_index_parent_check(PG_FUNCTION_ARGS) +{ + Oid indrelid = PG_GETARG_OID(0); + bool heapallindexed = false; + + if (PG_NARGS() >= 2) + heapallindexed = PG_GETARG_BOOL(1); + + amcheck_lock_relation_and_check(indrelid, gist_index_checkable, + gist_check_parent_keys_consistency, AccessShareLock, &heapallindexed); + + PG_RETURN_VOID(); +} + +/* + * Check that relation is eligible for GiST verification + */ +static void +gist_index_checkable(Relation rel) +{ + if (rel->rd_rel->relkind != RELKIND_INDEX || + rel->rd_rel->relam != GIST_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only GiST indexes are supported as targets for this verification"), + errdetail("Relation \"%s\" is not a GiST index.", + RelationGetRelationName(rel)))); + + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"), + errdetail("Index \"%s\" is associated with temporary relation.", + RelationGetRelationName(rel)))); + + if (!rel->rd_index->indisvalid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot check index \"%s\"", + RelationGetRelationName(rel)), + errdetail("Index is not valid"))); +} + +static GistCheckState +gist_init_heapallindexed(Relation rel) +{ + int64 total_pages; + int64 total_elems; + uint64 seed; + GistCheckState result; + + /* + * Size Bloom filter based on estimated number of tuples in index. + * This logic is similar to B-tree, see verify_btree.c . + */ + total_pages = RelationGetNumberOfBlocks(rel); + total_elems = Max(total_pages * (MaxOffsetNumber / 5), + (int64) rel->rd_rel->reltuples); + seed = pg_prng_uint64(&pg_global_prng_state); + result.filter = bloom_create(total_elems, maintenance_work_mem, seed); + + result.snapshot = RegisterSnapshot(GetTransactionSnapshot()); + + /* + * GetTransactionSnapshot() always acquires a new MVCC snapshot in + * READ COMMITTED mode. A new snapshot is guaranteed to have all + * the entries it requires in the index. + * + * We must defend against the possibility that an old xact + * snapshot was returned at higher isolation levels when that + * snapshot is not safe for index scans of the target index. This + * is possible when the snapshot sees tuples that are before the + * index's indcheckxmin horizon. Throwing an error here should be + * very rare. It doesn't seem worth using a secondary snapshot to + * avoid this. + */ + if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin && + !TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data), + result.snapshot->xmin)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("index \"%s\" cannot be verified using transaction snapshot", + RelationGetRelationName(rel)))); + + return result; +} + +/* + * Main entry point for GiST check. Allocates memory context and scans through + * GiST graph. This function verifies that tuples of internal pages cover all + * the key space of each tuple on leaf page. To do this we invoke + * gist_check_internal_page() for every internal page. + * + * gist_check_internal_page() in it's turn takes every tuple and tries to + * adjust it by tuples on referenced child page. Parent gist tuple should + * never require any adjustments. + */ +static void +gist_check_parent_keys_consistency(Relation rel, Relation heaprel, void* callback_state) +{ + BufferAccessStrategy strategy = GetAccessStrategy(BAS_BULKREAD); + GistScanItem *stack; + MemoryContext mctx; + MemoryContext oldcontext; + GISTSTATE *state; + int leafdepth; + bool heapallindexed = *((bool*)callback_state); + GistCheckState check_state; + + mctx = AllocSetContextCreate(CurrentMemoryContext, + "amcheck context", + ALLOCSET_DEFAULT_SIZES); + oldcontext = MemoryContextSwitchTo(mctx); + + state = initGISTstate(rel); + + if (heapallindexed) + check_state = gist_init_heapallindexed(rel); + check_state.state = state; + check_state.rel = rel; + check_state.heaprel = heaprel; + + + /* + * We don't know the height of the tree yet, but as soon as we encounter a + * leaf page, we will set 'leafdepth' to its depth. + */ + leafdepth = -1; + + /* Start the scan at the root page */ + stack = (GistScanItem *) palloc0(sizeof(GistScanItem)); + stack->depth = 0; + stack->parenttup = NULL; + stack->parentblk = InvalidBlockNumber; + stack->parentlsn = InvalidXLogRecPtr; + stack->blkno = GIST_ROOT_BLKNO; + + while (stack) + { + GistScanItem *stack_next; + Buffer buffer; + Page page; + OffsetNumber i, maxoff; + XLogRecPtr lsn; + + CHECK_FOR_INTERRUPTS(); + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, + RBM_NORMAL, strategy); + LockBuffer(buffer, GIST_SHARE); + page = (Page) BufferGetPage(buffer); + lsn = BufferGetLSNAtomic(buffer); + + /* Do basic sanity checks on the page headers */ + check_index_page(rel, buffer, stack->blkno); + + /* + * It's possible that the page was split since we looked at the + * parent, so that we didn't missed the downlink of the right sibling + * when we scanned the parent. If so, add the right sibling to the + * stack now. + */ + if (GistFollowRight(page) || stack->parentlsn < GistPageGetNSN(page)) + { + /* split page detected, install right link to the stack */ + GistScanItem *ptr = (GistScanItem *) palloc(sizeof(GistScanItem)); + + ptr->depth = stack->depth; + ptr->parenttup = CopyIndexTuple(stack->parenttup); + ptr->parentblk = stack->parentblk; + ptr->parentlsn = stack->parentlsn; + ptr->blkno = GistPageGetOpaque(page)->rightlink; + ptr->next = stack->next; + stack->next = ptr; + } + + /* Check that the tree has the same height in all branches */ + if (GistPageIsLeaf(page)) + { + if (leafdepth == -1) + leafdepth = stack->depth; + else if (stack->depth != leafdepth) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": internal pages traversal encountered leaf page unexpectedly on block %u", + RelationGetRelationName(rel), stack->blkno))); + } + + /* + * Check that each tuple looks valid, and is consistent with the + * downlink we followed when we stepped on this page. + */ + maxoff = PageGetMaxOffsetNumber(page); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + ItemId iid = PageGetItemIdCareful(rel, stack->blkno, page, i, sizeof(GISTPageOpaqueData)); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + + /* + * Check that it's not a leftover invalid tuple from pre-9.1 See + * also gistdoinsert() and gistbulkdelete() handling of such + * tuples. We do consider it error here. + */ + if (GistTupleIsInvalid(idxtuple)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("index \"%s\" contains an inner tuple marked as invalid, block %u, offset %u", + RelationGetRelationName(rel), stack->blkno, i), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), + errhint("Please REINDEX it."))); + + if (MAXALIGN(ItemIdGetLength(iid)) != MAXALIGN(IndexTupleSize(idxtuple))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has inconsistent tuple sizes, block %u, offset %u", + RelationGetRelationName(rel), stack->blkno, i))); + + /* + * Check if this tuple is consistent with the downlink in the + * parent. + */ + if (stack->parenttup && + gistgetadjusted(rel, stack->parenttup, idxtuple, state)) + { + /* + * There was a discrepancy between parent and child tuples. + * We need to verify it is not a result of concurrent call of + * gistplacetopage(). So, lock parent and try to find downlink + * for current page. It may be missing due to concurrent page + * split, this is OK. + * + * Note that when we aquire parent tuple now we hold lock for + * both parent and child buffers. Thus parent tuple must + * include keyspace of the child. + */ + pfree(stack->parenttup); + stack->parenttup = gist_refind_parent(rel, stack->parentblk, + stack->blkno, strategy); + + /* We found it - make a final check before failing */ + if (!stack->parenttup) + elog(NOTICE, "Unable to find parent tuple for block %u on block %u due to concurrent split", + stack->blkno, stack->parentblk); + else if (gistgetadjusted(rel, stack->parenttup, idxtuple, state)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has inconsistent records on page %u offset %u", + RelationGetRelationName(rel), stack->blkno, i))); + else + { + /* + * But now it is properly adjusted - nothing to do here. + */ + } + } + + + if (GistPageIsLeaf(page)) + { + if (heapallindexed) + { + bloom_add_element(check_state.filter, (unsigned char *) idxtuple, + IndexTupleSize(idxtuple)); + } + } + /* If this is an internal page, recurse into the child */ + else + { + GistScanItem *ptr; + + ptr = (GistScanItem *) palloc(sizeof(GistScanItem)); + ptr->depth = stack->depth + 1; + ptr->parenttup = CopyIndexTuple(idxtuple); + ptr->parentblk = stack->blkno; + ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + ptr->parentlsn = lsn; + ptr->next = stack->next; + stack->next = ptr; + } + } + + LockBuffer(buffer, GIST_UNLOCK); + ReleaseBuffer(buffer); + + /* Step to next item in the queue */ + stack_next = stack->next; + if (stack->parenttup) + pfree(stack->parenttup); + pfree(stack); + stack = stack_next; + } + + if (heapallindexed) + { + IndexInfo *indexinfo = BuildIndexInfo(rel); + TableScanDesc scan; + + scan = table_beginscan_strat(heaprel, /* relation */ + check_state.snapshot, /* snapshot */ + 0, /* number of keys */ + NULL, /* scan key */ + true, /* buffer access strategy OK */ + true); /* syncscan OK? */ + + /* + * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY. + */ + indexinfo->ii_Concurrent = true; + + indexinfo->ii_Unique = false; + indexinfo->ii_ExclusionOps = NULL; + indexinfo->ii_ExclusionProcs = NULL; + indexinfo->ii_ExclusionStrats = NULL; + + elog(DEBUG1, "verifying that tuples from index \"%s\" are present in \"%s\"", + RelationGetRelationName(rel), + RelationGetRelationName(heaprel)); + + table_index_build_scan(heaprel, rel, indexinfo, true, false, + gist_tuple_present_callback, (void *) &check_state, scan); + + ereport(DEBUG1, + (errmsg_internal("finished verifying presence of " INT64_FORMAT " tuples from table \"%s\" with bitset %.2f%% set", + check_state.heaptuplespresent, RelationGetRelationName(heaprel), + 100.0 * bloom_prop_bits_set(check_state.filter)))); + + UnregisterSnapshot(check_state.snapshot); + bloom_free(check_state.filter); + } + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(mctx); +} + +static void +gist_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, + bool *isnull, bool tupleIsAlive, void *checkstate) +{ + GistCheckState *state = (GistCheckState *) checkstate; + IndexTuple itup = gistFormTuple(state->state, index, values, isnull, true); + itup->t_tid = *tid; + /* Probe Bloom filter -- tuple should be present */ + if (bloom_lacks_element(state->filter, (unsigned char *) itup, + IndexTupleSize(itup))) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid)), + RelationGetRelationName(state->heaprel), + RelationGetRelationName(state->rel)))); + + state->heaptuplespresent++; + + pfree(itup); +} + +static void +check_index_page(Relation rel, Buffer buffer, BlockNumber blockNo) +{ + Page page = BufferGetPage(buffer); + + gistcheckpage(rel, buffer); + + if (GistPageGetOpaque(page)->gist_page_id != GIST_PAGE_ID) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has corrupted page %d", + RelationGetRelationName(rel), blockNo))); + + if (GistPageIsDeleted(page)) + { + if (!GistPageIsLeaf(page)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has deleted internal page %d", + RelationGetRelationName(rel), blockNo))); + if (PageGetMaxOffsetNumber(page) > InvalidOffsetNumber) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has deleted page %d with tuples", + RelationGetRelationName(rel), blockNo))); + } + else if (PageGetMaxOffsetNumber(page) > MaxIndexTuplesPerPage) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has page %d with exceeding count of tuples", + RelationGetRelationName(rel), blockNo))); +} + +/* + * Try to re-find downlink pointing to 'blkno', in 'parentblkno'. + * + * If found, returns a palloc'd copy of the downlink tuple. Otherwise, + * returns NULL. + */ +static IndexTuple +gist_refind_parent(Relation rel, BlockNumber parentblkno, + BlockNumber childblkno, BufferAccessStrategy strategy) +{ + Buffer parentbuf; + Page parentpage; + OffsetNumber o, + parent_maxoff; + IndexTuple result = NULL; + + parentbuf = ReadBufferExtended(rel, MAIN_FORKNUM, parentblkno, RBM_NORMAL, + strategy); + + LockBuffer(parentbuf, GIST_SHARE); + parentpage = BufferGetPage(parentbuf); + + if (GistPageIsLeaf(parentpage)) + { + UnlockReleaseBuffer(parentbuf); + return result; + } + + parent_maxoff = PageGetMaxOffsetNumber(parentpage); + for (o = FirstOffsetNumber; o <= parent_maxoff; o = OffsetNumberNext(o)) + { + ItemId p_iid = PageGetItemIdCareful(rel, parentblkno, parentpage, o, sizeof(GISTPageOpaqueData)); + IndexTuple itup = (IndexTuple) PageGetItem(parentpage, p_iid); + + if (ItemPointerGetBlockNumber(&(itup->t_tid)) == childblkno) + { + /* Found it! Make copy and return it */ + result = CopyIndexTuple(itup); + break; + } + } + + UnlockReleaseBuffer(parentbuf); + + return result; +} diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml index 5d61a33936f..9397a69c6e5 100644 --- a/doc/src/sgml/amcheck.sgml +++ b/doc/src/sgml/amcheck.sgml @@ -179,6 +179,25 @@ ORDER BY c.relpages DESC LIMIT 10; </para> </listitem> </varlistentry> + + <varlistentry> + <term> + <function>gist_index_parent_check(index regclass, heapallindexed boolean) returns void</function> + <indexterm> + <primary>gist_index_parent_check</primary> + </indexterm> + </term> + + <listitem> + <para> + <function>gist_index_parent_check</function> tests that its target GiST + has consistent parent-child tuples relations (no parent tuples + require tuple adjustement) and page graph respects balanced-tree + invariants (internal pages reference only leaf page or only internal + pages). + </para> + </listitem> + </varlistentry> </variablelist> <tip> <para> -- 2.37.3.542.gdd3f6c4cae
>From 5af85c93cb961040c5e256f79db38d285c5f1fb7 Mon Sep 17 00:00:00 2001 From: "Andrey M. Borodin" <x4mmm@flight.local> Date: Sat, 23 Jul 2022 14:22:26 +0500 Subject: [PATCH v14 3/3] Add gin_index_parent_check() to verify GIN index --- contrib/amcheck/Makefile | 3 +- contrib/amcheck/amcheck--1.3--1.4.sql | 11 +- contrib/amcheck/amcheck.c | 2 +- contrib/amcheck/expected/check_gin.out | 64 ++ contrib/amcheck/meson.build | 2 + contrib/amcheck/sql/check_gin.sql | 40 ++ contrib/amcheck/verify_gin.c | 800 +++++++++++++++++++++++++ doc/src/sgml/amcheck.sgml | 19 + 8 files changed, 938 insertions(+), 3 deletions(-) create mode 100644 contrib/amcheck/expected/check_gin.out create mode 100644 contrib/amcheck/sql/check_gin.sql create mode 100644 contrib/amcheck/verify_gin.c diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile index a8174195817..ecb849a605b 100644 --- a/contrib/amcheck/Makefile +++ b/contrib/amcheck/Makefile @@ -5,6 +5,7 @@ OBJS = \ $(WIN32RES) \ amcheck.o \ verify_gist.o \ + verify_gin.o \ verify_heapam.o \ verify_nbtree.o @@ -14,7 +15,7 @@ DATA = amcheck--1.2--1.3.sql amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck PGFILEDESC = "amcheck - function for verifying relation integrity" -REGRESS = check check_btree check_heap check_gist +REGRESS = check check_btree check_heap check_gist check_gin TAP_TESTS = 1 diff --git a/contrib/amcheck/amcheck--1.3--1.4.sql b/contrib/amcheck/amcheck--1.3--1.4.sql index 93297379efe..c914e6d0baa 100644 --- a/contrib/amcheck/amcheck--1.3--1.4.sql +++ b/contrib/amcheck/amcheck--1.3--1.4.sql @@ -11,4 +11,13 @@ RETURNS VOID AS 'MODULE_PATHNAME', 'gist_index_parent_check' LANGUAGE C STRICT; -REVOKE ALL ON FUNCTION gist_index_parent_check(regclass, boolean) FROM PUBLIC; \ No newline at end of file +REVOKE ALL ON FUNCTION gist_index_parent_check(regclass, boolean) FROM PUBLIC; + +-- gin_index_parent_check() +-- +CREATE FUNCTION gin_index_parent_check(index regclass, heapallindexed boolean) +RETURNS VOID +AS 'MODULE_PATHNAME', 'gin_index_parent_check' +LANGUAGE C STRICT; + +REVOKE ALL ON FUNCTION gin_index_parent_check(regclass, boolean) FROM PUBLIC; \ No newline at end of file diff --git a/contrib/amcheck/amcheck.c b/contrib/amcheck/amcheck.c index 0194ef0d7a2..1e0c875926c 100644 --- a/contrib/amcheck/amcheck.c +++ b/contrib/amcheck/amcheck.c @@ -83,7 +83,7 @@ amcheck_lock_relation_and_check(Oid indrelid, IndexCheckableCallback checkable, else { heaprel = NULL; - /* for "gcc -Og" https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78394 */ + /* Set these just to suppress "uninitialized variable" warnings */ save_userid = InvalidOid; save_sec_context = -1; save_nestlevel = -1; diff --git a/contrib/amcheck/expected/check_gin.out b/contrib/amcheck/expected/check_gin.out new file mode 100644 index 00000000000..d98d525c660 --- /dev/null +++ b/contrib/amcheck/expected/check_gin.out @@ -0,0 +1,64 @@ +-- Test of index bulk load +SELECT setseed(1); + setseed +--------- + +(1 row) + +CREATE TABLE "gin_check"("Column1" int[]); +-- posting trees (frequently used entries) +INSERT INTO gin_check select array_agg(round(random()*255) ) from generate_series(1, 100000) as i group by i % 10000; +-- posting leaves (sparse entries) +INSERT INTO gin_check select array_agg(255 + round(random()*100)) from generate_series(1, 100) as i group by i % 100; +CREATE INDEX gin_check_idx on "gin_check" USING GIN("Column1"); +SELECT gin_index_parent_check('gin_check_idx', true); + gin_index_parent_check +------------------------ + +(1 row) + +-- cleanup +DROP TABLE gin_check; +-- Test index inserts +SELECT setseed(1); + setseed +--------- + +(1 row) + +CREATE TABLE "gin_check"("Column1" int[]); +CREATE INDEX gin_check_idx on "gin_check" USING GIN("Column1"); +ALTER INDEX gin_check_idx SET (fastupdate = false); +-- posting trees +INSERT INTO gin_check select array_agg(round(random()*255) ) from generate_series(1, 100000) as i group by i % 10000; +-- posting leaves +INSERT INTO gin_check select array_agg(100 + round(random()*255)) from generate_series(1, 100) as i group by i % 100; +SELECT gin_index_parent_check('gin_check_idx', true); + gin_index_parent_check +------------------------ + +(1 row) + +-- cleanup +DROP TABLE gin_check; +-- Test GIN over text array +SELECT setseed(1); + setseed +--------- + +(1 row) + +CREATE TABLE "gin_check_text_array"("Column1" text[]); +-- posting trees +INSERT INTO gin_check_text_array select array_agg(md5(round(random()*300)::text)::text) from generate_series(1, 100000) as i group by i % 10000; +-- posting leaves +INSERT INTO gin_check_text_array select array_agg(md5(round(random()*300 + 300)::text)::text) from generate_series(1, 10000) as i group by i % 100; +CREATE INDEX gin_check_text_array_idx on "gin_check_text_array" USING GIN("Column1"); +SELECT gin_index_parent_check('gin_check_text_array_idx', true); + gin_index_parent_check +------------------------ + +(1 row) + +-- cleanup +DROP TABLE gin_check_text_array; diff --git a/contrib/amcheck/meson.build b/contrib/amcheck/meson.build index d84138f7fa6..0d7d651b24a 100644 --- a/contrib/amcheck/meson.build +++ b/contrib/amcheck/meson.build @@ -1,5 +1,6 @@ amcheck = shared_module('amcheck', [ 'amcheck.c', + 'verify_gin.c', 'verify_gist.c', 'verify_heapam.c', 'verify_nbtree.c', @@ -28,6 +29,7 @@ tests += { 'check_btree', 'check_heap', 'check_gist', + 'check_gin', ], }, 'tap': { diff --git a/contrib/amcheck/sql/check_gin.sql b/contrib/amcheck/sql/check_gin.sql new file mode 100644 index 00000000000..789259e6629 --- /dev/null +++ b/contrib/amcheck/sql/check_gin.sql @@ -0,0 +1,40 @@ +-- Test of index bulk load +SELECT setseed(1); +CREATE TABLE "gin_check"("Column1" int[]); +-- posting trees (frequently used entries) +INSERT INTO gin_check select array_agg(round(random()*255) ) from generate_series(1, 100000) as i group by i % 10000; +-- posting leaves (sparse entries) +INSERT INTO gin_check select array_agg(255 + round(random()*100)) from generate_series(1, 100) as i group by i % 100; +CREATE INDEX gin_check_idx on "gin_check" USING GIN("Column1"); +SELECT gin_index_parent_check('gin_check_idx', true); + +-- cleanup +DROP TABLE gin_check; + +-- Test index inserts +SELECT setseed(1); +CREATE TABLE "gin_check"("Column1" int[]); +CREATE INDEX gin_check_idx on "gin_check" USING GIN("Column1"); +ALTER INDEX gin_check_idx SET (fastupdate = false); +-- posting trees +INSERT INTO gin_check select array_agg(round(random()*255) ) from generate_series(1, 100000) as i group by i % 10000; +-- posting leaves +INSERT INTO gin_check select array_agg(100 + round(random()*255)) from generate_series(1, 100) as i group by i % 100; + +SELECT gin_index_parent_check('gin_check_idx', true); + +-- cleanup +DROP TABLE gin_check; + +-- Test GIN over text array +SELECT setseed(1); +CREATE TABLE "gin_check_text_array"("Column1" text[]); +-- posting trees +INSERT INTO gin_check_text_array select array_agg(md5(round(random()*300)::text)::text) from generate_series(1, 100000) as i group by i % 10000; +-- posting leaves +INSERT INTO gin_check_text_array select array_agg(md5(round(random()*300 + 300)::text)::text) from generate_series(1, 10000) as i group by i % 100; +CREATE INDEX gin_check_text_array_idx on "gin_check_text_array" USING GIN("Column1"); +SELECT gin_index_parent_check('gin_check_text_array_idx', true); + +-- cleanup +DROP TABLE gin_check_text_array; diff --git a/contrib/amcheck/verify_gin.c b/contrib/amcheck/verify_gin.c new file mode 100644 index 00000000000..90fe89501d8 --- /dev/null +++ b/contrib/amcheck/verify_gin.c @@ -0,0 +1,800 @@ +/*------------------------------------------------------------------------- + * + * verify_gin.c + * Verifies the integrity of GIN indexes based on invariants. + * + * Verification checks that all paths in GIN graph contain + * consistent keys: tuples on parent pages consistently include tuples + * from children pages. Also, verification checks graph invariants: + * internal page must have at least one downlinks, internal page can + * reference either only leaf pages or only internal pages. + * + * + * Copyright (c) 2017-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/amcheck/verify_gin.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/nbtree.h" +#include "amcheck.h" +#include "catalog/pg_am.h" +#include "miscadmin.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "string.h" + +/* + * GinScanItem represents one item of depth-first scan of GIN index. + */ +typedef struct GinScanItem +{ + int depth; + IndexTuple parenttup; + BlockNumber parentblk; + XLogRecPtr parentlsn; + BlockNumber blkno; + struct GinScanItem *next; +} GinScanItem; + +/* + * GinPostingTreeScanItem represents one item of depth-first scan of GIN posting tree. + */ +typedef struct GinPostingTreeScanItem +{ + int depth; + ItemPointerData parentkey; + BlockNumber parentblk; + BlockNumber blkno; + struct GinPostingTreeScanItem *next; +} GinPostingTreeScanItem; + + +PG_FUNCTION_INFO_V1(gin_index_parent_check); + +static void gin_index_checkable(Relation rel); +static void gin_check_parent_keys_consistency(Relation rel, Relation heaprel, void* callback_state); +static bool check_index_page(Relation rel, Buffer buffer, BlockNumber blockNo); +static IndexTuple gin_refind_parent(Relation rel, BlockNumber parentblkno, + BlockNumber childblkno, + BufferAccessStrategy strategy); + +/* + * gin_index_parent_check(index regclass) + * + * Verify integrity of GIN index. + * + * Acquires AccessShareLock on heap & index relations. + */ +Datum +gin_index_parent_check(PG_FUNCTION_ARGS) +{ + Oid indrelid = PG_GETARG_OID(0); + bool heapallindexed = false; + + if (PG_NARGS() >= 2) + heapallindexed = PG_GETARG_BOOL(1); + + amcheck_lock_relation_and_check(indrelid, gin_index_checkable, + gin_check_parent_keys_consistency, AccessShareLock, &heapallindexed); + + PG_RETURN_VOID(); +} + +/* + * Read item pointers from leaf entry tuple. + * + * Returns a palloc'd array of ItemPointers. The number of items is returned + * in *nitems. + */ +static ItemPointer +ginReadTupleWithoutState(IndexTuple itup, int *nitems) +{ + Pointer ptr = GinGetPosting(itup); + int nipd = GinGetNPosting(itup); + ItemPointer ipd; + int ndecoded; + + if (GinItupIsCompressed(itup)) + { + if (nipd > 0) + { + ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded); + if (nipd != ndecoded) + elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded", + nipd, ndecoded); + } + else + { + ipd = palloc(0); + } + } + else + { + ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd); + memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd); + } + *nitems = nipd; + return ipd; +} + + +/* + * Check that relation is eligible for GIN verification + */ +static void +gin_index_checkable(Relation rel) +{ + if (rel->rd_rel->relkind != RELKIND_INDEX || + rel->rd_rel->relam != GIN_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only GIN indexes are supported as targets for this verification"), + errdetail("Relation \"%s\" is not a GIN index.", + RelationGetRelationName(rel)))); + + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"), + errdetail("Index \"%s\" is associated with temporary relation.", + RelationGetRelationName(rel)))); + + if (!rel->rd_index->indisvalid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot check index \"%s\"", + RelationGetRelationName(rel)), + errdetail("Index is not valid"))); +} + +/* + * Allocates memory context and scans through postigTree graph + * + */ +static void +gin_check_posting_tree_parent_keys_consistency(Relation rel, BlockNumber posting_tree_root) +{ + BufferAccessStrategy strategy = GetAccessStrategy(BAS_BULKREAD); + GinPostingTreeScanItem *stack; + MemoryContext mctx; + MemoryContext oldcontext; + + int leafdepth; + + mctx = AllocSetContextCreate(CurrentMemoryContext, + "amcheck context", + ALLOCSET_DEFAULT_SIZES); + oldcontext = MemoryContextSwitchTo(mctx); + + /* + * We don't know the height of the tree yet, but as soon as we encounter a + * leaf page, we will set 'leafdepth' to its depth. + */ + leafdepth = -1; + + /* Start the scan at the root page */ + stack = (GinPostingTreeScanItem *) palloc0(sizeof(GinPostingTreeScanItem)); + stack->depth = 0; + ItemPointerSetInvalid(&stack->parentkey); + stack->parentblk = InvalidBlockNumber; + stack->blkno = posting_tree_root; + + elog(DEBUG3, "processing posting tree at blk %u", posting_tree_root); + + while (stack) + { + GinPostingTreeScanItem *stack_next; + Buffer buffer; + Page page; + OffsetNumber i, + maxoff; + + CHECK_FOR_INTERRUPTS(); + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, + RBM_NORMAL, strategy); + LockBuffer(buffer, GIN_SHARE); + page = (Page) BufferGetPage(buffer); + Assert(GinPageIsData(page)); + + /* Check that the tree has the same height in all branches */ + if (GinPageIsLeaf(page)) + { + ItemPointerData minItem; + int nlist; + ItemPointerData *list; + char tidrange_buf[100]; + + ItemPointerSetMin(&minItem); + + if (leafdepth == -1) + leafdepth = stack->depth; + else if (stack->depth != leafdepth) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": internal pages traversal encountered leaf page unexpectedly on block %u", + RelationGetRelationName(rel), stack->blkno))); + list = GinDataLeafPageGetItems(page, &nlist, minItem); + + if (nlist > 0) + { + snprintf(tidrange_buf, sizeof(tidrange_buf), + "%d tids (%u, %u) - (%u, %u)", + nlist, + ItemPointerGetBlockNumberNoCheck(&list[0]), + ItemPointerGetOffsetNumberNoCheck(&list[0]), + ItemPointerGetBlockNumberNoCheck(&list[nlist - 1]), + ItemPointerGetOffsetNumberNoCheck(&list[nlist - 1])); + } else { + snprintf(tidrange_buf, sizeof(tidrange_buf), "0 tids"); + } + + if (stack->parentblk != InvalidBlockNumber) + { + elog(DEBUG3, "blk %u: parent %u highkey (%u, %u), %s", + stack->blkno, + stack->parentblk, + ItemPointerGetBlockNumberNoCheck(&stack->parentkey), + ItemPointerGetOffsetNumberNoCheck(&stack->parentkey), + tidrange_buf); + } + else + { + elog(DEBUG3, "blk %u: root leaf, %s", + stack->blkno, + tidrange_buf); + } + + if (stack->parentblk != InvalidBlockNumber && + ItemPointerGetOffsetNumberNoCheck(&stack->parentkey) != InvalidOffsetNumber && + nlist > 0 && + ItemPointerCompare(&stack->parentkey, &list[nlist - 1]) < 0) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": tid exceeds parent's high key in postingTree leaf on block %u", + RelationGetRelationName(rel), stack->blkno))); + } + } + else + { + LocationIndex pd_lower; + int lowersize; + ItemPointerData bound; + + /* + * Check that tuples in each page are properly ordered and + * consistent with parent high key + */ + maxoff = GinPageGetOpaque(page)->maxoff; + if (stack->parentblk != InvalidBlockNumber) + elog(DEBUG3, "blk %u: internal posting tree page with %u items, parent %u highkey (%u, %u)", + stack->blkno, + maxoff, + stack->parentblk, + ItemPointerGetBlockNumberNoCheck(&stack->parentkey), + ItemPointerGetOffsetNumberNoCheck(&stack->parentkey)); + else + elog(DEBUG3, "blk %u: root internal posting tree page with %u items", stack->blkno, maxoff); + + /* + * A GIN posting tree internal page stores PostingItems in the + * 'lower' part of the page. The 'upper' part is unused. The + * number of elements is stored in the opaque area (maxoff). + * Make sure the size of the 'lower' part agrees with 'maxoff' + * + * We didn't set pd_lower until PostgreSQL version 9.4, so if this + * check fails, it could also be because the index was binary-upgraded + * from an earlier version. That was a long time ago, though, so let's + * warn if it doesn't match. + */ + pd_lower = ((PageHeader) page)->pd_lower; + lowersize = pd_lower - MAXALIGN(SizeOfPageHeaderData); + if ((lowersize - MAXALIGN(sizeof(ItemPointerData))) / sizeof(PostingItem) != maxoff) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has unexpected pd_lower %u in posting tree block %u with maxoff %u)", + RelationGetRelationName(rel), pd_lower, stack->blkno, maxoff))); + } + + /* + * Before the PostingItems, there's one ItemPointerData in the + * 'lower' part that stores the page's high key. + */ + bound = *GinDataPageGetRightBound(page); + + if (stack->parentblk != InvalidBlockNumber) + { + if (!ItemPointerEquals(&stack->parentkey, &bound)) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": posting tree page's high key (%u, %u) doesn't match the downlink on block %u (parent blk %u, key (%u, %u))", + RelationGetRelationName(rel), + ItemPointerGetBlockNumberNoCheck(&bound), + ItemPointerGetOffsetNumberNoCheck(&bound), + stack->blkno, + stack->parentblk, + ItemPointerGetBlockNumberNoCheck(&stack->parentkey), + ItemPointerGetOffsetNumberNoCheck(&stack->parentkey)))); + } + } + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + PostingItem *posting_item = GinDataPageGetPostingItem(page, i); + + elog(DEBUG3, "key (%u, %u) -> %u", + ItemPointerGetBlockNumber(&posting_item->key), + ItemPointerGetOffsetNumber(&posting_item->key), + BlockIdGetBlockNumber(&posting_item->child_blkno)); + + if (i == maxoff && GinPageGetOpaque(page)->rightlink == InvalidBlockNumber) + { + /* The rightmost item in the tree level has (0, 0) as the key */ + if (ItemPointerGetBlockNumberNoCheck(&posting_item->key) != 0 || + ItemPointerGetOffsetNumberNoCheck(&posting_item->key) != 0) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": rightmost posting tree page (blk %u) has unexpected last key (%u, %u)", + RelationGetRelationName(rel), + stack->blkno, + ItemPointerGetBlockNumberNoCheck(&posting_item->key), + ItemPointerGetOffsetNumberNoCheck(&posting_item->key)))); + } + } + else if (i != FirstOffsetNumber) + { + PostingItem *previous_posting_item = GinDataPageGetPostingItem(page, i - 1); + + if (ItemPointerCompare(&posting_item->key, &previous_posting_item->key) < 0) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has wrong tuple order in posting tree, block %u, offset %u", + RelationGetRelationName(rel), stack->blkno, i))); + } + } + + /* + * Check if this tuple is consistent with the downlink in the + * parent. + */ + if (stack->parentblk != InvalidBlockNumber && i == maxoff) + { + if (ItemPointerCompare(&stack->parentkey, &posting_item->key) < 0) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": posting item exceeds parent's high key in postingTree internal page on block %u offset %u", + RelationGetRelationName(rel), stack->blkno, i))); + + } + } + + /* If this is an internal page, recurse into the child */ + if (!GinPageIsLeaf(page)) + { + GinPostingTreeScanItem *ptr; + + ptr = (GinPostingTreeScanItem *) palloc(sizeof(GinPostingTreeScanItem)); + ptr->depth = stack->depth + 1; + ptr->parentkey = posting_item->key; + ptr->parentblk = stack->blkno; + ptr->blkno = BlockIdGetBlockNumber(&posting_item->child_blkno); + ptr->next = stack->next; + stack->next = ptr; + } + + } + } + LockBuffer(buffer, GIN_UNLOCK); + ReleaseBuffer(buffer); + + /* Step to next item in the queue */ + stack_next = stack->next; + pfree(stack); + stack = stack_next; + } + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(mctx); +} + +/* + * Main entry point for GIN check. Allocates memory context and scans through + * GIN graph. + */ +static void +gin_check_parent_keys_consistency(Relation rel, Relation heaprel, void* callback_state) +{ + BufferAccessStrategy strategy = GetAccessStrategy(BAS_BULKREAD); + GinScanItem *stack; + MemoryContext mctx; + MemoryContext oldcontext; + GinState state; + bool heapallindexed = *((bool*)callback_state); + + int leafdepth; + + mctx = AllocSetContextCreate(CurrentMemoryContext, + "amcheck context", + ALLOCSET_DEFAULT_SIZES); + oldcontext = MemoryContextSwitchTo(mctx); + initGinState(&state, rel); + + /* + * We don't know the height of the tree yet, but as soon as we encounter a + * leaf page, we will set 'leafdepth' to its depth. + */ + leafdepth = -1; + + /* Start the scan at the root page */ + stack = (GinScanItem *) palloc0(sizeof(GinScanItem)); + stack->depth = 0; + stack->parenttup = NULL; + stack->parentblk = InvalidBlockNumber; + stack->parentlsn = InvalidXLogRecPtr; + stack->blkno = GIN_ROOT_BLKNO; + + while (stack) + { + GinScanItem *stack_next; + Buffer buffer; + Page page; + OffsetNumber i, + maxoff; + XLogRecPtr lsn; + IndexTuple prev_tuple; + + CHECK_FOR_INTERRUPTS(); + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, + RBM_NORMAL, strategy); + LockBuffer(buffer, GIN_SHARE); + page = (Page) BufferGetPage(buffer); + lsn = BufferGetLSNAtomic(buffer); + + /* Do basic sanity checks on the page headers */ + if (!check_index_page(rel, buffer, stack->blkno)) + { + goto nextpage; + } + + /* + * It's possible that the page was split since we looked at the + * parent, so that we didn't missed the downlink of the right sibling + * when we scanned the parent. If so, add the right sibling to the + * stack now. + */ + if (stack->parenttup != NULL) + { + GinNullCategory parent_key_category; + Datum parent_key = gintuple_get_key(&state, stack->parenttup, &parent_key_category); + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + ItemId iid = PageGetItemIdCareful(rel, stack->blkno, page, maxoff, sizeof(GinPageOpaqueData)); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + OffsetNumber attnum = gintuple_get_attrnum(&state, idxtuple); + GinNullCategory page_max_key_category; + Datum page_max_key = gintuple_get_key(&state, idxtuple, &page_max_key_category); + + if (GinPageGetOpaque(page)->rightlink != InvalidBlockNumber && + ginCompareEntries(&state, attnum, page_max_key, page_max_key_category, parent_key, parent_key_category) > 0) + { + /* split page detected, install right link to the stack */ + GinScanItem *ptr; + + elog(DEBUG3, "split detected"); + + ptr = (GinScanItem *) palloc(sizeof(GinScanItem)); + ptr->depth = stack->depth; + ptr->parenttup = CopyIndexTuple(stack->parenttup); + ptr->parentblk = stack->parentblk; + ptr->parentlsn = stack->parentlsn; + ptr->blkno = GinPageGetOpaque(page)->rightlink; + ptr->next = stack->next; + stack->next = ptr; + } + } + + /* Check that the tree has the same height in all branches */ + if (GinPageIsLeaf(page)) + { + if (leafdepth == -1) + leafdepth = stack->depth; + else if (stack->depth != leafdepth) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": internal pages traversal encountered leaf page unexpectedly on block %u", + RelationGetRelationName(rel), stack->blkno))); + goto nextpage; + } + } + + /* + * Check that tuples in each page are properly ordered and consistent + * with parent high key + */ + maxoff = PageGetMaxOffsetNumber(page); + prev_tuple = NULL; + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + ItemId iid = PageGetItemIdCareful(rel, stack->blkno, page, i, sizeof(GinPageOpaqueData)); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + OffsetNumber attnum = gintuple_get_attrnum(&state, idxtuple); + GinNullCategory prev_key_category; + Datum prev_key; + GinNullCategory current_key_category; + Datum current_key; + + if (MAXALIGN(ItemIdGetLength(iid)) != MAXALIGN(IndexTupleSize(idxtuple))) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has inconsistent tuple sizes, block %u, offset %u", + RelationGetRelationName(rel), stack->blkno, i))); + continue; + } + + current_key = gintuple_get_key(&state, idxtuple, ¤t_key_category); + + /* (apparently) first block is metadata, skip order check */ + if (i != FirstOffsetNumber && stack->blkno != (BlockNumber) 1) + { + prev_key = gintuple_get_key(&state, prev_tuple, &prev_key_category); + if (ginCompareEntries(&state, attnum, prev_key, prev_key_category, current_key, current_key_category) >= 0) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has wrong tuple order, block %u, offset %u", + RelationGetRelationName(rel), stack->blkno, i))); + } + } + + /* + * Check if this tuple is consistent with the downlink in the + * parent. + */ + if (stack->parenttup && + i == maxoff) + { + GinNullCategory parent_key_category; + Datum parent_key = gintuple_get_key(&state, stack->parenttup, &parent_key_category); + + if (ginCompareEntries(&state, attnum, current_key, current_key_category, parent_key, parent_key_category) > 0) + { + /* + * There was a discrepancy between parent and child + * tuples. We need to verify it is not a result of + * concurrent call of gistplacetopage(). So, lock parent + * and try to find downlink for current page. It may be + * missing due to concurrent page split, this is OK. + */ + pfree(stack->parenttup); + stack->parenttup = gin_refind_parent(rel, stack->parentblk, + stack->blkno, strategy); + + /* We found it - make a final check before failing */ + if (!stack->parenttup) + elog(NOTICE, "Unable to find parent tuple for block %u on block %u due to concurrent split", + stack->blkno, stack->parentblk); + else + { + parent_key = gintuple_get_key(&state, stack->parenttup, &parent_key_category); + if (ginCompareEntries(&state, attnum, current_key, current_key_category, parent_key, parent_key_category) > 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has inconsistent records on page %u offset %u", + RelationGetRelationName(rel), stack->blkno, i))); + else + { + /* + * But now it is properly adjusted - nothing to do + * here. + */ + } + } + } + } + + /* If this is an internal page, recurse into the child */ + if (!GinPageIsLeaf(page)) + { + GinScanItem *ptr; + + ptr = (GinScanItem *) palloc(sizeof(GinScanItem)); + ptr->depth = stack->depth + 1; + /* last tuple in layer has no high key */ + if (i != maxoff && !GinPageGetOpaque(page)->rightlink) + { + ptr->parenttup = CopyIndexTuple(idxtuple); + } + else + { + ptr->parenttup = NULL; + } + ptr->parentblk = stack->blkno; + ptr->blkno = GinGetDownlink(idxtuple); + ptr->parentlsn = lsn; + ptr->next = stack->next; + stack->next = ptr; + } + /* If this item is a pointer to a posting tree, recurse into it */ + else if (GinIsPostingTree(idxtuple)) + { + BlockNumber rootPostingTree = GinGetPostingTree(idxtuple); + + gin_check_posting_tree_parent_keys_consistency(rel, rootPostingTree); + } + else + { + ItemPointer ipd; + int nipd; + + ipd = ginReadTupleWithoutState(idxtuple, &nipd); + + for (int j = 0; j < nipd; j++) + { + if (!OffsetNumberIsValid(ItemPointerGetOffsetNumber(&ipd[j]))) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": posting list contains invalid heap pointer on block %u", + RelationGetRelationName(rel), stack->blkno))); + } + } + pfree(ipd); + } + + prev_tuple = CopyIndexTuple(idxtuple); + } + +nextpage: + LockBuffer(buffer, GIN_UNLOCK); + ReleaseBuffer(buffer); + + /* Step to next item in the queue */ + stack_next = stack->next; + if (stack->parenttup) + pfree(stack->parenttup); + pfree(stack); + stack = stack_next; + } + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(mctx); +} + +/* + * Verify that a freshly-read page looks sane. + */ +static bool +gincheckpage(Relation rel, Buffer buf) +{ + Page page = BufferGetPage(buf); + + /* + * ReadBuffer verifies that every newly-read page passes + * PageHeaderIsValid, which means it either contains a reasonably sane + * page header or is all-zero. We have to defend against the all-zero + * case, however. + */ + if (PageIsNew(page)) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains unexpected zero page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); + return false; + } + + /* + * Additionally check that the special area looks sane. + */ + if (PageGetSpecialSize(page) != MAXALIGN(sizeof(GinPageOpaqueData))) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains corrupted page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); + return false; + } + return true; +} + +static bool +check_index_page(Relation rel, Buffer buffer, BlockNumber blockNo) +{ + Page page = BufferGetPage(buffer); + + if (!gincheckpage(rel, buffer)) + return false; + + if (GinPageIsDeleted(page)) + { + if (!GinPageIsLeaf(page)) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has deleted internal page %d", + RelationGetRelationName(rel), blockNo))); + return false; + } + if (PageGetMaxOffsetNumber(page) > InvalidOffsetNumber) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has deleted page %d with tuples", + RelationGetRelationName(rel), blockNo))); + return false; + } + } + else if (PageGetMaxOffsetNumber(page) > MaxIndexTuplesPerPage) + { + ereport(WARNING, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has page %d with exceeding count of tuples", + RelationGetRelationName(rel), blockNo))); + return false; + } + return true; +} + +/* + * Try to re-find downlink pointing to 'blkno', in 'parentblkno'. + * + * If found, returns a palloc'd copy of the downlink tuple. Otherwise, + * returns NULL. + */ +static IndexTuple +gin_refind_parent(Relation rel, BlockNumber parentblkno, + BlockNumber childblkno, BufferAccessStrategy strategy) +{ + Buffer parentbuf; + Page parentpage; + OffsetNumber o, + parent_maxoff; + IndexTuple result = NULL; + + parentbuf = ReadBufferExtended(rel, MAIN_FORKNUM, parentblkno, RBM_NORMAL, + strategy); + + LockBuffer(parentbuf, GIN_SHARE); + parentpage = BufferGetPage(parentbuf); + + if (GinPageIsLeaf(parentpage)) + { + UnlockReleaseBuffer(parentbuf); + return result; + } + + parent_maxoff = PageGetMaxOffsetNumber(parentpage); + for (o = FirstOffsetNumber; o <= parent_maxoff; o = OffsetNumberNext(o)) + { + ItemId p_iid = PageGetItemIdCareful(rel, parentblkno, parentpage, o, sizeof(GinPageOpaqueData)); + IndexTuple itup = (IndexTuple) PageGetItem(parentpage, p_iid); + + if (ItemPointerGetBlockNumber(&(itup->t_tid)) == childblkno) + { + /* Found it! Make copy and return it */ + result = CopyIndexTuple(itup); + break; + } + } + + UnlockReleaseBuffer(parentbuf); + + return result; +} diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml index 9397a69c6e5..7ffa36b2057 100644 --- a/doc/src/sgml/amcheck.sgml +++ b/doc/src/sgml/amcheck.sgml @@ -180,6 +180,25 @@ ORDER BY c.relpages DESC LIMIT 10; </listitem> </varlistentry> + <varlistentry> + <term> + <function>gin_index_parent_check(index regclass, heapallindexed boolean) returns void</function> + <indexterm> + <primary>gin_index_parent_check</primary> + </indexterm> + </term> + + <listitem> + <para> + <function>gin_index_parent_check</function> tests that its target GIN index + has consistent parent-child tuples relations (no parent tuples + require tuple adjustement) and page graph respects balanced-tree + invariants (internal pages reference only leaf page or only internal + pages). + </para> + </listitem> + </varlistentry> + <varlistentry> <term> <function>gist_index_parent_check(index regclass, heapallindexed boolean) returns void</function> -- 2.37.3.542.gdd3f6c4cae