On 27/03/2019 11:51, Andrey Borodin wrote:
Hi!
Here's new version of GiST amcheck, which takes into account recently committed
GiST VACUUM.
It tests that deleted pages do not contain any data.
Thanks! I had a look, and refactored it quite a bit.
I found the way the recursion worked confusing. On each internal page,
it looped through all the child nodes, checking the consistency of the
downlinks. And then it looped through the children again, to recurse.
This isn't performance-critical, but visiting every page twice still
seems strange.
In gist_check_page_keys(), if we get into the code to deal with a
concurrent update, we set 'parent' to point to a tuple on a parent page,
then unlock it, and continue to look at remaining tuples, using the
pointer that points to an unlocked buffer.
I came up with the attached, which fixes the above-mentioned things. I
also replaced the check that each node has only internal or leaf
children, with a different check that the tree has the same height in
all branches. That catches more potential problems, and was easier to
implement after the refactoring. This still needs at least a round of
fixing typos and tidying up comments, but it's more straightforward now,
IMHO.
What have you been using to test this?
- Heikki
diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile
index dcec3b85203..dd9b5ecf926 100644
--- a/contrib/amcheck/Makefile
+++ b/contrib/amcheck/Makefile
@@ -1,13 +1,13 @@
# contrib/amcheck/Makefile
MODULE_big = amcheck
-OBJS = verify_nbtree.o $(WIN32RES)
+OBJS = verify_nbtree.o verify_gist.o $(WIN32RES)
EXTENSION = amcheck
DATA = amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql
PGFILEDESC = "amcheck - function for verifying relation integrity"
-REGRESS = check check_btree
+REGRESS = check check_btree check_gist
ifdef USE_PGXS
PG_CONFIG = pg_config
diff --git a/contrib/amcheck/amcheck--1.1--1.2.sql b/contrib/amcheck/amcheck--1.1--1.2.sql
index 883530dec74..1d461fff5b9 100644
--- a/contrib/amcheck/amcheck--1.1--1.2.sql
+++ b/contrib/amcheck/amcheck--1.1--1.2.sql
@@ -17,3 +17,13 @@ LANGUAGE C STRICT PARALLEL RESTRICTED;
-- Don't want this to be available to public
REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean, boolean) FROM PUBLIC;
+
+--
+-- gist_index_parent_check()
+--
+CREATE FUNCTION gist_index_parent_check(index regclass)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'gist_index_parent_check'
+LANGUAGE C STRICT;
+
+REVOKE ALL ON FUNCTION gist_index_parent_check(regclass) FROM PUBLIC;
diff --git a/contrib/amcheck/amcheck.h b/contrib/amcheck/amcheck.h
new file mode 100644
index 00000000000..84da7d7ec0f
--- /dev/null
+++ b/contrib/amcheck/amcheck.h
@@ -0,0 +1,31 @@
+/*-------------------------------------------------------------------------
+ *
+ * amcheck.h
+ * Shared routines for amcheck verifications.
+ *
+ * Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/amcheck/amcheck.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/transam.h"
+#include "catalog/index.h"
+#include "catalog/pg_am.h"
+#include "commands/tablecmds.h"
+#include "miscadmin.h"
+#include "storage/lmgr.h"
+#include "utils/memutils.h"
+#include "utils/snapmgr.h"
+
+extern void
+amcheck_lock_relation(Oid indrelid, bool parentcheck,Relation *indrel,
+ Relation *heaprel, LOCKMODE *lockmode);
+
+extern void
+amcheck_unlock_relation(Oid indrelid, Relation indrel, Relation heaprel, LOCKMODE lockmode);
diff --git a/contrib/amcheck/verify_gist.c b/contrib/amcheck/verify_gist.c
new file mode 100644
index 00000000000..8b37b20bc9c
--- /dev/null
+++ b/contrib/amcheck/verify_gist.c
@@ -0,0 +1,348 @@
+/*-------------------------------------------------------------------------
+ *
+ * verify_gist.c
+ * Verifies the integrity of GiST indexes based on invariants.
+ *
+ * Verification checks that all paths in GiST graph contain
+ * consistent keys: tuples on parent pages consistently include tuples
+ * from children pages. Also, verification checks graph invariants:
+ * internal page must have at least one downlinks, internal page can
+ * reference either only leaf pages or only internal pages.
+ *
+ *
+ * Copyright (c) 2017-2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/amcheck/verify_gist.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/gist_private.h"
+#include "amcheck.h"
+
+
+typedef struct GistScanItem
+{
+ int depth;
+ IndexTuple parenttup;
+ BlockNumber parentblk;
+ XLogRecPtr parentlsn;
+ BlockNumber blkno;
+ struct GistScanItem *next;
+} GistScanItem;
+
+static void check_index_page(Relation rel, Buffer buffer);
+
+static IndexTuple gist_refind_parent(Relation rel, BlockNumber parentblkno, BlockNumber childblkno, BufferAccessStrategy strategy);
+
+static void gist_check_parent_keys_consistency(Relation rel);
+
+static void gist_index_checkable(Relation rel);
+
+static void
+check_index_page(Relation rel, Buffer buffer)
+{
+ Page page = BufferGetPage(buffer);
+
+ gistcheckpage(rel, buffer);
+ if (GistPageGetOpaque(page)->gist_page_id != GIST_PAGE_ID)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" has corrupted pages",
+ RelationGetRelationName(rel))));
+ if (GistPageIsDeleted(page))
+ {
+ elog(ERROR,"boom");
+ if (!GistPageIsLeaf(page))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" has deleted internal page",
+ RelationGetRelationName(rel))));
+ if (PageGetMaxOffsetNumber(page) > InvalidOffsetNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" has deleted page with tuples",
+ RelationGetRelationName(rel))));
+ }
+}
+
+/*
+ * Try to re-find downlink pointing to 'blkno', in 'parentblkno'.
+ *
+ * If found, returns a palloc'd copy of the downlink tuple. Otherwise,
+ * returns NULL.
+ */
+static IndexTuple
+gist_refind_parent(Relation rel, BlockNumber parentblkno, BlockNumber childblkno,
+ BufferAccessStrategy strategy)
+{
+ Buffer parentbuf;
+ Page parentpage;
+ OffsetNumber o,
+ parent_maxoff;
+ IndexTuple result = NULL;
+
+ parentbuf = ReadBufferExtended(rel, MAIN_FORKNUM, childblkno,
+ RBM_NORMAL, strategy);
+
+ LockBuffer(parentbuf, GIST_SHARE);
+ parentpage = BufferGetPage(parentbuf);
+ parent_maxoff = PageGetMaxOffsetNumber(parentpage);
+ for (o = FirstOffsetNumber; o <= parent_maxoff; o = OffsetNumberNext(o))
+ {
+ ItemId p_iid = PageGetItemId(parentpage, o);
+ IndexTuple itup = (IndexTuple) PageGetItem(parentpage, p_iid);
+
+ if (ItemPointerGetBlockNumber(&(itup->t_tid)) == childblkno)
+ {
+ /* Found it! Make copy and return it */
+ result = CopyIndexTuple(itup);
+ break;
+ }
+ }
+
+ LockBuffer(parentbuf, GIST_UNLOCK);
+
+ return result;
+}
+
+/*
+ * Main entry point for GiST check. Allocates memory context and scans
+ * through GiST graph.
+ * This function verifies that tuples of internal pages cover all the key
+ * space of each tuple on leaf page. To do this we invoke
+ * gist_check_internal_page() for every internal page.
+ *
+ * gist_check_internal_page() in it's turn takes every tuple and tries
+ * to adjust it by tuples on referenced child page. Parent gist tuple should
+ * never requre an adjustement.
+ */
+static void
+gist_check_parent_keys_consistency(Relation rel)
+{
+ BufferAccessStrategy strategy = GetAccessStrategy(BAS_BULKREAD);
+ GistScanItem *stack;
+ MemoryContext mctx;
+ MemoryContext oldcontext;
+ GISTSTATE *state;
+ int leafdepth;
+
+ mctx = AllocSetContextCreate(CurrentMemoryContext,
+ "amcheck context",
+ ALLOCSET_DEFAULT_SIZES);
+ oldcontext = MemoryContextSwitchTo(mctx);
+
+ state = initGISTstate(rel);
+
+ /*
+ * We don't know the height of the tree yet, but as soon as we encounter
+ * a leaf page, we will set 'leafdepth' to its depth.
+ */
+ leafdepth = -1;
+
+ /* Start the scan at the root page */
+ stack = (GistScanItem *) palloc0(sizeof(GistScanItem));
+ stack->depth = 0;
+ stack->parenttup = NULL;
+ stack->parentblk = InvalidBlockNumber;
+ stack->parentlsn = InvalidXLogRecPtr;
+ stack->blkno = GIST_ROOT_BLKNO;
+
+ while (stack)
+ {
+ GistScanItem *stack_next;
+ Buffer buffer;
+ Page page;
+ OffsetNumber i,
+ maxoff;
+ XLogRecPtr lsn;
+
+ CHECK_FOR_INTERRUPTS();
+
+ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno,
+ RBM_NORMAL, strategy);
+ LockBuffer(buffer, GIST_SHARE);
+ page = (Page) BufferGetPage(buffer);
+ lsn = BufferGetLSNAtomic(buffer);
+
+ /* Do basic sanity checks on the page headers */
+ check_index_page(rel, buffer);
+
+ /*
+ * It's possible that the page was split since we looked at the parent,
+ * so that we didn't missed the downlink of the right sibling when we
+ * scanned the parent. If so, add the right sibling to the stack now.
+ */
+ if (GistFollowRight(page) || stack->parentlsn < GistPageGetNSN(page))
+ {
+ /* split page detected, install right link to the stack */
+ GistScanItem *ptr = (GistScanItem *) palloc(sizeof(GistScanItem));
+
+ ptr->depth = stack->depth;
+ ptr->parenttup = CopyIndexTuple(stack->parenttup);
+ ptr->parentblk = stack->parentblk;
+ ptr->parentlsn = stack->parentlsn;
+ ptr->blkno = GistPageGetOpaque(page)->rightlink;
+ ptr->next = stack->next;
+ stack->next = ptr;
+ }
+
+ /* Check that the tree has the same height in all branches */
+ if (GistPageIsLeaf(page))
+ {
+ if (leafdepth == -1)
+ leafdepth = stack->depth;
+ else if (stack->depth != leafdepth)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\": internal pages traversal encountered leaf page unexpectedly",
+ RelationGetRelationName(rel))));
+ }
+ }
+
+ /*
+ * Check that each tuple looks valid, and is consistent with the
+ * downlink we followed when we stepped on this page.
+ */
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ ItemId iid = PageGetItemId(page, i);
+ IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+ /*
+ * Check that it's not a leftover invalid tuple from pre-9.1
+ * See also gistdoinsert() and gistbulkdelete() handlding of such
+ * tuples. We do consider it error here.
+ */
+ if (GistTupleIsInvalid(idxtuple))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("index \"%s\" contains an inner tuple marked as invalid",
+ RelationGetRelationName(rel)),
+ errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
+ errhint("Please REINDEX it.")));
+
+ if (MAXALIGN(ItemIdGetLength(iid)) != MAXALIGN(IndexTupleSize(idxtuple)))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" has inconsistent tuple sizes",
+ RelationGetRelationName(rel))));
+
+ /*
+ * Check if this tuple is consistent with the downlink in the
+ * parent.
+ *
+ * XXX: shouldn't we rather use gist_consistent?
+ */
+ if (stack->parenttup && gistgetadjusted(rel, stack->parenttup, idxtuple, state))
+ {
+ /*
+ * There was a discrepancy between parent and child tuples.
+ * We need to verify it is not a result of concurrent call
+ * of gistplacetopage(). So, lock parent and try to find downlink
+ * for current page. It may be missing due to concurrent page
+ * split, this is OK.
+ */
+ pfree(stack->parenttup);
+ stack->parenttup = gist_refind_parent(rel, stack->parentblk, stack->blkno, strategy);
+
+ /* We found it - make a final check before failing */
+ if (stack->parenttup && gistgetadjusted(rel, stack->parenttup, idxtuple, state))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" has inconsistent records",
+ RelationGetRelationName(rel))));
+ }
+ else
+ {
+ /*
+ * But now it is properly adjusted - nothing to do here.
+ */
+ }
+ }
+
+ /* If this is an internal page, recurse into the child */
+ if (!GistPageIsLeaf(page))
+ {
+ GistScanItem *ptr = (GistScanItem *) palloc(sizeof(GistScanItem));
+
+ ptr->depth = stack->depth + 1;
+ ptr->parenttup = CopyIndexTuple(idxtuple);
+ ptr->parentblk = stack->blkno;
+ ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+ ptr->parentlsn = lsn;
+ ptr->next = stack->next;
+ stack->next = ptr;
+ }
+ }
+
+ LockBuffer(buffer, GIST_UNLOCK);
+ ReleaseBuffer(buffer);
+
+ /* Step to next item in the queue */
+ stack_next = stack->next;
+ if (stack->parenttup)
+ pfree(stack->parenttup);
+ pfree(stack);
+ stack = stack_next;
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+ MemoryContextDelete(mctx);
+}
+
+/* Check that relation is eligible for GiST verification */
+static void
+gist_index_checkable(Relation rel)
+{
+ if (rel->rd_rel->relkind != RELKIND_INDEX ||
+ rel->rd_rel->relam != GIST_AM_OID)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("only GiST indexes are supported as targets for this"
+ " verification"),
+ errdetail("Relation \"%s\" is not a GiST index.",
+ RelationGetRelationName(rel))));
+
+ if (RELATION_IS_OTHER_TEMP(rel))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot access temporary tables of other sessions"),
+ errdetail("Index \"%s\" is associated with temporary relation.",
+ RelationGetRelationName(rel))));
+
+ if (!rel->rd_index->indisvalid)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot check index \"%s\"",
+ RelationGetRelationName(rel)),
+ errdetail("Index is not valid")));
+}
+
+PG_FUNCTION_INFO_V1(gist_index_parent_check);
+
+Datum
+gist_index_parent_check(PG_FUNCTION_ARGS)
+{
+ Oid indrelid = PG_GETARG_OID(0);
+ Relation indrel;
+ Relation heaprel;
+ LOCKMODE lockmode;
+
+ /* lock table and index with neccesary level */
+ amcheck_lock_relation(indrelid, true, &indrel, &heaprel, &lockmode);
+
+ /* verify that this is GiST eligible for check */
+ gist_index_checkable(indrel);
+ gist_check_parent_keys_consistency(indrel);
+
+ /* Unlock index and table */
+ amcheck_unlock_relation(indrelid, indrel, heaprel, lockmode);
+
+ PG_RETURN_VOID();
+}
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 6ae3bca9536..50991a656a5 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -21,23 +21,14 @@
*
*-------------------------------------------------------------------------
*/
-#include "postgres.h"
+#include "amcheck.h"
#include "access/heapam.h"
-#include "access/htup_details.h"
#include "access/nbtree.h"
#include "access/tableam.h"
#include "access/transam.h"
#include "access/xact.h"
-#include "catalog/index.h"
-#include "catalog/pg_am.h"
-#include "commands/tablecmds.h"
#include "lib/bloomfilter.h"
-#include "miscadmin.h"
-#include "storage/lmgr.h"
-#include "utils/memutils.h"
-#include "utils/snapmgr.h"
-
PG_MODULE_MAGIC;
@@ -212,23 +203,18 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
-/*
- * Helper for bt_index_[parent_]check, coordinating the bulk of the work.
- */
-static void
-bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
- bool rootdescend)
+
+/* Lock aquisition reused accross different am types */
+void
+amcheck_lock_relation(Oid indrelid, bool parentcheck, Relation *indrel,
+ Relation *heaprel, LOCKMODE *lockmode)
{
Oid heapid;
- Relation indrel;
- Relation heaprel;
- bool heapkeyspace;
- LOCKMODE lockmode;
if (parentcheck)
- lockmode = ShareLock;
+ *lockmode = ShareLock;
else
- lockmode = AccessShareLock;
+ *lockmode = AccessShareLock;
/*
* We must lock table before index to avoid deadlocks. However, if the
@@ -240,9 +226,9 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
*/
heapid = IndexGetRelation(indrelid, true);
if (OidIsValid(heapid))
- heaprel = table_open(heapid, lockmode);
+ *heaprel = heap_open(heapid, *lockmode);
else
- heaprel = NULL;
+ *heaprel = NULL;
/*
* Open the target index relations separately (like relation_openrv(), but
@@ -256,27 +242,23 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
* committed or recently dead heap tuples lacking index entries due to
* concurrent activity.)
*/
- indrel = index_open(indrelid, lockmode);
+ *indrel = index_open(indrelid, *lockmode);
/*
* Since we did the IndexGetRelation call above without any lock, it's
* barely possible that a race against an index drop/recreation could have
* netted us the wrong table.
*/
- if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false))
+ if (*heaprel == NULL || heapid != IndexGetRelation(indrelid, false))
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_TABLE),
errmsg("could not open parent table of index %s",
- RelationGetRelationName(indrel))));
-
- /* Relation suitable for checking as B-Tree? */
- btree_index_checkable(indrel);
-
- /* Check index, possibly against table it is an index on */
- heapkeyspace = _bt_heapkeyspace(indrel);
- bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
- heapallindexed, rootdescend);
+ RelationGetRelationName(*indrel))));
+}
+/* Pair for amcheck_lock_relation() */
+void amcheck_unlock_relation(Oid indrelid, Relation indrel, Relation heaprel, LOCKMODE lockmode)
+{
/*
* Release locks early. That's ok here because nothing in the called
* routines will trigger shared cache invalidations to be sent, so we can
@@ -287,6 +269,33 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
table_close(heaprel, lockmode);
}
+/*
+ * Helper for bt_index_[parent_]check, coordinating the bulk of the work.
+ */
+static void
+bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
+ bool rootdescend)
+{
+ Relation indrel;
+ Relation heaprel;
+ LOCKMODE lockmode;
+ bool heapkeyspace;
+
+ /* lock table and index with neccesary level */
+ amcheck_lock_relation(indrelid, parentcheck, &indrel, &heaprel, &lockmode);
+
+ /* Relation suitable for checking as B-Tree? */
+ btree_index_checkable(indrel);
+
+ /* Check index, possibly against table it is an index on */
+ heapkeyspace = _bt_heapkeyspace(indrel);
+ bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
+ heapallindexed, rootdescend);
+
+ /* Unlock index and table */
+ amcheck_unlock_relation(indrelid, indrel, heaprel, lockmode);
+}
+
/*
* Basic checks about the suitability of a relation for checking as a B-Tree
* index.
diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml
index 627651d8d4a..34842eaebf6 100644
--- a/doc/src/sgml/amcheck.sgml
+++ b/doc/src/sgml/amcheck.sgml
@@ -165,6 +165,27 @@ ORDER BY c.relpages DESC LIMIT 10;
</para>
</listitem>
</varlistentry>
+
+ <varlistentry>
+ <term>
+ <function>gist_index_parent_check(index regclass) returns void</function>
+ <indexterm>
+ <primary>gist_index_parent_check</primary>
+ </indexterm>
+ </term>
+
+ <listitem>
+ <para>
+ <function>gist_index_parent_check</function> tests that its target GiST
+ has consistent parent-child tuples relations (no parent tuples
+ require tuple adjustement) and page graph respects balanced-tree
+ invariants (internal pages reference only leaf page or only internal
+ pages). As with <function>bt_index_parent_check</function>, the
+ <function>gist_index_parent_check</function> aquires
+ <literal>ShareLock</literal> on index and heap relations.
+ </para>
+ </listitem>
+ </varlistentry>
</variablelist>
</sect2>