Suppose we start with this nbtree (subset of a diagram from verify_nbtree.c):
* 1
* / \
* 2 <-> 3
We're deleting 2, the leftmost leaf under a leftmost internal page. After the
MARK_PAGE_HALFDEAD record, the first downlink from 1 will lead to 3, which
still has a btpo_prev pointing to 2. bt_index_parent_check() complains here:
/* The first page we visit at the level should be leftmost */
if (first && !BlockNumberIsValid(state->prevrightlink) &&
!P_LEFTMOST(opaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("the first child of leftmost
target page is not leftmost of its level in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Target block=%u
child block=%u target page lsn=%X/%X.",
state->targetblock, blkno,
LSN_FORMAT_ARGS(state->targetlsn))));
One can encounter this if recovery ends between a MARK_PAGE_HALFDEAD record
and its corresponding UNLINK_PAGE record. See the attached test case. The
index is actually fine in such a state, right? I lean toward fixing this by
having amcheck scan left; if left links reach only half-dead or deleted pages,
that's as good as the present child block being P_LEFTMOST. There's a
different error from bt_index_check(), and I've not yet studied how to fix
that:
ERROR: left link/right link pair in index "not_leftmost_pk" not in agreement
DETAIL: Block=0 left block=0 left link from block=4294967295.
Alternatively, one could view this as a need for the user to VACUUM between
recovery and amcheck. The documentation could direct users to "VACUUM
(DISABLE_PAGE_SKIPPING off, INDEX_CLEANUP on, TRUNCATE off)" if not done since
last recovery. Does anyone prefer that or some other alternative?
For some other amcheck expectations, the comments suggest reliance on the
bt_index_parent_check() ShareLock. I haven't tried to make test cases for
them, but perhaps recovery can trick them the same way. Examples:
errmsg("downlink or sibling link points to deleted block in index \"%s\"",
errmsg("block %u is not leftmost in index \"%s\"",
errmsg("block %u is not true root in index \"%s\"",
Thanks,
nm
Author: Noah Misch <[email protected]>
Commit: Noah Misch <[email protected]>
diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile
index b82f221..9a7f4f7 100644
--- a/contrib/amcheck/Makefile
+++ b/contrib/amcheck/Makefile
@@ -13,6 +13,7 @@ PGFILEDESC = "amcheck - function for verifying relation
integrity"
REGRESS = check check_btree check_heap
TAP_TESTS = 1
+EXTRA_INSTALL=contrib/pg_walinspect
ifdef USE_PGXS
PG_CONFIG = pg_config
diff --git a/contrib/amcheck/t/004_pitr.pl b/contrib/amcheck/t/004_pitr.pl
new file mode 100644
index 0000000..ec6d87e
--- /dev/null
+++ b/contrib/amcheck/t/004_pitr.pl
@@ -0,0 +1,65 @@
+
+# Copyright (c) 2021-2023, PostgreSQL Global Development Group
+
+# Test integrity of intermediate states by PITR to those states
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# origin node: generate WAL records of interest.
+my $origin = PostgreSQL::Test::Cluster->new('origin');
+$origin->init(has_archiving => 1, allows_streaming => 1);
+$origin->append_conf('postgresql.conf', 'autovacuum = off');
+$origin->start;
+$origin->backup('my_backup');
+# Create a table with each of 6 PK values spanning 1/4 of a block. Delete the
+# first four, so one index leaf is eligible for deletion. Make a replication
+# slot just so pg_walinspect will always have access to later WAL.
+my $setup = <<EOSQL;
+BEGIN;
+CREATE EXTENSION amcheck;
+CREATE EXTENSION pg_walinspect;
+CREATE TABLE not_leftmost (c text STORAGE PLAIN);
+INSERT INTO not_leftmost
+ SELECT repeat(n::text, database_block_size / 4)
+ FROM generate_series(1,6) t(n), pg_control_init();
+ALTER TABLE not_leftmost ADD CONSTRAINT not_leftmost_pk PRIMARY KEY (c);
+DELETE FROM not_leftmost WHERE c ~ '^[1-4]';
+SELECT pg_create_physical_replication_slot('for_walinspect', true, false);
+COMMIT;
+EOSQL
+$origin->safe_psql('postgres', $setup);
+my $before_vacuum_lsn =
+ $origin->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+# VACUUM to delete the aforementioned leaf page. Find the LSN of that
+# UNLINK_PAGE record.
+my $exec = <<EOSQL;
+VACUUM VERBOSE not_leftmost;
+SELECT max(start_lsn)
+ FROM pg_get_wal_records_info('$before_vacuum_lsn', 'FFFFFFFF/FFFFFFFF')
+ WHERE resource_manager = 'Btree' AND record_type = 'UNLINK_PAGE';
+EOSQL
+my $unlink_lsn = $origin->safe_psql('postgres', $exec);
+$origin->stop;
+
+# replica node: amcheck at notable points in the WAL stream
+my $replica = PostgreSQL::Test::Cluster->new('replica');
+$replica->init_from_backup($origin, 'my_backup', has_restoring => 1);
+$replica->append_conf('postgresql.conf',
+ "recovery_target_lsn = '$unlink_lsn'");
+$replica->append_conf('postgresql.conf', 'recovery_target_inclusive = off');
+$replica->append_conf('postgresql.conf', 'recovery_target_action = promote');
+$replica->start;
+$replica->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';")
+ or die "Timed out while waiting for PITR promotion";
+# recovery done; run amcheck
+is( $replica->psql(
+ 'postgres', "SELECT bt_index_parent_check('not_leftmost_pk',
true)"),
+ 0);
+is( $replica->psql(
+ 'postgres', "SELECT bt_index_check('not_leftmost_pk', true)"),
+ 0);
+
+done_testing();