Hi,

On Tue, 10 Sept 2024 at 00:32, Noah Misch <n...@leadboat.com> wrote:
>
> Copying the vm file is a good technique.  In my test runs, this does catch the
> "never detect" bug, but it doesn't catch the blkno bug.  Can you make it catch
> both?  It's possible my hand-patching to recreate the blkno bug is what went
> wrong, so I'm attaching what I used.  It consists of
> v1-0002-Use-read-stream-in-pg_visibility-in-collect_corru.patch plus these
> fixes for the "never detect" bug from your v6-0002:

Your patch is correct. I wrongly assumed it would catch blockno bug,
the attached version catches it. I made blockno = 0 invisible and not
frozen before copying the vm file. So, in the blockno buggy version;
callback will skip that block but the main loop in the
collect_corrupt_items() will not skip it. I tested it with your patch
and there is exactly 1 blockno difference between expected and result
output.

--
Regards,
Nazir Bilal Yavuz
Microsoft
From 342bd8ede69942424b6d0568574db52cb574c479 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Tue, 10 Sep 2024 13:15:11 +0300
Subject: [PATCH v7 1/2] Add tests that pg_check_[visible|frozen] report
 corrupted tuples

Currently, there are no tests that pg_check_[visible|frozen] need to
report corrupted tuples. Add that kind of tests.

To add these tests, visibility map needs to be corrupted. It is done by
overwriting visibility map by its older state.
---
 contrib/pg_visibility/meson.build         |   1 +
 contrib/pg_visibility/t/002_corrupt_vm.pl | 107 ++++++++++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 contrib/pg_visibility/t/002_corrupt_vm.pl

diff --git a/contrib/pg_visibility/meson.build b/contrib/pg_visibility/meson.build
index f3c1263313a..609fc5f9f3e 100644
--- a/contrib/pg_visibility/meson.build
+++ b/contrib/pg_visibility/meson.build
@@ -36,6 +36,7 @@ tests += {
   'tap': {
     'tests': [
       't/001_concurrent_transaction.pl',
+      't/002_corrupt_vm.pl',
     ],
   },
 }
diff --git a/contrib/pg_visibility/t/002_corrupt_vm.pl b/contrib/pg_visibility/t/002_corrupt_vm.pl
new file mode 100644
index 00000000000..e57f365576f
--- /dev/null
+++ b/contrib/pg_visibility/t/002_corrupt_vm.pl
@@ -0,0 +1,107 @@
+# Copyright (c) 2021-2024, PostgreSQL Global Development Group
+
+# Check that pg_check_visible() and pg_check_frozen() reports
+# correct TIDs when there is a corruption.
+use strict;
+use warnings FATAL => 'all';
+use File::Copy;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init;
+$node->start;
+
+my $blck_size = $node->safe_psql("postgres", "SHOW block_size;");
+
+# Create a sample table with at least 10 pages and then run VACUUM. 10 is
+# selected manually as it is big enough to select 5 random tuples from the
+# relation.
+$node->safe_psql(
+	'postgres', qq(
+		CREATE EXTENSION pg_visibility;
+	    CREATE TABLE corruption_test
+            WITH (autovacuum_enabled = false) AS
+           SELECT
+				i,
+				repeat('a', 10) AS data
+			FROM
+    			generate_series(1, $blck_size) i;
+	    VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) corruption_test;
+    )
+);
+
+# VACUUM is run, it is safe to get the number of pages from the relpages.
+my $npages = $node->safe_psql(
+	"postgres",
+	"SELECT relpages FROM pg_class
+		WHERE  relname = 'corruption_test';"
+);
+ok($npages >= 10, "There should be at least 10 pages in the table");
+
+my $file = $node->safe_psql("postgres",
+	"SELECT pg_relation_filepath('corruption_test');");
+
+# Delete the first block to make sure that it will be skipped as it is
+# not visible nor frozen.
+$node->safe_psql(
+	"postgres",
+	"DELETE FROM corruption_test
+		WHERE (ctid::text::point)[0] = 0;"
+);
+
+# Copy visibility map.
+$node->stop;
+my $vm_file = $node->data_dir . '/' . $file . '_vm';
+copy("$vm_file", "${vm_file}_temp");
+$node->start;
+
+# Select 5 random tuples that are starting from the second block of the
+# relation. The first block is skipped because it is deleted above.
+my $tuples = $node->safe_psql(
+	"postgres",
+	"SELECT ctid FROM (
+        SELECT ctid FROM corruption_test
+			WHERE (ctid::text::point)[0] != 0
+            ORDER BY random() LIMIT 5)
+        ORDER BY ctid ASC;"
+);
+
+# Do the changes below to use tuples in the query.
+# "\n" -> ","
+# "(" -> "'("
+# ")" -> ")'"
+(my $tuples_query = $tuples) =~ s/\n/,/g;
+$tuples_query =~ s/\(/\'\(/g;
+$tuples_query =~ s/\)/\)\'/g;
+
+$node->safe_psql(
+	"postgres",
+	"DELETE FROM corruption_test
+		WHERE ctid in ($tuples_query);"
+);
+
+# Overwrite visibility map with the old one.
+$node->stop;
+move("${vm_file}_temp", "$vm_file");
+$node->start;
+
+my $result = $node->safe_psql(
+	"postgres",
+	"SELECT DISTINCT t_ctid
+		FROM pg_check_visible('corruption_test')
+		ORDER BY t_ctid ASC;"
+);
+is($result, $tuples, 'pg_check_visible must report tuples as corrupted');
+
+$result = $node->safe_psql(
+	"postgres",
+	"SELECT DISTINCT t_ctid
+		FROM pg_check_frozen('corruption_test')
+		ORDER BY t_ctid ASC;"
+);
+is($result, $tuples, 'pg_check_frozen must report tuples as corrupted');
+
+$node->stop;
+done_testing();
-- 
2.45.2

From ec0a22440971e92a696a4c2f52d1eae7b1f62078 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Tue, 3 Sep 2024 10:46:20 -0700
Subject: [PATCH v7 2/2] Optimize pg_visibility with read streams.

We've measured 5% performance improvement, and this arranges to benefit
automatically from future optimizations to the read_stream subsystem.

Nazir Bilal Yavuz

Discussion: CAN55FZ1_Ru3XpMgTwsU67FTH2fs_FrRROmb7x6zs+F44QBEiww@mail.gmail.com">https://postgr.es/m/CAN55FZ1_Ru3XpMgTwsU67FTH2fs_FrRROmb7x6zs+F44QBEiww@mail.gmail.com
---
 contrib/pg_visibility/pg_visibility.c | 113 +++++++++++++++++++++-----
 1 file changed, 92 insertions(+), 21 deletions(-)

diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c
index 773ba92e454..724122b1bc5 100644
--- a/contrib/pg_visibility/pg_visibility.c
+++ b/contrib/pg_visibility/pg_visibility.c
@@ -21,6 +21,7 @@
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
+#include "storage/read_stream.h"
 #include "storage/smgr.h"
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
@@ -41,6 +42,17 @@ typedef struct corrupt_items
 	ItemPointer tids;
 } corrupt_items;
 
+/* for collect_corrupt_items_read_stream_next_block */
+struct collect_corrupt_items_read_stream_private
+{
+	bool		all_frozen;
+	bool		all_visible;
+	BlockNumber current_blocknum;
+	BlockNumber last_exclusive;
+	Relation	rel;
+	Buffer		vmbuffer;
+};
+
 PG_FUNCTION_INFO_V1(pg_visibility_map);
 PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
 PG_FUNCTION_INFO_V1(pg_visibility);
@@ -478,6 +490,8 @@ collect_visibility_data(Oid relid, bool include_pd)
 	BlockNumber blkno;
 	Buffer		vmbuffer = InvalidBuffer;
 	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream = NULL;
 
 	rel = relation_open(relid, AccessShareLock);
 
@@ -489,6 +503,20 @@ collect_visibility_data(Oid relid, bool include_pd)
 	info->next = 0;
 	info->count = nblocks;
 
+	/* Create a stream if reading main fork. */
+	if (include_pd)
+	{
+		p.current_blocknum = 0;
+		p.last_exclusive = nblocks;
+		stream = read_stream_begin_relation(READ_STREAM_FULL,
+											bstrategy,
+											rel,
+											MAIN_FORKNUM,
+											block_range_read_stream_cb,
+											&p,
+											0);
+	}
+
 	for (blkno = 0; blkno < nblocks; ++blkno)
 	{
 		int32		mapbits;
@@ -513,8 +541,7 @@ collect_visibility_data(Oid relid, bool include_pd)
 			Buffer		buffer;
 			Page		page;
 
-			buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-										bstrategy);
+			buffer = read_stream_next_buffer(stream, NULL);
 			LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
 			page = BufferGetPage(buffer);
@@ -525,6 +552,12 @@ collect_visibility_data(Oid relid, bool include_pd)
 		}
 	}
 
+	if (include_pd)
+	{
+		Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+		read_stream_end(stream);
+	}
+
 	/* Clean up. */
 	if (vmbuffer != InvalidBuffer)
 		ReleaseBuffer(vmbuffer);
@@ -610,6 +643,38 @@ GetStrictOldestNonRemovableTransactionId(Relation rel)
 	}
 }
 
+/*
+ * Callback function to get next block for read stream object used in
+ * collect_corrupt_items() function.
+ */
+static BlockNumber
+collect_corrupt_items_read_stream_next_block(ReadStream *stream,
+											 void *callback_private_data,
+											 void *per_buffer_data)
+{
+	struct collect_corrupt_items_read_stream_private *p = callback_private_data;
+
+	for (; p->current_blocknum < p->last_exclusive; p->current_blocknum++)
+	{
+		bool		check_frozen = false;
+		bool		check_visible = false;
+
+		/* Make sure we are interruptible. */
+		CHECK_FOR_INTERRUPTS();
+
+		if (p->all_frozen && VM_ALL_FROZEN(p->rel, p->current_blocknum, &p->vmbuffer))
+			check_frozen = true;
+		if (p->all_visible && VM_ALL_VISIBLE(p->rel, p->current_blocknum, &p->vmbuffer))
+			check_visible = true;
+		if (!check_visible && !check_frozen)
+			continue;
+
+		return p->current_blocknum++;
+	}
+
+	return InvalidBlockNumber;
+}
+
 /*
  * Returns a list of items whose visibility map information does not match
  * the status of the tuples on the page.
@@ -628,12 +693,13 @@ static corrupt_items *
 collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 {
 	Relation	rel;
-	BlockNumber nblocks;
 	corrupt_items *items;
-	BlockNumber blkno;
 	Buffer		vmbuffer = InvalidBuffer;
 	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
 	TransactionId OldestXmin = InvalidTransactionId;
+	struct collect_corrupt_items_read_stream_private p;
+	ReadStream *stream;
+	Buffer		buffer;
 
 	rel = relation_open(relid, AccessShareLock);
 
@@ -643,8 +709,6 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 	if (all_visible)
 		OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
 
-	nblocks = RelationGetNumberOfBlocks(rel);
-
 	/*
 	 * Guess an initial array size. We don't expect many corrupted tuples, so
 	 * start with a small array.  This function uses the "next" field to track
@@ -658,34 +722,38 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 	items->count = 64;
 	items->tids = palloc(items->count * sizeof(ItemPointerData));
 
+	p.current_blocknum = 0;
+	p.last_exclusive = RelationGetNumberOfBlocks(rel);
+	p.rel = rel;
+	p.vmbuffer = InvalidBuffer;
+	p.all_frozen = all_frozen;
+	p.all_visible = all_visible;
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										collect_corrupt_items_read_stream_next_block,
+										&p,
+										0);
+
 	/* Loop over every block in the relation. */
-	for (blkno = 0; blkno < nblocks; ++blkno)
+	while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
 	{
-		bool		check_frozen = false;
-		bool		check_visible = false;
-		Buffer		buffer;
+		bool		check_frozen = all_frozen;
+		bool		check_visible = all_visible;
 		Page		page;
 		OffsetNumber offnum,
 					maxoff;
+		BlockNumber blkno;
 
 		/* Make sure we are interruptible. */
 		CHECK_FOR_INTERRUPTS();
 
-		/* Use the visibility map to decide whether to check this page. */
-		if (all_frozen && VM_ALL_FROZEN(rel, blkno, &vmbuffer))
-			check_frozen = true;
-		if (all_visible && VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-			check_visible = true;
-		if (!check_visible && !check_frozen)
-			continue;
-
-		/* Read and lock the page. */
-		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-									bstrategy);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buffer);
 		maxoff = PageGetMaxOffsetNumber(page);
+		blkno = BufferGetBlockNumber(buffer);
 
 		/*
 		 * The visibility map bits might have changed while we were acquiring
@@ -778,10 +846,13 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 
 		UnlockReleaseBuffer(buffer);
 	}
+	read_stream_end(stream);
 
 	/* Clean up. */
 	if (vmbuffer != InvalidBuffer)
 		ReleaseBuffer(vmbuffer);
+	if (p.vmbuffer != InvalidBuffer)
+		ReleaseBuffer(p.vmbuffer);
 	relation_close(rel, AccessShareLock);
 
 	/*
-- 
2.45.2

Reply via email to