Hi,

On Thu, 5 Sept 2024 at 18:54, Noah Misch <n...@leadboat.com> wrote:
>
> On Thu, Sep 05, 2024 at 03:59:53PM +0300, Nazir Bilal Yavuz wrote:
> > On Wed, 4 Sept 2024 at 21:43, Noah Misch <n...@leadboat.com> wrote:
> > > https://postgr.es/m/caeudqaozv3wty5tv2t29jcwpydbmkbiwqkzd42s2ogzdixp...@mail.gmail.com
> > > then observed that collect_corrupt_items() was now guaranteed to never 
> > > detect
> > > corruption.  I have pushed revert ddfc556 of the pg_visibility.c changes. 
> > >  For
> > > the next try, could you add that testing we discussed?
> >
> > Do you think that corrupting the visibility map and then seeing if
> > pg_check_visible() and pg_check_frozen() report something is enough?
>
> I think so.  Please check that it would have caught both the blkno bug and the
> above bug.

The test and updated patch files are attached. In that test I
overwrite the visibility map file with its older state. Something like
this:

1- Create the table, then run VACUUM FREEZE on the table.
2- Shutdown the server, create a copy of the vm file, restart the server.
3- Run the DELETE command on some $random_tuples.
4- Shutdown the server, overwrite the vm file with the #2 vm file,
restart the server.
5- pg_check_visible and pg_check_frozen must report $random_tuples as corrupted.

Do you think this test makes sense and enough?

--
Regards,
Nazir Bilal Yavuz
Microsoft
From 1f7b2cfe0d2247eb4768ce79d14982701437d473 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavu...@gmail.com>
Date: Mon, 9 Sep 2024 15:27:53 +0300
Subject: [PATCH v6 1/2] Add tests that pg_check_[visible|frozen] report
 corrupted tuples

Currently, there are no tests that pg_check_[visible|frozen] need to
report corrupted tuples. Add that kind of tests.

To add these tests, visibility map needs to be corrupted. It is done by
overwriting visibility map by its older state.
---
 contrib/pg_visibility/meson.build         |   1 +
 contrib/pg_visibility/t/002_corrupt_vm.pl | 100 ++++++++++++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100644 contrib/pg_visibility/t/002_corrupt_vm.pl

diff --git a/contrib/pg_visibility/meson.build b/contrib/pg_visibility/meson.build
index f3c1263313a..609fc5f9f3e 100644
--- a/contrib/pg_visibility/meson.build
+++ b/contrib/pg_visibility/meson.build
@@ -36,6 +36,7 @@ tests += {
   'tap': {
     'tests': [
       't/001_concurrent_transaction.pl',
+      't/002_corrupt_vm.pl',
     ],
   },
 }
diff --git a/contrib/pg_visibility/t/002_corrupt_vm.pl b/contrib/pg_visibility/t/002_corrupt_vm.pl
new file mode 100644
index 00000000000..cc6d043b8b7
--- /dev/null
+++ b/contrib/pg_visibility/t/002_corrupt_vm.pl
@@ -0,0 +1,100 @@
+# Copyright (c) 2021-2024, PostgreSQL Global Development Group
+
+# Check that pg_check_visible() and pg_check_frozen() reports
+# correct TIDs when there is a corruption.
+use strict;
+use warnings FATAL => 'all';
+use File::Copy;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init;
+$node->start;
+
+my $blck_size = $node->safe_psql("postgres", "SHOW block_size;");
+
+# Create a sample table with at least 10 pages and then run VACUUM. 10 is
+# selected manually as it is big enough to select 5 random tuples from the
+# relation.
+$node->safe_psql(
+	'postgres', qq(
+		CREATE EXTENSION pg_visibility;
+	    CREATE TABLE corruption_test
+            WITH (autovacuum_enabled = false) AS
+           SELECT
+				i,
+				repeat('a', 10) AS data
+			FROM
+    			generate_series(1, $blck_size) i;
+	    VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) corruption_test;
+    )
+);
+
+# VACUUM is run, it is safe to get the number of pages from the relpages.
+my $npages = $node->safe_psql(
+	"postgres",
+	"SELECT relpages FROM pg_class
+		WHERE  relname = 'corruption_test';"
+);
+
+ok($npages >= 10, "There should be at least 10 pages in the table");
+
+my $file = $node->safe_psql("postgres",
+	"SELECT pg_relation_filepath('corruption_test');");
+
+# Copy visibility map.
+$node->stop;
+my $vm_file = $node->data_dir . '/' . $file . '_vm';
+copy("$vm_file", "${vm_file}_temp");
+$node->start;
+
+# Select 5 random tuples from the relation.
+my $tuples = $node->safe_psql(
+	"postgres",
+	"SELECT ctid FROM (
+        SELECT ctid FROM corruption_test
+            ORDER BY random() LIMIT 5)
+        ORDER BY ctid ASC;"
+);
+
+# Do the changes below to use tuples in the query.
+# "\n" -> ","
+# "(" -> "'("
+# ")" -> ")'"
+(my $tuples_query = $tuples) =~ s/\n/,/g;
+$tuples_query =~ s/\(/\'\(/g;
+$tuples_query =~ s/\)/\)\'/g;
+
+$node->safe_psql(
+	"postgres",
+	"DELETE FROM corruption_test
+		WHERE ctid in ($tuples_query);"
+);
+
+# Overwrite visibility map with the old one.
+$node->stop;
+move("${vm_file}_temp", "$vm_file");
+$node->start;
+
+my $result = $node->safe_psql(
+	"postgres",
+	"SELECT DISTINCT t_ctid
+		FROM pg_check_visible('corruption_test')
+		ORDER BY t_ctid ASC;"
+);
+
+is($result, $tuples, 'pg_check_visible must report tuples as corrupted');
+
+$result = $node->safe_psql(
+	"postgres",
+	"SELECT DISTINCT t_ctid
+		FROM pg_check_frozen('corruption_test')
+		ORDER BY t_ctid ASC;"
+);
+
+is($result, $tuples, 'pg_check_frozen must report tuples as corrupted');
+
+$node->stop;
+done_testing();
-- 
2.45.2

From 69d095e9050f7f747cc2586fc9d2900f3bee7380 Mon Sep 17 00:00:00 2001
From: Noah Misch <n...@leadboat.com>
Date: Tue, 3 Sep 2024 10:46:20 -0700
Subject: [PATCH v6 2/2] Optimize pg_visibility with read streams.

We've measured 5% performance improvement, and this arranges to benefit
automatically from future optimizations to the read_stream subsystem.

Nazir Bilal Yavuz

Discussion: https://postgr.es/m/can55fz1_ru3xpmgtwsu67fth2fs_frrromb7x6zs+f44qbe...@mail.gmail.com
---
 contrib/pg_visibility/pg_visibility.c | 113 +++++++++++++++++++++-----
 1 file changed, 92 insertions(+), 21 deletions(-)

diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c
index 773ba92e454..724122b1bc5 100644
--- a/contrib/pg_visibility/pg_visibility.c
+++ b/contrib/pg_visibility/pg_visibility.c
@@ -21,6 +21,7 @@
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
+#include "storage/read_stream.h"
 #include "storage/smgr.h"
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
@@ -41,6 +42,17 @@ typedef struct corrupt_items
 	ItemPointer tids;
 } corrupt_items;
 
+/* for collect_corrupt_items_read_stream_next_block */
+struct collect_corrupt_items_read_stream_private
+{
+	bool		all_frozen;
+	bool		all_visible;
+	BlockNumber current_blocknum;
+	BlockNumber last_exclusive;
+	Relation	rel;
+	Buffer		vmbuffer;
+};
+
 PG_FUNCTION_INFO_V1(pg_visibility_map);
 PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
 PG_FUNCTION_INFO_V1(pg_visibility);
@@ -478,6 +490,8 @@ collect_visibility_data(Oid relid, bool include_pd)
 	BlockNumber blkno;
 	Buffer		vmbuffer = InvalidBuffer;
 	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream = NULL;
 
 	rel = relation_open(relid, AccessShareLock);
 
@@ -489,6 +503,20 @@ collect_visibility_data(Oid relid, bool include_pd)
 	info->next = 0;
 	info->count = nblocks;
 
+	/* Create a stream if reading main fork. */
+	if (include_pd)
+	{
+		p.current_blocknum = 0;
+		p.last_exclusive = nblocks;
+		stream = read_stream_begin_relation(READ_STREAM_FULL,
+											bstrategy,
+											rel,
+											MAIN_FORKNUM,
+											block_range_read_stream_cb,
+											&p,
+											0);
+	}
+
 	for (blkno = 0; blkno < nblocks; ++blkno)
 	{
 		int32		mapbits;
@@ -513,8 +541,7 @@ collect_visibility_data(Oid relid, bool include_pd)
 			Buffer		buffer;
 			Page		page;
 
-			buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-										bstrategy);
+			buffer = read_stream_next_buffer(stream, NULL);
 			LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
 			page = BufferGetPage(buffer);
@@ -525,6 +552,12 @@ collect_visibility_data(Oid relid, bool include_pd)
 		}
 	}
 
+	if (include_pd)
+	{
+		Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+		read_stream_end(stream);
+	}
+
 	/* Clean up. */
 	if (vmbuffer != InvalidBuffer)
 		ReleaseBuffer(vmbuffer);
@@ -610,6 +643,38 @@ GetStrictOldestNonRemovableTransactionId(Relation rel)
 	}
 }
 
+/*
+ * Callback function to get next block for read stream object used in
+ * collect_corrupt_items() function.
+ */
+static BlockNumber
+collect_corrupt_items_read_stream_next_block(ReadStream *stream,
+											 void *callback_private_data,
+											 void *per_buffer_data)
+{
+	struct collect_corrupt_items_read_stream_private *p = callback_private_data;
+
+	for (; p->current_blocknum < p->last_exclusive; p->current_blocknum++)
+	{
+		bool		check_frozen = false;
+		bool		check_visible = false;
+
+		/* Make sure we are interruptible. */
+		CHECK_FOR_INTERRUPTS();
+
+		if (p->all_frozen && VM_ALL_FROZEN(p->rel, p->current_blocknum, &p->vmbuffer))
+			check_frozen = true;
+		if (p->all_visible && VM_ALL_VISIBLE(p->rel, p->current_blocknum, &p->vmbuffer))
+			check_visible = true;
+		if (!check_visible && !check_frozen)
+			continue;
+
+		return p->current_blocknum++;
+	}
+
+	return InvalidBlockNumber;
+}
+
 /*
  * Returns a list of items whose visibility map information does not match
  * the status of the tuples on the page.
@@ -628,12 +693,13 @@ static corrupt_items *
 collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 {
 	Relation	rel;
-	BlockNumber nblocks;
 	corrupt_items *items;
-	BlockNumber blkno;
 	Buffer		vmbuffer = InvalidBuffer;
 	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
 	TransactionId OldestXmin = InvalidTransactionId;
+	struct collect_corrupt_items_read_stream_private p;
+	ReadStream *stream;
+	Buffer		buffer;
 
 	rel = relation_open(relid, AccessShareLock);
 
@@ -643,8 +709,6 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 	if (all_visible)
 		OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
 
-	nblocks = RelationGetNumberOfBlocks(rel);
-
 	/*
 	 * Guess an initial array size. We don't expect many corrupted tuples, so
 	 * start with a small array.  This function uses the "next" field to track
@@ -658,34 +722,38 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 	items->count = 64;
 	items->tids = palloc(items->count * sizeof(ItemPointerData));
 
+	p.current_blocknum = 0;
+	p.last_exclusive = RelationGetNumberOfBlocks(rel);
+	p.rel = rel;
+	p.vmbuffer = InvalidBuffer;
+	p.all_frozen = all_frozen;
+	p.all_visible = all_visible;
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										collect_corrupt_items_read_stream_next_block,
+										&p,
+										0);
+
 	/* Loop over every block in the relation. */
-	for (blkno = 0; blkno < nblocks; ++blkno)
+	while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
 	{
-		bool		check_frozen = false;
-		bool		check_visible = false;
-		Buffer		buffer;
+		bool		check_frozen = all_frozen;
+		bool		check_visible = all_visible;
 		Page		page;
 		OffsetNumber offnum,
 					maxoff;
+		BlockNumber blkno;
 
 		/* Make sure we are interruptible. */
 		CHECK_FOR_INTERRUPTS();
 
-		/* Use the visibility map to decide whether to check this page. */
-		if (all_frozen && VM_ALL_FROZEN(rel, blkno, &vmbuffer))
-			check_frozen = true;
-		if (all_visible && VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-			check_visible = true;
-		if (!check_visible && !check_frozen)
-			continue;
-
-		/* Read and lock the page. */
-		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-									bstrategy);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buffer);
 		maxoff = PageGetMaxOffsetNumber(page);
+		blkno = BufferGetBlockNumber(buffer);
 
 		/*
 		 * The visibility map bits might have changed while we were acquiring
@@ -778,10 +846,13 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 
 		UnlockReleaseBuffer(buffer);
 	}
+	read_stream_end(stream);
 
 	/* Clean up. */
 	if (vmbuffer != InvalidBuffer)
 		ReleaseBuffer(vmbuffer);
+	if (p.vmbuffer != InvalidBuffer)
+		ReleaseBuffer(p.vmbuffer);
 	relation_close(rel, AccessShareLock);
 
 	/*
-- 
2.45.2

Reply via email to