Hi,

please find attached version 2 of the patch.

Am Donnerstag, den 26.07.2018, 13:59 +0200 schrieb Michael Banck:
> I've now forward-ported this change to pg_verify_checksums, in order to
> make this application useful for online clusters, see attached patch.
> 
> I've tested this in a tight loop (while true; do pg_verify_checksums -D
> data1 -d > /dev/null || /bin/true; done)[2] while doing "while true; do
> createdb pgbench; pgbench -i -s 10 pgbench > /dev/null; dropdb pgbench;
> done", which I already used to develop the original code in the fork and
> which brought up a few bugs.
> 
> I got one checksums verification failure this way, all others were
> caught by the recheck (I've introduced a 500ms delay for the first ten
> failures) like this:
> 
> > pg_verify_checksums: checksum verification failed on first attempt in
> > file "data1/base/16837/16850", block 7770: calculated checksum 785 but
> > expected 5063
> > pg_verify_checksums: block 7770 in file "data1/base/16837/16850"
> > verified ok on recheck

I have now changed this from the pg_sleep() to a check against the
checkpoint LSN as discussed upthread.

> However, I am also seeing sporadic (maybe 0.5 times per pgbench run)
> failures like this:
> 
> > pg_verify_checksums: short read of block 2644 in file
> > "data1/base/16637/16650", got only 4096 bytes
> 
> This is not strictly a verification failure, should we do anything about
> this? In my fork, I am also rechecking on this[3] (and I am happy to
> extend the patch that way), but that makes the code and the patch more
> complicated and I wanted to check the general opinion on this case
> first.

I have added a retry for this as well now, without a pg_sleep() as well.
This catches around 80% of the half-reads, but a few slip through. At
that point we bail out with exit(1), and the user can try again, which I
think is fine? 

Alternatively, we could just skip to the next file then and don't make
it count as a checksum failure.

Other changes from V1:

1. Rebased to 422952ee
2. Ignore ENOENT failure during file open and skip to next file
3. Mention total number of skipped blocks during the summary at the end
of the run
4. Skip files starting with pg_internal.init*


Michael

-- 
Michael Banck
Projektleiter / Senior Berater
Tel.: +49 2166 9901-171
Fax:  +49 2166 9901-100
Email: michael.ba...@credativ.de

credativ GmbH, HRB Mönchengladbach 12080
USt-ID-Nummer: DE204566209
Trompeterallee 108, 41189 Mönchengladbach
Geschäftsführung: Dr. Michael Meskes, Jörg Folz, Sascha Heuer

Unser Umgang mit personenbezogenen Daten unterliegt
folgenden Bestimmungen: https://www.credativ.de/datenschutz
diff --git a/src/bin/pg_verify_checksums/pg_verify_checksums.c b/src/bin/pg_verify_checksums/pg_verify_checksums.c
index 1bc020ab6c..c3cc7b90b5 100644
--- a/src/bin/pg_verify_checksums/pg_verify_checksums.c
+++ b/src/bin/pg_verify_checksums/pg_verify_checksums.c
@@ -1,7 +1,7 @@
 /*
  * pg_verify_checksums
  *
- * Verifies page level checksums in an offline cluster
+ * Verifies page level checksums in a cluster
  *
  *	Copyright (c) 2010-2018, PostgreSQL Global Development Group
  *
@@ -25,7 +25,9 @@
 static int64 files = 0;
 static int64 blocks = 0;
 static int64 badblocks = 0;
+static int64 skippedblocks = 0;
 static ControlFileData *ControlFile;
+static XLogRecPtr checkpointLSN;
 
 static char *only_relfilenode = NULL;
 static bool verbose = false;
@@ -54,6 +56,7 @@ static const char *const skip[] = {
 	"pg_filenode.map",
 	"pg_internal.init",
 	"PG_VERSION",
+	"pgsql_tmp",
 	NULL,
 };
 
@@ -67,8 +70,14 @@ skipfile(const char *fn)
 		return true;
 
 	for (f = skip; *f; f++)
+	{
 		if (strcmp(*f, fn) == 0)
 			return true;
+		if (strcmp(*f, "pg_internal.init") == 0)
+			if (strncmp(*f, fn, strlen(*f)) == 0)
+				return true;
+	}
+
 	return false;
 }
 
@@ -79,10 +88,17 @@ scan_file(const char *fn, BlockNumber segmentno)
 	PageHeader	header = (PageHeader) buf.data;
 	int			f;
 	BlockNumber blockno;
+	bool		block_retry = false;
 
 	f = open(fn, O_RDONLY | PG_BINARY, 0);
 	if (f < 0)
 	{
+		if (errno == ENOENT)
+		{
+			/* File was removed in the meantime */
+			return;
+		}
+
 		fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
 				progname, fn, strerror(errno));
 		exit(1);
@@ -99,24 +115,99 @@ scan_file(const char *fn, BlockNumber segmentno)
 			break;
 		if (r != BLCKSZ)
 		{
-			fprintf(stderr, _("%s: could not read block %u in file \"%s\": read %d of %d\n"),
-					progname, blockno, fn, r, BLCKSZ);
-			exit(1);
+			if (block_retry)
+			{
+				/* We already tried once to reread the block, bail out */
+				fprintf(stderr, _("%s: could not read block %u in file \"%s\": read %d of %d\n"),
+						progname, blockno, fn, r, BLCKSZ);
+				exit(1);
+			}
+
+			/*
+			 * Retry the block. It's possible that we read the block while it
+			 * was extended or shrinked, so it it ends up looking torn to us.
+			 */
+
+			/*
+			 * Seek back by the amount of bytes we read to the beginning of
+			 * the failed block.
+			 */
+			if (lseek(f, -r, SEEK_CUR) == -1)
+			{
+				fprintf(stderr, _("%s: could not lseek in file \"%s\": %m\n"),
+						progname, fn);
+				exit(1);
+			}
+
+			/* Set flag so we know a retry was attempted */
+			block_retry = true;
+
+			/* Reset loop to validate the block again */
+			blockno--;
+
+			continue;
 		}
-		blocks++;
 
 		/* New pages have no checksum yet */
 		if (PageIsNew(header))
+		{
+			skippedblocks++;
 			continue;
+		}
+
+		blocks++;
 
 		csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE);
 		if (csum != header->pd_checksum)
 		{
+			/*
+			 * Retry the block on the first failure.  It's
+			 * possible that we read the first 4K page of
+			 * the block just before postgres updated the
+			 * entire block so it ends up looking torn to
+			 * us.  We only need to retry once because the
+			 * LSN should be updated to something we can
+			 * ignore on the next pass.  If the error
+			 * happens again then it is a true validation
+			 * failure.
+			 */
+			if (block_retry == false)
+			{
+				/* Seek to the beginning of the failed block */
+				if (lseek(f, -BLCKSZ, SEEK_CUR) == -1)
+				{
+					fprintf(stderr, _("%s: could not lseek in file \"%s\": %m\n"),
+							progname, fn);
+					exit(1);
+				}
+
+				/* Set flag so we know a retry was attempted */
+				block_retry = true;
+
+				/* Reset loop to validate the block again */
+				blockno--;
+
+				continue;
+			}
+
+			/* The checksum verification failed on retry as well.
+			 * Check if the page has been modified since the
+			 * checkpoint and skip it in this case.
+			 */
+			if (PageGetLSN(buf.data) > checkpointLSN)
+			{
+				block_retry = false;
+				blocks--;
+				skippedblocks++;
+				continue;
+			}
+
 			if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
 				fprintf(stderr, _("%s: checksum verification failed in file \"%s\", block %u: calculated checksum %X but block contains %X\n"),
 						progname, fn, blockno, csum, header->pd_checksum);
 			badblocks++;
 		}
+		block_retry = false;
 	}
 
 	if (verbose)
@@ -152,6 +243,12 @@ scan_directory(const char *basedir, const char *subdir)
 		snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name);
 		if (lstat(fn, &st) < 0)
 		{
+			if (errno == ENOENT)
+			{
+				/* File was removed in the meantime */
+				continue;
+			}
+
 			fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"),
 					progname, fn, strerror(errno));
 			exit(1);
@@ -285,7 +382,7 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
-	/* Check if cluster is running */
+	/* Check if checksums are enabled */
 	ControlFile = get_controlfile(DataDir, progname, &crc_ok);
 	if (!crc_ok)
 	{
@@ -293,19 +390,15 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
-	if (ControlFile->state != DB_SHUTDOWNED &&
-		ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
-	{
-		fprintf(stderr, _("%s: cluster must be shut down to verify checksums\n"), progname);
-		exit(1);
-	}
-
 	if (ControlFile->data_checksum_version == 0)
 	{
 		fprintf(stderr, _("%s: data checksums are not enabled in cluster\n"), progname);
 		exit(1);
 	}
 
+	/* Get checkpoint LSN */
+	checkpointLSN = ControlFile->checkPoint;
+
 	/* Scan all files */
 	scan_directory(DataDir, "global");
 	scan_directory(DataDir, "base");
@@ -315,6 +408,8 @@ main(int argc, char *argv[])
 	printf(_("Data checksum version: %d\n"), ControlFile->data_checksum_version);
 	printf(_("Files scanned:  %s\n"), psprintf(INT64_FORMAT, files));
 	printf(_("Blocks scanned: %s\n"), psprintf(INT64_FORMAT, blocks));
+	if (skippedblocks > 0)
+		printf(_("Blocks skipped: %s\n"), psprintf(INT64_FORMAT, skippedblocks));
 	printf(_("Bad checksums:  %s\n"), psprintf(INT64_FORMAT, badblocks));
 
 	if (badblocks > 0)

Reply via email to