On Wed, Oct 21, 2020 at 07:10:34PM +0900, Michael Paquier wrote:
> My guess is that we should be able to make use of that for base
> backups as well, but I also think that I'd rather let v13 go with more
> retries without depending on a new API layer, removing of the LSN
> check altogether.  Thinking of it, that's actually roughly what I
> posted here, but without the PageGetLSN() bit in the refactored code.
> So I see a pretty good argument to address the stable branches with
> that, and study for the future a better API to govern them all:
> https://www.postgresql.org/message-id/20201020062432.ga30...@paquier.xyz

So, I was sleeping on this one, and I could not find a reason why we
should not address both the zero case and the random data case at the
same time, as mentioned here:
https://www.postgresql.org/message-id/20201022012519.gf1...@paquier.xyz

We cannot trust the fields fields of the page header because these may
have been messed up with some random corruption, so what really
matters is that the checksums don't match, and that we can just rely
on that.  The zero-only case of a page is different because these
don't have a checksum set, so I would finish with something like the
attached to make the detection more robust.  This does not make the
detection perfect as there is no locking insurance (we really need to
do that but v13 has been released already), but with a sufficient
number of retries this can make things much more reliable than what's
present.

Are there any comments?  Anybody?
--
Michael
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 51b8f994ac..1a28ee1130 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -404,26 +404,33 @@ do { \
  *		extern declarations
  * ----------------------------------------------------------------
  */
+
+/* flags for PageAddItemExtended() */
 #define PAI_OVERWRITE			(1 << 0)
 #define PAI_IS_HEAP				(1 << 1)
 
+/* flags for PageIsVerifiedExtended() */
+#define PIV_LOG_WARNING			(1 << 0)
+#define PIV_REPORT_STAT			(1 << 1)
+
 #define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap) \
 	PageAddItemExtended(page, item, size, offsetNumber, \
 						((overwrite) ? PAI_OVERWRITE : 0) | \
 						((is_heap) ? PAI_IS_HEAP : 0))
 
 /*
- * Check that BLCKSZ is a multiple of sizeof(size_t).  In PageIsVerified(),
- * it is much faster to check if a page is full of zeroes using the native
- * word size.  Note that this assertion is kept within a header to make
- * sure that StaticAssertDecl() works across various combinations of
- * platforms and compilers.
+ * Check that BLCKSZ is a multiple of sizeof(size_t).  In
+ * PageIsVerifiedExtended(), it is much faster to check if a page is
+ * full of zeroes using the native word size.  Note that this assertion
+ * is kept within a header to make sure that StaticAssertDecl() works
+ * across various combinations of platforms and compilers.
  */
 StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
 				 "BLCKSZ has to be a multiple of sizeof(size_t)");
 
 extern void PageInit(Page page, Size pageSize, Size specialSize);
 extern bool PageIsVerified(Page page, BlockNumber blkno);
+extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags);
 extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
 										OffsetNumber offsetNumber, int flags);
 extern Page PageGetTempPage(Page page);
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index dbbd3aa31f..d538f25726 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -443,7 +443,8 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 
 		smgrread(src, forkNum, blkno, buf.data);
 
-		if (!PageIsVerified(page, blkno))
+		if (!PageIsVerifiedExtended(page, blkno,
+									PIV_LOG_WARNING | PIV_REPORT_STAT))
 			ereport(ERROR,
 					(errcode(ERRCODE_DATA_CORRUPTED),
 					 errmsg("invalid page in block %u of relation %s",
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index b89df01fa7..6f717c8956 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -1569,21 +1569,24 @@ sendFile(const char *readfilename, const char *tarfilename,
 {
 	int			fd;
 	BlockNumber blkno = 0;
-	bool		block_retry = false;
+	int			block_attempts = 0;
 	char		buf[TAR_SEND_SIZE];
-	uint16		checksum;
 	int			checksum_failures = 0;
 	off_t		cnt;
 	int			i;
 	pgoff_t		len = 0;
 	char	   *page;
 	size_t		pad;
-	PageHeader	phdr;
 	int			segmentno = 0;
 	char	   *segmentpath;
 	bool		verify_checksum = false;
 	pg_checksum_context checksum_ctx;
 
+	/* Maximum number of checksum failures reported for this file */
+#define CHECKSUM_REPORT_THRESHOLD		5
+	/* maximum number of retries done for a single page */
+#define PAGE_RETRY_THRESHOLD			5
+
 	pg_checksum_init(&checksum_ctx, manifest->checksum_type);
 
 	fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY);
@@ -1661,9 +1664,7 @@ sendFile(const char *readfilename, const char *tarfilename,
 		if (verify_checksum && (cnt % BLCKSZ != 0))
 		{
 			ereport(WARNING,
-					(errmsg("could not verify checksum in file \"%s\", block "
-							"%d: read buffer size %d and page size %d "
-							"differ",
+					(errmsg("could not verify checksum in file \"%s\", block %u: read buffer size %d and page size %d differ",
 							readfilename, blkno, (int) cnt, BLCKSZ)));
 			verify_checksum = false;
 		}
@@ -1675,78 +1676,75 @@ sendFile(const char *readfilename, const char *tarfilename,
 				page = buf + BLCKSZ * i;
 
 				/*
-				 * Only check pages which have not been modified since the
-				 * start of the base backup. Otherwise, they might have been
-				 * written only halfway and the checksum would not be valid.
-				 * However, replaying WAL would reinstate the correct page in
-				 * this case. We also skip completely new pages, since they
-				 * don't have a checksum yet.
+				 * Check on-disk pages with the same set of verification
+				 * conditions used before loading pages into shared buffers.
+				 * Note that PageIsVerifiedExtended() considers pages filled
+				 * only with zeros as valid, since they don't have a checksum
+				 * yet.  Failures are not reported to pgstat yet, as these are
+				 * reported in a single batch once a file is completed.  It
+				 * could be possible, that a page is written halfway while
+				 * doing this check, causing a false positive.  If that
+				 * happens, a page is retried once, with an error reported if
+				 * the second attempt also fails.
 				 */
-				if (!PageIsNew(page) && PageGetLSN(page) < startptr)
+				if (PageIsVerifiedExtended(page, blkno, 0))
 				{
-					checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
-					phdr = (PageHeader) page;
-					if (phdr->pd_checksum != checksum)
+					/* success, move on to the next block */
+					blkno++;
+					block_attempts = 0;
+					continue;
+				}
+
+				/* The verification of a page has failed, retry once */
+				if (block_attempts < PAGE_RETRY_THRESHOLD)
+				{
+					int			reread_cnt;
+
+					/* Reread the failed block */
+					reread_cnt =
+						basebackup_read_file(fd, buf + BLCKSZ * i,
+											 BLCKSZ, len + BLCKSZ * i,
+											 readfilename,
+											 false);
+					if (reread_cnt == 0)
 					{
 						/*
-						 * Retry the block on the first failure.  It's
-						 * possible that we read the first 4K page of the
-						 * block just before postgres updated the entire block
-						 * so it ends up looking torn to us.  We only need to
-						 * retry once because the LSN should be updated to
-						 * something we can ignore on the next pass.  If the
-						 * error happens again then it is a true validation
-						 * failure.
+						 * If we hit end-of-file, a concurrent truncation must
+						 * have occurred, so break out of this loop just as if
+						 * the initial fread() returned 0. We'll drop through
+						 * to the same code that handles that case. (We must
+						 * fix up cnt first, though.)
 						 */
-						if (block_retry == false)
-						{
-							int			reread_cnt;
-
-							/* Reread the failed block */
-							reread_cnt =
-								basebackup_read_file(fd, buf + BLCKSZ * i,
-													 BLCKSZ, len + BLCKSZ * i,
-													 readfilename,
-													 false);
-							if (reread_cnt == 0)
-							{
-								/*
-								 * If we hit end-of-file, a concurrent
-								 * truncation must have occurred, so break out
-								 * of this loop just as if the initial fread()
-								 * returned 0. We'll drop through to the same
-								 * code that handles that case. (We must fix
-								 * up cnt first, though.)
-								 */
-								cnt = BLCKSZ * i;
-								break;
-							}
-
-							/* Set flag so we know a retry was attempted */
-							block_retry = true;
-
-							/* Reset loop to validate the block again */
-							i--;
-							continue;
-						}
-
-						checksum_failures++;
-
-						if (checksum_failures <= 5)
-							ereport(WARNING,
-									(errmsg("checksum verification failed in "
-											"file \"%s\", block %d: calculated "
-											"%X but expected %X",
-											readfilename, blkno, checksum,
-											phdr->pd_checksum)));
-						if (checksum_failures == 5)
-							ereport(WARNING,
-									(errmsg("further checksum verification "
-											"failures in file \"%s\" will not "
-											"be reported", readfilename)));
+						cnt = BLCKSZ * i;
+						break;
 					}
+
+					/* Set flag so we know a retry was attempted */
+					block_attempts++;
+
+					/*
+					 * Wait for 100 ms to give some room to any parallel page
+					 * write that may have caused this retry to finish.
+					 */
+					pg_usleep(100000L);
+
+					/* Reset loop to validate the block again */
+					i--;
+					continue;
 				}
-				block_retry = false;
+
+				checksum_failures++;
+				if (checksum_failures <= CHECKSUM_REPORT_THRESHOLD)
+					ereport(WARNING,
+							(errmsg("checksum verification failed in block %u of file \"%s\"",
+									blkno, readfilename)));
+				if (checksum_failures == CHECKSUM_REPORT_THRESHOLD)
+					ereport(WARNING,
+							(errmsg("further checksum verification failures in file \"%s\" will not be reported",
+									readfilename)));
+
+				/* move to next block */
+				block_attempts = 0;
 				blkno++;
 			}
 		}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e549fa1d30..3eee86afe5 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -625,7 +625,8 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  *
  * In RBM_NORMAL mode, the page is read from disk, and the page header is
  * validated.  An error is thrown if the page header is not valid.  (But
- * note that an all-zero page is considered "valid"; see PageIsVerified().)
+ * note that an all-zero page is considered "valid"; see
+ * PageIsVerifiedExtended().)
  *
  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
  * valid, the page is zeroed instead of throwing an error. This is intended
@@ -917,7 +918,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			}
 
 			/* check for garbage data */
-			if (!PageIsVerified((Page) bufBlock, blockNum))
+			if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
+										PIV_LOG_WARNING | PIV_REPORT_STAT))
 			{
 				if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
 				{
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 4bc2bf955d..3a27dff04d 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -62,6 +62,18 @@ PageInit(Page page, Size pageSize, Size specialSize)
 
 /*
  * PageIsVerified
+ *		Utility wrapper for PageIsVerifiedExtended().
+ */
+bool
+PageIsVerified(Page page, BlockNumber blkno)
+{
+	return PageIsVerifiedExtended(page, blkno,
+								  PIV_LOG_WARNING | PIV_REPORT_STAT);
+}
+
+
+/*
+ * PageIsVerifiedExtended
  *		Check that the page header and checksum (if any) appear valid.
  *
  * This is called when a page has just been read in from disk.  The idea is
@@ -77,9 +89,15 @@ PageInit(Page page, Size pageSize, Size specialSize)
  * allow zeroed pages here, and are careful that the page access macros
  * treat such a page as empty and without free space.  Eventually, VACUUM
  * will clean up such a page and make it usable.
+ *
+ * If flag PIV_LOG_WARNING is set, a WARNING is logged in the event of
+ * a checksum failure.
+ *
+ * If flag PIV_REPORT_STAT is set, a checksum failure is reported directly
+ * to pgstat.
  */
 bool
-PageIsVerified(Page page, BlockNumber blkno)
+PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
 {
 	PageHeader	p = (PageHeader) page;
 	size_t	   *pagebytes;
@@ -140,12 +158,14 @@ PageIsVerified(Page page, BlockNumber blkno)
 	 */
 	if (checksum_failure)
 	{
-		ereport(WARNING,
-				(errcode(ERRCODE_DATA_CORRUPTED),
-				 errmsg("page verification failed, calculated checksum %u but expected %u",
-						checksum, p->pd_checksum)));
+		if ((flags & PIV_LOG_WARNING) != 0)
+			ereport(WARNING,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("page verification failed, calculated checksum %u but expected %u",
+							checksum, p->pd_checksum)));
 
-		pgstat_report_checksum_failure();
+		if ((flags & PIV_REPORT_STAT) != 0)
+			pgstat_report_checksum_failure();
 
 		if (header_sane && ignore_checksum_failure)
 			return true;
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index f674a7c94e..a46a9ece42 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -6,7 +6,7 @@ use File::Basename qw(basename dirname);
 use File::Path qw(rmtree);
 use PostgresNode;
 use TestLib;
-use Test::More tests => 109;
+use Test::More tests => 112;
 
 program_help_ok('pg_basebackup');
 program_version_ok('pg_basebackup');
@@ -528,6 +528,24 @@ $node->command_checks_all(
 	'pg_basebackup reports checksum mismatch');
 rmtree("$tempdir/backup_corrupt");
 
+# Create a new type of corruption and zero out one page header
+# completely.
+open $file, '+<', "$pgdata/$file_corrupt1";
+seek($file, $pageheader_size, 0);
+system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
+my $zero_data = '\0' x $pageheader_size;
+syswrite($file, $zero_data);
+close $file;
+system_or_bail 'pg_ctl', '-D', $pgdata, 'start';
+
+$node->command_checks_all(
+	[ 'pg_basebackup', '-D', "$tempdir/backup_corrupt1a" ],
+	1,
+	[qr{^$}],
+	[qr/^WARNING.*checksum verification failed/s],
+	"pg_basebackup reports checksum mismatch for zeroed page headers");
+rmtree("$tempdir/backup_corrupt1a");
+
 # induce further corruption in 5 more blocks
 system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
 open $file, '+<', "$pgdata/$file_corrupt1";

Attachment: signature.asc
Description: PGP signature

Reply via email to