Hi,
Am Mittwoch, den 28.02.2018, 19:08 +0100 schrieb Michael Banck:
> some installations have data which is only rarerly read, and if they are
> so large that dumps are not routinely taken, data corruption would only
> be detected with some large delay even with checksums enabled.
>
> The attached small patch verifies checksums (in case they are enabled)
> during a basebackup. The rationale is that we are reading every block in
> this case anyway, so this is a good opportunity to check them as well.
> Other and complementary ways of checking the checksums are possible of
> course, like the offline checking tool that Magnus just submitted.
I've attached a second version of this patch. Changes are:
1. I've included some code from Magnus' patch, notably the way the
segment numbers are determined and the skipfile() function, along with
the array of files to skip.
2. I am now checking the LSN in the pageheader and compare it against
the LSN of the basebackup start, so that no checksums are verified for
pages changed after basebackup start. I am not sure whether this
addresses all concerns by Stephen and David, as I am not re-reading the
page on a checksum mismatch as they are doing in pgbackrest.
3. pg_basebackup now exits with 1 if a checksum mismatch occured, but it
keeps the data around.
4. I added an Assert() that the TAR_SEND_SIZE is a multiple of BLCKSZ.
AFAICT we support block sizes of 1, 2, 4, 8, 16 and 32 KB, while
TAR_SEND_SIZE is set to 32 KB, so this should be fine unless somebody
mucks around with BLCKSZ manually, in which case the Assert should fire.
I compiled --with-blocksize=32 and checked that this still works as
intended.
5. I also check that the buffer we read is a multiple of BLCKSZ. If that
is not the case I emit a WARNING that the checksum cannot be checked and
pg_basebackup will exit with 1 as well.
This is how it looks like right now from pg_basebackup's POV:
postgres@fock:~$ initdb -k --pgdata=data1 1> /dev/null
WARNING: enabling "trust" authentication for local connections
You can change this by editing pg_hba.conf or using the option -A, or
--auth-local and --auth-host, the next time you run initdb.
postgres@fock:~$ pg_ctl --pgdata=data1 --log=pg1.log start
waiting for server to start.... done
server started
postgres@fock:~$ psql -h /tmp -c "SELECT pg_relation_filepath('pg_class')"
pg_relation_filepath
----------------------
base/12367/1259
(1 row)
postgres@fock:~$ echo -n "Bang!" | dd conv=notrunc oflag=seek_bytes seek=4000
bs=9 count=1 of=data1/base/12367/1259
0+1 records in
0+1 records out
5 bytes copied, 3.7487e-05 s, 133 kB/s
postgres@fock:~$ pg_basebackup --pgdata=data2 -h /tmp
WARNING: checksum mismatch in file "./base/12367/1259", segment 0, block 0:
expected CC05, found CA4D
pg_basebackup: checksum error occured
postgres@fock:~$ echo $?
1
postgres@fock:~$ ls data2
backup_label pg_dynshmem pg_multixact pg_snapshots pg_tblspc pg_xact
base pg_hba.conf pg_notify pg_stat pg_twophase
postgresql.auto.conf
global pg_ident.conf pg_replslot pg_stat_tmp PG_VERSION
postgresql.conf
pg_commit_ts pg_logical pg_serial pg_subtrans pg_wal
postgres@fock:~$
Possibly open questions:
1. I have not so far changed the replication protocol to make verifying
checksums optional. I can go about that next if the consensus is that we
need such an option (and cannot just check it everytime)?
2. The isolation tester test uses dd (similar to the above), is that
allowed, or do I have to come up with some internal Perl thing that also
works on Windows?
3. I am using basename() to get the filename, I haven't seen that used a
lot in the codebase (nor did I find an obvious internal implementation),
is that fine?
Cheers,
Michael
--
Michael Banck
Projektleiter / Senior Berater
Tel.: +49 2166 9901-171
Fax: +49 2166 9901-100
Email: michael.ba...@credativ.de
credativ GmbH, HRB Mönchengladbach 12080
USt-ID-Nummer: DE204566209
Trompeterallee 108, 41189 Mönchengladbach
Geschäftsführung: Dr. Michael Meskes, Jörg Folz, Sascha Heuer
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 185f32a5f9..8e84c7c38e 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -16,6 +16,7 @@
#include <unistd.h>
#include <time.h>
+#include "access/xlog.h"
#include "access/xlog_internal.h" /* for pg_start/stop_backup */
#include "catalog/catalog.h"
#include "catalog/pg_type.h"
@@ -30,6 +31,8 @@
#include "replication/basebackup.h"
#include "replication/walsender.h"
#include "replication/walsender_private.h"
+#include "storage/bufpage.h"
+#include "storage/checksum.h"
#include "storage/dsm_impl.h"
#include "storage/fd.h"
#include "storage/ipc.h"
@@ -75,6 +78,9 @@ static bool backup_started_in_recovery = false;
/* Relative path of temporary statistics directory */
static char *statrelpath = NULL;
+/* The starting XLOG position of the base backup */
+static XLogRecPtr startptr;
+
/*
* Size of each block sent into the tar stream for larger files.
*/
@@ -97,6 +103,9 @@ static TimeOffset elapsed_min_unit;
/* The last check of the transfer rate. */
static TimestampTz throttled_last;
+/* Whether a checksum failure occured. */
+static bool checksum_failure;
+
/*
* The contents of these directories are removed or recreated during server
* start so they are not included in backups. The directories themselves are
@@ -189,7 +198,6 @@ base_backup_cleanup(int code, Datum arg)
static void
perform_base_backup(basebackup_options *opt)
{
- XLogRecPtr startptr;
TimeLineID starttli;
XLogRecPtr endptr;
TimeLineID endtli;
@@ -205,6 +213,8 @@ perform_base_backup(basebackup_options *opt)
labelfile = makeStringInfo();
tblspc_map_file = makeStringInfo();
+ checksum_failure = false;
+
startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli,
labelfile, &tablespaces,
tblspc_map_file,
@@ -563,6 +573,12 @@ perform_base_backup(basebackup_options *opt)
pq_putemptymessage('c');
}
SendXlogRecPtrResult(endptr, endtli);
+
+ if (checksum_failure == true)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("checksum mismatch during basebackup")));
+
}
/*
@@ -1185,6 +1201,28 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
* Copied from pg_dump, but modified to work with libpq for sending
*/
+static const char *skip[] = {
+ "pg_control",
+ "pg_filenode.map",
+ "pg_internal.init",
+ "PG_VERSION",
+ NULL,
+};
+
+static bool
+skipfile(char *fn)
+{
+ const char **f;
+
+ if (strcmp(fn, ".") == 0 ||
+ strcmp(fn, "..") == 0)
+ return true;
+
+ for (f = skip; *f; f++)
+ if (strcmp(*f, fn) == 0)
+ return true;
+ return false;
+}
/*
* Given the member, write the TAR header & send the file.
@@ -1199,10 +1237,19 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
bool missing_ok)
{
FILE *fp;
+ BlockNumber blkno = 0;
char buf[TAR_SEND_SIZE];
+ uint16 checksum;
size_t cnt;
+ char *filename;
+ int i;
pgoff_t len = 0;
+ char page[BLCKSZ];
size_t pad;
+ PageHeader phdr;
+ int segmentno = 0;
+ char *segmentpath;
+ bool verify_checksum;
fp = AllocateFile(readfilename, "rb");
if (fp == NULL)
@@ -1214,10 +1261,87 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
errmsg("could not open file \"%s\": %m", readfilename)));
}
+ /*
+ * Verify checksums only when checksums are enabled, and the file to
+ * send is either in one of the default tablespaces (`./global' or
+ * `./base'), or in an external tablespace with an absolute pathname
+ * (starting with `/') and is not in the list of non-heap files to be
+ * skipped.
+ */
+ filename = basename(pstrdup(readfilename));
+ if (DataChecksumsEnabled() &&
+ !skipfile(filename) &&
+ (strncmp(readfilename, "./global/", 9) == 0 ||
+ strncmp(readfilename, "./base/", 7) == 0 ||
+ strncmp(readfilename, "/", 1) == 0))
+ verify_checksum = true;
+ else
+ verify_checksum = false;
+
_tarWriteHeader(tarfilename, NULL, statbuf, false);
+ /*
+ * Cut off at the segment boundary (".") to get the segment number in order
+ * to mix it into the checksum.
+ */
+ segmentpath = strstr(filename, ".");
+ if (verify_checksum && segmentpath != NULL)
+ {
+ *segmentpath++ = '\0';
+ segmentno = atoi(segmentpath);
+ if (segmentno == 0)
+ ereport(ERROR,
+ (errmsg("invalid segment number %d in filename %s\n",
+ segmentno, filename)));
+ }
+
while ((cnt = fread(buf, 1, Min(sizeof(buf), statbuf->st_size - len), fp)) > 0)
{
+ if (verify_checksum)
+ {
+ /*
+ * The checksums are verified at block level, so we iterate over
+ * the buffer in chunks of BLCKSZ, after making sure that
+ * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read
+ * a multiple of BLCKSZ bytes.
+ */
+ Assert(TAR_SEND_SIZE % BLCKSZ == 0);
+ if (cnt % BLCKSZ != 0)
+ {
+ ereport(WARNING,
+ (errmsg("cannot verify checksum in file \"%s\", block "
+ "%d: read buffer size %d and page size %d "
+ "differ",
+ readfilename, blkno, (int)cnt, BLCKSZ)));
+ checksum_failure = true;
+ verify_checksum = false;
+ continue;
+ }
+ for (i = 0; i < cnt / BLCKSZ; i++)
+ {
+ memcpy(page, (buf + BLCKSZ * i), BLCKSZ);
+ /*
+ * Only check pages which have not been modified since the
+ * start of the base backup.
+ */
+ if (PageGetLSN(page) < startptr)
+ {
+ checksum = pg_checksum_page((char *) page, blkno + segmentno*RELSEG_SIZE);
+ phdr = (PageHeader) page;
+ if (phdr->pd_checksum != checksum)
+ {
+ ereport(WARNING,
+ (errmsg("checksum mismatch in file \"%s\", "
+ "block %d: expected %X, found %X",
+ readfilename, blkno, phdr->pd_checksum,
+ checksum)));
+ checksum_failure = true;
+ }
+ }
+ blkno++;
+ }
+ }
+
/* Send the chunk as a CopyData message */
if (pq_putmessage('d', buf, cnt))
ereport(ERROR,
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 1b32592063..e1b364b4ef 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -39,6 +39,7 @@
#include "replication/basebackup.h"
#include "streamutil.h"
+#define ERRCODE_DATA_CORRUPTED "XX001"
typedef struct TablespaceListCell
{
@@ -81,6 +82,7 @@ static char *xlog_dir = NULL;
static char format = 'p'; /* p(lain)/t(ar) */
static char *label = "pg_basebackup base backup";
static bool noclean = false;
+static bool checksum_failure = false;
static bool showprogress = false;
static int verbose = 0;
static int compresslevel = 0;
@@ -155,7 +157,7 @@ cleanup_directories_atexit(void)
if (success || in_log_streamer)
return;
- if (!noclean)
+ if (!noclean && !checksum_failure)
{
if (made_new_pgdata)
{
@@ -195,7 +197,7 @@ cleanup_directories_atexit(void)
}
else
{
- if (made_new_pgdata || found_existing_pgdata)
+ if ((made_new_pgdata || found_existing_pgdata) && !checksum_failure)
fprintf(stderr,
_("%s: data directory \"%s\" not removed at user's request\n"),
progname, basedir);
@@ -1970,8 +1972,17 @@ BaseBackup(void)
res = PQgetResult(conn);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
- fprintf(stderr, _("%s: final receive failed: %s"),
- progname, PQerrorMessage(conn));
+ const char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+ if (sqlstate &&
+ strcmp(sqlstate, ERRCODE_DATA_CORRUPTED) == 0)
+ {
+ fprintf(stderr, _("%s: checksum error occured\n"),
+ progname);
+ checksum_failure = true;
+ } else {
+ fprintf(stderr, _("%s: final receive failed: %s"),
+ progname, PQerrorMessage(conn));
+ }
disconnect_and_exit(1);
}
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index cdf4f5be37..60561362e1 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -4,7 +4,7 @@ use Cwd;
use Config;
use PostgresNode;
use TestLib;
-use Test::More tests => 79;
+use Test::More tests => 83;
program_help_ok('pg_basebackup');
program_version_ok('pg_basebackup');
@@ -15,7 +15,7 @@ my $tempdir = TestLib::tempdir;
my $node = get_new_node('main');
# Initialize node without replication settings
-$node->init;
+$node->init(extra => [ '--data-checksums' ]);
$node->start;
my $pgdata = $node->data_dir;
@@ -317,3 +317,19 @@ like(
slurp_file("$tempdir/backupxs_sl_R/recovery.conf"),
qr/^primary_slot_name = 'slot1'\n/m,
'recovery.conf sets primary_slot_name');
+
+my $checksum = $node->safe_psql('postgres', 'SHOW data_checksums;');
+is($checksum, 'on', 'checksums are enabled');
+
+# induce corruption
+my $pg_class = $node->safe_psql('postgres',
+ q{SELECT pg_relation_filepath('pg_class')}
+);
+system_or_bail 'dd', 'conv=notrunc', 'oflag=seek_bytes', 'seek=4000', 'bs=9', 'count=1', 'if=/dev/zero', "of=$pgdata/$pg_class";
+
+$node->command_checks_all([ 'pg_basebackup', '-D', "$tempdir/backup_corrupt"],
+ 1,
+ [qr{^$}],
+ [qr/^WARNING.*checksum.mismatch/s],
+ 'pg_basebackup reports checksum mismatch'
+);