Hi,
Am Montag, den 05.03.2018, 06:36 -0500 schrieb Stephen Frost:
> Michael,
>
> * Michael Banck (michael.ba...@credativ.de) wrote:
> > On Sun, Mar 04, 2018 at 06:19:00PM +0100, Magnus Hagander wrote:
> > > So sure, if we go with WARNING + exit with an errorcode, that is perhaps
> > > the best combination of the two.
> >
> > I had a look at how to go about this, but it appears to be a bit
> > complicated; the first problem is that sendFile() and sendDir() don't
> > have status return codes that could be set on checksum verifcation
> > failure. So I added a global variable and threw an ereport(ERROR) at the
> > end of perform_base_backup(), but then I realized that `pg_basebackup'
> > the client program purges the datadir it created if it gets an error:
> >
> > > pg_basebackup: final receive failed: ERROR: Checksum mismatch during
> > > basebackup
> > >
> > > pg_basebackup: removing data directory "data2"
>
> Oh, ugh.
I came up with the attached patch, which sets a checksum_failure
variable in both basebackup.c and pg_basebackup.c, and emits an ereport
with (for now) ERRCODE_DATA_CORRUPTED at the end of
perform_base_backup(), which gets caught in pg_basebackup and then used
to not cleanup the datadir, but exit with a non-zero exit code.
Does that seem feasible?
Michael
--
Michael Banck
Projektleiter / Senior Berater
Tel.: +49 2166 9901-171
Fax: +49 2166 9901-100
Email: michael.ba...@credativ.de
credativ GmbH, HRB Mönchengladbach 12080
USt-ID-Nummer: DE204566209
Trompeterallee 108, 41189 Mönchengladbach
Geschäftsführung: Dr. Michael Meskes, Jörg Folz, Sascha Heuer
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 361cde3291..11fb009b59 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -100,6 +100,9 @@ static TimeOffset elapsed_min_unit;
/* The last check of the transfer rate. */
static TimestampTz throttled_last;
+/* Whether a checksum failure occured. */
+static bool checksum_failure;
+
/*
* The contents of these directories are removed or recreated during server
* start so they are not included in backups. The directories themselves are
@@ -208,6 +211,8 @@ perform_base_backup(basebackup_options *opt)
labelfile = makeStringInfo();
tblspc_map_file = makeStringInfo();
+ checksum_failure = false;
+
startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli,
labelfile, &tablespaces,
tblspc_map_file,
@@ -566,6 +571,12 @@ perform_base_backup(basebackup_options *opt)
pq_putemptymessage('c');
}
SendXlogRecPtrResult(endptr, endtli);
+
+ if (checksum_failure == true)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Checksum mismatch during basebackup\n")));
+
}
/*
@@ -1295,10 +1306,13 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
checksum = pg_checksum_page((char *) page, blkno + segmentno*RELSEG_SIZE);
phdr = (PageHeader) page;
if (phdr->pd_checksum != checksum)
+ {
ereport(WARNING,
- (errmsg("checksum mismatch in file \"%s\", block %d: "
- "expected %x, found %x", readfilename, blkno,
+ (errmsg("checksum mismatch in file \"%s\", segment %d, block %d: "
+ "expected %x, found %x", readfilename, segmentno, blkno,
phdr->pd_checksum, checksum)));
+ checksum_failure = true;
+ }
blkno++;
}
}
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 1b32592063..e1b364b4ef 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -39,6 +39,7 @@
#include "replication/basebackup.h"
#include "streamutil.h"
+#define ERRCODE_DATA_CORRUPTED "XX001"
typedef struct TablespaceListCell
{
@@ -81,6 +82,7 @@ static char *xlog_dir = NULL;
static char format = 'p'; /* p(lain)/t(ar) */
static char *label = "pg_basebackup base backup";
static bool noclean = false;
+static bool checksum_failure = false;
static bool showprogress = false;
static int verbose = 0;
static int compresslevel = 0;
@@ -155,7 +157,7 @@ cleanup_directories_atexit(void)
if (success || in_log_streamer)
return;
- if (!noclean)
+ if (!noclean && !checksum_failure)
{
if (made_new_pgdata)
{
@@ -195,7 +197,7 @@ cleanup_directories_atexit(void)
}
else
{
- if (made_new_pgdata || found_existing_pgdata)
+ if ((made_new_pgdata || found_existing_pgdata) && !checksum_failure)
fprintf(stderr,
_("%s: data directory \"%s\" not removed at user's request\n"),
progname, basedir);
@@ -1970,8 +1972,17 @@ BaseBackup(void)
res = PQgetResult(conn);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
- fprintf(stderr, _("%s: final receive failed: %s"),
- progname, PQerrorMessage(conn));
+ const char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+ if (sqlstate &&
+ strcmp(sqlstate, ERRCODE_DATA_CORRUPTED) == 0)
+ {
+ fprintf(stderr, _("%s: checksum error occured\n"),
+ progname);
+ checksum_failure = true;
+ } else {
+ fprintf(stderr, _("%s: final receive failed: %s"),
+ progname, PQerrorMessage(conn));
+ }
disconnect_and_exit(1);
}