On Sat, Jan 12, 2019 at 5:16 AM David Steele <da...@pgmasters.net> wrote:
> On 1/11/19 10:25 PM, Magnus Hagander wrote: > > On Fri, Jan 11, 2019 at 9:20 PM Tomas Vondra > > On 1/11/19 7:40 PM, Robert Haas wrote: > > > But I'm tentatively in favor of your proposal anyway, because it's > > > pretty simple and cheap and might help people, and doing something > > > noticeably better is probably annoyingly complicated. > > > > > > > +1 > > > > Yeah, that's the idea behind it -- it's cheap, and an > > early-warning-indicator. > > +1 > PFA is a patch to do this. It tracks things that happen in the general backends. Possibly we should also consider counting the errors actually found when running base backups? OTOH, that part of the code doesn't really track things like databases (as it operates just on the raw data directory underneath), so that implementation would definitely not be as clean... -- Magnus Hagander Me: https://www.hagander.net/ <http://www.hagander.net/> Work: https://www.redpill-linpro.com/ <http://www.redpill-linpro.com/>
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 0e73cdcdda..e2630fd368 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -2509,6 +2509,11 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i <entry>Number of deadlocks detected in this database</entry> </row> <row> + <entry><structfield>checksum_failures</structfield></entry> + <entry><type>bigint</type></entry> + <entry>Number of data page checksum failures detected in this database</entry> + </row> + <row> <entry><structfield>blk_read_time</structfield></entry> <entry><type>double precision</type></entry> <entry>Time spent reading data file blocks by backends in this database, diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 3e229c693c..7723f01327 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -823,6 +823,7 @@ CREATE VIEW pg_stat_database AS pg_stat_get_db_temp_files(D.oid) AS temp_files, pg_stat_get_db_temp_bytes(D.oid) AS temp_bytes, pg_stat_get_db_deadlocks(D.oid) AS deadlocks, + pg_stat_get_db_checksum_failures(D.oid) AS checksum_failures, pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time, pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time, pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 81c6499251..c5b4fab5ae 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -334,6 +334,7 @@ static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len); static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len); static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len); static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len); +static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len); static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len); /* ------------------------------------------------------------ @@ -1518,6 +1519,26 @@ pgstat_report_deadlock(void) pgstat_send(&msg, sizeof(msg)); } + +/* -------- + * pgstat_report_checksum_failure() - + * + * Tell the collector about a checksum failure. + * -------- + */ +void +pgstat_report_checksum_failure(void) +{ + PgStat_MsgDeadlock msg; + + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE); + msg.m_databaseid = MyDatabaseId; + pgstat_send(&msg, sizeof(msg)); +} + /* -------- * pgstat_report_tempfile() - * @@ -4455,6 +4476,10 @@ PgstatCollectorMain(int argc, char *argv[]) pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len); break; + case PGSTAT_MTYPE_CHECKSUMFAILURE: + pgstat_recv_checksum_failure((PgStat_MsgChecksumFailure *) &msg, len); + break; + default: break; } @@ -4554,6 +4579,7 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry) dbentry->n_temp_files = 0; dbentry->n_temp_bytes = 0; dbentry->n_deadlocks = 0; + dbentry->n_checksum_failures = 0; dbentry->n_block_read_time = 0; dbentry->n_block_write_time = 0; @@ -6197,6 +6223,22 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len) } /* ---------- + * pgstat_recv_checksum_failure() - + * + * Process a DEADLOCK message. + * ---------- + */ +static void +pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + dbentry->n_checksum_failures ++; +} + +/* ---------- * pgstat_recv_tempfile() - * * Process a TEMPFILE message. diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 517220bc33..14bc61b8ad 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -17,6 +17,7 @@ #include "access/htup_details.h" #include "access/itup.h" #include "access/xlog.h" +#include "pgstat.h" #include "storage/checksum.h" #include "utils/memdebug.h" #include "utils/memutils.h" @@ -151,6 +152,8 @@ PageIsVerified(Page page, BlockNumber blkno) errmsg("page verification failed, calculated checksum %u but expected %u", checksum, p->pd_checksum))); + pgstat_report_checksum_failure(); + if (header_sane && ignore_checksum_failure) return true; } diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 69f7265779..da1d685c08 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1498,6 +1498,21 @@ pg_stat_get_db_deadlocks(PG_FUNCTION_ARGS) } Datum +pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS) +{ + Oid dbid = PG_GETARG_OID(0); + int64 result; + PgStat_StatDBEntry *dbentry; + + if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) + result = 0; + else + result = (int64) (dbentry->n_checksum_failures); + + PG_RETURN_INT64(result); +} + +Datum pg_stat_get_db_blk_read_time(PG_FUNCTION_ARGS) { Oid dbid = PG_GETARG_OID(0); diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index a4e173b484..911fad1af9 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5233,6 +5233,10 @@ proname => 'pg_stat_get_db_deadlocks', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => 'oid', prosrc => 'pg_stat_get_db_deadlocks' }, +{ oid => '3425', descr => 'statistics: checksum failures detected in database', + proname => 'pg_stat_get_db_checksum_failures', provolatile => 's', proparallel => 'r', + prorettype => 'int8', proargtypes => 'oid', + prosrc => 'pg_stat_get_db_checksum_failures' }, { oid => '3074', descr => 'statistics: last reset for a database', proname => 'pg_stat_get_db_stat_reset_time', provolatile => 's', proparallel => 'r', prorettype => 'timestamptz', proargtypes => 'oid', diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 88a75fb798..b9e6463f2b 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -64,7 +64,8 @@ typedef enum StatMsgType PGSTAT_MTYPE_FUNCPURGE, PGSTAT_MTYPE_RECOVERYCONFLICT, PGSTAT_MTYPE_TEMPFILE, - PGSTAT_MTYPE_DEADLOCK + PGSTAT_MTYPE_DEADLOCK, + PGSTAT_MTYPE_CHECKSUMFAILURE } StatMsgType; /* ---------- @@ -530,6 +531,17 @@ typedef struct PgStat_MsgDeadlock Oid m_databaseid; } PgStat_MsgDeadlock; +/* ---------- + * PgStat_MsgChecksumFailure Sent by the backend to tell the collector + * about checksum failures noticed. + * ---------- + */ +typedef struct PgStat_MsgChecksumFailure +{ + PgStat_MsgHdr m_hdr; + Oid m_databaseid; +} PgStat_MsgChecksumFailure; + /* ---------- * PgStat_Msg Union over all possible messages. @@ -593,6 +605,7 @@ typedef struct PgStat_StatDBEntry PgStat_Counter n_temp_files; PgStat_Counter n_temp_bytes; PgStat_Counter n_deadlocks; + PgStat_Counter n_checksum_failures; PgStat_Counter n_block_read_time; /* times in microseconds */ PgStat_Counter n_block_write_time; @@ -1200,6 +1213,7 @@ extern void pgstat_report_analyze(Relation rel, extern void pgstat_report_recovery_conflict(int reason); extern void pgstat_report_deadlock(void); +extern void pgstat_report_checksum_failure(void); extern void pgstat_initialize(void); extern void pgstat_bestart(void); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 98f417cb57..e0f2c543ef 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1817,6 +1817,7 @@ pg_stat_database| SELECT d.oid AS datid, pg_stat_get_db_temp_files(d.oid) AS temp_files, pg_stat_get_db_temp_bytes(d.oid) AS temp_bytes, pg_stat_get_db_deadlocks(d.oid) AS deadlocks, + pg_stat_get_db_checksum_failures(d.oid) AS checksum_failures, pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time, pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time, pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset