On Sat, Jan 12, 2019 at 5:16 AM David Steele <da...@pgmasters.net> wrote:

> On 1/11/19 10:25 PM, Magnus Hagander wrote:
> > On Fri, Jan 11, 2019 at 9:20 PM Tomas Vondra
> >     On 1/11/19 7:40 PM, Robert Haas wrote:
> >      > But I'm tentatively in favor of your proposal anyway, because it's
> >      > pretty simple and cheap and might help people, and doing something
> >      > noticeably better is probably annoyingly complicated.
> >      >
> >
> >     +1
> >
> > Yeah, that's the idea behind it -- it's cheap, and an
> > early-warning-indicator.
>
> +1
>

PFA is a patch to do this.

It tracks things that happen in the general backends. Possibly we should
also consider counting the errors actually found when running base backups?
OTOH, that part of the code doesn't really track things like databases (as
it operates just on the raw data directory underneath), so that
implementation would definitely not be as clean...

-- 
 Magnus Hagander
 Me: https://www.hagander.net/ <http://www.hagander.net/>
 Work: https://www.redpill-linpro.com/ <http://www.redpill-linpro.com/>
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 0e73cdcdda..e2630fd368 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -2509,6 +2509,11 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
      <entry>Number of deadlocks detected in this database</entry>
     </row>
     <row>
+     <entry><structfield>checksum_failures</structfield></entry>
+     <entry><type>bigint</type></entry>
+     <entry>Number of data page checksum failures detected in this database</entry>
+    </row>
+    <row>
      <entry><structfield>blk_read_time</structfield></entry>
      <entry><type>double precision</type></entry>
      <entry>Time spent reading data file blocks by backends in this database,
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 3e229c693c..7723f01327 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -823,6 +823,7 @@ CREATE VIEW pg_stat_database AS
             pg_stat_get_db_temp_files(D.oid) AS temp_files,
             pg_stat_get_db_temp_bytes(D.oid) AS temp_bytes,
             pg_stat_get_db_deadlocks(D.oid) AS deadlocks,
+            pg_stat_get_db_checksum_failures(D.oid) AS checksum_failures,
             pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time,
             pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time,
             pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 81c6499251..c5b4fab5ae 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -334,6 +334,7 @@ static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
 static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
 static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
+static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
 /* ------------------------------------------------------------
@@ -1518,6 +1519,26 @@ pgstat_report_deadlock(void)
 	pgstat_send(&msg, sizeof(msg));
 }
 
+
+/* --------
+ * pgstat_report_checksum_failure() -
+ *
+ *	Tell the collector about a checksum failure.
+ * --------
+ */
+void
+pgstat_report_checksum_failure(void)
+{
+	PgStat_MsgDeadlock msg;
+
+	if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE);
+	msg.m_databaseid = MyDatabaseId;
+	pgstat_send(&msg, sizeof(msg));
+}
+
 /* --------
  * pgstat_report_tempfile() -
  *
@@ -4455,6 +4476,10 @@ PgstatCollectorMain(int argc, char *argv[])
 					pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len);
 					break;
 
+				case PGSTAT_MTYPE_CHECKSUMFAILURE:
+					pgstat_recv_checksum_failure((PgStat_MsgChecksumFailure *) &msg, len);
+					break;
+
 				default:
 					break;
 			}
@@ -4554,6 +4579,7 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
 	dbentry->n_temp_files = 0;
 	dbentry->n_temp_bytes = 0;
 	dbentry->n_deadlocks = 0;
+	dbentry->n_checksum_failures = 0;
 	dbentry->n_block_read_time = 0;
 	dbentry->n_block_write_time = 0;
 
@@ -6197,6 +6223,22 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
 }
 
 /* ----------
+ * pgstat_recv_checksum_failure() -
+ *
+ *	Process a DEADLOCK message.
+ * ----------
+ */
+static void
+pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len)
+{
+	PgStat_StatDBEntry *dbentry;
+
+	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
+
+	dbentry->n_checksum_failures ++;
+}
+
+/* ----------
  * pgstat_recv_tempfile() -
  *
  *	Process a TEMPFILE message.
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 517220bc33..14bc61b8ad 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -17,6 +17,7 @@
 #include "access/htup_details.h"
 #include "access/itup.h"
 #include "access/xlog.h"
+#include "pgstat.h"
 #include "storage/checksum.h"
 #include "utils/memdebug.h"
 #include "utils/memutils.h"
@@ -151,6 +152,8 @@ PageIsVerified(Page page, BlockNumber blkno)
 				 errmsg("page verification failed, calculated checksum %u but expected %u",
 						checksum, p->pd_checksum)));
 
+		pgstat_report_checksum_failure();
+
 		if (header_sane && ignore_checksum_failure)
 			return true;
 	}
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 69f7265779..da1d685c08 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1498,6 +1498,21 @@ pg_stat_get_db_deadlocks(PG_FUNCTION_ARGS)
 }
 
 Datum
+pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS)
+{
+	Oid			dbid = PG_GETARG_OID(0);
+	int64		result;
+	PgStat_StatDBEntry *dbentry;
+
+	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+		result = 0;
+	else
+		result = (int64) (dbentry->n_checksum_failures);
+
+	PG_RETURN_INT64(result);
+}
+
+Datum
 pg_stat_get_db_blk_read_time(PG_FUNCTION_ARGS)
 {
 	Oid			dbid = PG_GETARG_OID(0);
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index a4e173b484..911fad1af9 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5233,6 +5233,10 @@
   proname => 'pg_stat_get_db_deadlocks', provolatile => 's', proparallel => 'r',
   prorettype => 'int8', proargtypes => 'oid',
   prosrc => 'pg_stat_get_db_deadlocks' },
+{ oid => '3425', descr => 'statistics: checksum failures detected in database',
+  proname => 'pg_stat_get_db_checksum_failures', provolatile => 's', proparallel => 'r',
+  prorettype => 'int8', proargtypes => 'oid',
+  prosrc => 'pg_stat_get_db_checksum_failures' },
 { oid => '3074', descr => 'statistics: last reset for a database',
   proname => 'pg_stat_get_db_stat_reset_time', provolatile => 's',
   proparallel => 'r', prorettype => 'timestamptz', proargtypes => 'oid',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 88a75fb798..b9e6463f2b 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -64,7 +64,8 @@ typedef enum StatMsgType
 	PGSTAT_MTYPE_FUNCPURGE,
 	PGSTAT_MTYPE_RECOVERYCONFLICT,
 	PGSTAT_MTYPE_TEMPFILE,
-	PGSTAT_MTYPE_DEADLOCK
+	PGSTAT_MTYPE_DEADLOCK,
+	PGSTAT_MTYPE_CHECKSUMFAILURE
 } StatMsgType;
 
 /* ----------
@@ -530,6 +531,17 @@ typedef struct PgStat_MsgDeadlock
 	Oid			m_databaseid;
 } PgStat_MsgDeadlock;
 
+/* ----------
+ * PgStat_MsgChecksumFailure	Sent by the backend to tell the collector
+ *								about checksum failures noticed.
+ * ----------
+ */
+typedef struct PgStat_MsgChecksumFailure
+{
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
+} PgStat_MsgChecksumFailure;
+
 
 /* ----------
  * PgStat_Msg					Union over all possible messages.
@@ -593,6 +605,7 @@ typedef struct PgStat_StatDBEntry
 	PgStat_Counter n_temp_files;
 	PgStat_Counter n_temp_bytes;
 	PgStat_Counter n_deadlocks;
+	PgStat_Counter n_checksum_failures;
 	PgStat_Counter n_block_read_time;	/* times in microseconds */
 	PgStat_Counter n_block_write_time;
 
@@ -1200,6 +1213,7 @@ extern void pgstat_report_analyze(Relation rel,
 
 extern void pgstat_report_recovery_conflict(int reason);
 extern void pgstat_report_deadlock(void);
+extern void pgstat_report_checksum_failure(void);
 
 extern void pgstat_initialize(void);
 extern void pgstat_bestart(void);
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 98f417cb57..e0f2c543ef 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1817,6 +1817,7 @@ pg_stat_database| SELECT d.oid AS datid,
     pg_stat_get_db_temp_files(d.oid) AS temp_files,
     pg_stat_get_db_temp_bytes(d.oid) AS temp_bytes,
     pg_stat_get_db_deadlocks(d.oid) AS deadlocks,
+    pg_stat_get_db_checksum_failures(d.oid) AS checksum_failures,
     pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time,
     pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time,
     pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset

Reply via email to