Re: add checkpoint stats of snapshot and mapping files of pg_logical dir

Bharath Rupireddy Tue, 19 Jul 2022 04:59:40 -0700

On Tue, Apr 26, 2022 at 6:31 AM Michael Paquier <[email protected]> wrote:
>
> On Mon, Apr 25, 2022 at 01:34:38PM -0700, Nathan Bossart wrote:
> > I took another look at the example output, and I think I agree that logging
> > the total time for logical decoding operations is probably the best path
> > forward.  This information would be enough to clue an administrator into
> > the possible causes of lengthy checkpoints, but it also wouldn't disrupt
> > the readability of the log statement too much.
>
> +       /* translator: the placeholders after first %s show 
> restartpoint/checkpoint options */
> +                       (errmsg("%s starting:%s%s%s%s%s%s%s%s",
> +                                       restartpoint ?
> _("restartpoint") : _("checkpoint"),
>
> 0001 breaks translation, as "checkpoint/restartpoint" and "starting"
> would treated as separate terms to translate.  That would not matter
> for English, but it does in French where we'd say "début du
> checkpoint".  You could fix that by adding "starting" to each
> refactored term or build a string.  0002 does the latter, so my take
> is that you should begin using a StringInfo in 0001.


Thanks for reviewing. I've addressed the review comments, PSA v10
patch. Note that we can't use StringInfo as the checkpointer memory
context doesn't allow pallocs in the critical section and the
checkpoint can sometimes be run in the critical section.

I've also added the total number of WAL files a checkpoint has
processed (scanned the pg_wal directory) while removing old WAL files.
This helps to estimate the pg_wal disk space at the time of a
particular checkpoint, especially useful for debugging issues.

[1] sample output:
2022-07-19 10:33:45.378 UTC [3027866] LOG:  checkpoint starting: wal
2022-07-19 10:33:51.434 UTC [3027866] LOG:  checkpoint complete: wrote
50 buffers (0.3%); 0 WAL file(s) added, 12 removed, 35 recycled, 76
processed; write=3.651 s, sync=0.011 s, total=6.136 s; sync files=11,
longest=0.004 s, average=0.001 s; distance=770045 kB, estimate=770045
kB; lsn=0/95000260, redo lsn=0/79000068; logical decoding file(s)
processing=0.007 s

Regards,
Bharath Rupireddy.

From b113dfb9e876a84c2121b64ab94a9d10c3c670b4 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <[email protected]>
Date: Tue, 19 Jul 2022 10:35:29 +0000
Subject: [PATCH v10] Add time taken for processing logical decoding files to
 checkpoint log

At times, there can be many snapshot and mapping files under
pg_logical dir that the checkpoint might have to delete/fsync
based on the cutoff LSN which can increase the checkpoint time.
Add stats related to these files to better understand the delays
or time spent by the checkpointer processing them.

Also, add total number of WAL files processed during checkpoint to
the log message. This will be useful for debugging issues like
total time taken by checkpoint (if there are many WAL files) and
estimate the disk space occupied by pg_wal at the time of checkpoint.
---
 src/backend/access/transam/xlog.c | 30 +++++++++++++++++++++++-------
 src/include/access/xlog.h         |  5 +++++
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index b809a2152c..20c1689ed2 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3603,6 +3603,8 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
 							   insertTLI);
 			}
 		}
+
+		CheckpointStats.ckpt_segs_processed++;
 	}
 
 	FreeDir(xldir);
@@ -6092,7 +6094,8 @@ LogCheckpointEnd(bool restartpoint)
 				sync_msecs,
 				total_msecs,
 				longest_msecs,
-				average_msecs;
+				average_msecs,
+				l_dec_ops_msecs;
 	uint64		average_sync_time;
 
 	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
@@ -6129,6 +6132,9 @@ LogCheckpointEnd(bool restartpoint)
 			CheckpointStats.ckpt_sync_rels;
 	average_msecs = (long) ((average_sync_time + 999) / 1000);
 
+	l_dec_ops_msecs = TimestampDifferenceMilliseconds(CheckpointStats.l_dec_ops_start_t,
+													  CheckpointStats.l_dec_ops_end_t);
+
 	/*
 	 * ControlFileLock is not required to see ControlFile->checkPoint and
 	 * ->checkPointCopy here as we are the only updator of those variables at
@@ -6137,16 +6143,18 @@ LogCheckpointEnd(bool restartpoint)
 	if (restartpoint)
 		ereport(LOG,
 				(errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
-						"%d WAL file(s) added, %d removed, %d recycled; "
+						"%d WAL file(s) added, %d removed, %d recycled, %d processed; "
 						"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
 						"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
 						"distance=%d kB, estimate=%d kB; "
-						"lsn=%X/%X, redo lsn=%X/%X",
+						"lsn=%X/%X, redo lsn=%X/%X; "
+						"logical decoding file(s) processing=%ld.%03d s",
 						CheckpointStats.ckpt_bufs_written,
 						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
 						CheckpointStats.ckpt_segs_added,
 						CheckpointStats.ckpt_segs_removed,
 						CheckpointStats.ckpt_segs_recycled,
+						CheckpointStats.ckpt_segs_processed,
 						write_msecs / 1000, (int) (write_msecs % 1000),
 						sync_msecs / 1000, (int) (sync_msecs % 1000),
 						total_msecs / 1000, (int) (total_msecs % 1000),
@@ -6156,20 +6164,23 @@ LogCheckpointEnd(bool restartpoint)
 						(int) (PrevCheckPointDistance / 1024.0),
 						(int) (CheckPointDistanceEstimate / 1024.0),
 						LSN_FORMAT_ARGS(ControlFile->checkPoint),
-						LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
+						LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo),
+						l_dec_ops_msecs / 1000, (int) (l_dec_ops_msecs % 1000))));
 	else
 		ereport(LOG,
 				(errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
-						"%d WAL file(s) added, %d removed, %d recycled; "
+						"%d WAL file(s) added, %d removed, %d recycled, %d processed; "
 						"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
 						"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
 						"distance=%d kB, estimate=%d kB; "
-						"lsn=%X/%X, redo lsn=%X/%X",
+						"lsn=%X/%X, redo lsn=%X/%X; "
+						"logical decoding file(s) processing=%ld.%03d s",
 						CheckpointStats.ckpt_bufs_written,
 						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
 						CheckpointStats.ckpt_segs_added,
 						CheckpointStats.ckpt_segs_removed,
 						CheckpointStats.ckpt_segs_recycled,
+						CheckpointStats.ckpt_segs_processed,
 						write_msecs / 1000, (int) (write_msecs % 1000),
 						sync_msecs / 1000, (int) (sync_msecs % 1000),
 						total_msecs / 1000, (int) (total_msecs % 1000),
@@ -6179,7 +6190,8 @@ LogCheckpointEnd(bool restartpoint)
 						(int) (PrevCheckPointDistance / 1024.0),
 						(int) (CheckPointDistanceEstimate / 1024.0),
 						LSN_FORMAT_ARGS(ControlFile->checkPoint),
-						LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
+						LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo),
+						l_dec_ops_msecs / 1000, (int) (l_dec_ops_msecs % 1000))));
 }
 
 /*
@@ -6848,8 +6860,12 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
 	CheckPointRelationMap();
 	CheckPointReplicationSlots();
+
+	CheckpointStats.l_dec_ops_start_t = GetCurrentTimestamp();
 	CheckPointSnapBuild();
 	CheckPointLogicalRewriteHeap();
+	CheckpointStats.l_dec_ops_end_t = GetCurrentTimestamp();
+
 	CheckPointReplicationOrigin();
 
 	/* Write out all dirty data in SLRUs and the main buffer pool */
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index cd674c3c23..75730254ed 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -166,6 +166,7 @@ typedef struct CheckpointStatsData
 	int			ckpt_segs_added;	/* # of new xlog segments created */
 	int			ckpt_segs_removed;	/* # of xlog segments deleted */
 	int			ckpt_segs_recycled; /* # of xlog segments recycled */
+	int			ckpt_segs_processed; /* # of xlog segments processed */
 
 	int			ckpt_sync_rels; /* # of relations synced */
 	uint64		ckpt_longest_sync;	/* Longest sync for one relation */
@@ -173,6 +174,10 @@ typedef struct CheckpointStatsData
 									 * times, which is not necessarily the
 									 * same as the total elapsed time for the
 									 * entire sync phase. */
+
+	/* start and end timestamps of logical decoding file processing */
+	TimestampTz	l_dec_ops_start_t;
+	TimestampTz	l_dec_ops_end_t;
 } CheckpointStatsData;
 
 extern PGDLLIMPORT CheckpointStatsData CheckpointStats;
-- 
2.25.1

Re: add checkpoint stats of snapshot and mapping files of pg_logical dir

Reply via email to