On Fri, Mar 18, 2022 at 11:11 AM Bharath Rupireddy
<bharath.rupireddyforpostg...@gmail.com> wrote:
>
> On Fri, Mar 18, 2022 at 2:15 AM Nathan Bossart <nathandboss...@gmail.com> 
> wrote:
> >
> > On Thu, Mar 17, 2022 at 06:48:43PM +0530, Bharath Rupireddy wrote:
> > > Personally, I tend to agree with v4-0001 (option (4)) or v4-0002
> > > (option (3)) than v4-0003 (option (1)) as it adds more unreadability
> > > with most of the code duplicated creating more diff with previous
> > > versions and maintainability problems. Having said that, I will leave
> > > it to the committer to decide on that.
> >
> > I don't think v4-0003/option 1 needs to be unreadable.  Perhaps we can use
> > a StringInfo to build the message.  That might be a net improvement in
> > readability anyway.
>
> Looks like we already use StringInfo in PerformAuthentication for
> "connection" related log messages, see [1].
>
> I will give it a try using StringInfo for the "checkpoint/restartpoint
> complete" message too.

Well, here's the v5 patch, sample output at [1]. Please have a look at it.

[1]
2022-03-18 07:44:36.102 UTC [1165375] LOG:  checkpoint starting:
immediate force wait
2022-03-18 07:44:36.158 UTC [1165375] LOG:  checkpoint complete: wrote
444 buffers (2.7%); 0 WAL file(s) added, 0 removed, 0 recycled;
write=0.006 s, sync=0.017 s, total=0.057 s; sync files=2,
longest=0.014 s, average=0.009 s; distance=6270 kB, estimate=6270 kB;
logical decoding snapshot file(s) removed=3, time=0.001 s, cutoff
LSN=0/1B97B10

2022-03-18 07:45:43.263 UTC [1165375] LOG:  checkpoint starting:
immediate force wait
2022-03-18 07:45:43.338 UTC [1165375] LOG:  checkpoint complete: wrote
968 buffers (5.9%); 0 WAL file(s) added, 0 removed, 0 recycled;
write=0.012 s, sync=0.025 s, total=0.075 s; sync files=24,
longest=0.020 s, average=0.002 s; distance=6975 kB, estimate=6975 kB;
logical decoding snapshot file(s) removed=3, time=0.001 s, cutoff
LSN=0/1BDD080; logical decoding rewrite mapping file(s) removed=1,
synced=7, time=0.001 s, cutoff LSN=0/1BDD080

Regards,
Bharath Rupireddy.
From acde82f8a9990e519819aaa2e9c1a2f32441a135 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Fri, 18 Mar 2022 07:48:32 +0000
Subject: [PATCH v5] Add checkpoint stats of snapshot and mapping files of
 pg_logical dir

At times, there can be many snapshot and mapping files under
pg_logical dir that the checkpoint might have to delete/fsync
based on the cutoff LSN which can increase the checkpoint time.
Add stats related to these files to better understand the delays
or time spent by the checkpointer processing them.

Add these new log message only when necessary i.e. at least one
snapshot or mapping files is processed during the checkpoint.
---
 src/backend/access/heap/rewriteheap.c       |   6 +
 src/backend/access/transam/xlog.c           | 119 +++++++++++++-------
 src/backend/replication/logical/snapbuild.c |   4 +
 src/include/access/xlog.h                   |  13 +++
 4 files changed, 104 insertions(+), 38 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 2a53826736..59cc3fa567 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -1210,6 +1210,8 @@ CheckPointLogicalRewriteHeap(void)
 	if (cutoff != InvalidXLogRecPtr && redo < cutoff)
 		cutoff = redo;
 
+	CheckpointStats.repl_map_cutoff_lsn = cutoff;
+
 	mappings_dir = AllocateDir("pg_logical/mappings");
 	while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL)
 	{
@@ -1247,6 +1249,8 @@ CheckPointLogicalRewriteHeap(void)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m", path)));
+
+			CheckpointStats.repl_map_files_rmvd_cnt++;
 		}
 		else
 		{
@@ -1280,6 +1284,8 @@ CheckPointLogicalRewriteHeap(void)
 						(errcode_for_file_access(),
 						 errmsg("could not close file \"%s\": %m", path)));
 		}
+
+		CheckpointStats.repl_map_files_syncd_cnt++;
 	}
 	FreeDir(mappings_dir);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f436471b27..1fb97e09c2 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6080,6 +6080,7 @@ LogCheckpointEnd(bool restartpoint)
 				longest_msecs,
 				average_msecs;
 	uint64		average_sync_time;
+	StringInfoData logmsg;
 
 	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
 
@@ -6115,46 +6116,81 @@ LogCheckpointEnd(bool restartpoint)
 			CheckpointStats.ckpt_sync_rels;
 	average_msecs = (long) ((average_sync_time + 999) / 1000);
 
+	initStringInfo(&logmsg);
+
 	if (restartpoint)
-		ereport(LOG,
-				(errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
-						"%d WAL file(s) added, %d removed, %d recycled; "
-						"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
-						"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
-						"distance=%d kB, estimate=%d kB",
-						CheckpointStats.ckpt_bufs_written,
-						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
-						CheckpointStats.ckpt_segs_added,
-						CheckpointStats.ckpt_segs_removed,
-						CheckpointStats.ckpt_segs_recycled,
-						write_msecs / 1000, (int) (write_msecs % 1000),
-						sync_msecs / 1000, (int) (sync_msecs % 1000),
-						total_msecs / 1000, (int) (total_msecs % 1000),
-						CheckpointStats.ckpt_sync_rels,
-						longest_msecs / 1000, (int) (longest_msecs % 1000),
-						average_msecs / 1000, (int) (average_msecs % 1000),
-						(int) (PrevCheckPointDistance / 1024.0),
-						(int) (CheckPointDistanceEstimate / 1024.0))));
+		appendStringInfo(&logmsg,
+						_("restartpoint complete: wrote %d buffers (%.1f%%); "
+						  "%d WAL file(s) added, %d removed, %d recycled; "
+						  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+						  "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+						  "distance=%d kB, estimate=%d kB"),
+						  CheckpointStats.ckpt_bufs_written,
+						  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+						  CheckpointStats.ckpt_segs_added,
+						  CheckpointStats.ckpt_segs_removed,
+						  CheckpointStats.ckpt_segs_recycled,
+						  write_msecs / 1000, (int) (write_msecs % 1000),
+						  sync_msecs / 1000, (int) (sync_msecs % 1000),
+						  total_msecs / 1000, (int) (total_msecs % 1000),
+						  CheckpointStats.ckpt_sync_rels,
+						  longest_msecs / 1000, (int) (longest_msecs % 1000),
+						  average_msecs / 1000, (int) (average_msecs % 1000),
+						  (int) (PrevCheckPointDistance / 1024.0),
+						  (int) (CheckPointDistanceEstimate / 1024.0));
 	else
-		ereport(LOG,
-				(errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
-						"%d WAL file(s) added, %d removed, %d recycled; "
-						"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
-						"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
-						"distance=%d kB, estimate=%d kB",
-						CheckpointStats.ckpt_bufs_written,
-						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
-						CheckpointStats.ckpt_segs_added,
-						CheckpointStats.ckpt_segs_removed,
-						CheckpointStats.ckpt_segs_recycled,
-						write_msecs / 1000, (int) (write_msecs % 1000),
-						sync_msecs / 1000, (int) (sync_msecs % 1000),
-						total_msecs / 1000, (int) (total_msecs % 1000),
-						CheckpointStats.ckpt_sync_rels,
-						longest_msecs / 1000, (int) (longest_msecs % 1000),
-						average_msecs / 1000, (int) (average_msecs % 1000),
-						(int) (PrevCheckPointDistance / 1024.0),
-						(int) (CheckPointDistanceEstimate / 1024.0))));
+		appendStringInfo(&logmsg,
+						_("checkpoint complete: wrote %d buffers (%.1f%%); "
+						  "%d WAL file(s) added, %d removed, %d recycled; "
+						  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+						  "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+						  "distance=%d kB, estimate=%d kB"),
+						  CheckpointStats.ckpt_bufs_written,
+						  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+						  CheckpointStats.ckpt_segs_added,
+						  CheckpointStats.ckpt_segs_removed,
+						  CheckpointStats.ckpt_segs_recycled,
+						  write_msecs / 1000, (int) (write_msecs % 1000),
+						  sync_msecs / 1000, (int) (sync_msecs % 1000),
+						  total_msecs / 1000, (int) (total_msecs % 1000),
+						  CheckpointStats.ckpt_sync_rels,
+						  longest_msecs / 1000, (int) (longest_msecs % 1000),
+						  average_msecs / 1000, (int) (average_msecs % 1000),
+						  (int) (PrevCheckPointDistance / 1024.0),
+						  (int) (CheckPointDistanceEstimate / 1024.0));
+
+	if (CheckpointStats.repl_snap_files_rmvd_cnt > 0)
+	{
+		long t_msecs;
+
+		t_msecs = TimestampDifferenceMilliseconds(CheckpointStats.repl_snap_start_t,
+												  CheckpointStats.repl_snap_end_t);
+
+		appendStringInfo(&logmsg,
+						_("; logical decoding snapshot file(s) removed=" UINT64_FORMAT ", time=%ld.%03d s, cutoff LSN=%X/%X"),
+						  CheckpointStats.repl_snap_files_rmvd_cnt,
+						  t_msecs / 1000, (int) (t_msecs % 1000),
+						  LSN_FORMAT_ARGS(CheckpointStats.repl_snap_cutoff_lsn));
+	}
+
+	if (CheckpointStats.repl_map_files_rmvd_cnt ||
+		CheckpointStats.repl_map_files_syncd_cnt > 0)
+	{
+		long t_msecs;
+
+		t_msecs = TimestampDifferenceMilliseconds(CheckpointStats.repl_snap_start_t,
+												  CheckpointStats.repl_snap_end_t);
+
+		appendStringInfo(&logmsg,
+						_("; logical decoding rewrite mapping file(s) removed=" UINT64_FORMAT ", synced=" UINT64_FORMAT ", time=%ld.%03d s, cutoff LSN=%X/%X"),
+						  CheckpointStats.repl_map_files_rmvd_cnt,
+						  CheckpointStats.repl_map_files_syncd_cnt,
+						  t_msecs / 1000, (int) (t_msecs % 1000),
+						  LSN_FORMAT_ARGS(CheckpointStats.repl_map_cutoff_lsn));
+	}
+
+	ereport(LOG, errmsg_internal("%s", logmsg.data));
+	pfree(logmsg.data);
 }
 
 /*
@@ -6811,8 +6847,15 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
 	CheckPointRelationMap();
 	CheckPointReplicationSlots();
+
+	CheckpointStats.repl_snap_start_t = GetCurrentTimestamp();
 	CheckPointSnapBuild();
+	CheckpointStats.repl_snap_end_t = GetCurrentTimestamp();
+
+	CheckpointStats.repl_map_start_t = GetCurrentTimestamp();
 	CheckPointLogicalRewriteHeap();
+	CheckpointStats.repl_map_end_t = GetCurrentTimestamp();
+
 	CheckPointReplicationOrigin();
 
 	/* Write out all dirty data in SLRUs and the main buffer pool */
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 83fca8a77d..e652acfbe6 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1941,6 +1941,8 @@ CheckPointSnapBuild(void)
 	if (redo < cutoff)
 		cutoff = redo;
 
+	CheckpointStats.repl_snap_cutoff_lsn = cutoff;
+
 	snap_dir = AllocateDir("pg_logical/snapshots");
 	while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL)
 	{
@@ -1997,6 +1999,8 @@ CheckPointSnapBuild(void)
 								path)));
 				continue;
 			}
+
+			CheckpointStats.repl_snap_files_rmvd_cnt++;
 		}
 	}
 	FreeDir(snap_dir);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 09f6464331..32f8116388 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -172,6 +172,19 @@ typedef struct CheckpointStatsData
 									 * times, which is not necessarily the
 									 * same as the total elapsed time for the
 									 * entire sync phase. */
+
+	/* Statistics of snapshot files under "pg_logical/snapshots" */
+	uint64 repl_snap_files_rmvd_cnt;
+	XLogRecPtr repl_snap_cutoff_lsn;
+	TimestampTz	repl_snap_start_t;
+	TimestampTz	repl_snap_end_t;
+
+	/* Statistics of map files under "pg_logical/mappings" */
+	uint64 repl_map_files_syncd_cnt;
+	uint64 repl_map_files_rmvd_cnt;
+	XLogRecPtr repl_map_cutoff_lsn;
+	TimestampTz	repl_map_start_t;
+	TimestampTz	repl_map_end_t;
 } CheckpointStatsData;
 
 extern CheckpointStatsData CheckpointStats;
-- 
2.25.1

Reply via email to