On Wed, Aug 17, 2022 at 2:52 AM Nathan Bossart <nathandboss...@gmail.com> wrote: > > On Tue, Jul 19, 2022 at 05:29:10PM +0530, Bharath Rupireddy wrote: > > I've also added the total number of WAL files a checkpoint has > > processed (scanned the pg_wal directory) while removing old WAL files. > > This helps to estimate the pg_wal disk space at the time of a > > particular checkpoint, especially useful for debugging issues. > > І don't think it's clear what "processed" means here. In any case, I think > this part belongs in a separate patch or maybe even a new thread.
Agreed. PSA v11 patch. -- Bharath Rupireddy RDS Open Source Databases: https://aws.amazon.com/rds/postgresql/
From f5e0761835e863dd5b308756b262f897c99938b2 Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Date: Wed, 17 Aug 2022 05:45:00 +0000 Subject: [PATCH v11] Add time taken for processing logical decoding files to checkpoint log At times, there can be many snapshot and mapping files under pg_logical dir that the checkpoint might have to delete/fsync based on the cutoff LSN which can increase the checkpoint time. Add stats related to these files to better understand the delays or time spent by the checkpointer processing them. --- src/backend/access/transam/xlog.c | 22 +++++++++++++++++----- src/include/access/xlog.h | 4 ++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 9cedd6876f..251b73a4f0 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -6090,7 +6090,8 @@ LogCheckpointEnd(bool restartpoint) sync_msecs, total_msecs, longest_msecs, - average_msecs; + average_msecs, + l_dec_ops_msecs; uint64 average_sync_time; CheckpointStats.ckpt_end_t = GetCurrentTimestamp(); @@ -6127,6 +6128,9 @@ LogCheckpointEnd(bool restartpoint) CheckpointStats.ckpt_sync_rels; average_msecs = (long) ((average_sync_time + 999) / 1000); + l_dec_ops_msecs = TimestampDifferenceMilliseconds(CheckpointStats.l_dec_ops_start_t, + CheckpointStats.l_dec_ops_end_t); + /* * ControlFileLock is not required to see ControlFile->checkPoint and * ->checkPointCopy here as we are the only updator of those variables at @@ -6139,7 +6143,8 @@ LogCheckpointEnd(bool restartpoint) "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " "distance=%d kB, estimate=%d kB; " - "lsn=%X/%X, redo lsn=%X/%X", + "lsn=%X/%X, redo lsn=%X/%X; " + "logical decoding file(s) processing time=%ld.%03d s", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_segs_added, @@ -6154,7 +6159,8 @@ LogCheckpointEnd(bool restartpoint) (int) (PrevCheckPointDistance / 1024.0), (int) (CheckPointDistanceEstimate / 1024.0), LSN_FORMAT_ARGS(ControlFile->checkPoint), - LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo)))); + LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo), + l_dec_ops_msecs / 1000, (int) (l_dec_ops_msecs % 1000)))); else ereport(LOG, (errmsg("checkpoint complete: wrote %d buffers (%.1f%%); " @@ -6162,7 +6168,8 @@ LogCheckpointEnd(bool restartpoint) "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " "distance=%d kB, estimate=%d kB; " - "lsn=%X/%X, redo lsn=%X/%X", + "lsn=%X/%X, redo lsn=%X/%X; " + "logical decoding file(s) processing time=%ld.%03d s", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_segs_added, @@ -6177,7 +6184,8 @@ LogCheckpointEnd(bool restartpoint) (int) (PrevCheckPointDistance / 1024.0), (int) (CheckPointDistanceEstimate / 1024.0), LSN_FORMAT_ARGS(ControlFile->checkPoint), - LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo)))); + LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo), + l_dec_ops_msecs / 1000, (int) (l_dec_ops_msecs % 1000)))); } /* @@ -6846,8 +6854,12 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { CheckPointRelationMap(); CheckPointReplicationSlots(); + + CheckpointStats.l_dec_ops_start_t = GetCurrentTimestamp(); CheckPointSnapBuild(); CheckPointLogicalRewriteHeap(); + CheckpointStats.l_dec_ops_end_t = GetCurrentTimestamp(); + CheckPointReplicationOrigin(); /* Write out all dirty data in SLRUs and the main buffer pool */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index cd674c3c23..c71aca8534 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -173,6 +173,10 @@ typedef struct CheckpointStatsData * times, which is not necessarily the * same as the total elapsed time for the * entire sync phase. */ + + /* start and end timestamps of logical decoding file processing */ + TimestampTz l_dec_ops_start_t; + TimestampTz l_dec_ops_end_t; } CheckpointStatsData; extern PGDLLIMPORT CheckpointStatsData CheckpointStats; -- 2.34.1