On Fri, Jan 24, 2025 at 06:29:46PM +0300, Nazir Bilal Yavuz wrote: > By saying that, this does not solve the problem; there is still a > difference although you omit WAL initialization stats from the > pg_stat_io.
Reattaching the remaining patches to make the cfbot happy, as 0001 has been applied as 87a6690cc695. - if (track_wal_io_timing) - { - instr_time end; - - INSTR_TIME_SET_CURRENT(end); - INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_write_time, end, start); - } In 0002, you are removing PendingWalStats.wal_write_time, which does not seem OK to me because we should still aggregate this data for track_wal_io_timing, no? -- Michael
From 63d5d25c562fbe42e9c9d80d624edc0d1c2d92d9 Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz <byavu...@gmail.com> Date: Thu, 23 Jan 2025 10:45:23 +0300 Subject: [PATCH v10 2/3] Add WAL I/O stats to both pg_stat_io view and per backend I/O statistics This commit adds WAL I/O stats to both pg_stat_io view and per backend I/O statistics (pg_stat_get_backend_io()). This commit introduces a three new I/O concepts: - IOObject IOOBJECT_WAL is used for tracking all WAL I/Os. - IOOBJECT_WAL / IOCONTEXT_NORMAL is used for tracking I/O operations done on already created wal segments. - IOOBJECT_WAL / IOCONTEXT_INIT is used for tracking I/O operations done while creating the WAL segments. For now 027_stream_regress.pl -> 'check contents of pg_stat_statements on regression database' test fails. It is because walsenders may shutdown after the checkpointer. There is ongoing discussion to fix that problem: postgr.es/m/flat/kgng5nrvnlv335evmsuvpnh354rw7qyazl73kdysev2cr2v5zu%40m3cfzxicm5kp --- src/include/pgstat.h | 4 +- src/backend/access/transam/xlog.c | 57 ++++++-------- src/backend/access/transam/xlogreader.c | 8 ++ src/backend/access/transam/xlogrecovery.c | 14 ++++ src/backend/utils/activity/pgstat_backend.c | 9 ++- src/backend/utils/activity/pgstat_io.c | 84 ++++++++++++++++++--- src/test/regress/expected/stats.out | 53 +++++++++++++ src/test/regress/sql/stats.sql | 25 ++++++ doc/src/sgml/monitoring.sgml | 19 ++++- 9 files changed, 221 insertions(+), 52 deletions(-) diff --git a/src/include/pgstat.h b/src/include/pgstat.h index d0d45150977..0a456858c9c 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -274,14 +274,16 @@ typedef enum IOObject { IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, + IOOBJECT_WAL, } IOObject; -#define IOOBJECT_NUM_TYPES (IOOBJECT_TEMP_RELATION + 1) +#define IOOBJECT_NUM_TYPES (IOOBJECT_WAL + 1) typedef enum IOContext { IOCONTEXT_BULKREAD, IOCONTEXT_BULKWRITE, + IOCONTEXT_INIT, IOCONTEXT_NORMAL, IOCONTEXT_VACUUM, } IOContext; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index bf3dbda901d..ee26e9739bd 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2425,38 +2425,23 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) Size nbytes; Size nleft; ssize_t written; - instr_time start; + instr_time io_start; /* OK to write the page(s) */ from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ; nbytes = npages * (Size) XLOG_BLCKSZ; nleft = nbytes; + + io_start = pgstat_prepare_io_time(track_wal_io_timing); + do { errno = 0; - /* Measure I/O timing to write WAL data */ - if (track_wal_io_timing) - INSTR_TIME_SET_CURRENT(start); - else - INSTR_TIME_SET_ZERO(start); - pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); written = pg_pwrite(openLogFile, from, nleft, startoffset); pgstat_report_wait_end(); - /* - * Increment the I/O timing and the number of times WAL data - * were written out to disk. - */ - if (track_wal_io_timing) - { - instr_time end; - - INSTR_TIME_SET_CURRENT(end); - INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_write_time, end, start); - } - PendingWalStats.wal_write++; if (written <= 0) @@ -2481,6 +2466,9 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) startoffset += written; } while (nleft > 0); + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, + IOOP_WRITE, io_start, npages, npages * XLOG_BLCKSZ); + npages = 0; /* @@ -3216,6 +3204,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, int fd; int save_errno; int open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY; + instr_time io_start; Assert(logtli != 0); @@ -3259,6 +3248,8 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); + io_start = pgstat_prepare_io_time(track_wal_io_timing); + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); save_errno = 0; if (wal_init_zero) @@ -3294,6 +3285,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, } pgstat_report_wait_end(); + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE, + io_start, 1, wal_segment_size); + if (save_errno) { /* @@ -3310,6 +3304,8 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, errmsg("could not write to file \"%s\": %m", tmppath))); } + io_start = pgstat_prepare_io_time(track_wal_io_timing); + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); if (pg_fsync(fd) != 0) { @@ -3322,6 +3318,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, } pgstat_report_wait_end(); + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, + IOOP_FSYNC, io_start, 1, 0); + if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), @@ -8683,7 +8682,7 @@ void issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) { char *msg = NULL; - instr_time start; + instr_time io_start; Assert(tli != 0); @@ -8696,11 +8695,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) wal_sync_method == WAL_SYNC_METHOD_OPEN_DSYNC) return; - /* Measure I/O timing to sync the WAL file */ - if (track_wal_io_timing) - INSTR_TIME_SET_CURRENT(start); - else - INSTR_TIME_SET_ZERO(start); + io_start = pgstat_prepare_io_time(track_wal_io_timing); pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC); switch (wal_sync_method) @@ -8746,16 +8741,8 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) pgstat_report_wait_end(); - /* - * Increment the I/O timing and the number of times WAL files were synced. - */ - if (track_wal_io_timing) - { - instr_time end; - - INSTR_TIME_SET_CURRENT(end); - INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_sync_time, end, start); - } + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC, + io_start, 1, 0); PendingWalStats.wal_sync++; } diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 91b6a91767d..e8d97cf0732 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1507,6 +1507,9 @@ WALRead(XLogReaderState *state, char *p; XLogRecPtr recptr; Size nbytes; +#ifndef FRONTEND + instr_time io_start; +#endif p = buf; recptr = startptr; @@ -1552,6 +1555,8 @@ WALRead(XLogReaderState *state, segbytes = nbytes; #ifndef FRONTEND + io_start = pgstat_prepare_io_time(track_wal_io_timing); + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); #endif @@ -1561,6 +1566,9 @@ WALRead(XLogReaderState *state, #ifndef FRONTEND pgstat_report_wait_end(); + + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, + io_start, 1, readbytes); #endif if (readbytes <= 0) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index cf2b007806f..91fa72fadcc 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -60,6 +60,7 @@ #include "utils/datetime.h" #include "utils/fmgrprotos.h" #include "utils/guc_hooks.h" +#include "utils/pgstat_internal.h" #include "utils/pg_lsn.h" #include "utils/ps_status.h" #include "utils/pg_rusage.h" @@ -1836,6 +1837,9 @@ PerformWalRecovery(void) * end of main redo apply loop */ + /* Report pending statistics to the cumulative stats system */ + pgstat_flush_io(false); + if (reachedRecoveryTarget) { if (!reachedConsistency) @@ -3306,6 +3310,7 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, uint32 targetPageOff; XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY; int r; + instr_time io_start; XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size); targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size); @@ -3398,6 +3403,8 @@ retry: /* Read the requested page */ readOff = targetPageOff; + io_start = pgstat_prepare_io_time(track_wal_io_timing); + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); if (r != XLOG_BLCKSZ) @@ -3406,6 +3413,10 @@ retry: int save_errno = errno; pgstat_report_wait_end(); + + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, + io_start, 1, r); + XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size); if (r < 0) { @@ -3426,6 +3437,9 @@ retry: } pgstat_report_wait_end(); + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, + io_start, 1, r); + Assert(targetSegNo == readSegNo); Assert(targetPageOff == readOff); Assert(reqLen <= readLen); diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index bcf9e4b1487..338da73a9a9 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -24,6 +24,7 @@ #include "postgres.h" +#include "access/xlog.h" #include "storage/bufmgr.h" #include "utils/memutils.h" #include "utils/pgstat_internal.h" @@ -43,7 +44,7 @@ void pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time io_time) { - Assert(track_io_timing); + Assert(track_io_timing || track_wal_io_timing); if (!pgstat_tracks_backend_bktype(MyBackendType)) return; @@ -236,9 +237,6 @@ pgstat_tracks_backend_bktype(BackendType bktype) case B_DEAD_END_BACKEND: case B_ARCHIVER: case B_LOGGER: - case B_WAL_RECEIVER: - case B_WAL_WRITER: - case B_WAL_SUMMARIZER: case B_BG_WRITER: case B_CHECKPOINTER: case B_STARTUP: @@ -249,7 +247,10 @@ pgstat_tracks_backend_bktype(BackendType bktype) case B_BG_WORKER: case B_STANDALONE_BACKEND: case B_SLOTSYNC_WORKER: + case B_WAL_RECEIVER: case B_WAL_SENDER: + case B_WAL_SUMMARIZER: + case B_WAL_WRITER: return true; } diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 6ff5d9e96a1..17924759e90 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -16,6 +16,7 @@ #include "postgres.h" +#include "access/xlog.h" #include "executor/instrument.h" #include "storage/bufmgr.h" #include "utils/pgstat_internal.h" @@ -105,6 +106,20 @@ pgstat_prepare_io_time(bool track_io_guc) return io_start; } +/* + * Decide if IO timings need to be tracked. Timings associated to + * IOOBJECT_WAL objects are tracked if track_wal_io_timing is enabled, + * else rely on track_io_timing. + */ +static bool +pgstat_should_track_io_time(IOObject io_object) +{ + if (io_object == IOOBJECT_WAL) + return track_wal_io_timing; + + return track_io_timing; +} + /* * Like pgstat_count_io_op() except it also accumulates time. */ @@ -112,14 +127,24 @@ void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes) { - if (track_io_timing) + /* + * Accumulate timing data. pgstat_count_buffer is for pgstat_database. As + * pg_stat_database only counts blk_read_time and blk_write_time, it is + * set for IOOP_READ, IOOP_WRITE and IOOP_EXTEND. + * + * pgBufferUsage is for EXPLAIN. pgBufferUsage has write and read stats + * for shared, local and temporary blocks. Temporary blocks are ignored + * here. + */ + if (pgstat_should_track_io_time(io_object)) { instr_time io_time; INSTR_TIME_SET_CURRENT(io_time); INSTR_TIME_SUBTRACT(io_time, start_time); - if (io_op == IOOP_WRITE || io_op == IOOP_EXTEND) + if (io_object != IOOBJECT_WAL && + (io_op == IOOP_WRITE || io_op == IOOP_EXTEND)) { pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time)); if (io_object == IOOBJECT_RELATION) @@ -127,7 +152,7 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, else if (io_object == IOOBJECT_TEMP_RELATION) INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time); } - else if (io_op == IOOP_READ) + else if (io_object != IOOBJECT_WAL && io_op == IOOP_READ) { pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time)); if (io_object == IOOBJECT_RELATION) @@ -241,6 +266,8 @@ pgstat_get_io_context_name(IOContext io_context) return "bulkread"; case IOCONTEXT_BULKWRITE: return "bulkwrite"; + case IOCONTEXT_INIT: + return "init"; case IOCONTEXT_NORMAL: return "normal"; case IOCONTEXT_VACUUM: @@ -260,6 +287,8 @@ pgstat_get_io_object_name(IOObject io_object) return "relation"; case IOOBJECT_TEMP_RELATION: return "temp relation"; + case IOOBJECT_WAL: + return "wal"; } elog(ERROR, "unrecognized IOObject value: %d", io_object); @@ -332,8 +361,6 @@ pgstat_io_snapshot_cb(void) * - Syslogger because it is not connected to shared memory * - Archiver because most relevant archiving IO is delegated to a * specialized command or module -* - WAL Receiver, WAL Writer, and WAL Summarizer IO are not tracked in -* pg_stat_io for now * * Function returns true if BackendType participates in the cumulative stats * subsystem for IO and false if it does not. @@ -354,9 +381,6 @@ pgstat_tracks_io_bktype(BackendType bktype) case B_DEAD_END_BACKEND: case B_ARCHIVER: case B_LOGGER: - case B_WAL_RECEIVER: - case B_WAL_WRITER: - case B_WAL_SUMMARIZER: return false; case B_AUTOVAC_LAUNCHER: @@ -368,7 +392,10 @@ pgstat_tracks_io_bktype(BackendType bktype) case B_SLOTSYNC_WORKER: case B_STANDALONE_BACKEND: case B_STARTUP: + case B_WAL_RECEIVER: case B_WAL_SENDER: + case B_WAL_SUMMARIZER: + case B_WAL_WRITER: return true; } @@ -394,6 +421,15 @@ pgstat_tracks_io_object(BackendType bktype, IOObject io_object, if (!pgstat_tracks_io_bktype(bktype)) return false; + /* + * Currently, IO on IOOBJECT_WAL objects can only occur in the + * IOCONTEXT_NORMAL and IOCONTEXT_INIT IOContexts. + */ + if (io_object == IOOBJECT_WAL && + (io_context != IOCONTEXT_NORMAL && + io_context != IOCONTEXT_INIT)) + return false; + /* * Currently, IO on temporary relations can only occur in the * IOCONTEXT_NORMAL IOContext. @@ -460,14 +496,28 @@ pgstat_tracks_io_op(BackendType bktype, IOObject io_object, /* * Some BackendTypes will not do certain IOOps. */ - if ((bktype == B_BG_WRITER || bktype == B_CHECKPOINTER) && + if ((bktype == B_BG_WRITER) && (io_op == IOOP_READ || io_op == IOOP_EVICT || io_op == IOOP_HIT)) return false; + if ((bktype == B_CHECKPOINTER) && + ((io_object != IOOBJECT_WAL && io_op == IOOP_READ) || + (io_op == IOOP_EVICT || io_op == IOOP_HIT))) + return false; + if ((bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER || bktype == B_CHECKPOINTER) && io_op == IOOP_EXTEND) return false; + /* + * Some BackendTypes don't do reads with IOOBJECT_WAL. + */ + if (io_object == IOOBJECT_WAL && io_op == IOOP_READ && + (bktype == B_WAL_RECEIVER || bktype == B_BG_WRITER || + bktype == B_AUTOVAC_WORKER || bktype == B_AUTOVAC_WORKER || + bktype == B_WAL_WRITER)) + return false; + /* * Temporary tables are not logged and thus do not require fsync'ing. * Writeback is not requested for temporary tables. @@ -492,6 +542,22 @@ pgstat_tracks_io_op(BackendType bktype, IOObject io_object, if (!strategy_io_context && io_op == IOOP_REUSE) return false; + /* + * An IOCONTEXT_INIT done for an IOOBJECT_WAL io_object does writes and + * syncs. + */ + if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_INIT && + !(io_op == IOOP_WRITE || io_op == IOOP_FSYNC)) + return false; + + /* + * An IOCONTEXT_NORMAL done for an IOOBJECT_WAL io_object does writes, + * reads and syncs. + */ + if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL && + !(io_op == IOOP_WRITE || io_op == IOOP_READ || io_op == IOOP_FSYNC)) + return false; + /* * IOOP_FSYNC IOOps done by a backend using a BufferAccessStrategy are * counted in the IOCONTEXT_NORMAL IOContext. See comment in diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index a0317b7208e..7dfcaf03b78 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -862,6 +862,33 @@ WHERE pg_stat_get_backend_pid(beid) = pg_backend_pid(); t (1 row) +-- Test pg_stat_io for WAL in an init context, that should do writes +-- and syncs. +SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs + FROM pg_stat_io + WHERE context = 'init' and object = 'wal' \gset io_sum_wal_init_ +SELECT :io_sum_wal_init_writes > 0; + ?column? +---------- + t +(1 row) + +SELECT current_setting('fsync') = 'off' + OR :io_sum_wal_init_fsyncs > 0; + ?column? +---------- + t +(1 row) + +-- Test pg_stat_io for WAL in a normal context, that should do reads as well. +SELECT SUM(reads) > 0 + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal'; + ?column? +---------- + t +(1 row) + ----- -- Test that resetting stats works for reset timestamp ----- @@ -1256,6 +1283,7 @@ SELECT pg_stat_get_subscription_stats(NULL); -- - extends of relations using shared buffers -- - fsyncs done to ensure the durability of data dirtying shared buffers -- - shared buffer hits +-- - WAL writes and fsyncs in IOContext IOCONTEXT_NORMAL -- There is no test for blocks evicted from shared buffers, because we cannot -- be sure of the state of shared buffers at the point the test is run. -- Create a regular table and insert some data to generate IOCONTEXT_NORMAL @@ -1273,6 +1301,9 @@ SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs FROM pg_stat_get_backend_io(pg_backend_pid()) WHERE object = 'relation' \gset my_io_sum_shared_before_ +SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal' \gset io_sum_wal_normal_before_ CREATE TABLE test_io_shared(a int); INSERT INTO test_io_shared SELECT i FROM generate_series(1,100)i; SELECT pg_stat_force_next_flush(); @@ -1335,6 +1366,28 @@ SELECT current_setting('fsync') = 'off' t (1 row) +SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal' \gset io_sum_wal_normal_after_ +SELECT current_setting('synchronous_commit') = 'on'; + ?column? +---------- + t +(1 row) + +SELECT :io_sum_wal_normal_after_writes > :io_sum_wal_normal_before_writes; + ?column? +---------- + t +(1 row) + +SELECT current_setting('fsync') = 'off' + OR :io_sum_wal_normal_after_fsyncs > :io_sum_wal_normal_before_fsyncs; + ?column? +---------- + t +(1 row) + -- Change the tablespace so that the table is rewritten directly, then SELECT -- from it to cause it to be read back into shared buffers. SELECT sum(reads) AS io_sum_shared_before_reads diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql index 399c72bbcf7..9e368637199 100644 --- a/src/test/regress/sql/stats.sql +++ b/src/test/regress/sql/stats.sql @@ -442,6 +442,20 @@ SELECT (current_schemas(true))[1] = ('pg_temp_' || beid::text) AS match FROM pg_stat_get_backend_idset() beid WHERE pg_stat_get_backend_pid(beid) = pg_backend_pid(); +-- Test pg_stat_io for WAL in an init context, that should do writes +-- and syncs. +SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs + FROM pg_stat_io + WHERE context = 'init' and object = 'wal' \gset io_sum_wal_init_ +SELECT :io_sum_wal_init_writes > 0; +SELECT current_setting('fsync') = 'off' + OR :io_sum_wal_init_fsyncs > 0; + +-- Test pg_stat_io for WAL in a normal context, that should do reads as well. +SELECT SUM(reads) > 0 + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal'; + ----- -- Test that resetting stats works for reset timestamp ----- @@ -602,6 +616,7 @@ SELECT pg_stat_get_subscription_stats(NULL); -- - extends of relations using shared buffers -- - fsyncs done to ensure the durability of data dirtying shared buffers -- - shared buffer hits +-- - WAL writes and fsyncs in IOContext IOCONTEXT_NORMAL -- There is no test for blocks evicted from shared buffers, because we cannot -- be sure of the state of shared buffers at the point the test is run. @@ -621,6 +636,9 @@ SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs FROM pg_stat_get_backend_io(pg_backend_pid()) WHERE object = 'relation' \gset my_io_sum_shared_before_ +SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal' \gset io_sum_wal_normal_before_ CREATE TABLE test_io_shared(a int); INSERT INTO test_io_shared SELECT i FROM generate_series(1,100)i; SELECT pg_stat_force_next_flush(); @@ -649,6 +667,13 @@ SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs SELECT :my_io_sum_shared_after_writes >= :my_io_sum_shared_before_writes; SELECT current_setting('fsync') = 'off' OR :my_io_sum_shared_after_fsyncs >= :my_io_sum_shared_before_fsyncs; +SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal' \gset io_sum_wal_normal_after_ +SELECT current_setting('synchronous_commit') = 'on'; +SELECT :io_sum_wal_normal_after_writes > :io_sum_wal_normal_before_writes; +SELECT current_setting('fsync') = 'off' + OR :io_sum_wal_normal_after_fsyncs > :io_sum_wal_normal_before_fsyncs; -- Change the tablespace so that the table is rewritten directly, then SELECT -- from it to cause it to be read back into shared buffers. diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index e5888fae2b5..c6f2c384512 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -2581,9 +2581,10 @@ description | Waiting for a newly initialized WAL file to reach durable storage </para> <para> - Currently, I/O on relations (e.g. tables, indexes) is tracked. However, - relation I/O which bypasses shared buffers (e.g. when moving a table from one - tablespace to another) is currently not tracked. + Currently, I/O on relations (e.g. tables, indexes) and WAL activity are + tracked. However, relation I/O which bypasses shared buffers + (e.g. when moving a table from one tablespace to another) is currently + not tracked. </para> <table id="pg-stat-io-view" xreflabel="pg_stat_io"> @@ -2636,6 +2637,11 @@ description | Waiting for a newly initialized WAL file to reach durable storage <literal>temp relation</literal>: Temporary relations. </para> </listitem> + <listitem> + <para> + <literal>wal</literal>: Write Ahead Logs. + </para> + </listitem> </itemizedlist> </para> </entry> @@ -2660,6 +2666,13 @@ description | Waiting for a newly initialized WAL file to reach durable storage <literal>normal</literal>. </para> </listitem> + <listitem> + <para> + <literal>init</literal>: I/O operations performed while creating the + WAL segments are tracked in <varname>context</varname> + <literal>init</literal>. + </para> + </listitem> <listitem> <para> <literal>vacuum</literal>: I/O operations performed outside of shared -- 2.47.2
From 0cf23ff24939406f28e021b706f36944c8c189a4 Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz <byavu...@gmail.com> Date: Thu, 23 Jan 2025 13:52:31 +0300 Subject: [PATCH v10 3/3] Fetch timing columns from pg_stat_io in the pg_stat_wal view In the prior commit WAL I/Os' timing stats are started to be tracked in the pg_stat_io view. So, to avoid double accounting and to have same timing values in both pg_stat_io and pg_stat_wal; fetch timing columns from pg_stat_io in the pg_stat_wal view. --- src/include/catalog/pg_proc.dat | 6 +++--- src/include/pgstat.h | 4 ---- src/backend/catalog/system_views.sql | 14 ++++++++++--- src/backend/utils/activity/pgstat_wal.c | 2 -- src/backend/utils/adt/pgstatfuncs.c | 14 +++---------- src/test/regress/expected/rules.out | 27 ++++++++++++++++--------- doc/src/sgml/monitoring.sgml | 10 ++++++--- 7 files changed, 41 insertions(+), 36 deletions(-) diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 2aafdbc3e93..aaf2432cc17 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5928,9 +5928,9 @@ { oid => '1136', descr => 'statistics: information about WAL activity', proname => 'pg_stat_get_wal', proisstrict => 'f', provolatile => 's', proparallel => 'r', prorettype => 'record', proargtypes => '', - proallargtypes => '{int8,int8,numeric,int8,int8,int8,float8,float8,timestamptz}', - proargmodes => '{o,o,o,o,o,o,o,o,o}', - proargnames => '{wal_records,wal_fpi,wal_bytes,wal_buffers_full,wal_write,wal_sync,wal_write_time,wal_sync_time,stats_reset}', + proallargtypes => '{int8,int8,numeric,int8,int8,int8,timestamptz}', + proargmodes => '{o,o,o,o,o,o,o}', + proargnames => '{wal_records,wal_fpi,wal_bytes,wal_buffers_full,wal_write,wal_sync,stats_reset}', prosrc => 'pg_stat_get_wal' }, { oid => '6248', descr => 'statistics: information about WAL prefetching', proname => 'pg_stat_get_recovery_prefetch', prorows => '1', proretset => 't', diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 0a456858c9c..faac508015e 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -477,8 +477,6 @@ typedef struct PgStat_WalStats PgStat_Counter wal_buffers_full; PgStat_Counter wal_write; PgStat_Counter wal_sync; - PgStat_Counter wal_write_time; - PgStat_Counter wal_sync_time; TimestampTz stat_reset_timestamp; } PgStat_WalStats; @@ -493,8 +491,6 @@ typedef struct PgStat_PendingWalStats PgStat_Counter wal_buffers_full; PgStat_Counter wal_write; PgStat_Counter wal_sync; - instr_time wal_write_time; - instr_time wal_sync_time; } PgStat_PendingWalStats; diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 46868bf7e89..9448e1ec355 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1180,6 +1180,13 @@ SELECT FROM pg_stat_get_io() b; CREATE VIEW pg_stat_wal AS + WITH pgsio_sum_write_fsync_time AS ( + SELECT + sum(write_time) as sum_write_time, + sum(fsync_time) as sum_fsync_time + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal' + ) SELECT w.wal_records, w.wal_fpi, @@ -1187,10 +1194,11 @@ CREATE VIEW pg_stat_wal AS w.wal_buffers_full, w.wal_write, w.wal_sync, - w.wal_write_time, - w.wal_sync_time, + p.sum_write_time as write_time, + p.sum_fsync_time as fsync_time, w.stats_reset - FROM pg_stat_get_wal() w; + FROM pg_stat_get_wal() w + CROSS JOIN pgsio_sum_write_fsync_time p; CREATE VIEW pg_stat_progress_analyze AS SELECT diff --git a/src/backend/utils/activity/pgstat_wal.c b/src/backend/utils/activity/pgstat_wal.c index 18fa6b2936a..8ee650cdc30 100644 --- a/src/backend/utils/activity/pgstat_wal.c +++ b/src/backend/utils/activity/pgstat_wal.c @@ -126,8 +126,6 @@ pgstat_wal_flush_cb(bool nowait) WALSTAT_ACC(wal_buffers_full, PendingWalStats); WALSTAT_ACC(wal_write, PendingWalStats); WALSTAT_ACC(wal_sync, PendingWalStats); - WALSTAT_ACC_INSTR_TIME(wal_write_time); - WALSTAT_ACC_INSTR_TIME(wal_sync_time); #undef WALSTAT_ACC_INSTR_TIME #undef WALSTAT_ACC diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 0f5e0a9778d..5de31979bc0 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1609,7 +1609,7 @@ pg_stat_get_backend_io(PG_FUNCTION_ARGS) Datum pg_stat_get_wal(PG_FUNCTION_ARGS) { -#define PG_STAT_GET_WAL_COLS 9 +#define PG_STAT_GET_WAL_COLS 7 TupleDesc tupdesc; Datum values[PG_STAT_GET_WAL_COLS] = {0}; bool nulls[PG_STAT_GET_WAL_COLS] = {0}; @@ -1630,11 +1630,7 @@ pg_stat_get_wal(PG_FUNCTION_ARGS) INT8OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 6, "wal_sync", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 7, "wal_write_time", - FLOAT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 8, "wal_sync_time", - FLOAT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 9, "stats_reset", + TupleDescInitEntry(tupdesc, (AttrNumber) 7, "stats_reset", TIMESTAMPTZOID, -1, 0); BlessTupleDesc(tupdesc); @@ -1657,11 +1653,7 @@ pg_stat_get_wal(PG_FUNCTION_ARGS) values[4] = Int64GetDatum(wal_stats->wal_write); values[5] = Int64GetDatum(wal_stats->wal_sync); - /* Convert counters from microsec to millisec for display */ - values[6] = Float8GetDatum(((double) wal_stats->wal_write_time) / 1000.0); - values[7] = Float8GetDatum(((double) wal_stats->wal_sync_time) / 1000.0); - - values[8] = TimestampTzGetDatum(wal_stats->stat_reset_timestamp); + values[6] = TimestampTzGetDatum(wal_stats->stat_reset_timestamp); /* Returns the record as Datum */ PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 856a8349c50..a15e0096d9c 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2241,16 +2241,23 @@ pg_stat_user_tables| SELECT relid, autoanalyze_count FROM pg_stat_all_tables WHERE ((schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (schemaname !~ '^pg_toast'::text)); -pg_stat_wal| SELECT wal_records, - wal_fpi, - wal_bytes, - wal_buffers_full, - wal_write, - wal_sync, - wal_write_time, - wal_sync_time, - stats_reset - FROM pg_stat_get_wal() w(wal_records, wal_fpi, wal_bytes, wal_buffers_full, wal_write, wal_sync, wal_write_time, wal_sync_time, stats_reset); +pg_stat_wal| WITH pgsio_sum_write_fsync_time AS ( + SELECT sum(pg_stat_io.write_time) AS sum_write_time, + sum(pg_stat_io.fsync_time) AS sum_fsync_time + FROM pg_stat_io + WHERE ((pg_stat_io.context = 'normal'::text) AND (pg_stat_io.object = 'wal'::text)) + ) + SELECT w.wal_records, + w.wal_fpi, + w.wal_bytes, + w.wal_buffers_full, + w.wal_write, + w.wal_sync, + p.sum_write_time AS write_time, + p.sum_fsync_time AS fsync_time, + w.stats_reset + FROM (pg_stat_get_wal() w(wal_records, wal_fpi, wal_bytes, wal_buffers_full, wal_write, wal_sync, stats_reset) + CROSS JOIN pgsio_sum_write_fsync_time p); pg_stat_wal_receiver| SELECT pid, status, receive_start_lsn, diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index c6f2c384512..516c4b96112 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -4965,7 +4965,9 @@ description | Waiting for a newly initialized WAL file to reach durable storage <listitem> <para> <literal>io</literal>: Reset all the counters shown in the - <structname>pg_stat_io</structname> view. + <structname>pg_stat_io</structname> view. This resets the timing + counter stored in the <structname>pg_stat_wal</structname> view + too. </para> </listitem> <listitem> @@ -4982,8 +4984,10 @@ description | Waiting for a newly initialized WAL file to reach durable storage </listitem> <listitem> <para> - <literal>wal</literal>: Reset all the counters shown in the - <structname>pg_stat_wal</structname> view. + <literal>wal</literal>: Reset all the counters except timings shown + in the <structname>pg_stat_wal</structname> view. The timing + counters can be reset by calling pg_stat_reset_shared set to + <literal>io</literal>. </para> </listitem> <listitem> -- 2.47.2
signature.asc
Description: PGP signature