Hi, Thanks for the feedback! The new version of the patch is attached.
On Tue, 5 Dec 2023 at 09:16, Michael Paquier <mich...@paquier.xyz> wrote: > > - if (io_op == IOOP_WRITE || io_op == IOOP_EXTEND) > + if (io_op == IOOP_EXTEND || io_op == IOOP_WRITE) > > Unrelated diff. Done. > > + if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL && > + io_op == IOOP_FSYNC) > + PendingWalStats.wal_sync += cnt; > > Nah, I really don't think that adding this dependency within > pg_stat_io is a good idea. > > - PendingWalStats.wal_sync++; > + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC, > + io_start, 1); > > This is the only caller where this matters, and the count is always 1. I reverted that, pgstat_count_io_op_n doesn't count PendingWalStats.wal_sync now. > > + no_wal_normal_read = bktype == B_AUTOVAC_LAUNCHER || > + bktype == B_AUTOVAC_WORKER || bktype == B_BACKEND || > + bktype == B_BG_WORKER || bktype == B_BG_WRITER || > + bktype == B_CHECKPOINTER || bktype == B_WAL_RECEIVER || > + bktype == B_WAL_SENDER || bktype == B_WAL_WRITER; > + > + if (no_wal_normal_read && > + (io_object == IOOBJECT_WAL && > + io_op == IOOP_READ)) > + return false; > > This may be more readable if an enum is applied, without a default > clause so as it would not be forgotten if a new type is added, perhaps > in its own little routine. Done. > > - if (track_io_timing) > + if (track_io_timing || track_wal_io_timing) > INSTR_TIME_SET_CURRENT(io_start); > else > > This interface from pgstat_prepare_io_time() is not really good, > because we could finish by setting io_start in the existing code paths > calling this routine even if track_io_timing is false when > track_wal_io_timing is true. Why not changing this interface a bit > and pass down a GUC (track_io_timing or track_wal_io_timing) as an > argument of the function depending on what we expect to trigger the > timings? Done in 0001. > > - /* Convert counters from microsec to millisec for display */ > - values[6] = Float8GetDatum(((double) wal_stats->wal_write_time) / > 1000.0); > - values[7] = Float8GetDatum(((double) wal_stats->wal_sync_time) / > 1000.0); > + /* > + * There is no need to calculate timings for both pg_stat_wal and > + * pg_stat_io. So, fetch timings from pg_stat_io to make stats > gathering > + * cheaper. Note that, since timings are fetched from pg_stat_io; > + * pg_stat_reset_shared('io') will reset pg_stat_wal's timings too. > + * > + * Convert counters from microsec to millisec for display > + */ > + values[6] = Float8GetDatum(pg_stat_get_io_time(IOOBJECT_WAL, > + > IOCONTEXT_NORMAL, > + > IOOP_WRITE)); > + values[7] = Float8GetDatum(pg_stat_get_io_time(IOOBJECT_WAL, > + > IOCONTEXT_NORMAL, > + > IOOP_FSYNC)); > > Perhaps it is simpler to remove these columns from pg_stat_get_wal() > and plug an SQL upgrade to the view definition of pg_stat_wal? Done in 0003 but I am not sure if that is what you expected. > Finding a good balance between the subroutines, the two GUCs, the > contexts, the I/O operation type and the objects is the tricky part of > this patch. If the dependency to PendingWalStats is removed and if > the interface of pgstat_prepare_io_time is improved, things are a bit > cleaner, but it feels like we could do more.. Nya. I agree. The patch is not logically complicated but it is hard to select the best way. Any kind of feedback would be appreciated. -- Regards, Nazir Bilal Yavuz Microsoft
From b7bf7b92fa274775136314ecfde90fa32ed435cb Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz <byavuz81@gmail.com> Date: Wed, 29 Nov 2023 15:30:03 +0300 Subject: [PATCH v6 6/6] Add IOOBJECT_WAL / IOCONTEXT_NORMAL / read tests --- src/test/regress/expected/stats.out | 12 ++++++++++++ src/test/regress/sql/stats.sql | 8 ++++++++ 2 files changed, 20 insertions(+) diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index 4adda9e479..7f5340cd7e 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -881,6 +881,18 @@ SELECT current_setting('fsync') = 'off' t (1 row) +-- Test pg_stat_io IOOBJECT_WAL / IOCONTEXT_NORMAL / read. +-- When the servers starts, StartupXLOG function must be called by postmaster +-- or standalone-backend startup and WAL read must be done. +-- So, check these before stats get resetted. +SELECT SUM(reads) > 0 + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal'; + ?column? +---------- + t +(1 row) + ----- -- Test that resetting stats works for reset timestamp ----- diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql index 72e864a0d2..b13f17f4d2 100644 --- a/src/test/regress/sql/stats.sql +++ b/src/test/regress/sql/stats.sql @@ -452,6 +452,14 @@ SELECT :io_sum_wal_init_writes > 0; SELECT current_setting('fsync') = 'off' OR :io_sum_wal_init_fsyncs > 0; +-- Test pg_stat_io IOOBJECT_WAL / IOCONTEXT_NORMAL / read. +-- When the servers starts, StartupXLOG function must be called by postmaster +-- or standalone-backend startup and WAL read must be done. +-- So, check these before stats get resetted. +SELECT SUM(reads) > 0 + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal'; + ----- -- Test that resetting stats works for reset timestamp ----- -- 2.43.0
From afcc55a01f38ab79d7dd17d68e7b1d5c4be41d8d Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz <byavuz81@gmail.com> Date: Tue, 12 Dec 2023 11:17:42 +0300 Subject: [PATCH v6 3/6] Fetch pg_stat_wal's timings from pg_stat_io For the not calculating WAL timings on pg_stat_wal and pg_stat_io view, pg_stat_wal view's WAL timings are fetched from pg_stat_io. Since these timings are fetched from pg_stat_io, pg_stat_reset_shared('io') will reset pg_stat_wal's timings too. --- doc/src/sgml/monitoring.sgml | 9 ++++++--- src/backend/catalog/system_views.sql | 15 ++++++++++++--- src/backend/utils/adt/pgstatfuncs.c | 14 +++----------- src/include/catalog/pg_proc.dat | 6 +++--- src/test/regress/expected/rules.out | 27 +++++++++++++++++---------- 5 files changed, 41 insertions(+), 30 deletions(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 0450f91ccb..679c527f46 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -4753,7 +4753,9 @@ description | Waiting for a newly initialized WAL file to reach durable storage <listitem> <para> <literal>io</literal>: Reset all the counters shown in the - <structname>pg_stat_io</structname> view. + <structname>pg_stat_io</structname> view. Note that, this will + reset <structname>pg_stat_wal</structname> view's timing counters + too. </para> </listitem> <listitem> @@ -4770,8 +4772,9 @@ description | Waiting for a newly initialized WAL file to reach durable storage </listitem> <listitem> <para> - <literal>wal</literal>: Reset all the counters shown in the - <structname>pg_stat_wal</structname> view. + <literal>wal</literal>: Reset all the counters except timings shown + in the <structname>pg_stat_wal</structname> view. These timing + counters can be reset by calling pg_stat_reset_shared with 'io'. </para> </listitem> <listitem> diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 11d18ed9dd..91c6acf754 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1169,6 +1169,13 @@ SELECT FROM pg_stat_get_io() b; CREATE VIEW pg_stat_wal AS + WITH pgsio_sum_write_fsync_time AS ( + SELECT + sum(write_time) as sum_write_time, + sum(fsync_time) as sum_fsync_time + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal' + ) SELECT w.wal_records, w.wal_fpi, @@ -1176,10 +1183,12 @@ CREATE VIEW pg_stat_wal AS w.wal_buffers_full, w.wal_write, w.wal_sync, - w.wal_write_time, - w.wal_sync_time, + p.sum_write_time as write_time, + p.sum_fsync_time as fsync_time, w.stats_reset - FROM pg_stat_get_wal() w; + FROM pg_stat_get_wal() w + CROSS JOIN pgsio_sum_write_fsync_time p; + CREATE VIEW pg_stat_progress_analyze AS SELECT diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 8d14a4183c..96e318bf69 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1452,7 +1452,7 @@ pg_stat_get_io(PG_FUNCTION_ARGS) Datum pg_stat_get_wal(PG_FUNCTION_ARGS) { -#define PG_STAT_GET_WAL_COLS 9 +#define PG_STAT_GET_WAL_COLS 7 TupleDesc tupdesc; Datum values[PG_STAT_GET_WAL_COLS] = {0}; bool nulls[PG_STAT_GET_WAL_COLS] = {0}; @@ -1473,11 +1473,7 @@ pg_stat_get_wal(PG_FUNCTION_ARGS) INT8OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 6, "wal_sync", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 7, "wal_write_time", - FLOAT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 8, "wal_sync_time", - FLOAT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 9, "stats_reset", + TupleDescInitEntry(tupdesc, (AttrNumber) 7, "stats_reset", TIMESTAMPTZOID, -1, 0); BlessTupleDesc(tupdesc); @@ -1500,11 +1496,7 @@ pg_stat_get_wal(PG_FUNCTION_ARGS) values[4] = Int64GetDatum(wal_stats->wal_write); values[5] = Int64GetDatum(wal_stats->wal_sync); - /* Convert counters from microsec to millisec for display */ - values[6] = Float8GetDatum(((double) wal_stats->wal_write_time) / 1000.0); - values[7] = Float8GetDatum(((double) wal_stats->wal_sync_time) / 1000.0); - - values[8] = TimestampTzGetDatum(wal_stats->stat_reset_timestamp); + values[6] = TimestampTzGetDatum(wal_stats->stat_reset_timestamp); /* Returns the record as Datum */ PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 77e8b13764..38050cbb09 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5770,9 +5770,9 @@ { oid => '1136', descr => 'statistics: information about WAL activity', proname => 'pg_stat_get_wal', proisstrict => 'f', provolatile => 's', proparallel => 'r', prorettype => 'record', proargtypes => '', - proallargtypes => '{int8,int8,numeric,int8,int8,int8,float8,float8,timestamptz}', - proargmodes => '{o,o,o,o,o,o,o,o,o}', - proargnames => '{wal_records,wal_fpi,wal_bytes,wal_buffers_full,wal_write,wal_sync,wal_write_time,wal_sync_time,stats_reset}', + proallargtypes => '{int8,int8,numeric,int8,int8,int8,timestamptz}', + proargmodes => '{o,o,o,o,o,o,o}', + proargnames => '{wal_records,wal_fpi,wal_bytes,wal_buffers_full,wal_write,wal_sync,stats_reset}', prosrc => 'pg_stat_get_wal' }, { oid => '6248', descr => 'statistics: information about WAL prefetching', proname => 'pg_stat_get_recovery_prefetch', prorows => '1', proretset => 't', diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 05070393b9..d3f6301c9d 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2222,16 +2222,23 @@ pg_stat_user_tables| SELECT relid, autoanalyze_count FROM pg_stat_all_tables WHERE ((schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (schemaname !~ '^pg_toast'::text)); -pg_stat_wal| SELECT wal_records, - wal_fpi, - wal_bytes, - wal_buffers_full, - wal_write, - wal_sync, - wal_write_time, - wal_sync_time, - stats_reset - FROM pg_stat_get_wal() w(wal_records, wal_fpi, wal_bytes, wal_buffers_full, wal_write, wal_sync, wal_write_time, wal_sync_time, stats_reset); +pg_stat_wal| WITH pgsio_sum_write_fsync_time AS ( + SELECT sum(pg_stat_io.write_time) AS sum_write_time, + sum(pg_stat_io.fsync_time) AS sum_fsync_time + FROM pg_stat_io + WHERE ((pg_stat_io.context = 'normal'::text) AND (pg_stat_io.object = 'wal'::text)) + ) + SELECT w.wal_records, + w.wal_fpi, + w.wal_bytes, + w.wal_buffers_full, + w.wal_write, + w.wal_sync, + p.sum_write_time AS write_time, + p.sum_fsync_time AS fsync_time, + w.stats_reset + FROM (pg_stat_get_wal() w(wal_records, wal_fpi, wal_bytes, wal_buffers_full, wal_write, wal_sync, stats_reset) + CROSS JOIN pgsio_sum_write_fsync_time p); pg_stat_wal_receiver| SELECT pid, status, receive_start_lsn, -- 2.43.0
From 6367c92c957b05ad4a098c35e0df9ef51b2126c6 Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz <byavuz81@gmail.com> Date: Fri, 1 Dec 2023 10:03:21 +0300 Subject: [PATCH v6 5/6] Add IOOBJECT_WAL / IOCONTEXT_INIT / write and fsync tests --- src/test/regress/expected/stats.out | 19 +++++++++++++++++++ src/test/regress/sql/stats.sql | 10 ++++++++++ 2 files changed, 29 insertions(+) diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index 4d3a515bdd..4adda9e479 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -862,6 +862,25 @@ WHERE pg_stat_get_backend_pid(beid) = pg_backend_pid(); t (1 row) +-- Test pg_stat_io IOOBJECT_WAL / IOCONTEXT_INIT / write and fsync. +-- When the servers starts, the initial WAL file must be created, +-- so check these stats before stats get resetted. +SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs + FROM pg_stat_io + WHERE context = 'init' and object = 'wal' \gset io_sum_wal_init_ +SELECT :io_sum_wal_init_writes > 0; + ?column? +---------- + t +(1 row) + +SELECT current_setting('fsync') = 'off' + OR :io_sum_wal_init_fsyncs > 0; + ?column? +---------- + t +(1 row) + ----- -- Test that resetting stats works for reset timestamp ----- diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql index aa48e65dc8..72e864a0d2 100644 --- a/src/test/regress/sql/stats.sql +++ b/src/test/regress/sql/stats.sql @@ -442,6 +442,16 @@ SELECT (current_schemas(true))[1] = ('pg_temp_' || beid::text) AS match FROM pg_stat_get_backend_idset() beid WHERE pg_stat_get_backend_pid(beid) = pg_backend_pid(); +-- Test pg_stat_io IOOBJECT_WAL / IOCONTEXT_INIT / write and fsync. +-- When the servers starts, the initial WAL file must be created, +-- so check these stats before stats get resetted. +SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs + FROM pg_stat_io + WHERE context = 'init' and object = 'wal' \gset io_sum_wal_init_ +SELECT :io_sum_wal_init_writes > 0; +SELECT current_setting('fsync') = 'off' + OR :io_sum_wal_init_fsyncs > 0; + ----- -- Test that resetting stats works for reset timestamp ----- -- 2.43.0
From c328c9b8862e600328b4d91592031c393cc27464 Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz <byavuz81@gmail.com> Date: Fri, 10 Nov 2023 14:52:22 +0300 Subject: [PATCH v6 4/6] Add IOOBJECT_WAL / IOCONTEXT_NORMAL / write and fsync tests --- src/test/regress/expected/stats.out | 26 ++++++++++++++++++++++++++ src/test/regress/sql/stats.sql | 11 +++++++++++ 2 files changed, 37 insertions(+) diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index 346e10a3d2..4d3a515bdd 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -1255,6 +1255,7 @@ SELECT pg_stat_get_subscription_stats(NULL); -- - extends of relations using shared buffers -- - fsyncs done to ensure the durability of data dirtying shared buffers -- - shared buffer hits +-- - WAL writes and fsyncs in IOContext IOCONTEXT_NORMAL -- There is no test for blocks evicted from shared buffers, because we cannot -- be sure of the state of shared buffers at the point the test is run. -- Create a regular table and insert some data to generate IOCONTEXT_NORMAL @@ -1264,6 +1265,9 @@ SELECT sum(extends) AS io_sum_shared_before_extends SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs FROM pg_stat_io WHERE object = 'relation' \gset io_sum_shared_before_ +SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal' \gset io_sum_wal_normal_before_ CREATE TABLE test_io_shared(a int); INSERT INTO test_io_shared SELECT i FROM generate_series(1,100)i; SELECT pg_stat_force_next_flush(); @@ -1301,6 +1305,28 @@ SELECT current_setting('fsync') = 'off' t (1 row) +SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal' \gset io_sum_wal_normal_after_ +SELECT current_setting('synchronous_commit') = 'on'; + ?column? +---------- + t +(1 row) + +SELECT :io_sum_wal_normal_after_writes > :io_sum_wal_normal_before_writes; + ?column? +---------- + t +(1 row) + +SELECT current_setting('fsync') = 'off' + OR :io_sum_wal_normal_after_fsyncs > :io_sum_wal_normal_before_fsyncs; + ?column? +---------- + t +(1 row) + -- Change the tablespace so that the table is rewritten directly, then SELECT -- from it to cause it to be read back into shared buffers. SELECT sum(reads) AS io_sum_shared_before_reads diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql index e3b4ca96e8..aa48e65dc8 100644 --- a/src/test/regress/sql/stats.sql +++ b/src/test/regress/sql/stats.sql @@ -601,6 +601,7 @@ SELECT pg_stat_get_subscription_stats(NULL); -- - extends of relations using shared buffers -- - fsyncs done to ensure the durability of data dirtying shared buffers -- - shared buffer hits +-- - WAL writes and fsyncs in IOContext IOCONTEXT_NORMAL -- There is no test for blocks evicted from shared buffers, because we cannot -- be sure of the state of shared buffers at the point the test is run. @@ -612,6 +613,9 @@ SELECT sum(extends) AS io_sum_shared_before_extends SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs FROM pg_stat_io WHERE object = 'relation' \gset io_sum_shared_before_ +SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal' \gset io_sum_wal_normal_before_ CREATE TABLE test_io_shared(a int); INSERT INTO test_io_shared SELECT i FROM generate_series(1,100)i; SELECT pg_stat_force_next_flush(); @@ -630,6 +634,13 @@ SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs SELECT :io_sum_shared_after_writes > :io_sum_shared_before_writes; SELECT current_setting('fsync') = 'off' OR :io_sum_shared_after_fsyncs > :io_sum_shared_before_fsyncs; +SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs + FROM pg_stat_io + WHERE context = 'normal' and object = 'wal' \gset io_sum_wal_normal_after_ +SELECT current_setting('synchronous_commit') = 'on'; +SELECT :io_sum_wal_normal_after_writes > :io_sum_wal_normal_before_writes; +SELECT current_setting('fsync') = 'off' + OR :io_sum_wal_normal_after_fsyncs > :io_sum_wal_normal_before_fsyncs; -- Change the tablespace so that the table is rewritten directly, then SELECT -- from it to cause it to be read back into shared buffers. -- 2.43.0
From ee53af3bfaf4582a940c0b2bd93f5b98739ef423 Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz <byavuz81@gmail.com> Date: Tue, 12 Dec 2023 10:40:02 +0300 Subject: [PATCH v6 2/6] Show WAL stats on pg_stat_io (except streaming replication) This patch aims to showing WAL stats per backend on pg_stat_io view. With this patch, it can be seen how many WAL operations it makes, their context, types and total timings per backend in pg_stat_io view. In this path new IOContext IOCONTEXT_INIT is introduced, it is for IO operations done while creating the things. Currently, it is used only together with IOObject IOOBJECT_WAL. IOOBJECT_WAL means IO operations related to WAL. IOOBJECT_WAL / IOCONTEXT_NORMAL means IO operations done on already created WAL segments. IOOBJECT_WAL / IOCONTEXT_INIT means IO operations done while creating the WAL segments. This patch currently covers: - Documentation - IOOBJECT_WAL / IOCONTEXT_NORMAL / read, write and fsync stats on pg_stat_io. - IOOBJECT_WAL / IOCONTEXT_INIT / write and fsync stats on pg_stat_io. doesn't cover: - Streaming replication WAL IO. --- doc/src/sgml/monitoring.sgml | 19 +++- src/backend/access/transam/xlog.c | 58 ++++------ src/backend/access/transam/xlogrecovery.c | 10 ++ src/backend/utils/activity/pgstat_io.c | 124 +++++++++++++++++++++- src/backend/utils/adt/pgstatfuncs.c | 10 +- src/include/pgstat.h | 10 +- 6 files changed, 182 insertions(+), 49 deletions(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 42509042ad..0450f91ccb 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -2499,9 +2499,10 @@ description | Waiting for a newly initialized WAL file to reach durable storage </para> <para> - Currently, I/O on relations (e.g. tables, indexes) is tracked. However, - relation I/O which bypasses shared buffers (e.g. when moving a table from one - tablespace to another) is currently not tracked. + Currently, I/O on relations (e.g. tables, indexes) and WAL activities are + tracked. However, relation I/O which bypasses shared buffers + (e.g. when moving a table from one tablespace to another) is currently + not tracked. </para> <table id="pg-stat-io-view" xreflabel="pg_stat_io"> @@ -2554,6 +2555,11 @@ description | Waiting for a newly initialized WAL file to reach durable storage <literal>temp relation</literal>: Temporary relations. </para> </listitem> + <listitem> + <para> + <literal>wal</literal>: Write Ahead Logs. + </para> + </listitem> </itemizedlist> </para> </entry> @@ -2578,6 +2584,13 @@ description | Waiting for a newly initialized WAL file to reach durable storage <literal>normal</literal>. </para> </listitem> + <listitem> + <para> + <literal>init</literal>: I/O operations performed while creating the + WAL segments are tracked in <varname>context</varname> + <literal>init</literal>. + </para> + </listitem> <listitem> <para> <literal>vacuum</literal>: I/O operations performed outside of shared diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 01e0484584..6f7149084f 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2277,38 +2277,22 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) Size nbytes; Size nleft; int written; - instr_time start; + instr_time io_start; /* OK to write the page(s) */ from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ; nbytes = npages * (Size) XLOG_BLCKSZ; nleft = nbytes; + + io_start = pgstat_prepare_io_time(track_wal_io_timing); do { errno = 0; - /* Measure I/O timing to write WAL data */ - if (track_wal_io_timing) - INSTR_TIME_SET_CURRENT(start); - else - INSTR_TIME_SET_ZERO(start); - pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); written = pg_pwrite(openLogFile, from, nleft, startoffset); pgstat_report_wait_end(); - /* - * Increment the I/O timing and the number of times WAL data - * were written out to disk. - */ - if (track_wal_io_timing) - { - instr_time end; - - INSTR_TIME_SET_CURRENT(end); - INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_write_time, end, start); - } - PendingWalStats.wal_write++; if (written <= 0) @@ -2333,6 +2317,9 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) startoffset += written; } while (nleft > 0); + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, + IOOP_WRITE, io_start, npages); + npages = 0; /* @@ -3039,6 +3026,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, int fd; int save_errno; int open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY; + instr_time io_start; Assert(logtli != 0); @@ -3082,6 +3070,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); + /* start timing writes for stats */ + io_start = pgstat_prepare_io_time(track_wal_io_timing); + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); save_errno = 0; if (wal_init_zero) @@ -3117,6 +3108,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, } pgstat_report_wait_end(); + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE, + io_start, 1); + if (save_errno) { /* @@ -3133,6 +3127,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, errmsg("could not write to file \"%s\": %m", tmppath))); } + /* start timing fsyncs for stats */ + io_start = pgstat_prepare_io_time(track_wal_io_timing); + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); if (pg_fsync(fd) != 0) { @@ -3145,6 +3142,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, } pgstat_report_wait_end(); + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, + IOOP_FSYNC, io_start, 1); + if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), @@ -8317,7 +8317,7 @@ void issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) { char *msg = NULL; - instr_time start; + instr_time io_start; Assert(tli != 0); @@ -8330,11 +8330,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) wal_sync_method == WAL_SYNC_METHOD_OPEN_DSYNC) return; - /* Measure I/O timing to sync the WAL file */ - if (track_wal_io_timing) - INSTR_TIME_SET_CURRENT(start); - else - INSTR_TIME_SET_ZERO(start); + io_start = pgstat_prepare_io_time(track_wal_io_timing); pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC); switch (wal_sync_method) @@ -8378,16 +8374,8 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) pgstat_report_wait_end(); - /* - * Increment the I/O timing and the number of times WAL files were synced. - */ - if (track_wal_io_timing) - { - instr_time end; - - INSTR_TIME_SET_CURRENT(end); - INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_sync_time, end, start); - } + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC, + io_start, 1); PendingWalStats.wal_sync++; } diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index a2c8fa3981..16c1b9ba99 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -60,6 +60,7 @@ #include "utils/builtins.h" #include "utils/datetime.h" #include "utils/guc_hooks.h" +#include "utils/pgstat_internal.h" #include "utils/pg_lsn.h" #include "utils/ps_status.h" #include "utils/pg_rusage.h" @@ -1771,6 +1772,9 @@ PerformWalRecovery(void) */ ApplyWalRecord(xlogreader, record, &replayTLI); + /* Report pending statistics to the cumulative stats system */ + pgstat_flush_io(false); + /* Exit loop if we reached inclusive recovery target */ if (recoveryStopsAfter(xlogreader)) { @@ -3246,6 +3250,7 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, uint32 targetPageOff; XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY; int r; + instr_time io_start; XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size); targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size); @@ -3338,6 +3343,8 @@ retry: /* Read the requested page */ readOff = targetPageOff; + io_start = pgstat_prepare_io_time(track_wal_io_timing); + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); if (r != XLOG_BLCKSZ) @@ -3366,6 +3373,9 @@ retry: } pgstat_report_wait_end(); + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, + io_start, 1); + Assert(targetSegNo == readSegNo); Assert(targetPageOff == readOff); Assert(reqLen <= readLen); diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 7263a80c72..36bf5ce708 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -18,6 +18,7 @@ #include "executor/instrument.h" #include "storage/bufmgr.h" +#include "access/xlog.h" #include "utils/pgstat_internal.h" @@ -114,6 +115,22 @@ pgstat_prepare_io_time(bool track_time_guc) return io_start; } +/* + * Decide if the io timing needs be tracked. + */ +bool +pgstat_should_track_io_time(IOObject io_object, IOContext io_context) +{ + /* + * io times of IOOBJECT_WAL IOObject needs to be tracked when + * 'track_wal_io_timing' is set regardless of 'track_io_timing'. + */ + if (io_object == IOOBJECT_WAL) + return track_wal_io_timing; + + return track_io_timing; +} + /* * Like pgstat_count_io_op_n() except it also accumulates time. */ @@ -121,7 +138,27 @@ void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt) { - if (track_io_timing) + /* + * Accumulate different type of times here. We want to eventually + * deduplicate these counters, so we are consolidating them first. This + * also makes it easy to compare what is tracked for which stats or + * instrumentation purpose. + * + * Some of the IO counters didn't moved here because they track at a + * different level of granularity or at a different point in the call + * stack. + * + * pgstat_count_buffer is for pgstat_database. Since pg_stat_database only + * counts blk_read_time and blk_write_time, it is set for IOOP_READ and + * IOOP_WRITE. + * + * pgBufferUsage is for EXPLAIN. pgBufferUsage has only write and read + * stats for shared/local and temporary blocks. Only shared/local blocks + * are counted here. + * + * At the end of the if case, accumulate time for the pg_stat_io. + */ + if (pgstat_should_track_io_time(io_object, io_context)) { instr_time io_time; @@ -149,6 +186,7 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, io_time); } + /* The IO timings are counted, now count the IO numbers */ pgstat_count_io_op_n(io_object, io_context, io_op, cnt); } @@ -229,12 +267,33 @@ pgstat_get_io_context_name(IOContext io_context) return "normal"; case IOCONTEXT_VACUUM: return "vacuum"; + case IOCONTEXT_INIT: + return "init"; } elog(ERROR, "unrecognized IOContext value: %d", io_context); pg_unreachable(); } +/* + * op_bytes can change according to IOObject and IOContext. + * Return BLCKSZ as default because most of the + * IOObject / IOContext uses BLCKSZ. + */ +int +pgstat_get_io_op_bytes(IOObject io_object, IOContext io_context) +{ + if (io_object == IOOBJECT_WAL) + { + if (io_context == IOCONTEXT_NORMAL) + return XLOG_BLCKSZ; + else if (io_context == IOCONTEXT_INIT) + return wal_segment_size; + } + + return BLCKSZ; +} + const char * pgstat_get_io_object_name(IOObject io_object) { @@ -244,6 +303,8 @@ pgstat_get_io_object_name(IOObject io_object) return "relation"; case IOOBJECT_TEMP_RELATION: return "temp relation"; + case IOOBJECT_WAL: + return "wal"; } elog(ERROR, "unrecognized IOObject value: %d", io_object); @@ -325,10 +386,10 @@ pgstat_tracks_io_bktype(BackendType bktype) case B_INVALID: case B_ARCHIVER: case B_LOGGER: - case B_WAL_RECEIVER: - case B_WAL_WRITER: return false; + case B_WAL_RECEIVER: + case B_WAL_WRITER: case B_AUTOVAC_LAUNCHER: case B_AUTOVAC_WORKER: case B_BACKEND: @@ -363,6 +424,15 @@ pgstat_tracks_io_object(BackendType bktype, IOObject io_object, if (!pgstat_tracks_io_bktype(bktype)) return false; + /* + * Currently, IO on IOOBJECT_WAL IOObject can only occur in the + * IOCONTEXT_NORMAL and IOCONTEXT_INIT IOContext. + */ + if (io_object == IOOBJECT_WAL && + (io_context != IOCONTEXT_NORMAL && + io_context != IOCONTEXT_INIT)) + return false; + /* * Currently, IO on temporary relations can only occur in the * IOCONTEXT_NORMAL IOContext. @@ -437,6 +507,33 @@ pgstat_tracks_io_op(BackendType bktype, IOObject io_object, bktype == B_CHECKPOINTER) && io_op == IOOP_EXTEND) return false; + /* + * Some BackendTypes / IOObjects will not do certain IOOps. + */ + if (io_object == IOOBJECT_WAL && io_op == IOOP_READ) + { + switch (bktype) + { + case B_STANDALONE_BACKEND: + case B_STARTUP: + break; + + case B_INVALID: + case B_ARCHIVER: + case B_AUTOVAC_LAUNCHER: + case B_AUTOVAC_WORKER: + case B_BACKEND: + case B_BG_WORKER: + case B_BG_WRITER: + case B_CHECKPOINTER: + case B_LOGGER: + case B_WAL_RECEIVER: + case B_WAL_SENDER: + case B_WAL_WRITER: + return false; + } + } + /* * Temporary tables are not logged and thus do not require fsync'ing. * Writeback is not requested for temporary tables. @@ -461,6 +558,27 @@ pgstat_tracks_io_op(BackendType bktype, IOObject io_object, if (!strategy_io_context && io_op == IOOP_REUSE) return false; + /* + * Some IOOps are not valid in certain IOContexts / IOObjects and some + * IOOps are only valid in certain IOContexts / IOObjects. + */ + + /* + * In IOOBJECT_WAL io_object, IOCONTEXT_INIT io_context means operations + * done while creating new WAL segments. + */ + if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_INIT && + !(io_op == IOOP_WRITE || io_op == IOOP_FSYNC)) + return false; + + /* + * In IOOBJECT_WAL io_object, IOCONTEXT_NORMAL io_context means operations + * done on already created WAL segments. + */ + if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL && + !(io_op == IOOP_WRITE || io_op == IOOP_READ || io_op == IOOP_FSYNC)) + return false; + /* * IOOP_FSYNC IOOps done by a backend using a BufferAccessStrategy are * counted in the IOCONTEXT_NORMAL IOContext. See comment in diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 0cea320c00..8d14a4183c 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1377,6 +1377,7 @@ pg_stat_get_io(PG_FUNCTION_ARGS) for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++) { const char *context_name = pgstat_get_io_context_name(io_context); + int op_bytes; Datum values[IO_NUM_COLUMNS] = {0}; bool nulls[IO_NUM_COLUMNS] = {0}; @@ -1395,12 +1396,11 @@ pg_stat_get_io(PG_FUNCTION_ARGS) values[IO_COL_RESET_TIME] = TimestampTzGetDatum(reset_time); /* - * Hard-code this to the value of BLCKSZ for now. Future - * values could include XLOG_BLCKSZ, once WAL IO is tracked, - * and constant multipliers, once non-block-oriented IO (e.g. - * temporary file IO) is tracked. + * op_bytes can change according to IOObject and IOContext. + * Get the correct op_bytes. */ - values[IO_COL_CONVERSION] = Int64GetDatum(BLCKSZ); + op_bytes = pgstat_get_io_op_bytes(io_obj, io_context); + values[IO_COL_CONVERSION] = Int64GetDatum(op_bytes); for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++) { diff --git a/src/include/pgstat.h b/src/include/pgstat.h index f95d8db0c4..2a3d131dce 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -276,9 +276,10 @@ typedef enum IOObject { IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, + IOOBJECT_WAL, } IOObject; -#define IOOBJECT_NUM_TYPES (IOOBJECT_TEMP_RELATION + 1) +#define IOOBJECT_NUM_TYPES (IOOBJECT_WAL + 1) typedef enum IOContext { @@ -286,9 +287,10 @@ typedef enum IOContext IOCONTEXT_BULKWRITE, IOCONTEXT_NORMAL, IOCONTEXT_VACUUM, + IOCONTEXT_INIT, } IOContext; -#define IOCONTEXT_NUM_TYPES (IOCONTEXT_VACUUM + 1) +#define IOCONTEXT_NUM_TYPES (IOCONTEXT_INIT + 1) typedef enum IOOp { @@ -519,11 +521,13 @@ extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io, BackendType bktype); extern void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op); extern void pgstat_count_io_op_n(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt); -extern instr_time pgstat_prepare_io_time(void); +extern instr_time pgstat_prepare_io_time(bool track_time_guc); +extern bool pgstat_should_track_io_time(IOObject io_object, IOContext io_context); extern void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt); extern PgStat_IO *pgstat_fetch_stat_io(void); +extern int pgstat_get_io_op_bytes(IOObject io_object, IOContext io_context); extern const char *pgstat_get_io_context_name(IOContext io_context); extern const char *pgstat_get_io_object_name(IOObject io_object); -- 2.43.0
From e439ddc128c9b9ea1e2832f339e872c27d308499 Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz <byavuz81@gmail.com> Date: Tue, 12 Dec 2023 10:33:55 +0300 Subject: [PATCH v6 1/6] Use timing GUCs on pgstat_prepare_io_time function --- src/backend/storage/buffer/bufmgr.c | 10 +++++----- src/backend/storage/buffer/localbuf.c | 4 ++-- src/backend/storage/smgr/md.c | 4 ++-- src/backend/utils/activity/pgstat_io.c | 13 +++++++++++-- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f7c67d504c..a652907fc9 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1143,7 +1143,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, MemSet((char *) bufBlock, 0, BLCKSZ); else { - instr_time io_start = pgstat_prepare_io_time(); + instr_time io_start = pgstat_prepare_io_time(track_io_timing); smgrread(smgr, forkNum, blockNum, bufBlock); @@ -2070,7 +2070,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, } } - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); /* * Note: if smgrzeroextend fails, we will end up with buffers that are @@ -3523,7 +3523,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, */ bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); /* * bufToWrite is either the shared buffer or a copy, as appropriate. @@ -4181,7 +4181,7 @@ FlushRelationBuffers(Relation rel) PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); smgrwrite(RelationGetSmgr(rel), BufTagGetForkNum(&bufHdr->tag), @@ -5614,7 +5614,7 @@ IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context) sort_pending_writebacks(wb_context->pending_writebacks, wb_context->nr_pending); - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); /* * Coalesce neighbouring writes, but nothing else. For that we iterate diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index aebcf146b4..903ea97b85 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -246,7 +246,7 @@ GetLocalVictimBuffer(void) PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); /* And write... */ smgrwrite(oreln, @@ -411,7 +411,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr, } } - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); /* actually extend relation */ smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false); diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index fdecbad170..62b2e79106 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -1172,7 +1172,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) ereport(DEBUG1, (errmsg_internal("could not forward fsync request because request queue is full"))); - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0) ereport(data_sync_elevel(ERROR), @@ -1569,7 +1569,7 @@ mdsyncfiletag(const FileTag *ftag, char *path) need_to_close = true; } - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); /* Sync the file. */ result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC); diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 490d5a9ab7..7263a80c72 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -92,14 +92,23 @@ pgstat_count_io_op_n(IOObject io_object, IOContext io_context, IOOp io_op, uint3 have_iostats = true; } +/* + * Prepares the io_time for pgstat_count_io_op_time() function. This returns + * the current time if the 'track_time_guc' is true, otherwise returns zero. + */ instr_time -pgstat_prepare_io_time(void) +pgstat_prepare_io_time(bool track_time_guc) { instr_time io_start; - if (track_io_timing) + if (track_time_guc) INSTR_TIME_SET_CURRENT(io_start); else + + /* + * If the time won't be tracked, there is no need to set io_start but + * compiler complains about uninitialized use. So, set it to zero. + */ INSTR_TIME_SET_ZERO(io_start); return io_start; -- 2.43.0