Greetings, * Heikki Linnakangas (hlinn...@iki.fi) wrote: > On 05/02/2021 23:22, Stephen Frost wrote: > >Unless there's anything else on this, I'll commit these sometime next > >week. > > One more thing: Instead of using 'effective_io_concurrency' GUC directly, > should call get_tablespace_maintenance_io_concurrency().
Ah, yeah, of course. Updated patch attached. Thanks! Stephen
From 5aa55b44230474e6e61873260e9595a5495a1094 Mon Sep 17 00:00:00 2001 From: Stephen Frost <sfr...@snowman.net> Date: Fri, 5 Feb 2021 15:59:02 -0500 Subject: [PATCH 1/2] Improve logging of auto-vacuum and auto-analyze When logging auto-vacuum and auto-analyze activity, include the I/O timing if track_io_timing is enabled. Also, for auto-analyze, add the read rate and the dirty rate, similar to how that information has historically been logged for auto-vacuum. Stephen Frost and Jakub Wartak Reviewed-By: Heikki Linnakangas Discussion: https://www.postgresql.org/message-id/VI1PR0701MB69603A433348EDCF783C6ECBF6EF0%40VI1PR0701MB6960.eurprd07.prod.outlook.com --- doc/src/sgml/config.sgml | 8 ++- src/backend/access/heap/vacuumlazy.c | 18 +++++ src/backend/commands/analyze.c | 100 +++++++++++++++++++++++++-- 3 files changed, 116 insertions(+), 10 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 5ef1c7ad3c..2da2f2e32a 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -7411,9 +7411,11 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; I/O timing information is displayed in <link linkend="monitoring-pg-stat-database-view"> <structname>pg_stat_database</structname></link>, in the output of - <xref linkend="sql-explain"/> when the <literal>BUFFERS</literal> option is - used, and by <xref linkend="pgstatstatements"/>. Only superusers can - change this setting. + <xref linkend="sql-explain"/> when the <literal>BUFFERS</literal> option + is used, by autovacuum for auto-vacuums and auto-analyzes, when + <xref linkend="guc-log-autovacuum-min-duration"/> is set and by + <xref linkend="pgstatstatements"/>. Only superusers can change this + setting. </para> </listitem> </varlistentry> diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index f3d2265fad..3a5e5a1ac2 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -440,6 +440,8 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params, TransactionId new_frozen_xid; MultiXactId new_min_multi; ErrorContextCallback errcallback; + PgStat_Counter startreadtime = 0; + PgStat_Counter startwritetime = 0; Assert(params != NULL); Assert(params->index_cleanup != VACOPT_TERNARY_DEFAULT); @@ -454,6 +456,11 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params, { pg_rusage_init(&ru0); starttime = GetCurrentTimestamp(); + if (track_io_timing) + { + startreadtime = pgStatBlockReadTime; + startwritetime = pgStatBlockWriteTime; + } } if (params->options & VACOPT_VERBOSE) @@ -674,6 +681,17 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params, (long long) VacuumPageDirty); appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"), read_rate, write_rate); + if (track_io_timing) + { + appendStringInfoString(&buf, _("I/O Timings:")); + if (pgStatBlockReadTime - startreadtime > 0) + appendStringInfo(&buf, _(" read=%.3f"), + (double) (pgStatBlockReadTime - startreadtime) / 1000); + if (pgStatBlockWriteTime - startwritetime > 0) + appendStringInfo(&buf, _(" write=%.3f"), + (double) (pgStatBlockWriteTime - startwritetime) / 1000); + appendStringInfoChar(&buf, '\n'); + } appendStringInfo(&buf, _("system usage: %s\n"), pg_rusage_show(&ru0)); appendStringInfo(&buf, _("WAL usage: %ld records, %ld full page images, %llu bytes"), diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 7295cf0215..0864cda80e 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -312,6 +312,11 @@ do_analyze_rel(Relation onerel, VacuumParams *params, Oid save_userid; int save_sec_context; int save_nestlevel; + int64 AnalyzePageHit = VacuumPageHit; + int64 AnalyzePageMiss = VacuumPageMiss; + int64 AnalyzePageDirty = VacuumPageDirty; + PgStat_Counter startreadtime = 0; + PgStat_Counter startwritetime = 0; if (inh) ereport(elevel, @@ -346,8 +351,14 @@ do_analyze_rel(Relation onerel, VacuumParams *params, /* measure elapsed time iff autovacuum logging requires it */ if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0) { + if (track_io_timing) + { + startreadtime = pgStatBlockReadTime; + startwritetime = pgStatBlockWriteTime; + } + pg_rusage_init(&ru0); - if (params->log_min_duration > 0) + if (params->log_min_duration >= 0) starttime = GetCurrentTimestamp(); } @@ -682,15 +693,90 @@ do_analyze_rel(Relation onerel, VacuumParams *params, /* Log the action if appropriate */ if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0) { + TimestampTz endtime = GetCurrentTimestamp(); + if (params->log_min_duration == 0 || - TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(), + TimestampDifferenceExceeds(starttime, endtime, params->log_min_duration)) + { + long delay_in_ms; + double read_rate = 0; + double write_rate = 0; + StringInfoData buf; + + /* + * Calculate the difference in the Page Hit/Miss/Dirty that + * happened as part of the analyze by subtracting out the + * pre-analyze values which we saved above. + */ + AnalyzePageHit = VacuumPageHit - AnalyzePageHit; + AnalyzePageMiss = VacuumPageMiss - AnalyzePageMiss; + AnalyzePageDirty = VacuumPageDirty - AnalyzePageDirty; + + /* + * We do not expect an analyze to take > 25 days and it simplifies + * things a bit to use TimestampDifferenceMilliseconds. + */ + delay_in_ms = TimestampDifferenceMilliseconds(starttime, endtime); + + /* + * Note that we are reporting these read/write rates in the same + * manner as VACUUM does, which means that while the 'average read + * rate' here actually corresponds to page misses and resulting + * reads which are also picked up by track_io_timing, if enabled, + * the 'average write rate' is actually talking about the rate of + * pages being dirtied, not being written out, so it's typical to + * have a non-zero 'avg write rate' while I/O Timings only reports + * reads. + * + * It's not clear that an ANALYZE will ever result in + * FlushBuffer() being called, but we track and support reporting + * on I/O write time in case that changes as it's practically free + * to do so anyway. + */ + + if (delay_in_ms > 0) + { + read_rate = (double) BLCKSZ * AnalyzePageMiss / (1024 * 1024) / + (delay_in_ms / 1000.0); + write_rate = (double) BLCKSZ * AnalyzePageDirty / (1024 * 1024) / + (delay_in_ms / 1000.0); + } + + /* + * We split this up so we don't emit empty I/O timing values when + * track_io_timing isn't enabled. + */ + + initStringInfo(&buf); + appendStringInfo(&buf, _("automatic analyze of table \"%s.%s.%s\"\n"), + get_database_name(MyDatabaseId), + get_namespace_name(RelationGetNamespace(onerel)), + RelationGetRelationName(onerel)); + appendStringInfo(&buf, _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"), + (long long) AnalyzePageHit, + (long long) AnalyzePageMiss, + (long long) AnalyzePageDirty); + appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"), + read_rate, write_rate); + if (track_io_timing) + { + appendStringInfoString(&buf, _("I/O Timings:")); + if (pgStatBlockReadTime - startreadtime > 0) + appendStringInfo(&buf, _(" read=%.3f"), + (double) (pgStatBlockReadTime - startreadtime) / 1000); + if (pgStatBlockWriteTime - startwritetime > 0) + appendStringInfo(&buf, _(" write=%.3f"), + (double) (pgStatBlockWriteTime - startwritetime) / 1000); + appendStringInfoChar(&buf, '\n'); + } + appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0)); + ereport(LOG, - (errmsg("automatic analyze of table \"%s.%s.%s\" system usage: %s", - get_database_name(MyDatabaseId), - get_namespace_name(RelationGetNamespace(onerel)), - RelationGetRelationName(onerel), - pg_rusage_show(&ru0)))); + (errmsg_internal("%s", buf.data))); + + pfree(buf.data); + } } /* Roll back any GUC changes executed by index functions */ -- 2.25.1 From f03fad7422e0e2f88b6d962ff8c7b174a3b90d79 Mon Sep 17 00:00:00 2001 From: Stephen Frost <sfr...@snowman.net> Date: Fri, 5 Feb 2021 16:05:07 -0500 Subject: [PATCH 2/2] Use pre-fetching for ANALYZE When we have posix_fadvise() available, we can improve the performance of an ANALYZE by quite a bit by using it to inform the kernel of the blocks that we're going to be asking for. Similar to bitmap index scans, the number of buffers pre-fetched is based off of the maintenance_io_concurrency setting (for the particular tablespace or, if not set, globally, via get_tablespace_maintenance_io_concurrency()). Reviewed-By: Heikki Linnakangas Discussion: https://www.postgresql.org/message-id/VI1PR0701MB69603A433348EDCF783C6ECBF6EF0%40VI1PR0701MB6960.eurprd07.prod.outlook.com --- src/backend/commands/analyze.c | 76 +++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 0864cda80e..4de08ad923 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -63,6 +63,7 @@ #include "utils/pg_rusage.h" #include "utils/sampling.h" #include "utils/sortsupport.h" +#include "utils/spccache.h" #include "utils/syscache.h" #include "utils/timestamp.h" @@ -1113,10 +1114,12 @@ acquire_sample_rows(Relation onerel, int elevel, double *totalrows, double *totaldeadrows) { int numrows = 0; /* # rows now in reservoir */ + int prefetch_maximum = 0; /* blocks to prefetch if enabled */ double samplerows = 0; /* total # rows collected */ double liverows = 0; /* # live rows seen */ double deadrows = 0; /* # dead rows seen */ double rowstoskip = -1; /* -1 means not set yet */ + long randseed; /* Seed for block sampler(s) */ BlockNumber totalblocks; TransactionId OldestXmin; BlockSamplerData bs; @@ -1125,6 +1128,9 @@ acquire_sample_rows(Relation onerel, int elevel, TableScanDesc scan; BlockNumber nblocks; BlockNumber blksdone = 0; +#ifdef USE_PREFETCH + BlockSamplerData prefetch_bs; +#endif Assert(targrows > 0); @@ -1134,7 +1140,15 @@ acquire_sample_rows(Relation onerel, int elevel, OldestXmin = GetOldestNonRemovableTransactionId(onerel); /* Prepare for sampling block numbers */ - nblocks = BlockSampler_Init(&bs, totalblocks, targrows, random()); + randseed = random(); + nblocks = BlockSampler_Init(&bs, totalblocks, targrows, randseed); + +#ifdef USE_PREFETCH + prefetch_maximum = get_tablespace_io_concurrency(onerel->rd_rel->reltablespace); + /* Create another BlockSampler, using the same seed, for prefetching */ + if (prefetch_maximum) + (void) BlockSampler_Init(&prefetch_bs, totalblocks, targrows, randseed); +#endif /* Report sampling block numbers */ pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_TOTAL, @@ -1146,14 +1160,72 @@ acquire_sample_rows(Relation onerel, int elevel, scan = table_beginscan_analyze(onerel); slot = table_slot_create(onerel, NULL); +#ifdef USE_PREFETCH + + /* + * If we are doing prefetching, then go ahead and tell the kernel about + * the first set of pages we are going to want. This also moves our + * iterator out ahead of the main one being used, where we will keep it so + * that we're always pre-fetching out prefetch_maximum number of blocks + * ahead. + */ + if (prefetch_maximum) + { + for (int i = 0; i < prefetch_maximum; i++) + { + BlockNumber prefetch_block; + + if (!BlockSampler_HasMore(&prefetch_bs)) + break; + + prefetch_block = BlockSampler_Next(&prefetch_bs); + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, prefetch_block); + } + } +#endif + /* Outer loop over blocks to sample */ while (BlockSampler_HasMore(&bs)) { + bool block_accepted; +#ifdef USE_PREFETCH + BlockNumber prefetch_targblock = InvalidBlockNumber; +#endif BlockNumber targblock = BlockSampler_Next(&bs); +#ifdef USE_PREFETCH + + /* + * Make sure that every time the main BlockSampler is moved forward + * that our prefetch BlockSampler also gets moved forward, so that we + * always stay out ahead. + */ + if (prefetch_maximum && BlockSampler_HasMore(&prefetch_bs)) + prefetch_targblock = BlockSampler_Next(&prefetch_bs); +#endif + vacuum_delay_point(); - if (!table_scan_analyze_next_block(scan, targblock, vac_strategy)) + block_accepted = table_scan_analyze_next_block(scan, targblock, vac_strategy); + +#ifdef USE_PREFETCH + + /* + * When pre-fetching, after we get a block, tell the kernel about the + * next one we will want, if there's any left. + * + * We want to do this even if the table_scan_analyze_next_block() call + * above decides against analyzing the block it picked. + */ + if (prefetch_maximum && prefetch_targblock != InvalidBlockNumber) + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, prefetch_targblock); +#endif + + /* + * Don't analyze if table_scan_analyze_next_block() indicated this + * block is unsuitable for analyzing. + */ + if (!block_accepted) continue; while (table_scan_analyze_next_tuple(scan, OldestXmin, &liverows, &deadrows, slot)) -- 2.25.1
signature.asc
Description: PGP signature