On 2020/01/13 4:11, Pavel Stehule wrote:
> The following review has been posted through the commitfest application:
> make installcheck-world:  tested, passed
> Implements feature:       tested, passed
> Spec compliant:           not tested
> Documentation:            tested, passed
> 
> I like this patch, because I used similar functionality some years ago very 
> successfully. The implementation is almost simple, and the result should be 
> valid by used method.

Thanks for your review!

> The potential problem is performance impact. Very early test show impact cca 
> 3% worst case, but I'll try to repeat these tests.

Yes, performance impact is the main concern. I want to know how it 
affects performance in various test cases or on various environments.

> There are some ending whitespaces and useless tabs.
> 
> The new status of this patch is: Waiting on Author
I attach v4 patches removing those extra whitespaces of the end of lines 
and useless tabs.

--
Yoshikazu Imai
From b009b1f8f6be47ae61b5e4538e2730d721ee60db Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshik...@jp.fujitsu.com>
Date: Wed, 15 Jan 2020 09:13:19 +0000
Subject: [PATCH v4 1/2] Add pg_stat_waitaccum view.

pg_stat_waitaccum shows counts and duration of each wait events.
Each backend/backgrounds counts and measures the time of wait event
in every pgstat_report_wait_start and pgstat_report_wait_end. They
store those info into their local variables and send to Statistics
Collector. We can get those info via Statistics Collector.

For reducing overhead, I implemented statistic hash instead of
dynamic hash. I also implemented track_wait_timing which
determines wait event duration is collected or not.

On windows, this function might be not worked correctly, because
now it initializes local variables in pg_stat_init which is not
passed to fork processes on windows.
---
 src/backend/catalog/system_views.sql          |   8 +
 src/backend/postmaster/pgstat.c               | 344 ++++++++++++++++++++++++++
 src/backend/storage/lmgr/lwlock.c             |  19 ++
 src/backend/utils/adt/pgstatfuncs.c           |  80 ++++++
 src/backend/utils/misc/guc.c                  |   9 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/catalog/pg_proc.dat               |   9 +
 src/include/pgstat.h                          | 123 ++++++++-
 src/include/storage/lwlock.h                  |   1 +
 src/include/storage/proc.h                    |   1 +
 src/test/regress/expected/rules.out           |   5 +
 11 files changed, 598 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/system_views.sql 
b/src/backend/catalog/system_views.sql
index 773edf8..80f2caa 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -957,6 +957,14 @@ CREATE VIEW pg_stat_bgwriter AS
         pg_stat_get_buf_alloc() AS buffers_alloc,
         pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 
+CREATE VIEW pg_stat_waitaccum AS
+    SELECT
+               S.wait_event_type AS wait_event_type,
+               S.wait_event AS wait_event,
+               S.calls AS calls,
+               S.times AS times
+       FROM pg_stat_get_waitaccum(NULL) AS S;
+
 CREATE VIEW pg_stat_progress_vacuum AS
     SELECT
         S.pid AS pid, S.datid AS datid, D.datname AS datname,
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 51c486b..08e10ad 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -123,6 +123,7 @@
  */
 bool           pgstat_track_activities = false;
 bool           pgstat_track_counts = false;
+bool           pgstat_track_wait_timing = false;
 int                    pgstat_track_functions = TRACK_FUNC_OFF;
 int                    pgstat_track_activity_query_size = 1024;
 
@@ -153,6 +154,10 @@ static time_t last_pgstat_start_time;
 
 static bool pgStatRunningInCollector = false;
 
+WAHash *wa_hash;
+
+instr_time waitStart;
+
 /*
  * Structures in which backends store per-table info that's waiting to be
  * sent to the collector.
@@ -255,6 +260,7 @@ static int  localNumBackends = 0;
  */
 static PgStat_ArchiverStats archiverStats;
 static PgStat_GlobalStats globalStats;
+static PgStat_WaitAccumStats waitAccumStats;
 
 /*
  * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
@@ -280,6 +286,8 @@ static pid_t pgstat_forkexec(void);
 #endif
 
 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) 
pg_attribute_noreturn();
+static void pgstat_init_waitaccum_hash(WAHash **hash);
+static PgStat_WaitAccumEntry *pgstat_add_wa_entry(WAHash *hash, uint32 key);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
@@ -287,8 +295,11 @@ static PgStat_StatTabEntry 
*pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
                                                                                
                 Oid tableoid, bool create);
 static void pgstat_write_statsfiles(bool permanent, bool allDbs);
 static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool 
permanent);
+static void pgstat_write_waitaccum_statsfile(FILE *fpout);
 static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
 static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB 
*funchash, bool permanent);
+static bool pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+                                                                               
        FILE *fpin, const char *statfile);
 static void backend_read_statsfile(void);
 static void pgstat_read_current_status(void);
 
@@ -324,6 +335,7 @@ static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int 
len);
 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
 static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
+static void pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len);
 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
 static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int 
len);
@@ -331,6 +343,27 @@ static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, 
int len);
 static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int 
len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
+
+PgStat_WaitAccumEntry *
+pgstat_get_wa_entry(WAHash *hash, uint32 key)
+{
+       WAEntry *current;
+       int bucket = key % WA_BUCKET_SIZE;
+
+       current = hash->buckets[bucket];
+
+       while (current != NULL)
+       {
+               if (current->key == key)
+                       return current->entry;
+
+               current = current->next;
+       }
+
+       return NULL;
+}
+
+
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
@@ -602,6 +635,8 @@ retry2:
 
        pg_freeaddrinfo_all(hints.ai_family, addrs);
 
+       pgstat_init_waitaccum_hash(&wa_hash);
+
        return;
 
 startup_failed:
@@ -624,6 +659,75 @@ startup_failed:
        SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
 }
 
+static PgStat_WaitAccumEntry *
+pgstat_add_wa_entry(WAHash *hash, uint32 key)
+{
+       WAEntry *prev;
+       WAEntry *new;
+       int bucket = key % WA_BUCKET_SIZE;
+
+       prev = hash->buckets[bucket];
+
+       while (prev != NULL && prev->next != NULL)
+               prev = prev->next;
+
+       new = &hash->entries[hash->entry_num++];
+       new->key = key;
+       new->entry = MemoryContextAllocZero(TopMemoryContext, 
(sizeof(PgStat_WaitAccumEntry)));
+
+       if (prev != NULL)
+               prev->next = new;
+       else
+               hash->buckets[bucket] = new;
+
+       return new->entry;
+}
+
+static void
+pgstat_init_waitaccum_entry(WAHash *hash, uint32 wait_event_info)
+{
+       PgStat_WaitAccumEntry *entry;
+
+       entry = pgstat_add_wa_entry(hash, wait_event_info);
+       entry->wait_event_info = wait_event_info;
+}
+
+static void
+pgstat_init_waitaccum_hash(WAHash **hash)
+{
+       uint32 i;
+       int last_tranche_id;
+
+       *hash = MemoryContextAllocZero(TopMemoryContext, sizeof(WAHash));
+
+       last_tranche_id = LWLockGetLastTrancheId();
+       for (i = PG_WAIT_LWLOCK + 1; i <= (PG_WAIT_LWLOCK | last_tranche_id); 
i++)
+               pgstat_init_waitaccum_entry(*hash, i);
+
+       for (i = (PG_WAIT_LOCK | LOCKTAG_RELATION); i <= (PG_WAIT_LOCK | 
LOCKTAG_LAST_TYPE); i++)
+               pgstat_init_waitaccum_entry(*hash, i);
+
+       for (i = PG_WAIT_BUFFER_PIN; i <= PG_WAIT_BUFFER_PIN; i++)
+               pgstat_init_waitaccum_entry(*hash, i);
+
+       for (i = PG_WAIT_ACTIVITY; i <= PG_WAIT_ACTIVITY_LAST_TYPE; i++)
+               pgstat_init_waitaccum_entry(*hash, i);
+
+       for (i = PG_WAIT_CLIENT; i <= PG_WAIT_CLIENT_LAST_TYPE; i++)
+               pgstat_init_waitaccum_entry(*hash, i);
+
+       //do extension stuff
+
+       for (i = PG_WAIT_IPC; i <= PG_WAIT_IPC_LAST_TYPE; i++)
+               pgstat_init_waitaccum_entry(*hash, i);
+
+       for (i = PG_WAIT_TIMEOUT; i <= PG_WAIT_TIMEOUT_LAST_TYPE; i++)
+               pgstat_init_waitaccum_entry(*hash, i);
+
+       for (i = PG_WAIT_IO; i <= PG_WAIT_IO_LAST_TYPE; i++)
+               pgstat_init_waitaccum_entry(*hash, i);
+}
+
 /*
  * subroutine for pgstat_reset_all
  */
@@ -904,6 +1008,9 @@ pgstat_report_stat(bool force)
 
        /* Now, send function statistics */
        pgstat_send_funcstats();
+
+       /* Send wait accumulative statistics */
+       pgstat_send_waitaccum();
 }
 
 /*
@@ -1334,6 +1441,8 @@ pgstat_reset_shared_counters(const char *target)
                msg.m_resettarget = RESET_ARCHIVER;
        else if (strcmp(target, "bgwriter") == 0)
                msg.m_resettarget = RESET_BGWRITER;
+       else if (strcmp(target, "waitaccum") == 0)
+               msg.m_resettarget = RESET_WAITACCUM;
        else
                ereport(ERROR,
                                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -2618,6 +2727,22 @@ pgstat_fetch_global(void)
        return &globalStats;
 }
 
+/*
+ * ---------
+ * pgstat_fetch_stat_waitaccum() -
+ *
+ *     Support function for the SQL-callable pgstat* functions. Returns
+ *     a pointer to the wait accum statistics struct.
+ * ---------
+ */
+PgStat_WaitAccumStats *
+pgstat_fetch_stat_waitaccum(void)
+{
+       backend_read_statsfile();
+
+       return &waitAccumStats;
+}
+
 
 /* ------------------------------------------------------------
  * Functions for management of the shared-memory PgBackendStatus array
@@ -4410,6 +4535,53 @@ pgstat_send_bgwriter(void)
        MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
 }
 
+/* ----------
+ * pgstat_send_waitaccum() -
+ *
+ * ----------
+ */
+void
+pgstat_send_waitaccum()
+{
+       PgStat_MsgWaitAccum msg;
+       PgStat_WaitAccumEntry *entry;
+       int i;
+
+       if (wa_hash == NULL)
+               return;
+
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_WAITACCUM);
+       msg.m_nentries = 0;
+
+       for (i = 0; i < wa_hash->entry_num; i++)
+       {
+               entry = wa_hash->entries[i].entry;
+
+               /* Send only wait events that have occurred. */
+               if (entry->calls == 0)
+                       continue;
+
+               /*
+                * Prepare and send the message
+                */
+               memcpy(&msg.m_entry[msg.m_nentries], entry, 
sizeof(PgStat_WaitAccumEntry));
+               if (++msg.m_nentries >= PGSTAT_NUM_WAITACCUMENTRIES)
+               {
+                       pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, 
m_entry[0]) +
+                                               msg.m_nentries * 
sizeof(PgStat_WaitAccumEntry));
+                       msg.m_nentries = 0;
+               }
+
+               /* Clear wait events information. */
+               entry->calls = 0;
+               INSTR_TIME_SET_ZERO(entry->times);
+       }
+
+       if (msg.m_nentries > 0)
+               pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+                                       msg.m_nentries * 
sizeof(PgStat_WaitAccumEntry));
+}
+
 
 /* ----------
  * PgstatCollectorMain() -
@@ -4602,6 +4774,10 @@ PgstatCollectorMain(int argc, char *argv[])
                                        pgstat_recv_bgwriter(&msg.msg_bgwriter, 
len);
                                        break;
 
+                               case PGSTAT_MTYPE_WAITACCUM:
+                                       
pgstat_recv_waitaccum(&msg.msg_waitaccum, len);
+                                       break;
+
                                case PGSTAT_MTYPE_FUNCSTAT:
                                        pgstat_recv_funcstat(&msg.msg_funcstat, 
len);
                                        break;
@@ -4872,6 +5048,8 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
        rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
        (void) rc;                                      /* we'll check for 
error with ferror */
 
+       pgstat_write_waitaccum_statsfile(fpout);
+
        /*
         * Walk through the database table.
         */
@@ -5077,6 +5255,43 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, 
bool permanent)
 }
 
 /* ----------
+ * pgstat_write_waitaccum_statsfile() -
+ *             Write the waitAccumStats to the stat file.
+ *
+ * ----------
+ */
+static void
+pgstat_write_waitaccum_statsfile(FILE *fpout)
+{
+       PgStat_WaitAccumEntry *entry;
+       WAHash *hash = waitAccumStats.hash;
+       int                     rc;
+       int                     i;
+
+       /*
+        * Walk through the waitaccum hash.
+        */
+       for (i = 0; i < hash->entry_num; i++)
+       {
+               entry = hash->entries[i].entry;
+
+               /* Write only wait events that have occurred. */
+               if (entry->calls == 0)
+                       continue;
+
+               /*
+                * Write out the DB entry. We don't write the tables or 
functions
+                * pointers, since they're of no use to any other process.
+                */
+               fputc('D', fpout);
+               rc = fwrite(entry, sizeof(PgStat_WaitAccumEntry), 1, fpout);
+               (void) rc;                              /* we'll check for 
error with ferror */
+       }
+
+       fputc('E', fpout);
+}
+
+/* ----------
  * pgstat_read_statsfiles() -
  *
  *     Reads in some existing statistics collector files and returns the
@@ -5129,6 +5344,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool 
deep)
         */
        memset(&globalStats, 0, sizeof(globalStats));
        memset(&archiverStats, 0, sizeof(archiverStats));
+       waitAccumStats.hash = MemoryContextAllocZero(pgStatLocalContext, 
sizeof(WAHash));
 
        /*
         * Set the current timestamp (will be kept only in case we can't load an
@@ -5199,6 +5415,9 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool 
deep)
                goto done;
        }
 
+       if(!pgstat_read_waitaccum_statsfile(&waitAccumStats, fpin, statfile))
+               goto done;
+
        /*
         * We found an existing collector stats file. Read it and put all the
         * hashtable entries into place.
@@ -5497,10 +5716,13 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool 
permanent,
        PgStat_StatDBEntry dbentry;
        PgStat_GlobalStats myGlobalStats;
        PgStat_ArchiverStats myArchiverStats;
+       PgStat_WaitAccumStats myWaitAccumStats;
        FILE       *fpin;
        int32           format_id;
        const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : 
pgstat_stat_filename;
 
+       myWaitAccumStats.hash = MemoryContextAllocZero(CurrentMemoryContext, 
sizeof(WAHash));
+
        /*
         * Try to open the stats file.  As above, anything but ENOENT is worthy 
of
         * complaining about.
@@ -5551,6 +5773,9 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool 
permanent,
                return false;
        }
 
+       if(!pgstat_read_waitaccum_statsfile(&myWaitAccumStats, fpin, statfile))
+               return false;
+
        /* By default, we're going to return the timestamp of the global file. 
*/
        *ts = myGlobalStats.stats_timestamp;
 
@@ -5604,6 +5829,75 @@ done:
        return true;
 }
 
+/* ----------
+ * pgstat_read_statsfiles() -
+ *
+ *     Reads the waitaccum stats from the file.
+ *     If an error happens when reading file, return false. Otherwise return 
true.
+ *
+ * ----------
+ */
+static bool
+pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+                                                               FILE *fpin, 
const char *statfile)
+{
+       PgStat_WaitAccumEntry *entry;
+       PgStat_WaitAccumEntry buf;
+       WAHash *hash = stats->hash;
+
+       /*
+        * Read and put all the hashtable entries into place.
+        */
+       for (;;)
+       {
+               switch (fgetc(fpin))
+               {
+                               /*
+                                * 'D'  A PgStat_WaitAccumEntry struct 
describing a database
+                                * follows.
+                                */
+                       case 'D':
+                               if (fread(&buf, 1, 
sizeof(PgStat_WaitAccumEntry), fpin)
+                                                                        != 
sizeof(PgStat_WaitAccumEntry))
+                               {
+                                       ereport(pgStatRunningInCollector ? LOG 
: WARNING,
+                                                       (errmsg("corrupted 
statistics file \"%s\"",
+                                                                       
statfile)));
+                                       return false;
+                               }
+
+                               entry = pgstat_get_wa_entry(hash, 
buf.wait_event_info);
+
+                               if (entry)
+                               {
+                                       ereport(pgStatRunningInCollector ? LOG 
: WARNING,
+                                                       (errmsg("corrupted 
statistics file \"%s\"",
+                                                                       
statfile)));
+                                       return false;
+                               }
+
+                               /*
+                                * Add to the DB hash
+                                */
+                               entry = pgstat_add_wa_entry(hash, 
buf.wait_event_info);
+                               memcpy(entry, &buf, 
sizeof(PgStat_WaitAccumEntry));
+
+                               break;
+
+                       case 'E':
+                               return true;
+
+                       default:
+                               ereport(pgStatRunningInCollector ? LOG : 
WARNING,
+                                               (errmsg("corrupted statistics 
file \"%s\"",
+                                                               statfile)));
+                               return false;
+               }
+       }
+
+       return 0;
+}
+
 /*
  * If not already done, read the statistics collector stats file into
  * some hash tables.  The results will be kept until pgstat_clear_snapshot()
@@ -6113,7 +6407,20 @@ 
pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
                memset(&archiverStats, 0, sizeof(archiverStats));
                archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
        }
+       else if (msg->m_resettarget == RESET_WAITACCUM)
+       {
+               PgStat_WaitAccumEntry *entry;
+               WAHash *hash = waitAccumStats.hash;
+               int i;
+
+               for (i = 0; i < hash->entry_num; i++)
+               {
+                       entry = hash->entries[i].entry;
 
+                       entry->calls = 0;
+                       INSTR_TIME_SET_ZERO(entry->times);
+               }
+       }
        /*
         * Presumably the sender of this message validated the target, don't
         * complain here if it's not valid
@@ -6293,6 +6600,43 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
 }
 
 /* ----------
+ * pgstat_recv_waitaccum() -
+ *
+ *     Process a WAITACCUM message.
+ * ----------
+ */
+static void
+pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len)
+{
+       PgStat_WaitAccumEntry *m_entry = &(msg->m_entry[0]);
+       PgStat_WaitAccumEntry *entry;
+       WAHash *hash = waitAccumStats.hash;
+       int                     i;
+
+       /*
+        * Process all function entries in the message.
+        */
+       for (i = 0; i < msg->m_nentries; i++, m_entry++)
+       {
+               entry = pgstat_get_wa_entry(hash, m_entry->wait_event_info);
+
+               if (!entry)
+               {
+                       entry = pgstat_add_wa_entry(hash, 
m_entry->wait_event_info);
+                       memcpy(entry, m_entry, sizeof(PgStat_WaitAccumEntry));
+               }
+               else
+               {
+                       /*
+                        * Otherwise add the values to the existing entry.
+                        */
+                       entry->calls += m_entry->calls;
+                       INSTR_TIME_ADD(entry->times, m_entry->times);
+               }
+       }
+}
+
+/* ----------
  * pgstat_recv_recoveryconflict() -
  *
  *     Process a RECOVERYCONFLICT message.
diff --git a/src/backend/storage/lmgr/lwlock.c 
b/src/backend/storage/lmgr/lwlock.c
index d07ce60..6f4eb19 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -594,6 +594,25 @@ LWLockNewTrancheId(void)
 }
 
 /*
+ * Get a last tranche ID.
+ */
+int
+LWLockGetLastTrancheId(void)
+{
+       int                     result;
+       int                *LWLockCounter;
+
+       Assert(!lock_named_request_allowed);
+
+       LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+       SpinLockAcquire(ShmemLock);
+       result = *LWLockCounter;
+       SpinLockRelease(ShmemLock);
+
+       return result;
+}
+
+/*
  * Register a tranche ID in the lookup table for the current process.  This
  * routine will save a pointer to the tranche name passed as an argument,
  * so the name should be allocated in a backend-lifetime context
diff --git a/src/backend/utils/adt/pgstatfuncs.c 
b/src/backend/utils/adt/pgstatfuncs.c
index 3dbf604..bed7d01 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1974,3 +1974,83 @@ pg_stat_get_archiver(PG_FUNCTION_ARGS)
        PG_RETURN_DATUM(HeapTupleGetDatum(
                                                                          
heap_form_tuple(tupdesc, values, nulls)));
 }
+
+Datum
+pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_WAITACCUM_COLS     4
+       PgStat_WaitAccumStats *waitaccum_stats;
+       PgStat_WaitAccumEntry *entry;
+       ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+       TupleDesc       tupdesc;
+       Tuplestorestate *tupstore;
+       MemoryContext per_query_ctx;
+       MemoryContext oldcontext;
+       int i;
+
+       /* check to see if caller supports us returning a tuplestore */
+       if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("set-valued function called in context 
that cannot accept a set")));
+       if (!(rsinfo->allowedModes & SFRM_Materialize))
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("materialize mode required, but it is 
not " \
+                                               "allowed in this context")));
+
+       /* Build a tuple descriptor for our result type */
+       if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+               elog(ERROR, "return type must be a row type");
+
+       per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+       oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+       tupstore = tuplestore_begin_heap(true, false, work_mem);
+       rsinfo->returnMode = SFRM_Materialize;
+       rsinfo->setResult = tupstore;
+       rsinfo->setDesc = tupdesc;
+
+       MemoryContextSwitchTo(oldcontext);
+
+       /* Get statistics about the waitaccum process */
+       waitaccum_stats = pgstat_fetch_stat_waitaccum();
+
+       for (i = 0; i < waitaccum_stats->hash->entry_num; i++)
+       {
+               Datum           values[PG_STAT_GET_WAITACCUM_COLS];
+               bool            nulls[PG_STAT_GET_WAITACCUM_COLS];
+               const char *wait_event_type = NULL;
+               const char *wait_event = NULL;
+
+               /* Initialise values and NULL flags arrays */
+               MemSet(values, 0, sizeof(values));
+               MemSet(nulls, 0, sizeof(nulls));
+
+               entry = waitaccum_stats->hash->entries[i].entry;
+
+               /* Fill values and NULLs */
+               {
+                       uint32          raw_wait_event;
+
+                       raw_wait_event = 
UINT32_ACCESS_ONCE(entry->wait_event_info);
+                       wait_event_type = 
pgstat_get_wait_event_type(raw_wait_event);
+                       wait_event = pgstat_get_wait_event(raw_wait_event);
+               }
+
+               values[0] = CStringGetTextDatum(wait_event_type);
+
+               values[1] = CStringGetTextDatum(wait_event);
+
+               values[2] = Int64GetDatum(entry->calls);
+
+               values[3] = 
UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+
+               tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+       }
+
+       /* clean up and return the tuplestore */
+       tuplestore_donestoring(tupstore);
+
+       return (Datum) 0;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index e5f8a13..2924472 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1424,6 +1424,15 @@ static struct config_bool ConfigureNamesBool[] =
                false,
                NULL, NULL, NULL
        },
+       {
+               {"track_wait_timing", PGC_SUSET, STATS_COLLECTOR,
+                       gettext_noop("Collects timing statistics for wait 
events."),
+                       NULL
+               },
+               &pgstat_track_wait_timing,
+               false,
+               NULL, NULL, NULL
+       },
 
        {
                {"update_process_title", PGC_SUSET, PROCESS_TITLE,
diff --git a/src/backend/utils/misc/postgresql.conf.sample 
b/src/backend/utils/misc/postgresql.conf.sample
index e1048c0..3a99182 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -570,6 +570,7 @@
 #track_activities = on
 #track_counts = on
 #track_io_timing = off
+#track_wait_timing = off
 #track_functions = none                        # none, pl, all
 #track_activity_query_size = 1024      # (change requires restart)
 #stats_temp_directory = 'pg_stat_tmp'
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 427faa3..4e5a502 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5159,6 +5159,15 @@
   proargmodes => 
'{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
   proargnames => 
'{pid,datid,pid,usesysid,application_name,state,query,wait_event_type,wait_event,xact_start,query_start,backend_start,state_change,client_addr,client_hostname,client_port,backend_xid,backend_xmin,backend_type,ssl,sslversion,sslcipher,sslbits,sslcompression,ssl_client_dn,ssl_client_serial,ssl_issuer_dn,gss_auth,gss_princ,gss_enc}',
   prosrc => 'pg_stat_get_activity' },
+{ oid => '2228',
+  descr => 'statistics: information about accumulative data of wait event',
+  proname => 'pg_stat_get_waitaccum', prorows => '200', proisstrict => 'f',
+  proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,text,text,int8,int8}',
+  proargmodes => '{i,o,o,o,o}',
+  proargnames => '{pid,wait_event_type,wait_event,calls,times}',
+  prosrc => 'pg_stat_get_waitaccum' },
 { oid => '3318',
   descr => 'statistics: information about progress of backends running 
maintenance command',
   proname => 'pg_stat_get_progress_info', prorows => '100', proretset => 't',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index e5a5d02..f90bb44 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -59,6 +59,7 @@ typedef enum StatMsgType
        PGSTAT_MTYPE_ANALYZE,
        PGSTAT_MTYPE_ARCHIVER,
        PGSTAT_MTYPE_BGWRITER,
+       PGSTAT_MTYPE_WAITACCUM,
        PGSTAT_MTYPE_FUNCSTAT,
        PGSTAT_MTYPE_FUNCPURGE,
        PGSTAT_MTYPE_RECOVERYCONFLICT,
@@ -119,7 +120,8 @@ typedef struct PgStat_TableCounts
 typedef enum PgStat_Shared_Reset_Target
 {
        RESET_ARCHIVER,
-       RESET_BGWRITER
+       RESET_BGWRITER,
+       RESET_WAITACCUM
 } PgStat_Shared_Reset_Target;
 
 /* Possible object types for resetting single counters */
@@ -423,6 +425,33 @@ typedef struct PgStat_MsgBgWriter
 } PgStat_MsgBgWriter;
 
 /* ----------
+ * PgStat_WaitAccumEntry       Entry in backend/background's 
per-wait_event_info hash table
+ * ----------
+ */
+typedef struct PgStat_WaitAccumEntry
+{
+       uint32                  wait_event_info;
+       PgStat_Counter  calls;
+       instr_time              times;
+} PgStat_WaitAccumEntry;
+
+/* ----------
+ * PgStat_MsgWaitAccum Sent by backend/background's process to update 
statistics.
+ * ----------
+ */
+#define PGSTAT_NUM_WAITACCUMENTRIES    \
+       ((PGSTAT_MSG_PAYLOAD - sizeof(int))  \
+        / sizeof(PgStat_WaitAccumEntry))
+
+typedef struct PgStat_MsgWaitAccum
+{
+       PgStat_MsgHdr m_hdr;
+
+       int m_nentries;
+       PgStat_WaitAccumEntry m_entry[PGSTAT_NUM_WAITACCUMENTRIES];
+} PgStat_MsgWaitAccum;
+
+/* ----------
  * PgStat_MsgRecoveryConflict  Sent by the backend upon recovery conflict
  * ----------
  */
@@ -564,6 +593,7 @@ typedef union PgStat_Msg
        PgStat_MsgAnalyze msg_analyze;
        PgStat_MsgArchiver msg_archiver;
        PgStat_MsgBgWriter msg_bgwriter;
+       PgStat_MsgWaitAccum msg_waitaccum;
        PgStat_MsgFuncstat msg_funcstat;
        PgStat_MsgFuncpurge msg_funcpurge;
        PgStat_MsgRecoveryConflict msg_recoveryconflict;
@@ -581,7 +611,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID  0x01A5BC9D
+#define PGSTAT_FILE_FORMAT_ID  0x01A5BC9E
 
 /* ----------
  * PgStat_StatDBEntry                  The collector's data per database
@@ -711,6 +741,30 @@ typedef struct PgStat_GlobalStats
        TimestampTz stat_reset_timestamp;
 } PgStat_GlobalStats;
 
+typedef struct WAEntry
+{
+       int key;
+       PgStat_WaitAccumEntry *entry;
+       struct WAEntry *next;
+} WAEntry;
+
+#define WA_BUCKET_SIZE 461
+
+typedef struct WAHash
+{
+       WAEntry entries[WA_BUCKET_SIZE];
+       WAEntry *buckets[WA_BUCKET_SIZE];
+       int entry_num;
+} WAHash;
+
+/*
+ * WaitAccum statistics kept in the stats collector
+ */
+typedef struct PgStat_WaitAccumStats
+{
+       WAHash *hash;
+} PgStat_WaitAccumStats;
+
 
 /* ----------
  * Backend types
@@ -787,6 +841,8 @@ typedef enum
        WAIT_EVENT_WAL_WRITER_MAIN
 } WaitEventActivity;
 
+#define        PG_WAIT_ACTIVITY_LAST_TYPE      WAIT_EVENT_WAL_WRITER_MAIN
+
 /* ----------
  * Wait Events - Client
  *
@@ -808,6 +864,8 @@ typedef enum
        WAIT_EVENT_GSS_OPEN_SERVER,
 } WaitEventClient;
 
+#define        PG_WAIT_CLIENT_LAST_TYPE        WAIT_EVENT_GSS_OPEN_SERVER
+
 /* ----------
  * Wait Events - IPC
  *
@@ -856,6 +914,8 @@ typedef enum
        WAIT_EVENT_SYNC_REP
 } WaitEventIPC;
 
+#define        PG_WAIT_IPC_LAST_TYPE   WAIT_EVENT_SYNC_REP
+
 /* ----------
  * Wait Events - Timeout
  *
@@ -869,6 +929,8 @@ typedef enum
        WAIT_EVENT_RECOVERY_APPLY_DELAY
 } WaitEventTimeout;
 
+#define        PG_WAIT_TIMEOUT_LAST_TYPE       WAIT_EVENT_RECOVERY_APPLY_DELAY
+
 /* ----------
  * Wait Events - IO
  *
@@ -948,6 +1010,8 @@ typedef enum
        WAIT_EVENT_WAL_WRITE
 } WaitEventIO;
 
+#define        PG_WAIT_IO_LAST_TYPE    WAIT_EVENT_WAL_WRITE
+
 /* ----------
  * Command type for progress reporting purposes
  * ----------
@@ -1204,6 +1268,8 @@ typedef struct PgStat_FunctionCallUsage
        instr_time      f_start;
 } PgStat_FunctionCallUsage;
 
+extern WAHash *wa_hash;
+extern instr_time waitStart;
 
 /* ----------
  * GUC parameters
@@ -1211,6 +1277,7 @@ typedef struct PgStat_FunctionCallUsage
  */
 extern bool pgstat_track_activities;
 extern bool pgstat_track_counts;
+extern bool pgstat_track_wait_timing;
 extern int     pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_track_activity_query_size;
 extern char *pgstat_stat_directory;
@@ -1228,6 +1295,7 @@ extern PgStat_MsgBgWriter BgWriterStats;
 extern PgStat_Counter pgStatBlockReadTime;
 extern PgStat_Counter pgStatBlockWriteTime;
 
+extern PgStat_WaitAccumEntry *pgstat_get_wa_entry(WAHash *hash, uint32 key);
 /* ----------
  * Functions called from postmaster
  * ----------
@@ -1315,6 +1383,50 @@ extern char *pgstat_clip_activity(const char 
*raw_activity);
  * initialized.
  * ----------
  */
+
+static inline void
+pgstat_report_waitaccum_start()
+{
+       if (wa_hash == NULL)
+               return;
+
+       if (pgstat_track_wait_timing)
+       {
+               INSTR_TIME_SET_CURRENT(waitStart);
+       }
+}
+
+static inline void
+pgstat_report_waitaccum_end(uint32 wait_event_info)
+{
+       PgStat_WaitAccumEntry *entry;
+       instr_time  diff;
+
+       if (wa_hash == NULL)
+               return;
+
+       if (pgstat_track_wait_timing)
+       {
+               INSTR_TIME_SET_CURRENT(diff);
+               INSTR_TIME_SUBTRACT(diff, waitStart);
+       }
+
+       entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
+
+       if (!entry)
+       {
+               printf("wait_event_info: %u.\n", wait_event_info);
+               fflush(stdout);
+               return;
+       }
+
+       entry->calls++;
+       if (pgstat_track_wait_timing)
+       {
+               INSTR_TIME_ADD(entry->times, diff);
+       }
+}
+
 static inline void
 pgstat_report_wait_start(uint32 wait_event_info)
 {
@@ -1328,6 +1440,8 @@ pgstat_report_wait_start(uint32 wait_event_info)
         * four-bytes, updates are atomic.
         */
        proc->wait_event_info = wait_event_info;
+
+       pgstat_report_waitaccum_start();
 }
 
 /* ----------
@@ -1347,6 +1461,8 @@ pgstat_report_wait_end(void)
        if (!pgstat_track_activities || !proc)
                return;
 
+       pgstat_report_waitaccum_end(proc->wait_event_info);
+
        /*
         * Since this is a four-byte field which is always read and written as
         * four-bytes, updates are atomic.
@@ -1354,6 +1470,7 @@ pgstat_report_wait_end(void)
        proc->wait_event_info = 0;
 }
 
+
 /* nontransactional event counts are simple enough to inline */
 
 #define pgstat_count_heap_scan(rel)                                            
                        \
@@ -1421,6 +1538,7 @@ extern void pgstat_twophase_postabort(TransactionId xid, 
uint16 info,
 
 extern void pgstat_send_archiver(const char *xlog, bool failed);
 extern void pgstat_send_bgwriter(void);
+extern void pgstat_send_waitaccum(void);
 
 /* ----------
  * Support functions for the SQL-callable functions to
@@ -1435,5 +1553,6 @@ extern PgStat_StatFuncEntry 
*pgstat_fetch_stat_funcentry(Oid funcid);
 extern int     pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
+extern PgStat_WaitAccumStats *pgstat_fetch_stat_waitaccum(void);
 
 #endif                                                 /* PGSTAT_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 8fda8e4..2149c96 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -184,6 +184,7 @@ extern LWLockPadded *GetNamedLWLockTranche(const char 
*tranche_name);
  * registration in the main shared memory segment wouldn't work for that case.
  */
 extern int     LWLockNewTrancheId(void);
+extern int     LWLockGetLastTrancheId(void);
 extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name);
 extern void LWLockInitialize(LWLock *lock, int tranche_id);
 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 5b407e6..bd47ccb 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -21,6 +21,7 @@
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
 #include "storage/proclist_types.h"
+#include "portability/instr_time.h"
 
 /*
  * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
diff --git a/src/test/regress/expected/rules.out 
b/src/test/regress/expected/rules.out
index 62eaf90..82566d0 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2063,6 +2063,11 @@ pg_stat_user_tables| SELECT pg_stat_all_tables.relid,
     pg_stat_all_tables.autoanalyze_count
    FROM pg_stat_all_tables
   WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 
'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ 
'^pg_toast'::text));
+pg_stat_waitaccum| SELECT s.wait_event_type,
+    s.wait_event,
+    s.calls,
+    s.times
+   FROM pg_stat_get_waitaccum(NULL::integer) s(wait_event_type, wait_event, 
calls, times);
 pg_stat_wal_receiver| SELECT s.pid,
     s.status,
     s.receive_start_lsn,
-- 
1.8.3.1

From 4b9ac96fbf66222ae5fca60e5eed02209b42d1c8 Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshik...@jp.fujitsu.com>
Date: Wed, 15 Jan 2020 12:42:58 +0000
Subject: [PATCH v4 2/2] [POC] Change measuring method of wait event time from
 INSTR_TIME to rdtsc.

This patch changes measuring method of wait event time from INSTR_TIME (which
uses gettimeofday or clock_gettime) to rdtsc. This might reduce the overhead
of measuring overhead.

Any supports like changing clock cycle to actual time or error handling are
not currently implemented.
---
 src/include/pgstat.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index f90bb44..58fa1f7 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -432,7 +432,7 @@ typedef struct PgStat_WaitAccumEntry
 {
        uint32                  wait_event_info;
        PgStat_Counter  calls;
-       instr_time              times;
+       uint64                  times;
 } PgStat_WaitAccumEntry;
 
 /* ----------
@@ -1269,7 +1269,7 @@ typedef struct PgStat_FunctionCallUsage
 } PgStat_FunctionCallUsage;
 
 extern WAHash *wa_hash;
-extern instr_time waitStart;
+extern uint64 waitStart;
 
 /* ----------
  * GUC parameters
@@ -1392,7 +1392,7 @@ pgstat_report_waitaccum_start()
 
        if (pgstat_track_wait_timing)
        {
-               INSTR_TIME_SET_CURRENT(waitStart);
+               waitStart = rdtsc();
        }
 }
 
@@ -1400,15 +1400,15 @@ static inline void
 pgstat_report_waitaccum_end(uint32 wait_event_info)
 {
        PgStat_WaitAccumEntry *entry;
-       instr_time  diff;
+       uint64          diff = 0;
 
        if (wa_hash == NULL)
                return;
 
        if (pgstat_track_wait_timing)
        {
-               INSTR_TIME_SET_CURRENT(diff);
-               INSTR_TIME_SUBTRACT(diff, waitStart);
+               diff = rdtsc();
+               diff -= waitStart;
        }
 
        entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
@@ -1423,7 +1423,7 @@ pgstat_report_waitaccum_end(uint32 wait_event_info)
        entry->calls++;
        if (pgstat_track_wait_timing)
        {
-               INSTR_TIME_ADD(entry->times, diff);
+               entry->times += diff;
        }
 }
 
-- 
1.8.3.1

Reply via email to