From 0694b70d681186ad10af2837560bac8ab1cba8b2 Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshikazu@jp.fujitsu.com>
Date: Thu, 19 Dec 2019 04:28:28 +0000
Subject: [PATCH 1/2] Adding the pg_stat_waitaccum view which shows counts and
 duration of each wait events. Each backend/backgrounds counts and measures
 the time of wait event in every pgstat_report_wait_start and
 pgstat_report_wait_end. They store those info into their local variables and
 send to Statistics Collector. We can get those info via Statistics Collector.

For reducing overhead, I implemented statistic hash instead of
dynamic hash. I also implemented track_wait_timing which
determines wait event duration is collected or not.

On windows, this function might be not worked correctly, because
now it initialize local variables in pg_stat_init which is not
passed to fork processes on windows.
---
 src/backend/catalog/system_views.sql          |   8 +
 src/backend/postmaster/pgstat.c               | 344 ++++++++++++++++++++++++++
 src/backend/storage/lmgr/lwlock.c             |  19 ++
 src/backend/utils/adt/pgstatfuncs.c           |  80 ++++++
 src/backend/utils/misc/guc.c                  |   9 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/catalog/pg_proc.dat               |   9 +
 src/include/pgstat.h                          | 123 ++++++++-
 src/include/storage/lwlock.h                  |   1 +
 src/include/storage/proc.h                    |   1 +
 src/test/regress/expected/rules.out           |   5 +
 11 files changed, 598 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index f7800f0..976ad98 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -951,6 +951,14 @@ CREATE VIEW pg_stat_bgwriter AS
         pg_stat_get_buf_alloc() AS buffers_alloc,
         pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 
+CREATE VIEW pg_stat_waitaccum AS
+    SELECT
+		S.wait_event_type AS wait_event_type,
+		S.wait_event AS wait_event,
+		S.calls AS calls,
+		S.times AS times
+	FROM pg_stat_get_waitaccum(NULL) AS S;
+
 CREATE VIEW pg_stat_progress_vacuum AS
     SELECT
         S.pid AS pid, S.datid AS datid, D.datname AS datname,
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index e931512..1454e77 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -123,6 +123,7 @@
  */
 bool		pgstat_track_activities = false;
 bool		pgstat_track_counts = false;
+bool		pgstat_track_wait_timing = false;
 int			pgstat_track_functions = TRACK_FUNC_OFF;
 int			pgstat_track_activity_query_size = 1024;
 
@@ -153,6 +154,10 @@ static time_t last_pgstat_start_time;
 
 static bool pgStatRunningInCollector = false;
 
+WAHash *wa_hash;
+
+instr_time waitStart;
+
 /*
  * Structures in which backends store per-table info that's waiting to be
  * sent to the collector.
@@ -255,6 +260,7 @@ static int	localNumBackends = 0;
  */
 static PgStat_ArchiverStats archiverStats;
 static PgStat_GlobalStats globalStats;
+static PgStat_WaitAccumStats waitAccumStats;
 
 /*
  * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
@@ -280,6 +286,8 @@ static pid_t pgstat_forkexec(void);
 #endif
 
 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
+static void pgstat_init_waitaccum_hash(WAHash **hash);
+static PgStat_WaitAccumEntry *pgstat_add_wa_entry(WAHash *hash, uint32 key);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
@@ -287,8 +295,11 @@ static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
 												 Oid tableoid, bool create);
 static void pgstat_write_statsfiles(bool permanent, bool allDbs);
 static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
+static void pgstat_write_waitaccum_statsfile(FILE *fpout);
 static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
 static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
+static bool pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+											FILE *fpin, const char *statfile);
 static void backend_read_statsfile(void);
 static void pgstat_read_current_status(void);
 
@@ -324,6 +335,7 @@ static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
 static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
+static void pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len);
 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
 static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
@@ -331,6 +343,27 @@ static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
 static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
+
+PgStat_WaitAccumEntry *
+pgstat_get_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *current;
+	int bucket = key % WA_BUCKET_SIZE;
+
+	current = hash->buckets[bucket];
+
+	while (current != NULL)
+	{
+		if (current->key == key)
+			return current->entry;
+
+		current = current->next;
+	}
+
+	return NULL;
+}
+
+
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
@@ -602,6 +635,8 @@ retry2:
 
 	pg_freeaddrinfo_all(hints.ai_family, addrs);
 
+	pgstat_init_waitaccum_hash(&wa_hash);
+
 	return;
 
 startup_failed:
@@ -624,6 +659,75 @@ startup_failed:
 	SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
 }
 
+static PgStat_WaitAccumEntry *
+pgstat_add_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *prev;
+	WAEntry *new;
+	int bucket = key % WA_BUCKET_SIZE;
+	
+	prev = hash->buckets[bucket];
+
+	while (prev != NULL && prev->next != NULL)
+		prev = prev->next;
+	
+	new = &hash->entries[hash->entry_num++];
+	new->key = key;
+	new->entry = MemoryContextAllocZero(TopMemoryContext, (sizeof(PgStat_WaitAccumEntry)));
+
+	if (prev != NULL)
+		prev->next = new;
+	else
+		hash->buckets[bucket] = new;
+
+	return new->entry;
+}
+
+static void
+pgstat_init_waitaccum_entry(WAHash *hash, uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+
+	entry = pgstat_add_wa_entry(hash, wait_event_info);
+	entry->wait_event_info = wait_event_info;
+}
+
+static void
+pgstat_init_waitaccum_hash(WAHash **hash)
+{
+	uint32 i;
+	int last_tranche_id;
+
+	*hash = MemoryContextAllocZero(TopMemoryContext, sizeof(WAHash));
+
+	last_tranche_id = LWLockGetLastTrancheId();
+	for (i = PG_WAIT_LWLOCK + 1; i <= (PG_WAIT_LWLOCK | last_tranche_id); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = (PG_WAIT_LOCK | LOCKTAG_RELATION); i <= (PG_WAIT_LOCK | LOCKTAG_LAST_TYPE); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_BUFFER_PIN; i <= PG_WAIT_BUFFER_PIN; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_ACTIVITY; i <= PG_WAIT_ACTIVITY_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_CLIENT; i <= PG_WAIT_CLIENT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	//do extension stuff
+
+	for (i = PG_WAIT_IPC; i <= PG_WAIT_IPC_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_TIMEOUT; i <= PG_WAIT_TIMEOUT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_IO; i <= PG_WAIT_IO_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+}
+
 /*
  * subroutine for pgstat_reset_all
  */
@@ -904,6 +1008,9 @@ pgstat_report_stat(bool force)
 
 	/* Now, send function statistics */
 	pgstat_send_funcstats();
+
+	/* Send wait accumulative statistics */
+	pgstat_send_waitaccum();
 }
 
 /*
@@ -1334,6 +1441,8 @@ pgstat_reset_shared_counters(const char *target)
 		msg.m_resettarget = RESET_ARCHIVER;
 	else if (strcmp(target, "bgwriter") == 0)
 		msg.m_resettarget = RESET_BGWRITER;
+	else if (strcmp(target, "waitaccum") == 0)
+		msg.m_resettarget = RESET_WAITACCUM;
 	else
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -2618,6 +2727,22 @@ pgstat_fetch_global(void)
 	return &globalStats;
 }
 
+/*
+ * ---------
+ * pgstat_fetch_stat_waitaccum() -
+ *
+ *	Support function for the SQL-callable pgstat* functions. Returns
+ *	a pointer to the wait accum statistics struct.
+ * ---------
+ */
+PgStat_WaitAccumStats *
+pgstat_fetch_stat_waitaccum(void)
+{
+	backend_read_statsfile();
+
+	return &waitAccumStats;
+}
+
 
 /* ------------------------------------------------------------
  * Functions for management of the shared-memory PgBackendStatus array
@@ -4407,6 +4532,53 @@ pgstat_send_bgwriter(void)
 	MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
 }
 
+/* ----------
+ * pgstat_send_waitaccum() -
+ *
+ * ----------
+ */
+void
+pgstat_send_waitaccum()
+{
+	PgStat_MsgWaitAccum msg;
+	PgStat_WaitAccumEntry *entry;
+	int i;
+
+	if (wa_hash == NULL)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_WAITACCUM);
+	msg.m_nentries = 0;
+
+	for (i = 0; i < wa_hash->entry_num; i++)
+	{
+		entry = wa_hash->entries[i].entry;
+
+		/* Send only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Prepare and send the message
+		 */
+		memcpy(&msg.m_entry[msg.m_nentries], entry, sizeof(PgStat_WaitAccumEntry));
+		if (++msg.m_nentries >= PGSTAT_NUM_WAITACCUMENTRIES)
+		{
+			pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+						msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+			msg.m_nentries = 0;
+		}
+
+		/* Clear wait events information. */
+		entry->calls = 0;
+		INSTR_TIME_SET_ZERO(entry->times);
+	}
+
+	if (msg.m_nentries > 0)
+		pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+					msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+}
+
 
 /* ----------
  * PgstatCollectorMain() -
@@ -4599,6 +4771,10 @@ PgstatCollectorMain(int argc, char *argv[])
 					pgstat_recv_bgwriter(&msg.msg_bgwriter, len);
 					break;
 
+				case PGSTAT_MTYPE_WAITACCUM:
+					pgstat_recv_waitaccum(&msg.msg_waitaccum, len);
+					break;
+
 				case PGSTAT_MTYPE_FUNCSTAT:
 					pgstat_recv_funcstat(&msg.msg_funcstat, len);
 					break;
@@ -4869,6 +5045,8 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
 	(void) rc;					/* we'll check for error with ferror */
 
+	pgstat_write_waitaccum_statsfile(fpout);
+
 	/*
 	 * Walk through the database table.
 	 */
@@ -5074,6 +5252,43 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 }
 
 /* ----------
+ * pgstat_write_waitaccum_statsfile() -
+ *		Write the waitAccumStats to the stat file.
+ *
+ * ----------
+ */
+static void
+pgstat_write_waitaccum_statsfile(FILE *fpout)
+{
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			rc;
+	int			i;
+
+	/*
+	 * Walk through the waitaccum hash.
+	 */
+	for (i = 0; i < hash->entry_num; i++)
+	{
+		entry = hash->entries[i].entry;
+
+		/* Write only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Write out the DB entry. We don't write the tables or functions
+		 * pointers, since they're of no use to any other process.
+		 */
+		fputc('D', fpout);
+		rc = fwrite(entry, sizeof(PgStat_WaitAccumEntry), 1, fpout);
+		(void) rc;				/* we'll check for error with ferror */
+	}	
+
+	fputc('E', fpout);
+}
+
+/* ----------
  * pgstat_read_statsfiles() -
  *
  *	Reads in some existing statistics collector files and returns the
@@ -5126,6 +5341,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 	 */
 	memset(&globalStats, 0, sizeof(globalStats));
 	memset(&archiverStats, 0, sizeof(archiverStats));
+	waitAccumStats.hash = MemoryContextAllocZero(pgStatLocalContext, sizeof(WAHash));
 
 	/*
 	 * Set the current timestamp (will be kept only in case we can't load an
@@ -5196,6 +5412,9 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 		goto done;
 	}
 
+	if(!pgstat_read_waitaccum_statsfile(&waitAccumStats, fpin, statfile))
+		goto done;
+
 	/*
 	 * We found an existing collector stats file. Read it and put all the
 	 * hashtable entries into place.
@@ -5494,10 +5713,13 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 	PgStat_StatDBEntry dbentry;
 	PgStat_GlobalStats myGlobalStats;
 	PgStat_ArchiverStats myArchiverStats;
+	PgStat_WaitAccumStats myWaitAccumStats;
 	FILE	   *fpin;
 	int32		format_id;
 	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
 
+	myWaitAccumStats.hash = MemoryContextAllocZero(CurrentMemoryContext, sizeof(WAHash));
+
 	/*
 	 * Try to open the stats file.  As above, anything but ENOENT is worthy of
 	 * complaining about.
@@ -5548,6 +5770,9 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 		return false;
 	}
 
+	if(!pgstat_read_waitaccum_statsfile(&myWaitAccumStats, fpin, statfile))
+		return false;
+
 	/* By default, we're going to return the timestamp of the global file. */
 	*ts = myGlobalStats.stats_timestamp;
 
@@ -5601,6 +5826,75 @@ done:
 	return true;
 }
 
+/* ----------
+ * pgstat_read_statsfiles() -
+ *
+ *	Reads the waitaccum stats from the file.
+ *	If an error happens when reading file, return false. Otherwise return true.
+ *
+ * ----------
+ */
+static bool
+pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+								FILE *fpin, const char *statfile)
+{
+	PgStat_WaitAccumEntry *entry;
+	PgStat_WaitAccumEntry buf;
+	WAHash *hash = stats->hash;
+
+	/*
+	 * Read and put all the hashtable entries into place.
+	 */
+	for (;;)
+	{
+		switch (fgetc(fpin))
+		{
+				/*
+				 * 'D'	A PgStat_WaitAccumEntry struct describing a database
+				 * follows.
+				 */
+			case 'D':
+				if (fread(&buf, 1, sizeof(PgStat_WaitAccumEntry), fpin)
+									 != sizeof(PgStat_WaitAccumEntry))
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					return false;
+				}
+
+				entry = pgstat_get_wa_entry(hash, buf.wait_event_info);
+
+				if (entry)
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					return false;
+				}
+
+				/*
+				 * Add to the DB hash
+				 */
+				entry = pgstat_add_wa_entry(hash, buf.wait_event_info);
+				memcpy(entry, &buf, sizeof(PgStat_WaitAccumEntry));
+
+				break;
+
+			case 'E':
+				return true;
+
+			default:
+				ereport(pgStatRunningInCollector ? LOG : WARNING,
+						(errmsg("corrupted statistics file \"%s\"",
+								statfile)));
+				return false;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * If not already done, read the statistics collector stats file into
  * some hash tables.  The results will be kept until pgstat_clear_snapshot()
@@ -6110,7 +6404,20 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 		memset(&archiverStats, 0, sizeof(archiverStats));
 		archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
 	}
+	else if (msg->m_resettarget == RESET_WAITACCUM)
+	{
+		PgStat_WaitAccumEntry *entry;
+		WAHash *hash = waitAccumStats.hash;
+		int i;
+
+		for (i = 0; i < hash->entry_num; i++)
+		{
+			entry = hash->entries[i].entry;
 
+			entry->calls = 0;
+			INSTR_TIME_SET_ZERO(entry->times);
+		}
+	}
 	/*
 	 * Presumably the sender of this message validated the target, don't
 	 * complain here if it's not valid
@@ -6290,6 +6597,43 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
 }
 
 /* ----------
+ * pgstat_recv_waitaccum() -
+ *
+ *	Process a WAITACCUM message.
+ * ----------
+ */
+static void
+pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len)
+{
+	PgStat_WaitAccumEntry *m_entry = &(msg->m_entry[0]);
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			i;
+
+	/*
+	 * Process all function entries in the message.
+	 */
+	for (i = 0; i < msg->m_nentries; i++, m_entry++)
+	{
+		entry = pgstat_get_wa_entry(hash, m_entry->wait_event_info);
+
+		if (!entry)
+		{
+			entry = pgstat_add_wa_entry(hash, m_entry->wait_event_info);
+			memcpy(entry, m_entry, sizeof(PgStat_WaitAccumEntry));
+		}
+		else
+		{
+			/*
+			 * Otherwise add the values to the existing entry.
+			 */
+			entry->calls += m_entry->calls;
+			INSTR_TIME_ADD(entry->times, m_entry->times);
+		}
+	}
+}
+
+/* ----------
  * pgstat_recv_recoveryconflict() -
  *
  *	Process a RECOVERYCONFLICT message.
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 18e3843..8f5b0ba 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -594,6 +594,25 @@ LWLockNewTrancheId(void)
 }
 
 /*
+ * Get a last tranche ID.
+ */
+int
+LWLockGetLastTrancheId(void)
+{
+	int			result;
+	int		   *LWLockCounter;
+
+	Assert(!lock_named_request_allowed);
+
+	LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+	SpinLockAcquire(ShmemLock);
+	result = *LWLockCounter;
+	SpinLockRelease(ShmemLock);
+
+	return result;
+}
+
+/*
  * Register a tranche ID in the lookup table for the current process.  This
  * routine will save a pointer to the tranche name passed as an argument,
  * so the name should be allocated in a backend-lifetime context
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 05240bf..b408db3 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1970,3 +1970,83 @@ pg_stat_get_archiver(PG_FUNCTION_ARGS)
 	PG_RETURN_DATUM(HeapTupleGetDatum(
 									  heap_form_tuple(tupdesc, values, nulls)));
 }
+
+Datum
+pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_WAITACCUM_COLS	4
+	PgStat_WaitAccumStats *waitaccum_stats;
+	PgStat_WaitAccumEntry *entry;
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	int i;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* Get statistics about the waitaccum process */
+	waitaccum_stats = pgstat_fetch_stat_waitaccum();
+
+	for (i = 0; i < waitaccum_stats->hash->entry_num; i++)
+	{
+		Datum		values[PG_STAT_GET_WAITACCUM_COLS];
+		bool		nulls[PG_STAT_GET_WAITACCUM_COLS];
+		const char *wait_event_type = NULL;
+		const char *wait_event = NULL;
+
+		/* Initialise values and NULL flags arrays */
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		entry = waitaccum_stats->hash->entries[i].entry;
+
+		/* Fill values and NULLs */
+		{
+			uint32		raw_wait_event;
+
+			raw_wait_event = UINT32_ACCESS_ONCE(entry->wait_event_info);
+			wait_event_type = pgstat_get_wait_event_type(raw_wait_event);
+			wait_event = pgstat_get_wait_event(raw_wait_event);
+		}
+
+		values[0] = CStringGetTextDatum(wait_event_type);
+
+		values[1] = CStringGetTextDatum(wait_event);
+
+		values[2] = Int64GetDatum(entry->calls);
+
+		values[3] = UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 8d951ce..ebae427 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1424,6 +1424,15 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"track_wait_timing", PGC_SUSET, STATS_COLLECTOR,
+			gettext_noop("Collects timing statistics for wait events."),
+			NULL
+		},
+		&pgstat_track_wait_timing,
+		false,
+		NULL, NULL, NULL
+	},
 
 	{
 		{"update_process_title", PGC_SUSET, PROCESS_TITLE,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 087190c..070c213 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -569,6 +569,7 @@
 #track_activities = on
 #track_counts = on
 #track_io_timing = off
+#track_wait_timing = off
 #track_functions = none			# none, pl, all
 #track_activity_query_size = 1024	# (change requires restart)
 #stats_temp_directory = 'pg_stat_tmp'
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index ac8f64b..9d7c2e8 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5152,6 +5152,15 @@
   proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
   proargnames => '{pid,datid,pid,usesysid,application_name,state,query,wait_event_type,wait_event,xact_start,query_start,backend_start,state_change,client_addr,client_hostname,client_port,backend_xid,backend_xmin,backend_type,ssl,sslversion,sslcipher,sslbits,sslcompression,ssl_client_dn,ssl_client_serial,ssl_issuer_dn,gss_auth,gss_princ,gss_enc}',
   prosrc => 'pg_stat_get_activity' },
+{ oid => '2228',
+  descr => 'statistics: information about accumulative data of wait event',
+  proname => 'pg_stat_get_waitaccum', prorows => '200', proisstrict => 'f',
+  proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,text,text,int8,int8}',
+  proargmodes => '{i,o,o,o,o}',
+  proargnames => '{pid,wait_event_type,wait_event,calls,times}',
+  prosrc => 'pg_stat_get_waitaccum' },
 { oid => '3318',
   descr => 'statistics: information about progress of backends running maintenance command',
   proname => 'pg_stat_get_progress_info', prorows => '100', proretset => 't',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index fe076d8..32c4b5f 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -59,6 +59,7 @@ typedef enum StatMsgType
 	PGSTAT_MTYPE_ANALYZE,
 	PGSTAT_MTYPE_ARCHIVER,
 	PGSTAT_MTYPE_BGWRITER,
+	PGSTAT_MTYPE_WAITACCUM,
 	PGSTAT_MTYPE_FUNCSTAT,
 	PGSTAT_MTYPE_FUNCPURGE,
 	PGSTAT_MTYPE_RECOVERYCONFLICT,
@@ -119,7 +120,8 @@ typedef struct PgStat_TableCounts
 typedef enum PgStat_Shared_Reset_Target
 {
 	RESET_ARCHIVER,
-	RESET_BGWRITER
+	RESET_BGWRITER,
+	RESET_WAITACCUM
 } PgStat_Shared_Reset_Target;
 
 /* Possible object types for resetting single counters */
@@ -423,6 +425,33 @@ typedef struct PgStat_MsgBgWriter
 } PgStat_MsgBgWriter;
 
 /* ----------
+ * PgStat_WaitAccumEntry	Entry in backend/background's per-wait_event_info hash table
+ * ----------
+ */
+typedef struct PgStat_WaitAccumEntry
+{
+	uint32			wait_event_info;
+	PgStat_Counter	calls;
+	instr_time		times;
+} PgStat_WaitAccumEntry;
+
+/* ----------
+ * PgStat_MsgWaitAccum	Sent by backend/background's process to update statistics.
+ * ----------
+ */
+#define PGSTAT_NUM_WAITACCUMENTRIES	\
+	((PGSTAT_MSG_PAYLOAD - sizeof(int))  \
+	 / sizeof(PgStat_WaitAccumEntry))
+
+typedef struct PgStat_MsgWaitAccum
+{
+	PgStat_MsgHdr m_hdr;
+
+	int m_nentries;
+	PgStat_WaitAccumEntry m_entry[PGSTAT_NUM_WAITACCUMENTRIES];
+} PgStat_MsgWaitAccum;
+
+/* ----------
  * PgStat_MsgRecoveryConflict	Sent by the backend upon recovery conflict
  * ----------
  */
@@ -564,6 +593,7 @@ typedef union PgStat_Msg
 	PgStat_MsgAnalyze msg_analyze;
 	PgStat_MsgArchiver msg_archiver;
 	PgStat_MsgBgWriter msg_bgwriter;
+	PgStat_MsgWaitAccum msg_waitaccum;
 	PgStat_MsgFuncstat msg_funcstat;
 	PgStat_MsgFuncpurge msg_funcpurge;
 	PgStat_MsgRecoveryConflict msg_recoveryconflict;
@@ -581,7 +611,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9D
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9E
 
 /* ----------
  * PgStat_StatDBEntry			The collector's data per database
@@ -711,6 +741,30 @@ typedef struct PgStat_GlobalStats
 	TimestampTz stat_reset_timestamp;
 } PgStat_GlobalStats;
 
+typedef struct WAEntry
+{
+	int key;
+	PgStat_WaitAccumEntry *entry;
+	struct WAEntry *next;
+} WAEntry;
+
+#define WA_BUCKET_SIZE 461
+
+typedef struct WAHash
+{
+	WAEntry entries[WA_BUCKET_SIZE];
+	WAEntry *buckets[WA_BUCKET_SIZE];
+	int entry_num;
+} WAHash;
+
+/*
+ * WaitAccum statistics kept in the stats collector
+ */
+typedef struct PgStat_WaitAccumStats
+{
+	WAHash *hash;
+} PgStat_WaitAccumStats;
+
 
 /* ----------
  * Backend types
@@ -787,6 +841,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITER_MAIN
 } WaitEventActivity;
 
+#define	PG_WAIT_ACTIVITY_LAST_TYPE	WAIT_EVENT_WAL_WRITER_MAIN
+
 /* ----------
  * Wait Events - Client
  *
@@ -808,6 +864,8 @@ typedef enum
 	WAIT_EVENT_GSS_OPEN_SERVER,
 } WaitEventClient;
 
+#define	PG_WAIT_CLIENT_LAST_TYPE	WAIT_EVENT_GSS_OPEN_SERVER
+
 /* ----------
  * Wait Events - IPC
  *
@@ -856,6 +914,8 @@ typedef enum
 	WAIT_EVENT_SYNC_REP
 } WaitEventIPC;
 
+#define	PG_WAIT_IPC_LAST_TYPE	WAIT_EVENT_SYNC_REP
+
 /* ----------
  * Wait Events - Timeout
  *
@@ -869,6 +929,8 @@ typedef enum
 	WAIT_EVENT_RECOVERY_APPLY_DELAY
 } WaitEventTimeout;
 
+#define	PG_WAIT_TIMEOUT_LAST_TYPE	WAIT_EVENT_RECOVERY_APPLY_DELAY
+
 /* ----------
  * Wait Events - IO
  *
@@ -947,6 +1009,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITE
 } WaitEventIO;
 
+#define	PG_WAIT_IO_LAST_TYPE	WAIT_EVENT_WAL_WRITE
+
 /* ----------
  * Command type for progress reporting purposes
  * ----------
@@ -1203,6 +1267,8 @@ typedef struct PgStat_FunctionCallUsage
 	instr_time	f_start;
 } PgStat_FunctionCallUsage;
 
+extern WAHash *wa_hash;
+extern instr_time waitStart;
 
 /* ----------
  * GUC parameters
@@ -1210,6 +1276,7 @@ typedef struct PgStat_FunctionCallUsage
  */
 extern bool pgstat_track_activities;
 extern bool pgstat_track_counts;
+extern bool pgstat_track_wait_timing;
 extern int	pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_track_activity_query_size;
 extern char *pgstat_stat_directory;
@@ -1227,6 +1294,7 @@ extern PgStat_MsgBgWriter BgWriterStats;
 extern PgStat_Counter pgStatBlockReadTime;
 extern PgStat_Counter pgStatBlockWriteTime;
 
+extern PgStat_WaitAccumEntry *pgstat_get_wa_entry(WAHash *hash, uint32 key);
 /* ----------
  * Functions called from postmaster
  * ----------
@@ -1314,6 +1382,50 @@ extern char *pgstat_clip_activity(const char *raw_activity);
  * initialized.
  * ----------
  */
+
+static inline void
+pgstat_report_waitaccum_start()
+{
+	if (wa_hash == NULL)
+		return; 
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(waitStart);
+	}
+}
+
+static inline void
+pgstat_report_waitaccum_end(uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+	instr_time  diff;
+
+	if (wa_hash == NULL)
+		return; 
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(diff);
+		INSTR_TIME_SUBTRACT(diff, waitStart);
+	}
+
+	entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
+
+	if (!entry)
+	{
+		printf("wait_event_info: %u.\n", wait_event_info);
+		fflush(stdout);
+		return;
+	}
+
+	entry->calls++;
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_ADD(entry->times, diff);
+	}
+}
+
 static inline void
 pgstat_report_wait_start(uint32 wait_event_info)
 {
@@ -1327,6 +1439,8 @@ pgstat_report_wait_start(uint32 wait_event_info)
 	 * four-bytes, updates are atomic.
 	 */
 	proc->wait_event_info = wait_event_info;
+
+	pgstat_report_waitaccum_start();
 }
 
 /* ----------
@@ -1346,6 +1460,8 @@ pgstat_report_wait_end(void)
 	if (!pgstat_track_activities || !proc)
 		return;
 
+	pgstat_report_waitaccum_end(proc->wait_event_info);
+
 	/*
 	 * Since this is a four-byte field which is always read and written as
 	 * four-bytes, updates are atomic.
@@ -1353,6 +1469,7 @@ pgstat_report_wait_end(void)
 	proc->wait_event_info = 0;
 }
 
+
 /* nontransactional event counts are simple enough to inline */
 
 #define pgstat_count_heap_scan(rel)									\
@@ -1420,6 +1537,7 @@ extern void pgstat_twophase_postabort(TransactionId xid, uint16 info,
 
 extern void pgstat_send_archiver(const char *xlog, bool failed);
 extern void pgstat_send_bgwriter(void);
+extern void pgstat_send_waitaccum(void);
 
 /* ----------
  * Support functions for the SQL-callable functions to
@@ -1434,5 +1552,6 @@ extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
 extern int	pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
+extern PgStat_WaitAccumStats *pgstat_fetch_stat_waitaccum(void);
 
 #endif							/* PGSTAT_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index f9450da..cf2e953 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -184,6 +184,7 @@ extern LWLockPadded *GetNamedLWLockTranche(const char *tranche_name);
  * registration in the main shared memory segment wouldn't work for that case.
  */
 extern int	LWLockNewTrancheId(void);
+extern int	LWLockGetLastTrancheId(void);
 extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name);
 extern void LWLockInitialize(LWLock *lock, int tranche_id);
 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 281e1db..4d03d80 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -21,6 +21,7 @@
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
 #include "storage/proclist_types.h"
+#include "portability/instr_time.h"
 
 /*
  * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 80a0782..b524fae 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2058,6 +2058,11 @@ pg_stat_user_tables| SELECT pg_stat_all_tables.relid,
     pg_stat_all_tables.autoanalyze_count
    FROM pg_stat_all_tables
   WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text));
+pg_stat_waitaccum| SELECT s.wait_event_type,
+    s.wait_event,
+    s.calls,
+    s.times
+   FROM pg_stat_get_waitaccum(NULL::integer) s(wait_event_type, wait_event, calls, times);
 pg_stat_wal_receiver| SELECT s.pid,
     s.status,
     s.receive_start_lsn,
-- 
1.8.3.1

