Re: pg_stat_advisor extension

Ilia Evdokimov Tue, 06 Feb 2024 07:28:01 -0800

Hi hackers,

I've encountered and addressed errors in the"0001-pg_stat_advisor-extension.patch" when applying it to the mainbranch, specifically trailing whitespace issues at lines 117 and 118:


```
0001-pg_stat_advisor-extension.patch:117: trailing whitespace.
                                                   QUERY PLAN
0001-pg_stat_advisor-extension.patch:118: trailing whitespace.

warning: 2 lines add whitespace errors.

```

An updated patch is attached for review

I welcome your insights, feedback, and evaluations regarding thenecessity of integrating this new extension into PostgreSQL.



Kind regards,

Ilia Evdokimov,

Tantor Labs LLC.

From 6316706c42996219e507bb6ded9dd1e872180e38 Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdoki...@tantorlabs.ru>
Date: Tue, 6 Feb 2024 18:11:04 +0300
Subject: [PATCH] pg_stat_advisor extension

---
 contrib/Makefile                              |   1 +
 contrib/meson.build                           |   1 +
 contrib/pg_stat_advisor/.gitignore            |   3 +
 contrib/pg_stat_advisor/Makefile              |  20 +
 contrib/pg_stat_advisor/README.md             |  85 ++++
 .../expected/pg_stat_advisor.out              |  96 ++++
 contrib/pg_stat_advisor/meson.build           |  30 ++
 contrib/pg_stat_advisor/pg_stat_advisor.c     | 477 ++++++++++++++++++
 .../pg_stat_advisor/sql/pg_stat_advisor.sql   |  50 ++
 9 files changed, 763 insertions(+)
 create mode 100644 contrib/pg_stat_advisor/.gitignore
 create mode 100644 contrib/pg_stat_advisor/Makefile
 create mode 100644 contrib/pg_stat_advisor/README.md
 create mode 100644 contrib/pg_stat_advisor/expected/pg_stat_advisor.out
 create mode 100644 contrib/pg_stat_advisor/meson.build
 create mode 100644 contrib/pg_stat_advisor/pg_stat_advisor.c
 create mode 100644 contrib/pg_stat_advisor/sql/pg_stat_advisor.sql

diff --git a/contrib/Makefile b/contrib/Makefile
index da4e2316a3..da9a4ceeaa 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -34,6 +34,7 @@ SUBDIRS = \
 		pg_buffercache	\
 		pg_freespacemap \
 		pg_prewarm	\
+		pg_stat_advisor \
 		pg_stat_statements \
 		pg_surgery	\
 		pg_trgm		\
diff --git a/contrib/meson.build b/contrib/meson.build
index c12dc906ca..a20d99443b 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -49,6 +49,7 @@ subdir('pgcrypto')
 subdir('pg_freespacemap')
 subdir('pg_prewarm')
 subdir('pgrowlocks')
+subdir('pg_stat_advisor')
 subdir('pg_stat_statements')
 subdir('pgstattuple')
 subdir('pg_surgery')
diff --git a/contrib/pg_stat_advisor/.gitignore b/contrib/pg_stat_advisor/.gitignore
new file mode 100644
index 0000000000..913175ff6e
--- /dev/null
+++ b/contrib/pg_stat_advisor/.gitignore
@@ -0,0 +1,3 @@
+/log/
+/results/
+/tmp_check/
diff --git a/contrib/pg_stat_advisor/Makefile b/contrib/pg_stat_advisor/Makefile
new file mode 100644
index 0000000000..f31b939e8a
--- /dev/null
+++ b/contrib/pg_stat_advisor/Makefile
@@ -0,0 +1,20 @@
+# contrib/pg_stat_advisor/Makefile
+
+MODULE_big = pg_stat_advisor
+OBJS = \
+	$(WIN32RES) \
+	pg_stat_advisor.o
+PGFILEDESC = "pg_stat_advisor - analyze query performance and recommend the creation of additional statistics"
+
+REGRESS = pg_stat_advisor
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_stat_advisor
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_stat_advisor/README.md b/contrib/pg_stat_advisor/README.md
new file mode 100644
index 0000000000..f9610f2ed5
--- /dev/null
+++ b/contrib/pg_stat_advisor/README.md
@@ -0,0 +1,85 @@
+## pg_stat_advisor - PostgreSQL advisor to create statistics
+
+pg_stat_advisor is a PostgreSQL extension designed to analyze query performance and recommend the creation of additional statistics to improve query plan.
+
+Append pg_stat_advisor to shared_preload_libraries configuration parameter in your postgresql.conf file then restart the PostgreSQL database to apply the changes. Or you can use "LOAD 'pg_stat_advisor';"command
+```
+LOAD 'pg_stat_advisor';
+```
+
+There is the pg_stat_advisor.suggest_statistics_threshold GUC that can be used to set a suggest_statistics_threshold. It is the the ratio of total tuples produced compared to the planned rows. If parameter is set by 0, the printing switches off.
+
+For example:
+```
+SET pg_stat_advisor.suggest_statistics_threshold = 1.0;
+```
+
+Examples:
+
+
+```
+postgres=# create table t (i int, j int);
+CREATE TABLE
+postgres=# insert into t select i/10, i/100 from generate_series(1, 1000000) i;
+INSERT 0 1000000
+postgres=# analyze t;
+ANALYZE
+postgres=# explain analyze select * from t where i = 100 and j = 10;
+                                                   QUERY PLAN
+
+----------------------------------------------------------------------------------------------
+------------------
+ Gather  (cost=1000.00..11675.10 rows=1 width=8) (actual time=0.526..61.564 rows=10 loops=1)
+   Workers Planned: 2
+   Workers Launched: 2
+   ->  Parallel Seq Scan on t  (cost=0.00..10675.00 rows=1 width=8) (actual time=35.369..54.44
+7 rows=3 loops=3)
+         Filter: ((i = 100) AND (j = 10))
+         Rows Removed by Filter: 333330
+ Planning Time: 0.148 ms
+ Execution Time: 61.589 ms
+(8 rows)
+
+
+postgres=# load 'pg_stat_advisor';
+LOAD
+postgres=# set pg_stat_advisor.suggest_statistics_threshold = 0.1;
+SET
+postgres=# explain analyze select * from t where i = 100 and j = 10;
+NOTICE:  pg_stat_advisor suggestion: CREATE STATISTICS t_i_j ON i, j FROM t
+                                                   QUERY PLAN
+
+----------------------------------------------------------------------------------------------
+------------------
+ Gather  (cost=1000.00..11675.10 rows=1 width=8) (actual time=0.400..59.292 rows=10 loops=1)
+   Workers Planned: 2
+   Workers Launched: 2
+   ->  Parallel Seq Scan on t  (cost=0.00..10675.00 rows=1 width=8) (actual time=35.614..54.29
+1 rows=3 loops=3)
+         Filter: ((i = 100) AND (j = 10))
+         Rows Removed by Filter: 333330
+ Planning Time: 0.081 ms
+ Execution Time: 59.413 ms
+(8 rows)
+
+
+postgres=# create statistics stat_t_i_j on i, j from t;
+CREATE STATISTICS
+postgres=# analyze t;
+ANALYZE
+postgres=# explain analyze select * from t where i = 100 and j = 10;
+                                                   QUERY PLAN
+
+----------------------------------------------------------------------------------------------
+------------------
+ Gather  (cost=1000.00..11675.10 rows=10 width=8) (actual time=0.400..59.292 rows=10 loops=1)
+   Workers Planned: 2
+   Workers Launched: 2
+   ->  Parallel Seq Scan on t  (cost=0.00..10675.00 rows=1 width=8) (actual time=35.614..54.29
+1 rows=3 loops=3)
+         Filter: ((i = 100) AND (j = 10))
+         Rows Removed by Filter: 333330
+ Planning Time: 0.081 ms
+ Execution Time: 59.413 ms
+(8 rows)
+```
diff --git a/contrib/pg_stat_advisor/expected/pg_stat_advisor.out b/contrib/pg_stat_advisor/expected/pg_stat_advisor.out
new file mode 100644
index 0000000000..b45e7153ae
--- /dev/null
+++ b/contrib/pg_stat_advisor/expected/pg_stat_advisor.out
@@ -0,0 +1,96 @@
+LOAD 'pg_stat_advisor';
+SET pg_stat_advisor.suggest_statistics_threshold = 0.1;
+-------------------------------Check printing suggestion-----------------------------------------------------------
+CREATE TABLE t (i INT, j INT);
+INSERT INTO t SELECT i/10, i/100 FROM  GENERATE_SERIES(1,1000000) i;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+NOTICE:  pg_stat_advisor suggestion: CREATE STATISTICS t_i_j ON i, j FROM t
+                      QUERY PLAN                      
+------------------------------------------------------
+ Gather (actual rows=10 loops=1)
+   Workers Planned: 2
+   Workers Launched: 2
+   ->  Parallel Seq Scan on t (actual rows=3 loops=3)
+         Filter: ((i = 100) AND (j = 10))
+         Rows Removed by Filter: 333330
+(6 rows)
+
+CREATE STATISTICS t_i_j ON i, j FROM t;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Gather (actual rows=10 loops=1)
+   Workers Planned: 2
+   Workers Launched: 2
+   ->  Parallel Seq Scan on t (actual rows=3 loops=3)
+         Filter: ((i = 100) AND (j = 10))
+         Rows Removed by Filter: 333330
+(6 rows)
+
+DROP STATISTICS t_i_j;
+-------------------------------Check existing statistics except suggested name-------------------------------------
+CREATE STATISTICS statistics_t_i_j ON i, j FROM t;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Gather (actual rows=10 loops=1)
+   Workers Planned: 2
+   Workers Launched: 2
+   ->  Parallel Seq Scan on t (actual rows=3 loops=3)
+         Filter: ((i = 100) AND (j = 10))
+         Rows Removed by Filter: 333330
+(6 rows)
+
+DROP STATISTICS statistics_t_i_j;
+DROP TABLE t;
+-------------------------------Check suggest_statistics_threshold parameter----------------------------------------
+SET pg_stat_advisor.suggest_statistics_threshold = 10.0;
+CREATE TABLE inner_tab(x INT, y INT);
+CREATE TABLE outer_tab(pk INT PRIMARY KEY, x INT, y INT);
+CREATE INDEX ON inner_tab(x,y);
+INSERT INTO outer_tab VALUES (GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000)*10);
+INSERT INTO inner_tab VALUES (GENERATE_SERIES(1,1000000)/10, GENERATE_SERIES(1,1000000)/10*10);
+ANALYZE inner_tab;
+ANALYZE outer_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+NOTICE:  pg_stat_advisor suggestion: CREATE STATISTICS inner_tab_x_y ON x, y FROM inner_tab
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Nested Loop (actual rows=10 loops=1)
+   ->  Index Scan using outer_tab_pkey on outer_tab (actual rows=1 loops=1)
+         Index Cond: (pk = 1)
+   ->  Index Only Scan using inner_tab_x_y_idx on inner_tab (actual rows=10 loops=1)
+         Index Cond: ((x = outer_tab.x) AND (y = outer_tab.y))
+         Heap Fetches: 10
+(6 rows)
+
+CREATE STATISTICS inner_tab_x_y ON x, y FROM inner_tab;
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Nested Loop (actual rows=10 loops=1)
+   ->  Index Scan using outer_tab_pkey on outer_tab (actual rows=1 loops=1)
+         Index Cond: (pk = 1)
+   ->  Index Only Scan using inner_tab_x_y_idx on inner_tab (actual rows=10 loops=1)
+         Index Cond: ((x = outer_tab.x) AND (y = outer_tab.y))
+         Heap Fetches: 10
+(6 rows)
+
+DROP STATISTICS inner_tab_x_y;
+SET pg_stat_advisor.suggest_statistics_threshold = 10.1;
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Nested Loop (actual rows=10 loops=1)
+   ->  Index Scan using outer_tab_pkey on outer_tab (actual rows=1 loops=1)
+         Index Cond: (pk = 1)
+   ->  Index Only Scan using inner_tab_x_y_idx on inner_tab (actual rows=10 loops=1)
+         Index Cond: ((x = outer_tab.x) AND (y = outer_tab.y))
+         Heap Fetches: 10
+(6 rows)
+
diff --git a/contrib/pg_stat_advisor/meson.build b/contrib/pg_stat_advisor/meson.build
new file mode 100644
index 0000000000..292917c025
--- /dev/null
+++ b/contrib/pg_stat_advisor/meson.build
@@ -0,0 +1,30 @@
+# Copyright (c) 2022-2024, PostgreSQL Global Development Group
+
+pg_stat_advisor_sources = files(
+  'pg_stat_advisor.c',
+)
+
+if host_system == 'windows'
+  pg_stat_advisor_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'pg_stat_advsior',
+    '--FILEDESC', 'pg_stat_advisor - suggestion of creating statistics',])
+endif
+
+pg_stat_advisor = shared_module('pg_stat_advisor',
+  pg_stat_advisor_sources,
+  kwargs: contrib_mod_args + {
+    'dependencies': contrib_mod_args['dependencies'],
+  },
+)
+contrib_targets += pg_stat_advisor
+
+tests += {
+  'name': 'pg_stat_advisor',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'regress': {
+    'sql': [
+      'pg_stat_advisor',
+    ],
+  },
+}
diff --git a/contrib/pg_stat_advisor/pg_stat_advisor.c b/contrib/pg_stat_advisor/pg_stat_advisor.c
new file mode 100644
index 0000000000..498a41d4bf
--- /dev/null
+++ b/contrib/pg_stat_advisor/pg_stat_advisor.c
@@ -0,0 +1,477 @@
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "access/table.h"
+#include "catalog/pg_statistic_ext.h"
+#include "commands/explain.h"
+#include "optimizer/optimizer.h"
+#include "parser/parsetree.h"
+#include "statistics/statistics.h"
+#include "utils/guc.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+
+PG_MODULE_MAGIC;
+
+/* GUC variables */
+static double pg_stat_advisor_suggest_statistics_threshold = 0.0;
+
+/* Current nesting depth of ExecutorRun calls */
+static int	nesting_level = 0;
+
+#define pg_stat_advisor_enabled() (nesting_level == 0)
+
+void		_PG_init(void);
+
+/* Saved hook values in case of unload */
+static ExecutorStart_hook_type prev_ExecutorStart = NULL;
+static ExecutorRun_hook_type prev_ExecutorRun = NULL;
+static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
+static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
+
+static void explain_ExecutorStart(QueryDesc *queryDesc, int eflags);
+static void explain_ExecutorRun(QueryDesc *queryDesc,
+								ScanDirection direction,
+								uint64 count, bool execute_once);
+static void explain_ExecutorFinish(QueryDesc *queryDesc);
+static void explain_ExecutorEnd(QueryDesc *queryDesc);
+
+static void SuggestMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es);
+
+/*
+ * Module load callback
+ */
+void
+_PG_init(void)
+{
+	/* Define custom GUC variables. */
+	DefineCustomRealVariable("pg_stat_advisor.suggest_statistics_threshold",
+							 "Set the threshold for actual/estimated rows",
+							 "Zero disables suggestion of creating statistics",
+							 &pg_stat_advisor_suggest_statistics_threshold,
+							 0.0,
+							 0.0,
+							 INT_MAX,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	MarkGUCPrefixReserved("pg_stat_advisor");
+
+	/* Install hooks. */
+	prev_ExecutorStart = ExecutorStart_hook;
+	ExecutorStart_hook = explain_ExecutorStart;
+	prev_ExecutorRun = ExecutorRun_hook;
+	ExecutorRun_hook = explain_ExecutorRun;
+	prev_ExecutorFinish = ExecutorFinish_hook;
+	ExecutorFinish_hook = explain_ExecutorFinish;
+	prev_ExecutorEnd = ExecutorEnd_hook;
+	ExecutorEnd_hook = explain_ExecutorEnd;
+}
+
+/*
+ * ExecutorStart hook: start up logging if needed
+ */
+static void
+explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+	if (prev_ExecutorStart)
+		prev_ExecutorStart(queryDesc, eflags);
+	else
+		standard_ExecutorStart(queryDesc, eflags);
+
+	if (pg_stat_advisor_enabled() && queryDesc->totaltime == NULL)
+	{
+		MemoryContext oldcxt;
+
+		oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+		queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+		MemoryContextSwitchTo(oldcxt);
+	}
+}
+
+/*
+ * ExecutorRun hook: all we need do is track nesting depth
+ */
+static void
+explain_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction,
+					uint64 count, bool execute_once)
+{
+	nesting_level++;
+	PG_TRY();
+	{
+		if (prev_ExecutorRun)
+			prev_ExecutorRun(queryDesc, direction, count, execute_once);
+		else
+			standard_ExecutorRun(queryDesc, direction, count, execute_once);
+	}
+	PG_FINALLY();
+	{
+		nesting_level--;
+	}
+	PG_END_TRY();
+}
+
+/*
+ * ExecutorFinish hook: all we need do is track nesting depth
+ */
+static void
+explain_ExecutorFinish(QueryDesc *queryDesc)
+{
+	nesting_level++;
+	PG_TRY();
+	{
+		if (prev_ExecutorFinish)
+			prev_ExecutorFinish(queryDesc);
+		else
+			standard_ExecutorFinish(queryDesc);
+	}
+	PG_FINALLY();
+	{
+		nesting_level--;
+	}
+	PG_END_TRY();
+}
+
+/*
+ * ExecutorEnd hook: log results if needed
+ */
+static void
+explain_ExecutorEnd(QueryDesc *queryDesc)
+{
+	if (queryDesc->totaltime && pg_stat_advisor_enabled())
+	{
+		MemoryContext oldcxt;
+		ExplainState *es;
+
+		/*
+		 * Make sure we operate in the per-query context, so any cruft will be
+		 * discarded later during ExecutorEnd.
+		 */
+		oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+
+		/*
+		 * Make sure stats accumulation is done.  (Note: it's okay if several
+		 * levels of hook all do this.)
+		 */
+		InstrEndLoop(queryDesc->totaltime);
+
+		es = NewExplainState();
+
+		es->analyze = queryDesc->instrument_options;
+
+		ExplainBeginOutput(es);
+		ExplainQueryText(es, queryDesc);
+
+		ExplainPrintPlan(es, queryDesc);
+		if (es->analyze)
+			ExplainPrintTriggers(es, queryDesc);
+		if (es->costs)
+			ExplainPrintJITSummary(es, queryDesc);
+		ExplainEndOutput(es);
+
+		if (pg_stat_advisor_suggest_statistics_threshold && !IsParallelWorker())
+			SuggestMultiColumnStatisticsForNode(queryDesc->planstate, es);
+
+		MemoryContextSwitchTo(oldcxt);
+	}
+
+	if (prev_ExecutorEnd)
+		prev_ExecutorEnd(queryDesc);
+	else
+		standard_ExecutorEnd(queryDesc);
+}
+
+/**
+ * Comparator used to sort Vars by name
+ */
+static int
+vars_list_comparator(const ListCell *a, const ListCell *b)
+{
+	char	   *va = strVal(linitial(((ColumnRef *) lfirst(a))->fields));
+	char	   *vb = strVal(linitial(((ColumnRef *) lfirst(b))->fields));
+
+	return strcmp(va, vb);
+}
+
+/**
+ * Suggest statistics for specified subplans.
+ */
+static void
+SuggestMultiColumnStatisticsForSubPlans(List *plans, ExplainState *es)
+{
+	ListCell   *lst;
+
+	foreach(lst, plans)
+	{
+		SubPlanState *sps = (SubPlanState *) lfirst(lst);
+
+		SuggestMultiColumnStatisticsForNode(sps->planstate, es);
+	}
+}
+
+/**
+ * Suggest statistics for plan subnodes.
+ */
+static void
+SuggestMultiColumnStatisticsForMemberNodes(PlanState **planstates, int nsubnodes,
+										   ExplainState *es)
+{
+	int			j;
+
+	for (j = 0; j < nsubnodes; j++)
+		SuggestMultiColumnStatisticsForNode(planstates[j], es);
+}
+
+/**
+ * Find extended statistics from 'relation_oid' relation on colmap
+ */
+static bool
+FindExtendedStatisticsOnVars(Oid *relation_oid, Bitmapset *colmap)
+{
+	bool		isExistExtendedStatistics = false;
+	ListCell   *oid;
+
+	/* Receive all extended statistics for current relation */
+	Relation	pg_stextdata = table_open(*relation_oid, RowExclusiveLock);
+	List	   *statistics = RelationGetStatExtList(pg_stextdata);
+
+	foreach(oid, statistics)
+	{
+		Oid			statOid = lfirst_oid(oid);
+		Form_pg_statistic_ext staForm;
+		HeapTuple	htup;
+		Bitmapset  *keys = NULL;
+		int			i;
+
+		/* Read from pg_statistic_ext */
+		htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
+		if (!HeapTupleIsValid(htup))
+			elog(ERROR, "cache lookup failed for statistics object %u", statOid);
+		staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
+
+		for (i = 0; i < staForm->stxkeys.dim1; i++)
+			keys = bms_add_member(keys, staForm->stxkeys.values[i]);
+
+		/* If we have statistics for current columns */
+		if (!bms_compare(keys, colmap))
+		{
+			isExistExtendedStatistics = true;
+
+			ReleaseSysCache(htup);
+			bms_free(keys);
+
+			break;
+		}
+
+		ReleaseSysCache(htup);
+		bms_free(keys);
+	}
+
+	list_free(statistics);
+	table_close(pg_stextdata, RowExclusiveLock);
+
+	return isExistExtendedStatistics;
+}
+
+/**
+ * Suggest statistics for qual
+ */
+static void
+SuggestMultiColumnStatisticsForQual(List *qual, ExplainState *es)
+{
+	List	   *vars = NULL;
+	ListCell   *lc;
+
+	/* Extract vars from all quals */
+	foreach(lc, qual)
+	{
+		Node	   *node = (Node *) lfirst(lc);
+
+		if (IsA(node, RestrictInfo))
+			node = (Node *) ((RestrictInfo *) node)->clause;
+		vars = list_concat(vars, pull_vars_of_level(node, 0));
+	}
+
+	/* Loop until we considered all vars */
+	while (vars != NULL)
+	{
+		ListCell   *cell;
+		List	   *cols = NULL;
+		Index		relno = 0;
+		Bitmapset  *colmap = NULL;
+
+		/* Contruct list of unique vars */
+		foreach(cell, vars)
+		{
+			Node	   *node = (Node *) lfirst(cell);
+
+			if (IsA(node, Var))
+			{
+				Var		   *var = (Var *) node;
+				int			varno = IS_SPECIAL_VARNO(var->varno) ? var->varnosyn : var->varno;
+
+				if (cols == NULL || varno == relno)
+				{
+					int			varattno = IS_SPECIAL_VARNO(var->varno) ? var->varattnosyn : var->varattno;
+
+					relno = varno;
+					if (var->varattno > 0 &&
+						!bms_is_member(varattno, colmap) &&
+						varno >= 1 &&	/* not synthetic var */
+						varno <= list_length(es->rtable) &&
+						list_length(cols) < STATS_MAX_DIMENSIONS)
+					{
+						RangeTblEntry *rte = rt_fetch(varno, es->rtable);
+
+						if (rte->rtekind == RTE_RELATION)
+						{
+							ColumnRef  *col = makeNode(ColumnRef);
+							char	   *colname = get_rte_attribute_name(rte, varattno);
+
+							col->fields = list_make1(makeString(colname));
+							cols = lappend(cols, col);
+							colmap = bms_add_member(colmap, varattno);
+						}
+					}
+				}
+				else
+				{
+					continue;
+				}
+			}
+			vars = foreach_delete_current(vars, cell);
+		}
+		/* To suggest statitics we need to have at least 2 columns */
+		if (list_length(cols) >= 2)
+		{
+			RangeTblEntry *rte = rt_fetch(relno, es->rtable);
+			char	   *rel_name = get_rel_name(rte->relid);
+			char	   *stat_name = rel_name;
+			char	   *create_stat_stmt = (char *) "";
+			char const *sep = "ON";
+			size_t		name_len;
+
+			/* Sort variables by name */
+			list_sort(cols, vars_list_comparator);
+
+			/*
+			 * Construct name for statistic by concatenating relation name
+			 * with all columns
+			 */
+			foreach(cell, cols)
+			{
+				char	   *col_name = strVal(linitial(((ColumnRef *) lfirst(cell))->fields));
+
+				stat_name = psprintf("%s_%s", stat_name, col_name);
+				create_stat_stmt = psprintf("%s%s %s", create_stat_stmt, sep, col_name);
+				sep = ",";
+			}
+
+			name_len = strlen(stat_name);
+			/* Truncate name if it doesn't fit in NameData */
+			if (name_len >= NAMEDATALEN)
+				stat_name = psprintf("%.*s_%08x", NAMEDATALEN - 10, stat_name, (unsigned) hash_any((uint8 *) stat_name, name_len));
+
+			if (!FindExtendedStatisticsOnVars(&rte->relid, colmap))
+			{
+				ereport(NOTICE, (errmsg("pg_stat_advisor suggestion: CREATE STATISTICS %s %s FROM %s",
+										stat_name, create_stat_stmt, rel_name),
+								 errhidestmt(true)));
+			}
+		}
+
+		bms_free(colmap);
+	}
+}
+
+/**
+ * Suggest statistics for node
+ */
+static void
+SuggestMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es)
+{
+	Plan	   *plan = planstate->plan;
+
+	if (planstate->instrument && plan->plan_rows != 0 &&
+		pg_stat_advisor_suggest_statistics_threshold > 0.0 &&
+		planstate->instrument->ntuples / plan->plan_rows >= pg_stat_advisor_suggest_statistics_threshold)
+	{
+		elog(DEBUG1, "Estimated=%f, actual=%f, error=%f: plan=%s",
+			 plan->plan_rows,
+			 planstate->instrument->ntuples,
+			 planstate->instrument->ntuples / plan->plan_rows,
+			 nodeToString(plan));
+		/* quals, sort keys, etc */
+		switch (nodeTag(plan))
+		{
+			case T_IndexScan:
+				SuggestMultiColumnStatisticsForQual(((IndexScan *) plan)->indexqualorig, es);
+				break;
+			case T_IndexOnlyScan:
+				SuggestMultiColumnStatisticsForQual(((IndexOnlyScan *) plan)->indexqual, es);
+				break;
+			case T_BitmapIndexScan:
+				SuggestMultiColumnStatisticsForQual(((BitmapIndexScan *) plan)->indexqualorig, es);
+				break;
+			case T_NestLoop:
+				SuggestMultiColumnStatisticsForQual(((NestLoop *) plan)->join.joinqual, es);
+				break;
+			case T_MergeJoin:
+				SuggestMultiColumnStatisticsForQual(((MergeJoin *) plan)->mergeclauses, es);
+				SuggestMultiColumnStatisticsForQual(((MergeJoin *) plan)->join.joinqual, es);
+				break;
+			case T_HashJoin:
+				SuggestMultiColumnStatisticsForQual(((HashJoin *) plan)->hashclauses, es);
+				SuggestMultiColumnStatisticsForQual(((HashJoin *) plan)->join.joinqual, es);
+				break;
+			default:
+				break;
+		}
+		SuggestMultiColumnStatisticsForQual(plan->qual, es);
+	}
+
+	/* initPlan-s */
+	if (planstate->initPlan)
+		SuggestMultiColumnStatisticsForSubPlans(planstate->initPlan, es);
+
+	/* lefttree */
+	if (outerPlanState(planstate))
+		SuggestMultiColumnStatisticsForNode(outerPlanState(planstate), es);
+
+	/* righttree */
+	if (innerPlanState(planstate))
+		SuggestMultiColumnStatisticsForNode(innerPlanState(planstate), es);
+
+	/* special child plans */
+	switch (nodeTag(plan))
+	{
+		case T_Append:
+			SuggestMultiColumnStatisticsForMemberNodes(((AppendState *) planstate)->appendplans,
+													   ((AppendState *) planstate)->as_nplans,
+													   es);
+			break;
+		case T_MergeAppend:
+			SuggestMultiColumnStatisticsForMemberNodes(((MergeAppendState *) planstate)->mergeplans,
+													   ((MergeAppendState *) planstate)->ms_nplans,
+													   es);
+			break;
+		case T_BitmapAnd:
+			SuggestMultiColumnStatisticsForMemberNodes(((BitmapAndState *) planstate)->bitmapplans,
+													   ((BitmapAndState *) planstate)->nplans,
+													   es);
+			break;
+		case T_BitmapOr:
+			SuggestMultiColumnStatisticsForMemberNodes(((BitmapOrState *) planstate)->bitmapplans,
+													   ((BitmapOrState *) planstate)->nplans,
+													   es);
+			break;
+		case T_SubqueryScan:
+			SuggestMultiColumnStatisticsForNode(((SubqueryScanState *) planstate)->subplan, es);
+			break;
+		default:
+			break;
+	}
+}
diff --git a/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql b/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
new file mode 100644
index 0000000000..3fd8b24416
--- /dev/null
+++ b/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
@@ -0,0 +1,50 @@
+LOAD 'pg_stat_advisor';
+SET pg_stat_advisor.suggest_statistics_threshold = 0.1;
+
+-------------------------------Check printing suggestion-----------------------------------------------------------
+
+CREATE TABLE t (i INT, j INT);
+INSERT INTO t SELECT i/10, i/100 FROM  GENERATE_SERIES(1,1000000) i;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+
+CREATE STATISTICS t_i_j ON i, j FROM t;
+
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+
+DROP STATISTICS t_i_j;
+
+-------------------------------Check existing statistics except suggested name-------------------------------------
+
+CREATE STATISTICS statistics_t_i_j ON i, j FROM t;
+
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+
+DROP STATISTICS statistics_t_i_j;
+DROP TABLE t;
+
+-------------------------------Check suggest_statistics_threshold parameter----------------------------------------
+
+SET pg_stat_advisor.suggest_statistics_threshold = 10.0;
+
+CREATE TABLE inner_tab(x INT, y INT);
+CREATE TABLE outer_tab(pk INT PRIMARY KEY, x INT, y INT);
+CREATE INDEX ON inner_tab(x,y);
+INSERT INTO outer_tab VALUES (GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000)*10);
+INSERT INTO inner_tab VALUES (GENERATE_SERIES(1,1000000)/10, GENERATE_SERIES(1,1000000)/10*10);
+ANALYZE inner_tab;
+ANALYZE outer_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+
+CREATE STATISTICS inner_tab_x_y ON x, y FROM inner_tab;
+
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+
+DROP STATISTICS inner_tab_x_y;
+SET pg_stat_advisor.suggest_statistics_threshold = 10.1;
+
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
\ No newline at end of file
-- 
2.34.1

Re: pg_stat_advisor extension

Reply via email to