From 59d6fa8fd1dbccd60cf4a72663157fc531dbd5f9 Mon Sep 17 00:00:00 2001
From: erthalion <9erthalion6@gmail.com>
Date: Fri, 29 Mar 2019 15:20:22 +0100
Subject: [PATCH v12 3/3] Reorder by values distribution

---
 src/backend/optimizer/path/pathkeys.c    | 218 +++++++++++++++++++++++++++++++
 src/backend/optimizer/plan/planner.c     |  33 ++++-
 src/backend/utils/misc/guc.c             |  21 ++-
 src/include/optimizer/paths.h            |  10 ++
 src/test/regress/expected/aggregates.out |  16 +--
 5 files changed, 288 insertions(+), 10 deletions(-)

diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 65e53ef854..ac79bc4975 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -24,6 +24,7 @@
 #include "nodes/plannodes.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/pathnode.h"
+#include "optimizer/plancat.h"
 #include "optimizer/paths.h"
 #include "partitioning/partbounds.h"
 #include "utils/lsyscache.h"
@@ -332,6 +333,11 @@ pathkeys_contained_in(List *keys1, List *keys2)
 	return false;
 }
 
+/************************<DEBUG PART>*************************************/
+bool debug_group_by_reorder_by_pathkeys = true;
+bool debug_cheapest_group_by = true;
+/************************</DEBUG PART>************************************/
+
 /*
  * Reorder GROUP BY pathkeys and clauses to match order of pathkeys. Function
  * returns new lists,  original GROUP BY lists stay untouched.
@@ -345,6 +351,9 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
 	ListCell	*key;
 	int			n;
 
+	if (debug_group_by_reorder_by_pathkeys == false)
+		return 0;
+
 	if (pathkeys == NIL || *group_pathkeys == NIL)
 		return 0;
 
@@ -384,6 +393,215 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
 	return n;
 }
 
+/*
+ * get_width_multiplier
+ *
+ *	Returns relative complexity of comparing two values based on their width.
+ *	The idea behind is that long values have more expensive comparison.
+ */
+static double
+get_width_multiplier(PlannerInfo *root, Expr *expr)
+{
+	double	width = -1.0;
+
+	if (IsA(expr, RelabelType))
+		expr = (Expr *) ((RelabelType *) expr)->arg;
+
+	/* Try to find actual stat in corresonding relation */
+	if (IsA(expr, Var))
+	{
+		Var		*var = (Var *) expr;
+
+		if (var->varno > 0 && var->varno < root->simple_rel_array_size)
+		{
+			RelOptInfo	*rel = root->simple_rel_array[var->varno];
+
+			if (rel != NULL &&
+				var->varattno >= rel->min_attr &&
+				var->varattno <= rel->max_attr)
+			{
+				int	ndx = var->varattno - rel->min_attr;
+
+				if (rel->attr_widths[ndx] > 0)
+					width = rel->attr_widths[ndx];
+			}
+		}
+	}
+
+	/* Didn't find any actual stats, use estimation by type */
+	if (width < 0.0)
+	{
+		Node	*node = (Node*) expr;
+
+		width = get_typavgwidth(exprType(node), exprTypmod(node));
+	}
+
+	/*
+	 * Any value in pgsql is passed by Datum type, so any operation with value
+	 * could not be cheaper than operation with Datum type
+	 */
+	if (width <= sizeof(Datum))
+		return sizeof(Datum);
+
+	return width;
+}
+
+/*
+ * Order tail of list of group pathkeys by uniqueness descendetly. It allows to
+ * speedup sorting. Returns newly allocated lists, old ones stay untouched.
+ * n_preordered defines a head of list which order should be prevented.
+ */
+void
+get_cheapest_group_keys_order(PlannerInfo *root, double nrows,
+							  List *target_list,
+							  List **group_pathkeys, List **group_clauses,
+							  int n_preordered)
+{
+	struct
+	{
+		PathKey			*pathkey;
+		SortGroupClause	*sgc;
+		Node			*pathkeyExpr;
+	}
+				   *keys, tmp;
+	int				nkeys = list_length(*group_pathkeys) - n_preordered;
+	List		   *pathkeyExprList = NIL,
+				   *new_group_pathkeys = NIL,
+				   *new_group_clauses = NIL;
+	ListCell	   *cell;
+	int				i = 0, n_keys_to_est;
+
+	if (!debug_cheapest_group_by)
+		return;
+
+	if (nkeys < 2)
+		return; /* nothing to do */
+
+	/*
+	 * Nothing to do here, since reordering of group clauses to match ORDER BY
+	 * already performed in preprocess_groupclause
+	 */
+	if (n_preordered == 0 && root->sort_pathkeys)
+		return;
+
+	keys = palloc(nkeys * sizeof(*keys));
+
+	/*
+	 * Collect information about pathkey for subsequent usage
+	 */
+	for_each_cell(cell, list_nth_cell(*group_pathkeys, n_preordered))
+	{
+		PathKey			*pathkey = (PathKey *) lfirst(cell);
+
+		keys[i].pathkey = pathkey;
+		keys[i].sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+											  *group_clauses);
+		keys[i].pathkeyExpr = get_sortgroupclause_expr(keys[i].sgc,
+													   target_list);
+		i++;
+	}
+
+	/*
+	 * Find the cheapest to sort order of columns. We will find a first column
+	 * with bigger number of group, then pair (first column in pair is  already
+	 * defined in first step), them triple and so on.
+	 */
+	for(n_keys_to_est = 1; n_keys_to_est <= nkeys - 1; n_keys_to_est++)
+	{
+		ListCell   *tail_cell;
+		int			best_i = 0;
+		double		best_est_num_groups = -1;
+
+		/* expand list of columns and remeber last cell */
+		pathkeyExprList = lappend(pathkeyExprList, NULL);
+		tail_cell = list_tail(pathkeyExprList);
+
+		/*
+		 * Find the best last column - the best means bigger number of groups,
+		 * previous columns are already choosen
+		 */
+		for(i = n_keys_to_est - 1; i < nkeys; i++)
+		{
+			double  est_num_groups;
+			Expr *expr = (Expr *) keys[i].pathkeyExpr;
+			PathKey *pathkey = keys[i].pathkey;
+			EquivalenceMember *em = (EquivalenceMember *)
+									linitial(pathkey->pk_eclass->ec_members);
+
+			lfirst(tail_cell) = keys[i].pathkeyExpr;
+			est_num_groups = estimate_num_groups(root, pathkeyExprList,
+												 nrows, NULL);
+			est_num_groups /= get_width_multiplier(root, expr);
+
+			if (em->em_datatype != InvalidOid)
+			{
+				Oid			sortop;
+				QualCost	costs;
+				costs.startup = costs.per_tuple = 0;
+
+				sortop = get_opfamily_member(pathkey->pk_opfamily,
+											 em->em_datatype, em->em_datatype,
+											 pathkey->pk_strategy);
+				add_function_cost(root, get_opcode(sortop), NULL, &costs);
+
+				est_num_groups /= costs.per_tuple;
+			}
+
+			if (est_num_groups > best_est_num_groups)
+			{
+				best_est_num_groups = est_num_groups;
+				best_i = i;
+			}
+		}
+
+		/* Save the best choice */
+		lfirst(tail_cell) = keys[best_i].pathkeyExpr;
+		if (best_i != n_keys_to_est - 1)
+		{
+			tmp = keys[n_keys_to_est - 1];
+			keys[n_keys_to_est - 1] = keys[best_i];
+			keys[best_i] = tmp;
+		}
+	}
+	list_free(pathkeyExprList);
+
+	/*
+	 * Construct result lists, keys array is already ordered to get a cheapest
+	 * sort
+	 */
+	i = 0;
+	foreach(cell, *group_pathkeys)
+	{
+		PathKey	   *pathkey;
+		SortGroupClause *sgc;
+
+		if (i < n_preordered)
+		{
+			pathkey = (PathKey *) lfirst(cell);
+			sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+										  *group_clauses);
+		}
+		else
+		{
+			pathkey = keys[i - n_preordered].pathkey;
+			sgc = keys[i - n_preordered].sgc;
+		}
+
+		new_group_pathkeys = lappend(new_group_pathkeys, pathkey);
+		new_group_clauses = lappend(new_group_clauses, sgc);
+
+		i++;
+	}
+
+	pfree(keys);
+
+	/* Just append the rest GROUP BY clauses */
+	new_group_clauses = list_concat_unique_ptr(new_group_clauses, *group_clauses);
+
+	*group_pathkeys = new_group_pathkeys;
+	*group_clauses = new_group_clauses;
+}
+
 /*
  * get_cheapest_path_for_pathkeys
  *	  Find the cheapest path (according to the specified criterion) that
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 247e39d6ff..d480551d3c 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -6389,7 +6389,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			bool		is_sorted;
 			List	   *group_pathkeys = root->group_pathkeys,
 					   *group_clauses = parse->groupClause;
-			int			n_preordered_groups;
+			int			n_preordered_groups = 0;
 
 			if (parse->groupingSets)
 			{
@@ -6413,11 +6413,20 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			{
 				/* Sort the cheapest-total path if it isn't already sorted */
 				if (!is_sorted)
+				{
+					if (!parse->groupingSets)
+						get_cheapest_group_keys_order(root,
+													  path->rows,
+													  extra->targetList,
+													  &group_pathkeys,
+													  &group_clauses,
+													  n_preordered_groups);
 					path = (Path *) create_sort_path(root,
 													 grouped_rel,
 													 path,
 													 group_pathkeys,
 													 -1.0);
+				}
 
 				/* Now decide what to stick atop it */
 				if (parse->groupingSets)
@@ -6491,6 +6500,12 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 				{
 					if (path != partially_grouped_rel->cheapest_total_path)
 						continue;
+					get_cheapest_group_keys_order(root,
+												  path->rows,
+												  extra->targetList,
+												  &group_pathkeys,
+												  &group_clauses,
+												  n_preordered_groups);
 					path = (Path *) create_sort_path(root,
 													 grouped_rel,
 													 path,
@@ -6765,11 +6780,19 @@ create_partial_grouping_paths(PlannerInfo *root,
 			{
 				/* Sort the cheapest partial path, if it isn't already */
 				if (!is_sorted)
+				{
+					get_cheapest_group_keys_order(root,
+												  path->rows,
+												  extra->targetList,
+												  &group_pathkeys,
+												  &group_clauses,
+												  n_preordered_groups);
 					path = (Path *) create_sort_path(root,
 													 partially_grouped_rel,
 													 path,
 													 group_pathkeys,
 													 -1.0);
+				}
 
 				if (parse->hasAggs)
 					add_path(partially_grouped_rel, (Path *)
@@ -6816,11 +6839,19 @@ create_partial_grouping_paths(PlannerInfo *root,
 
 				/* Sort the cheapest partial path, if it isn't already */
 				if (!is_sorted)
+				{
+					get_cheapest_group_keys_order(root,
+												  path->rows,
+												  extra->targetList,
+												  &group_pathkeys,
+												  &group_clauses,
+												  n_preordered_groups);
 					path = (Path *) create_sort_path(root,
 													 partially_grouped_rel,
 													 path,
 													 group_pathkeys,
 													 -1.0);
+				}
 
 				if (parse->hasAggs)
 					add_partial_path(partially_grouped_rel, (Path *)
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f7f726b5ae..f13ea2ab5d 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1952,7 +1952,26 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
-
+/************************<DEBUG OPT GROUP BY>*********************************/
+	{
+		{"debug_group_by_reorder_by_pathkeys", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("enable reorder GROUP BY by pathkeys"),
+			NULL
+		},
+		&debug_group_by_reorder_by_pathkeys,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"debug_enable_cheapest_group_by", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("find a cheapest order of columns in GROUP BY."),
+			NULL
+		},
+		&debug_cheapest_group_by,
+		true,
+		NULL, NULL, NULL
+	},
+/************************</DEBUG OPT GROUP BY>********************************/
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index faf6449f4d..fe4c067369 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -186,6 +186,16 @@ extern bool pathkeys_contained_in(List *keys1, List *keys2);
 extern int group_keys_reorder_by_pathkeys(List *pathkeys,
 										  List **group_pathkeys,
 										  List **group_clauses);
+/************************<DEBUG OPT GROUP BY>*********************************/
+extern bool debug_group_by_reorder_by_pathkeys;
+extern bool debug_cheapest_group_by;
+/************************</DEBUG OPT GROUP BY>********************************/
+extern void get_cheapest_group_keys_order(PlannerInfo *root,
+										  double nrows,
+										  List *target_list,
+										  List **group_pathkeys,
+										  List **group_clauses,
+										  int	n_preordered);
 extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
 							   Relids required_outer,
 							   CostSelector cost_criterion,
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 265c996d5e..9afc1a827d 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -2228,9 +2228,9 @@ SELECT count(*) FROM btg GROUP BY v, p;
          QUERY PLAN          
 -----------------------------
  GroupAggregate
-   Group Key: v, p
+   Group Key: p, v
    ->  Sort
-         Sort Key: v, p
+         Sort Key: p, v
          ->  Seq Scan on btg
 (5 rows)
 
@@ -2239,9 +2239,9 @@ SELECT count(*) FROM btg GROUP BY v, p, c;
          QUERY PLAN          
 -----------------------------
  GroupAggregate
-   Group Key: v, p, c
+   Group Key: p, c, v
    ->  Sort
-         Sort Key: v, p, c
+         Sort Key: p, c, v
          ->  Seq Scan on btg
 (5 rows)
 
@@ -2261,9 +2261,9 @@ SELECT count(*) FROM btg GROUP BY v, p, d, c;
           QUERY PLAN          
 ------------------------------
  GroupAggregate
-   Group Key: v, p, d, c
+   Group Key: p, d, c, v
    ->  Sort
-         Sort Key: v, p, d, c
+         Sort Key: p, d, c, v
          ->  Seq Scan on btg
 (5 rows)
 
@@ -2318,9 +2318,9 @@ SELECT count(*) FROM btg GROUP BY p, d, e;
          QUERY PLAN          
 -----------------------------
  GroupAggregate
-   Group Key: p, d, e
+   Group Key: p, e, d
    ->  Sort
-         Sort Key: p, d, e
+         Sort Key: p, e, d
          ->  Seq Scan on btg
 (5 rows)
 
-- 
2.16.4

