Re: POC: GROUP BY optimization

Andrey V. Lepikhov Tue, 28 Dec 2021 00:48:07 -0800

On 9/2/20 9:12 PM, Tomas Vondra wrote:
> We could simply use the input "tuples" value here, and then divide the
> current and previous estimate to calculate the number of new groups.

Performing a review of this patch I made a number of changes (seecleanup.txt). Maybe it will be useful.As I see, the code, which implements the main idea, is quite stable.Doubts localized in the cost estimation routine. Maybe try to finishthis work by implementing an conservative strategy to a cost estimationof sorting?


--
regards,
Andrey Lepikhov
Postgres Professional

diff --git a/src/backend/optimizer/path/costsize.c 
b/src/backend/optimizer/path/costsize.c
index e617e2ce0e..211ae66b33 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1865,7 +1865,7 @@ get_width_cost_multiplier(PlannerInfo *root, Expr *expr)
  *
  * For multi-column sorts we need to estimate the number of comparisons for
  * each individual column - for example with columns (c1, c2, ..., ck) we
- * can estimate that number of comparions on ck is roughly
+ * can estimate that number of comparisons on ck is roughly
  *
  *     ncomparisons(c1, c2, ..., ck) / ncomparisons(c1, c2, ..., c(k-1))
  *
@@ -1874,10 +1874,10 @@ get_width_cost_multiplier(PlannerInfo *root, Expr *expr)
  *
  *     N * sum( Fk * log(Gk) )
  *
- * Note: We also consider column witdth, not just the comparator cost.
+ * Note: We also consider column width, not just the comparator cost.
  *
  * NOTE: some callers currently pass NIL for pathkeys because they
- * can't conveniently supply the sort keys.  In this case, it will fallback to
+ * can't conveniently supply the sort keys. In this case, it will fallback to
  * simple comparison cost estimate.
  */
 static Cost
@@ -1925,13 +1925,13 @@ compute_cpu_sort_cost(PlannerInfo *root, List 
*pathkeys, int nPresortedKeys,
         */
        foreach(lc, pathkeys)
        {
-               PathKey                         *pathkey = (PathKey*)lfirst(lc);
+               PathKey                         *pathkey = (PathKey*) 
lfirst(lc);
                EquivalenceMember       *em;
                double                           nGroups,
                                                         correctedNGroups;
 
                /*
-                * We believe than equivalence members aren't very  different, 
so, to
+                * We believe than equivalence members aren't very different, 
so, to
                 * estimate cost we take just first member
                 */
                em = (EquivalenceMember *) 
linitial(pathkey->pk_eclass->ec_members);
@@ -1964,7 +1964,7 @@ compute_cpu_sort_cost(PlannerInfo *root, List *pathkeys, 
int nPresortedKeys,
 
                totalFuncCost += funcCost;
 
-               /* remeber if we have a fake var in pathkeys */
+               /* Remember if we have a fake var in pathkeys */
                has_fake_var |= is_fake_var(em->em_expr);
                pathkeyExprs = lappend(pathkeyExprs, em->em_expr);
 
@@ -1974,7 +1974,7 @@ compute_cpu_sort_cost(PlannerInfo *root, List *pathkeys, 
int nPresortedKeys,
                 */
                if (has_fake_var == false)
                        /*
-                        * Recursively compute number of group in group from 
previous step
+                        * Recursively compute number of groups in a group from 
previous step
                         */
                        nGroups = estimate_num_groups_incremental(root, 
pathkeyExprs,
                                                                                
                          tuplesPerPrevGroup, NULL, NULL,
@@ -1992,8 +1992,7 @@ compute_cpu_sort_cost(PlannerInfo *root, List *pathkeys, 
int nPresortedKeys,
                         *
                         * XXX What's the logic of the following formula?
                         */
-                       nGroups = ceil(2.0 + sqrt(tuples) *
-                               list_length(pathkeyExprs) / 
list_length(pathkeys));
+                       nGroups = ceil(2.0 + sqrt(tuples) * (i + 1) / 
list_length(pathkeys));
                else
                        nGroups = tuples;
 
@@ -2033,7 +2032,7 @@ compute_cpu_sort_cost(PlannerInfo *root, List *pathkeys, 
int nPresortedKeys,
 
                /*
                 * We could skip all following columns for cost estimation, 
because we
-                * believe that tuples are unique by set ot previous columns
+                * believe that tuples are unique by the set of previous columns
                 */
                if (tuplesPerPrevGroup <= 1.0)
                        break;
diff --git a/src/backend/optimizer/path/pathkeys.c 
b/src/backend/optimizer/path/pathkeys.c
index 82831a4fa4..707b5ba75b 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -544,9 +544,10 @@ PathkeyMutatorNext(PathkeyMutatorState *state)
        return state->elemsList;
 }
 
-typedef struct {
-       Cost cost;
-       PathKey *pathkey;
+typedef struct PathkeySortCost
+{
+       Cost            cost;
+       PathKey    *pathkey;
 } PathkeySortCost;
 
 static int
@@ -793,7 +794,7 @@ get_useful_group_keys_orderings(PlannerInfo *root, double 
nrows,
         * more complex logic to decide the ordering.
         *
         * XXX Isn't this somewhat redundant with presorted_keys? Actually, it's
-        * more a complement, because it allows benefinting from incremental 
sort
+        * more a complement, because it allows benefiting from incremental sort
         * as much as possible.
         *
         * XXX This does nothing if (n_preordered == 0). We shouldn't create the
diff --git a/src/backend/optimizer/plan/planner.c 
b/src/backend/optimizer/plan/planner.c
index 7a89cee879..f281a23bb7 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -6158,7 +6158,6 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo 
*input_rel,
                {
                        ListCell   *lc2;
                        Path       *path = (Path *) lfirst(lc);
-                       Path       *path_save = path;
                        Path       *path_original = path;
 
                        List       *pathkey_orderings = NIL;
@@ -6182,9 +6181,6 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo 
*input_rel,
                                int                     presorted_keys = 0;
                                PathKeyInfo *info = (PathKeyInfo *) lfirst(lc2);
 
-                               /* restore the path (we replace it in the loop) 
*/
-                               path = path_save;
-
                                is_sorted = 
pathkeys_count_contained_in(info->pathkeys,
                                                                                
                                path->pathkeys,
                                                                                
                                &presorted_keys);
@@ -6653,7 +6649,6 @@ create_partial_grouping_paths(PlannerInfo *root,
                {
                        ListCell   *lc2;
                        Path       *path = (Path *) lfirst(lc);
-                       Path       *path_save = path;
 
                        List       *pathkey_orderings = NIL;
 
@@ -6676,9 +6671,6 @@ create_partial_grouping_paths(PlannerInfo *root,
                                int                     presorted_keys = 0;
                                PathKeyInfo *info = (PathKeyInfo *) lfirst(lc2);
 
-                               /* restore the path (we replace it in the loop) 
*/
-                               path = path_save;
-
                                is_sorted = 
pathkeys_count_contained_in(info->pathkeys,
                                                                                
                                path->pathkeys,
                                                                                
                                &presorted_keys);

>From 603ce7f8f3f4febbfdeb120a9c9b97c396d50f0e Mon Sep 17 00:00:00 2001
From: Andrey Lepikhov <a.lepik...@postgrespro.ru>
Date: Mon, 20 Dec 2021 16:05:05 +0500
Subject: [PATCH] Group BY optimization

---
 .../postgres_fdw/expected/postgres_fdw.out    |  15 +-
 src/backend/optimizer/path/costsize.c         | 368 +++++++++-
 src/backend/optimizer/path/equivclass.c       |  13 +-
 src/backend/optimizer/path/pathkeys.c         | 577 ++++++++++++++++
 src/backend/optimizer/plan/planner.c          | 645 ++++++++++--------
 src/backend/optimizer/util/pathnode.c         |   2 +-
 src/backend/utils/adt/selfuncs.c              |  37 +-
 src/backend/utils/misc/guc.c                  |  29 +
 src/include/nodes/nodes.h                     |   1 +
 src/include/nodes/pathnodes.h                 |  10 +
 src/include/optimizer/cost.h                  |   4 +-
 src/include/optimizer/paths.h                 |  11 +
 src/include/utils/selfuncs.h                  |   5 +
 src/test/regress/expected/aggregates.out      | 244 ++++++-
 .../regress/expected/incremental_sort.out     |   2 +-
 src/test/regress/expected/join.out            |  51 +-
 .../regress/expected/partition_aggregate.out  | 136 ++--
 src/test/regress/expected/partition_join.out  |  75 +-
 src/test/regress/expected/union.out           |  60 +-
 src/test/regress/sql/aggregates.sql           |  99 +++
 src/test/regress/sql/incremental_sort.sql     |   2 +-
 21 files changed, 1892 insertions(+), 494 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 7720ab9c58..0a2b41707c 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -2741,16 +2741,13 @@ select c2 * (random() <= 1)::int as c2 from ft2 group by c2 * (random() <= 1)::i
 -- GROUP BY clause in various forms, cardinal, alias and constant expression
 explain (verbose, costs off)
 select count(c2) w, c2 x, 5 y, 7.0 z from ft1 group by 2, y, 9.0::int order by 2;
-                                      QUERY PLAN                                       
----------------------------------------------------------------------------------------
- Sort
+                                                 QUERY PLAN                                                 
+------------------------------------------------------------------------------------------------------------
+ Foreign Scan
    Output: (count(c2)), c2, 5, 7.0, 9
-   Sort Key: ft1.c2
-   ->  Foreign Scan
-         Output: (count(c2)), c2, 5, 7.0, 9
-         Relations: Aggregate on (public.ft1)
-         Remote SQL: SELECT count(c2), c2, 5, 7.0, 9 FROM "S 1"."T 1" GROUP BY 2, 3, 5
-(7 rows)
+   Relations: Aggregate on (public.ft1)
+   Remote SQL: SELECT count(c2), c2, 5, 7.0, 9 FROM "S 1"."T 1" GROUP BY 2, 3, 5 ORDER BY c2 ASC NULLS LAST
+(4 rows)
 
 select count(c2) w, c2 x, 5 y, 7.0 z from ft1 group by 2, y, 9.0::int order by 2;
   w  | x | y |  z  
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 1e4d404f02..211ae66b33 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1754,6 +1754,324 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
 									rterm->pathtarget->width);
 }
 
+/*
+ * is_fake_var
+ *		Workaround for generate_append_tlist() which generates fake Vars with
+ *		varno == 0, that will cause a fail of estimate_num_group() call
+ *
+ * XXX Ummm, why would estimate_num_group fail with this?
+ */
+static bool
+is_fake_var(Expr *expr)
+{
+	if (IsA(expr, RelabelType))
+		expr = (Expr *) ((RelabelType *) expr)->arg;
+
+	return (IsA(expr, Var) && ((Var *) expr)->varno == 0);
+}
+
+/*
+ * get_width_cost_multiplier
+ *		Returns relative complexity of comparing two values based on it's width.
+ * The idea behind - long values have more expensive comparison. Return value is
+ * in cpu_operator_cost unit.
+ */
+static double
+get_width_cost_multiplier(PlannerInfo *root, Expr *expr)
+{
+	double	width = -1.0; /* fake value */
+
+	if (IsA(expr, RelabelType))
+		expr = (Expr *) ((RelabelType *) expr)->arg;
+
+	/* Try to find actual stat in corresponding relation */
+	if (IsA(expr, Var))
+	{
+		Var		*var = (Var *) expr;
+
+		if (var->varno > 0 && var->varno < root->simple_rel_array_size)
+		{
+			RelOptInfo	*rel = root->simple_rel_array[var->varno];
+
+			if (rel != NULL &&
+				var->varattno >= rel->min_attr &&
+				var->varattno <= rel->max_attr)
+			{
+				int	ndx = var->varattno - rel->min_attr;
+
+				if (rel->attr_widths[ndx] > 0)
+					width = rel->attr_widths[ndx];
+			}
+		}
+	}
+
+	/* Didn't find any actual stats, use estimation by type */
+	if (width < 0.0)
+	{
+		Node	*node = (Node*) expr;
+
+		width = get_typavgwidth(exprType(node), exprTypmod(node));
+	}
+
+	/*
+	 * Any value in pgsql is passed by Datum type, so any operation with value
+	 * could not be cheaper than operation with Datum type
+	 */
+	if (width <= sizeof(Datum))
+		return 1.0;
+
+	/*
+	 * Seems, cost of comparision is not directly proportional to args width,
+	 * because comparing args could be differ width (we known only average over
+	 * column) and difference often could be defined only by looking on first
+	 * bytes. So, use log16(width) as estimation.
+	 */
+	return 1.0 + 0.125 * LOG2(width / sizeof(Datum));
+}
+
+/*
+ * compute_cpu_sort_cost
+ *		compute CPU cost of sort (i.e. in-memory)
+ *
+ * The main thing we need to calculate to estimate sort CPU costs is the number
+ * of calls to the comparator functions. The difficulty is that for multi-column
+ * sorts there may be different data types involved (for some of which the calls
+ * may be much more expensive). Furthermore, the columns may have very different
+ * number of distinct values - the higher the number, the fewer comparisons will
+ * be needed for the following columns.
+ *
+ * The algoritm is incremental - we add pathkeys one by one, and at each step we
+ * estimate the number of necessary comparisons (based on the number of distinct
+ * groups in the current pathkey prefix and the new pathkey), and the comparison
+ * costs (which is data type specific).
+ *
+ * Estimation of the number of comparisons is based on ideas from two sources:
+ *
+ * 1) "Algorithms" (course), Robert Sedgewick, Kevin Wayne [https://algs4.cs.princeton.edu/home/]
+ *
+ * 2) "Quicksort Is Optimal For Many Equal Keys" (paper), Sebastian Wild,
+ * arXiv:1608.04906v4 [cs.DS] 1 Nov 2017. [https://arxiv.org/abs/1608.04906]
+ *
+ * In term of that paper, let N - number of tuples, Xi - number of tuples with
+ * key Ki, then the estimate of number of comparisons is:
+ *
+ *	log(N! / (X1! * X2! * ..))  ~  sum(Xi * log(N/Xi))
+ *
+ * In our case all Xi are the same because now we don't have any estimation of
+ * group sizes, we have only know the estimate of number of groups (distinct
+ * values). In that case, formula becomes:
+ *
+ *	N * log(NumberOfGroups)
+ *
+ * For multi-column sorts we need to estimate the number of comparisons for
+ * each individual column - for example with columns (c1, c2, ..., ck) we
+ * can estimate that number of comparisons on ck is roughly
+ *
+ *	ncomparisons(c1, c2, ..., ck) / ncomparisons(c1, c2, ..., c(k-1))
+ *
+ * Let k be a column number, Gk - number of groups defined by k columns, and Fk
+ * the cost of the comparison is
+ *
+ *	N * sum( Fk * log(Gk) )
+ *
+ * Note: We also consider column width, not just the comparator cost.
+ *
+ * NOTE: some callers currently pass NIL for pathkeys because they
+ * can't conveniently supply the sort keys. In this case, it will fallback to
+ * simple comparison cost estimate.
+ */
+static Cost
+compute_cpu_sort_cost(PlannerInfo *root, List *pathkeys, int nPresortedKeys,
+					  Cost comparison_cost, double tuples, double output_tuples,
+					  bool heapSort)
+{
+	Cost		per_tuple_cost = 0.0;
+	ListCell	*lc;
+	List		*pathkeyExprs = NIL;
+	double		tuplesPerPrevGroup = tuples;
+	double		totalFuncCost = 1.0;
+	bool		has_fake_var = false;
+	int			i = 0;
+	Oid			prev_datatype = InvalidOid;
+	Cost		funcCost = 0.0;
+	List		*cache_varinfos = NIL;
+
+	/* fallback if pathkeys is unknown */
+	if (list_length(pathkeys) == 0)
+	{
+		/*
+		 * If we'll use a bounded heap-sort keeping just K tuples in memory, for
+		 * a total number of tuple comparisons of N log2 K; but the constant
+		 * factor is a bit higher than for quicksort.  Tweak it so that the
+		 * cost curve is continuous at the crossover point.
+		 *
+		 * XXX I suppose the "quicksort factor" references to 1.5 at the end
+		 * of this function, but I'm not sure. I suggest we introduce some simple
+		 * constants for that, instead of magic values.
+		 */
+		output_tuples = (heapSort) ? 2.0 * output_tuples : tuples;
+		per_tuple_cost += 2.0 * cpu_operator_cost * LOG2(output_tuples);
+
+		/* add cost provided by caller */
+		per_tuple_cost += comparison_cost;
+
+		return per_tuple_cost * tuples;
+	}
+
+	/*
+	 * Computing total cost of sorting takes into account:
+	 * - per column comparison function cost
+	 * - we try to compute needed number of comparison per column
+	 */
+	foreach(lc, pathkeys)
+	{
+		PathKey				*pathkey = (PathKey*) lfirst(lc);
+		EquivalenceMember	*em;
+		double				 nGroups,
+							 correctedNGroups;
+
+		/*
+		 * We believe than equivalence members aren't very different, so, to
+		 * estimate cost we take just first member
+		 */
+		em = (EquivalenceMember *) linitial(pathkey->pk_eclass->ec_members);
+
+		if (em->em_datatype != InvalidOid)
+		{
+			/* do not lookup funcCost if data type is the same as previous */
+			if (prev_datatype != em->em_datatype)
+			{
+				Oid			sortop;
+				QualCost	cost;
+
+				sortop = get_opfamily_member(pathkey->pk_opfamily,
+											 em->em_datatype, em->em_datatype,
+											 pathkey->pk_strategy);
+
+				cost.startup = 0;
+				cost.per_tuple = 0;
+				add_function_cost(root, get_opcode(sortop), NULL, &cost);
+				/* we need procost, not product of procost and cpu_operator_cost */
+				funcCost = cost.per_tuple / cpu_operator_cost;
+				prev_datatype = em->em_datatype;
+			}
+		}
+		else
+			funcCost = 1.0; /* fallback */
+
+		/* Try to take into account actual width fee */
+		funcCost *= get_width_cost_multiplier(root, em->em_expr);
+
+		totalFuncCost += funcCost;
+
+		/* Remember if we have a fake var in pathkeys */
+		has_fake_var |= is_fake_var(em->em_expr);
+		pathkeyExprs = lappend(pathkeyExprs, em->em_expr);
+
+		/*
+		 * Prevent call estimate_num_groups() with fake Var. Note,
+		 * pathkeyExprs contains only previous columns
+		 */
+		if (has_fake_var == false)
+			/*
+			 * Recursively compute number of groups in a group from previous step
+			 */
+			nGroups = estimate_num_groups_incremental(root, pathkeyExprs,
+													  tuplesPerPrevGroup, NULL, NULL,
+													  &cache_varinfos,
+													  list_length(pathkeyExprs) - 1);
+		else if (tuples > 4.0)
+			/*
+			 * Use geometric mean as estimation if there is no any stats.
+			 * Don't use DEFAULT_NUM_DISTINCT because it used for only one
+			 * column while here we try to estimate number of groups over
+			 * set of columns.
+			 *
+			 * XXX Perhaps this should use DEFAULT_NUM_DISTINCT at least to
+			 * limit the calculated values, somehow?
+			 *
+			 * XXX What's the logic of the following formula?
+			 */
+			nGroups = ceil(2.0 + sqrt(tuples) * (i + 1) / list_length(pathkeys));
+		else
+			nGroups = tuples;
+
+		/*
+		 * Presorted keys aren't participated in comparison but still checked
+		 * by qsort comparator.
+		 */
+		if (i >= nPresortedKeys)
+		{
+			if (heapSort)
+			{
+				double heap_tuples;
+
+				/* have to keep at least one group, and a multiple of group size */
+				heap_tuples = Max(ceil(output_tuples / tuplesPerPrevGroup) * tuplesPerPrevGroup,
+								  tuplesPerPrevGroup);
+
+				/* so how many (whole) groups is that? */
+				correctedNGroups = ceil(heap_tuples / tuplesPerPrevGroup);
+			}
+			else
+				/* all groups in the input */
+				correctedNGroups = nGroups;
+
+			correctedNGroups = Max(1.0, ceil(correctedNGroups));
+
+			per_tuple_cost += totalFuncCost * LOG2(correctedNGroups);
+		}
+
+		i++;
+
+		/*
+		 * Real-world distribution isn't uniform but now we don't have a way to
+		 * determine that, so, add multiplier to get closer to worst case.
+		 */
+		tuplesPerPrevGroup = ceil(1.5 * tuplesPerPrevGroup / nGroups);
+
+		/*
+		 * We could skip all following columns for cost estimation, because we
+		 * believe that tuples are unique by the set of previous columns
+		 */
+		if (tuplesPerPrevGroup <= 1.0)
+			break;
+	}
+
+	list_free(pathkeyExprs);
+
+	/* per_tuple_cost is in cpu_operator_cost units */
+	per_tuple_cost *= cpu_operator_cost;
+
+	/*
+	 * Accordingly to "Introduction to algorithms", Thomas H. Cormen, Charles E.
+	 * Leiserson, Ronald L. Rivest, ISBN 0-07-013143-0, quicksort estimation
+	 * formula has additional term proportional to number of tuples (See Chapter
+	 * 8.2 and Theorem 4.1). It has meaning with low number of tuples,
+	 * approximately less that 1e4. Of course, it could be implemented as
+	 * additional multiplier under logarithm, but use more complicated formula
+	 * which takes into account number of unique tuples and it isn't clear how
+	 * to combine multiplier with groups. Estimate it as 10 in cpu_operator_cost
+	 * unit.
+	 */
+	per_tuple_cost += 10 * cpu_operator_cost;
+
+	per_tuple_cost += comparison_cost;
+
+	return tuples * per_tuple_cost;
+}
+
+/*
+ * simple wrapper just to estimate best sort path
+ */
+Cost
+cost_sort_estimate(PlannerInfo *root, List *pathkeys, int nPresortedKeys,
+				   double tuples)
+{
+	return compute_cpu_sort_cost(root, pathkeys, nPresortedKeys,
+								0, tuples, tuples, false);
+}
+
 /*
  * cost_tuplesort
  *	  Determines and returns the cost of sorting a relation using tuplesort,
@@ -1770,7 +2088,7 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  * number of initial runs formed and M is the merge order used by tuplesort.c.
  * Since the average initial run should be about sort_mem, we have
  *		disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
- *		cpu = comparison_cost * t * log2(t)
+ * 		and cpu cost (computed by compute_cpu_sort_cost()).
  *
  * If the sort is bounded (i.e., only the first k result tuples are needed)
  * and k tuples can fit into sort_mem, we use a heap method that keeps only
@@ -1789,9 +2107,11 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  * 'comparison_cost' is the extra cost per comparison, if any
  * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
  * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
+ * 'startup_cost' is expected to be 0 at input. If there is "input cost" it should
+ * be added by caller later.
  */
 static void
-cost_tuplesort(Cost *startup_cost, Cost *run_cost,
+cost_tuplesort(PlannerInfo *root, List *pathkeys, Cost *startup_cost, Cost *run_cost,
 			   double tuples, int width,
 			   Cost comparison_cost, int sort_mem,
 			   double limit_tuples)
@@ -1808,9 +2128,6 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost,
 	if (tuples < 2.0)
 		tuples = 2.0;
 
-	/* Include the default cost-per-comparison */
-	comparison_cost += 2.0 * cpu_operator_cost;
-
 	/* Do we have a useful LIMIT? */
 	if (limit_tuples > 0 && limit_tuples < tuples)
 	{
@@ -1834,12 +2151,10 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost,
 		double		log_runs;
 		double		npageaccesses;
 
-		/*
-		 * CPU costs
-		 *
-		 * Assume about N log2 N comparisons
-		 */
-		*startup_cost = comparison_cost * tuples * LOG2(tuples);
+		/* CPU costs */
+		*startup_cost = compute_cpu_sort_cost(root, pathkeys, 0,
+											  comparison_cost, tuples,
+											  tuples, false);
 
 		/* Disk costs */
 
@@ -1855,18 +2170,17 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost,
 	}
 	else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
 	{
-		/*
-		 * We'll use a bounded heap-sort keeping just K tuples in memory, for
-		 * a total number of tuple comparisons of N log2 K; but the constant
-		 * factor is a bit higher than for quicksort.  Tweak it so that the
-		 * cost curve is continuous at the crossover point.
-		 */
-		*startup_cost = comparison_cost * tuples * LOG2(2.0 * output_tuples);
+		/* We'll use a bounded heap-sort keeping just K tuples in memory. */
+		*startup_cost = compute_cpu_sort_cost(root, pathkeys, 0,
+											  comparison_cost, tuples,
+											  output_tuples, true);
 	}
 	else
 	{
 		/* We'll use plain quicksort on all the input tuples */
-		*startup_cost = comparison_cost * tuples * LOG2(tuples);
+		*startup_cost = compute_cpu_sort_cost(root, pathkeys, 0,
+											  comparison_cost, tuples,
+											  tuples, false);
 	}
 
 	/*
@@ -1899,8 +2213,8 @@ cost_incremental_sort(Path *path,
 					  double input_tuples, int width, Cost comparison_cost, int sort_mem,
 					  double limit_tuples)
 {
-	Cost		startup_cost = 0,
-				run_cost = 0,
+	Cost		startup_cost,
+				run_cost,
 				input_run_cost = input_total_cost - input_startup_cost;
 	double		group_tuples,
 				input_groups;
@@ -1985,7 +2299,7 @@ cost_incremental_sort(Path *path,
 	 * pessimistic about incremental sort performance and increase its average
 	 * group size by half.
 	 */
-	cost_tuplesort(&group_startup_cost, &group_run_cost,
+	cost_tuplesort(root, pathkeys, &group_startup_cost, &group_run_cost,
 				   1.5 * group_tuples, width, comparison_cost, sort_mem,
 				   limit_tuples);
 
@@ -1993,7 +2307,7 @@ cost_incremental_sort(Path *path,
 	 * Startup cost of incremental sort is the startup cost of its first group
 	 * plus the cost of its input.
 	 */
-	startup_cost += group_startup_cost
+	startup_cost = group_startup_cost
 		+ input_startup_cost + group_input_run_cost;
 
 	/*
@@ -2002,7 +2316,7 @@ cost_incremental_sort(Path *path,
 	 * group, plus the total cost to process the remaining groups, plus the
 	 * remaining cost of input.
 	 */
-	run_cost += group_run_cost
+	run_cost = group_run_cost
 		+ (group_run_cost + group_startup_cost) * (input_groups - 1)
 		+ group_input_run_cost * (input_groups - 1);
 
@@ -2042,7 +2356,7 @@ cost_sort(Path *path, PlannerInfo *root,
 	Cost		startup_cost;
 	Cost		run_cost;
 
-	cost_tuplesort(&startup_cost, &run_cost,
+	cost_tuplesort(root, pathkeys, &startup_cost, &run_cost,
 				   tuples, width,
 				   comparison_cost, sort_mem,
 				   limit_tuples);
@@ -2140,7 +2454,7 @@ append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers)
  *	  Determines and returns the cost of an Append node.
  */
 void
-cost_append(AppendPath *apath)
+cost_append(AppendPath *apath, PlannerInfo *root)
 {
 	ListCell   *l;
 
@@ -2208,7 +2522,7 @@ cost_append(AppendPath *apath)
 					 * any child.
 					 */
 					cost_sort(&sort_path,
-							  NULL, /* doesn't currently need root */
+							  root,
 							  pathkeys,
 							  subpath->total_cost,
 							  subpath->rows,
diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c
index 6f1abbe47d..899da5b109 100644
--- a/src/backend/optimizer/path/equivclass.c
+++ b/src/backend/optimizer/path/equivclass.c
@@ -681,7 +681,18 @@ get_eclass_for_sort_expr(PlannerInfo *root,
 
 			if (opcintype == cur_em->em_datatype &&
 				equal(expr, cur_em->em_expr))
-				return cur_ec;	/* Match! */
+			{
+				/*
+				 * Match!
+				 *
+				 * Copy sortref if it wasn't set yet, it's possible if ec was
+				 * constructed from WHERE clause, ie it doesn't have target
+				 * reference at all
+				 */
+				if (cur_ec->ec_sortref == 0 && sortref > 0)
+					cur_ec->ec_sortref = sortref;
+				return cur_ec;
+			}
 		}
 	}
 
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 216dd26385..707b5ba75b 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -17,16 +17,19 @@
  */
 #include "postgres.h"
 
+#include "miscadmin.h"
 #include "access/stratnum.h"
 #include "catalog/pg_opfamily.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
 #include "nodes/plannodes.h"
+#include "optimizer/cost.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "partitioning/partbounds.h"
 #include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
 
 
 static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys);
@@ -334,6 +337,527 @@ pathkeys_contained_in(List *keys1, List *keys2)
 	return false;
 }
 
+/************************<DEBUG PART>*************************************/
+bool debug_group_by_reorder_by_pathkeys = true;
+bool debug_group_by_match_order_by = true;
+bool debug_cheapest_group_by = true;
+/************************</DEBUG PART>************************************/
+
+/*
+ * group_keys_reorder_by_pathkeys
+ *		Reorder GROUP BY keys to match pathkeys of input path.
+ *
+ * Function returns new lists (pathkeys and clauses), original GROUP BY lists
+ * stay untouched.
+ *
+ * Returns the number of GROUP BY keys with a matching pathkey.
+ */
+int
+group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
+							   List **group_clauses)
+{
+	List	   *new_group_pathkeys= NIL,
+			   *new_group_clauses = NIL;
+	ListCell   *lc;
+	int			n;
+
+	if (debug_group_by_reorder_by_pathkeys == false)
+		return 0;
+
+	if (pathkeys == NIL || *group_pathkeys == NIL)
+		return 0;
+
+	/*
+	 * Walk the pathkeys (determining ordering of the input path) and see if
+	 * there's a matching GROUP BY key. If we find one, we append if to the
+	 * list, and do the same for the clauses.
+	 *
+	 * Once we find first pathkey without a matching GROUP BY key, the rest of
+	 * the pathkeys is useless and can't be used to evaluate the grouping, so
+	 * we abort the loop and ignore the remaining pathkeys.
+	 *
+	 * XXX Pathkeys are built in a way to allow simply comparing pointers.
+	 */
+	foreach(lc, pathkeys)
+	{
+		PathKey			*pathkey = (PathKey *) lfirst(lc);
+		SortGroupClause	*sgc;
+
+		/* abort on first mismatch */
+		if (!list_member_ptr(*group_pathkeys, pathkey))
+			break;
+
+		new_group_pathkeys = lappend(new_group_pathkeys, pathkey);
+
+		sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+									  *group_clauses);
+
+		new_group_clauses = lappend(new_group_clauses, sgc);
+	}
+
+	/* remember the number of pathkeys with a matching GROUP BY key */
+	n = list_length(new_group_pathkeys);
+
+	/* XXX maybe return when (n == 0) */
+
+	/* append the remaining group pathkeys (will be treated as not sorted) */
+	*group_pathkeys = list_concat_unique_ptr(new_group_pathkeys,
+											 *group_pathkeys);
+	*group_clauses = list_concat_unique_ptr(new_group_clauses,
+											*group_clauses);
+
+	return n;
+}
+
+/*
+ * Used to generate all permutations of a pathkey list.
+ */
+typedef struct PathkeyMutatorState {
+	List	   *elemsList;
+	ListCell  **elemCells;
+	void	  **elems;
+	int		   *positions;
+	int			 mutatorNColumns;
+	int			 count;
+} PathkeyMutatorState;
+
+
+/*
+ * PathkeyMutatorInit
+ *		Initialize state of the permutation generator.
+ *
+ * We want to generate permutations of elements in the "elems" list. We may want
+ * to skip some number of elements at the beginning (when treating as presorted)
+ * or at the end (we only permute a limited number of group keys).
+ *
+ * The list is decomposed into elements, and we also keep pointers to individual
+ * cells. This allows us to build the permuted list quickly and cheaply, without
+ * creating any copies.
+ */
+static void
+PathkeyMutatorInit(PathkeyMutatorState *state, List *elems, int start, int end)
+{
+	int i;
+	int			n = end - start;
+	ListCell	*lc;
+
+	memset(state, 0, sizeof(*state));
+
+	state->mutatorNColumns = n;
+
+	state->elemsList = list_copy(elems);
+
+	state->elems = palloc(sizeof(void*) * n);
+	state->elemCells = palloc(sizeof(ListCell*) * n);
+	state->positions = palloc(sizeof(int) * n);
+
+	i = 0;
+	for_each_cell(lc, state->elemsList, list_nth_cell(state->elemsList, start))
+	{
+		state->elemCells[i] = lc;
+		state->elems[i] = lfirst(lc);
+		state->positions[i] = i + 1;
+		i++;
+
+		if (i >= n)
+			break;
+	}
+}
+
+/* Swap two elements of an array. */
+static void
+PathkeyMutatorSwap(int *a, int i, int j)
+{
+  int s = a[i];
+
+  a[i] = a[j];
+  a[j] = s;
+}
+
+/*
+ * Generate the next permutation of elements.
+ */
+static bool
+PathkeyMutatorNextSet(int *a, int n)
+{
+	int j, k, l, r;
+
+	j = n - 2;
+
+	while (j >= 0 && a[j] >= a[j + 1])
+		j--;
+
+	if (j < 0)
+		return false;
+
+	k = n - 1;
+
+	while (k >= 0 && a[j] >= a[k])
+		k--;
+
+	PathkeyMutatorSwap(a, j, k);
+
+	l = j + 1;
+	r = n - 1;
+
+	while (l < r)
+		PathkeyMutatorSwap(a, l++, r--);
+
+	return true;
+}
+
+/*
+ * PathkeyMutatorNext
+ *		Generate the next permutation of list of elements.
+ *
+ * Returns the next permutation (as a list of elements) or NIL if there are no
+ * more permutations.
+ */
+static List *
+PathkeyMutatorNext(PathkeyMutatorState *state)
+{
+	int	i;
+
+	state->count++;
+
+	/* first permutation is original list */
+	if (state->count == 1)
+		return state->elemsList;
+
+	/* when there are no more permutations, return NIL */
+	if (!PathkeyMutatorNextSet(state->positions, state->mutatorNColumns))
+	{
+		pfree(state->elems);
+		pfree(state->elemCells);
+		pfree(state->positions);
+
+		list_free(state->elemsList);
+
+		return NIL;
+	}
+
+	/* update the list cells to point to the right elements */
+	for(i=0; i<state->mutatorNColumns; i++)
+		lfirst(state->elemCells[i]) =
+			(void *) state->elems[ state->positions[i] - 1 ];
+
+	return state->elemsList;
+}
+
+typedef struct PathkeySortCost
+{
+	Cost		cost;
+	PathKey	   *pathkey;
+} PathkeySortCost;
+
+static int
+pathkey_sort_cost_comparator(const void *_a, const void *_b)
+{
+	const PathkeySortCost *a = (PathkeySortCost *) _a;
+	const PathkeySortCost *b = (PathkeySortCost *) _b;
+
+	if (a->cost < b->cost)
+		return -1;
+	else if (a->cost == b->cost)
+		return 0;
+	return 1;
+}
+
+/*
+ * get_cheapest_group_keys_order
+ *		Returns the pathkeys in an order cheapest to evaluate.
+ *
+ * Given a list of pathkeys, we try to reorder them in a way that minimizes
+ * the CPU cost of sorting. This depends mainly on the cost of comparator
+ * function (the pathkeys may use different data types) and the number of
+ * distinct values in each column (which affects the number of comparator
+ * calls for the following pathkeys).
+ *
+ * In case the input is partially sorted, only the remaining pathkeys are
+ * considered.
+ *
+ * Returns newly allocated lists. If no reordering is possible (or needed),
+ * the lists are set to NIL.
+ */
+static bool
+get_cheapest_group_keys_order(PlannerInfo *root, double nrows,
+							  List **group_pathkeys, List **group_clauses,
+							  int n_preordered)
+{
+	List		   *new_group_pathkeys = NIL,
+				   *new_group_clauses = NIL,
+				   *var_group_pathkeys;
+
+	ListCell	   *cell;
+	PathkeyMutatorState	mstate;
+	double			cheapest_sort_cost = -1.0;
+
+	int nFreeKeys;
+	int nToPermute;
+
+	/* If this optimization is disabled, we're done. */
+	if (!debug_cheapest_group_by)
+		return false;
+
+	/* If there are less than 2 unsorted pathkeys, we're done. */
+	if (list_length(*group_pathkeys) - n_preordered < 2)
+		return false;
+
+	/*
+	 * We could exhaustively cost all possible orderings of the pathkeys, but for
+	 * large number of pathkeys that might be prohibitively expensive. So we try
+	 * to apply a simple cheap heuristics first - we sort the pathkeys by sort
+	 * cost (as if the pathkey was sorted independently) and then check only the
+	 * four cheapest pathkeys. The remaining pathkeys are kept ordered by cost.
+	 *
+	 * XXX This is a very simple heuristics, and likely to work fine for most
+	 * cases (because number of GROUP BY clauses tends to be lower than 4). But
+	 * it ignores how the number of distinct values in each pathkey affects the
+	 * following sorts. It may be better to use "more expensive" pathkey first
+	 * if it has many distinct values, because it then limits the number of
+	 * comparisons for the remaining pathkeys. But evaluating that is kinda the
+	 * expensive bit we're trying to not do.
+	 */
+	nFreeKeys = list_length(*group_pathkeys) - n_preordered;
+	nToPermute = 4;
+	if (nFreeKeys > nToPermute)
+	{
+		int i;
+		PathkeySortCost *costs = palloc(sizeof(PathkeySortCost) * nFreeKeys);
+
+		/* skip the pre-ordered pathkeys */
+		cell = list_nth_cell(*group_pathkeys, n_preordered);
+
+		/* estimate cost for sorting individual pathkeys */
+		for (i = 0; cell != NULL; i++, (cell = lnext(*group_pathkeys, cell)))
+		{
+			List *to_cost = list_make1(lfirst(cell));
+
+			Assert(i < nFreeKeys);
+
+			costs[i].pathkey = lfirst(cell);
+			costs[i].cost = cost_sort_estimate(root, to_cost, 0, nrows);
+
+			pfree(to_cost);
+		}
+
+		/* sort the pathkeys by sort cost in ascending order */
+		qsort(costs, nFreeKeys, sizeof(*costs), pathkey_sort_cost_comparator);
+
+		/*
+		 * Rebuild the list of pathkeys - first the preordered ones, then the
+		 * rest ordered by cost.
+		 */
+		new_group_pathkeys = list_truncate(list_copy(*group_pathkeys), n_preordered);
+
+		for (i = 0; i < nFreeKeys; i++)
+			new_group_pathkeys = lappend(new_group_pathkeys, costs[i].pathkey);
+
+		pfree(costs);
+	}
+	else
+	{
+		/*
+		 * Since v13 list_free() can clean list elements so for original list
+		 * not to be modified it should be copied to a new one which can then
+		 * be cleaned safely if needed.
+		 */
+		new_group_pathkeys = list_copy(*group_pathkeys);
+		nToPermute = nFreeKeys;
+	}
+
+	Assert(list_length(new_group_pathkeys) == list_length(*group_pathkeys));
+
+	/*
+	 * Generate pathkey lists with permutations of the first nToPermute pathkeys.
+	 *
+	 * XXX We simply calculate sort cost for each individual pathkey list, but
+	 * there's room for two dynamic programming optimizations here. Firstly, we
+	 * may pass the current "best" cost to cost_sort_estimate so that it can
+	 * "abort" if the estimated pathkeys list exceeds it. Secondly, it could pass
+	 * return information about the position when it exceeded the cost, and we
+	 * could skip all permutations with the same prefix.
+	 *
+	 * Imagine we've already found ordering with cost C1, and we're evaluating
+	 * another ordering - cost_sort_estimate() calculates cost by adding the
+	 * pathkeys one by one (more or less), and the cost only grows. If at any
+	 * point it exceeds C1, it can't possibly be "better" so we can discard it.
+	 * But we also know that we can discard all ordering with the same prefix,
+	 * because if we're estimating (a,b,c,d) and we exceeded C1 at (a,b) then
+	 * the same thing will happen for any ordering with this prefix.
+	 *
+	 *
+	 */
+	PathkeyMutatorInit(&mstate, new_group_pathkeys, n_preordered, n_preordered + nToPermute);
+
+	while((var_group_pathkeys = PathkeyMutatorNext(&mstate)) != NIL)
+	{
+		Cost	cost;
+
+		cost = cost_sort_estimate(root, var_group_pathkeys, n_preordered, nrows);
+
+		if (cost < cheapest_sort_cost || cheapest_sort_cost < 0)
+		{
+			list_free(new_group_pathkeys);
+			new_group_pathkeys = list_copy(var_group_pathkeys);
+			cheapest_sort_cost = cost;
+		}
+	}
+
+	/* Reorder the group clauses according to the reordered pathkeys. */
+	foreach(cell, new_group_pathkeys)
+	{
+		PathKey			*pathkey = (PathKey *) lfirst(cell);
+
+		new_group_clauses = lappend(new_group_clauses,
+						get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+												*group_clauses));
+	}
+
+	/* Just append the rest GROUP BY clauses */
+	new_group_clauses = list_concat_unique_ptr(new_group_clauses,
+											   *group_clauses);
+
+	*group_pathkeys = new_group_pathkeys;
+	*group_clauses = new_group_clauses;
+
+	return true;
+}
+
+/*
+ * get_useful_group_keys_orderings
+ *		Determine which orderings of GROUP BY keys are potentially interesting.
+ *
+ * Returns list of PathKeyInfo items, each representing an interesting ordering
+ * of GROUP BY keys. Each items stores pathkeys and clauses in matching order.
+ *
+ * The function considers (and keeps) multiple group by orderings:
+ *
+ * - the original ordering, as specified by the GROUP BY clause
+ *
+ * - GROUP BY keys reordered to minimize the sort cost
+ *
+ * - GROUP BY keys reordered to match path ordering (as much as possible), with
+ *   the tail reoredered to minimize the sort cost
+ *
+ * - GROUP BY keys to match target ORDER BY clause (as much as possible), with
+ *   the tail reoredered to minimize the sort cost
+ *
+ * There are other potentially interesting orderings (e.g. it might be best to
+ * match the first ORDER BY key, order the remaining keys differently and then
+ * rely on incremental sort to fix this), but we ignore those for now. To make
+ * this work we'd have to pretty much generate all possible permutations.
+ */
+List *
+get_useful_group_keys_orderings(PlannerInfo *root, double nrows,
+								List *path_pathkeys,
+								List *group_pathkeys, List *group_clauses)
+{
+	Query	   *parse = root->parse;
+	List	   *infos = NIL;
+	PathKeyInfo *info;
+	int			n_preordered = 0;
+
+	List *pathkeys = group_pathkeys;
+	List *clauses = group_clauses;
+
+	/* always return at least the original pathkeys/clauses */
+	info = makeNode(PathKeyInfo);
+	info->pathkeys = pathkeys;
+	info->clauses = clauses;
+
+	infos = lappend(infos, info);
+
+	/* for grouping sets we can't do any reordering */
+	if (parse->groupingSets)
+		return infos;
+
+	/*
+	 * Try reordering pathkeys to minimize the sort cost, ignoring both the
+	 * target ordering (ORDER BY) and ordering of the input path.
+	 */
+	if (get_cheapest_group_keys_order(root, nrows, &pathkeys, &clauses,
+									  n_preordered))
+	{
+		info = makeNode(PathKeyInfo);
+		info->pathkeys = pathkeys;
+		info->clauses = clauses;
+
+		infos = lappend(infos, info);
+	}
+
+	/*
+	 * If the path is sorted in some way, try reordering the group keys to match
+	 * as much of the ordering as possible - we get this sort for free (mostly).
+	 *
+	 * We must not do this when there are no grouping sets, because those use
+	 * more complex logic to decide the ordering.
+	 *
+	 * XXX Isn't this somewhat redundant with presorted_keys? Actually, it's
+	 * more a complement, because it allows benefiting from incremental sort
+	 * as much as possible.
+	 *
+	 * XXX This does nothing if (n_preordered == 0). We shouldn't create the
+	 * info in this case.
+	 */
+	if (path_pathkeys)
+	{
+		n_preordered = group_keys_reorder_by_pathkeys(path_pathkeys,
+													  &pathkeys,
+													  &clauses);
+
+		/* reorder the tail to minimize sort cost */
+		get_cheapest_group_keys_order(root, nrows, &pathkeys, &clauses,
+									  n_preordered);
+
+		/*
+		 * reorder the tail to minimize sort cost
+		 *
+		 * XXX Ignore the return value - there may be nothing to reorder, in
+		 * which case get_cheapest_group_keys_order returns false. But we
+		 * still want to keep the keys reordered to path_pathkeys.
+		 */
+		info = makeNode(PathKeyInfo);
+		info->pathkeys = pathkeys;
+		info->clauses = clauses;
+
+		infos = lappend(infos, info);
+	}
+
+	/*
+	 * Try reordering pathkeys to minimize the sort cost (this time consider
+	 * the ORDER BY clause, but only if set debug_group_by_match_order_by).
+	 *
+	 * XXX This does nothing if (n_preordered == 0). We shouldn't create the
+	 * info in this case.
+	 */
+	if (root->sort_pathkeys && debug_group_by_match_order_by)
+	{
+		n_preordered = group_keys_reorder_by_pathkeys(root->sort_pathkeys,
+													  &pathkeys,
+													  &clauses);
+
+		/*
+		 * reorder the tail to minimize sort cost
+		 *
+		 * XXX Ignore the return value - there may be nothing to reorder, in
+		 * which case get_cheapest_group_keys_order returns false. But we
+		 * still want to keep the keys reordered to sort_pathkeys.
+		 */
+		get_cheapest_group_keys_order(root, nrows, &pathkeys, &clauses,
+									  n_preordered);
+
+		/* keep the group keys reordered to match ordering of input path */
+		info = makeNode(PathKeyInfo);
+		info->pathkeys = pathkeys;
+		info->clauses = clauses;
+
+		infos = lappend(infos, info);
+	}
+
+	return infos;
+}
+
 /*
  * pathkeys_count_contained_in
  *    Same as pathkeys_contained_in, but also sets length of longest
@@ -1862,6 +2386,54 @@ pathkeys_useful_for_ordering(PlannerInfo *root, List *pathkeys)
 	return n_common_pathkeys;
 }
 
+/*
+ * pathkeys_useful_for_grouping
+ *		Count the number of pathkeys that are useful for grouping (instead of
+ *		explicit sort)
+ *
+ * Group pathkeys could be reordered to benefit from the odering. The ordering
+ * may not be "complete" and may require incremental sort, but that's fine. So
+ * we simply count prefix pathkeys with a matching group key, and stop once we
+ * find the first pathkey without a match.
+ *
+ * So e.g. with pathkeys (a,b,c) and group keys (a,b,e) this determines (a,b)
+ * pathkeys are useful for grouping, and we might do incremental sort to get
+ * path ordered by (a,b,e).
+ *
+ * This logic is necessary to retain paths with ordeding not matching grouping
+ * keys directly, without the reordering.
+ *
+ * Returns the length of pathkey prefix with matching group keys.
+ */
+static int
+pathkeys_useful_for_grouping(PlannerInfo *root, List *pathkeys)
+{
+	ListCell *key;
+	int		  n = 0;
+
+	/* no special ordering requested for grouping */
+	if (root->group_pathkeys == NIL)
+		return 0;
+
+	/* unordered path */
+	if (pathkeys == NIL)
+		return 0;
+
+	/* walk the pathkeys and search for matching group key */
+	foreach(key, pathkeys)
+	{
+		PathKey	*pathkey = (PathKey *) lfirst(key);
+
+		/* no matching group key, we're done */
+		if (!list_member_ptr(root->group_pathkeys, pathkey))
+			break;
+
+		n++;
+	}
+
+	return n;
+}
+
 /*
  * truncate_useless_pathkeys
  *		Shorten the given pathkey list to just the useful pathkeys.
@@ -1876,6 +2448,9 @@ truncate_useless_pathkeys(PlannerInfo *root,
 
 	nuseful = pathkeys_useful_for_merging(root, rel, pathkeys);
 	nuseful2 = pathkeys_useful_for_ordering(root, pathkeys);
+	if (nuseful2 > nuseful)
+		nuseful = nuseful2;
+	nuseful2 = pathkeys_useful_for_grouping(root, pathkeys);
 	if (nuseful2 > nuseful)
 		nuseful = nuseful2;
 
@@ -1911,6 +2486,8 @@ has_useful_pathkeys(PlannerInfo *root, RelOptInfo *rel)
 {
 	if (rel->joininfo != NIL || rel->has_eclass_joins)
 		return true;			/* might be able to use pathkeys for merging */
+	if (root->group_pathkeys != NIL)
+		return true;			/* might be able to use pathkeys for grouping */
 	if (root->query_pathkeys != NIL)
 		return true;			/* might be able to use them for ordering */
 	return false;				/* definitely useless */
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index bd01ec0526..f281a23bb7 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -6156,24 +6156,120 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 		 */
 		foreach(lc, input_rel->pathlist)
 		{
+			ListCell   *lc2;
 			Path	   *path = (Path *) lfirst(lc);
 			Path	   *path_original = path;
-			bool		is_sorted;
-			int			presorted_keys;
 
-			is_sorted = pathkeys_count_contained_in(root->group_pathkeys,
-													path->pathkeys,
-													&presorted_keys);
+			List	   *pathkey_orderings = NIL;
+
+			List	   *group_pathkeys = root->group_pathkeys;
+			List	   *group_clauses = parse->groupClause;
+
+			/* generate alternative group orderings that might be useful */
+			pathkey_orderings = get_useful_group_keys_orderings(root,
+																path->rows,
+																path->pathkeys,
+																group_pathkeys,
+																group_clauses);
 
-			if (path == cheapest_path || is_sorted)
+			Assert(list_length(pathkey_orderings) > 0);
+
+			/* process all potentially interesting grouping reorderings */
+			foreach (lc2, pathkey_orderings)
 			{
-				/* Sort the cheapest-total path if it isn't already sorted */
-				if (!is_sorted)
-					path = (Path *) create_sort_path(root,
-													 grouped_rel,
-													 path,
-													 root->group_pathkeys,
-													 -1.0);
+				bool		is_sorted;
+				int			presorted_keys = 0;
+				PathKeyInfo *info = (PathKeyInfo *) lfirst(lc2);
+
+				is_sorted = pathkeys_count_contained_in(info->pathkeys,
+														path->pathkeys,
+														&presorted_keys);
+
+				if (path == cheapest_path || is_sorted)
+				{
+					/* Sort the cheapest-total path if it isn't already sorted */
+					if (!is_sorted)
+					{
+						path = (Path *) create_sort_path(root,
+														 grouped_rel,
+														 path,
+														 info->pathkeys,
+														 -1.0);
+					}
+
+					/* Now decide what to stick atop it */
+					if (parse->groupingSets)
+					{
+						consider_groupingsets_paths(root, grouped_rel,
+													path, true, can_hash,
+													gd, agg_costs, dNumGroups);
+					}
+					else if (parse->hasAggs)
+					{
+						/*
+						 * We have aggregation, possibly with plain GROUP BY. Make
+						 * an AggPath.
+						 */
+						add_path(grouped_rel, (Path *)
+								 create_agg_path(root,
+												 grouped_rel,
+												 path,
+												 grouped_rel->reltarget,
+												 info->clauses ? AGG_SORTED : AGG_PLAIN,
+												 AGGSPLIT_SIMPLE,
+												 info->clauses,
+												 havingQual,
+												 agg_costs,
+												 dNumGroups));
+					}
+					else if (group_clauses)
+					{
+						/*
+						 * We have GROUP BY without aggregation or grouping sets.
+						 * Make a GroupPath.
+						 */
+						add_path(grouped_rel, (Path *)
+								 create_group_path(root,
+												   grouped_rel,
+												   path,
+												   info->clauses,
+												   havingQual,
+												   dNumGroups));
+					}
+					else
+					{
+						/* Other cases should have been handled above */
+						Assert(false);
+					}
+				}
+
+				/*
+				 * Now we may consider incremental sort on this path, but only
+				 * when the path is not already sorted and when incremental sort
+				 * is enabled.
+				 */
+				if (is_sorted || !enable_incremental_sort)
+					continue;
+
+				/* Restore the input path (we might have added Sort on top). */
+				path = path_original;
+
+				/* no shared prefix, no point in building incremental sort */
+				if (presorted_keys == 0)
+					continue;
+
+				/*
+				 * We should have already excluded pathkeys of length 1 because
+				 * then presorted_keys > 0 would imply is_sorted was true.
+				 */
+				Assert(list_length(root->group_pathkeys) != 1);
+
+				path = (Path *) create_incremental_sort_path(root,
+															 grouped_rel,
+															 path,
+															 info->pathkeys,
+															 presorted_keys,
+															 -1.0);
 
 				/* Now decide what to stick atop it */
 				if (parse->groupingSets)
@@ -6185,17 +6281,17 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 				else if (parse->hasAggs)
 				{
 					/*
-					 * We have aggregation, possibly with plain GROUP BY. Make
-					 * an AggPath.
+					 * We have aggregation, possibly with plain GROUP BY. Make an
+					 * AggPath.
 					 */
 					add_path(grouped_rel, (Path *)
 							 create_agg_path(root,
 											 grouped_rel,
 											 path,
 											 grouped_rel->reltarget,
-											 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+											 info->clauses ? AGG_SORTED : AGG_PLAIN,
 											 AGGSPLIT_SIMPLE,
-											 parse->groupClause,
+											 info->clauses,
 											 havingQual,
 											 agg_costs,
 											 dNumGroups));
@@ -6203,14 +6299,14 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 				else if (parse->groupClause)
 				{
 					/*
-					 * We have GROUP BY without aggregation or grouping sets.
-					 * Make a GroupPath.
+					 * We have GROUP BY without aggregation or grouping sets. Make
+					 * a GroupPath.
 					 */
 					add_path(grouped_rel, (Path *)
 							 create_group_path(root,
 											   grouped_rel,
 											   path,
-											   parse->groupClause,
+											   info->clauses,
 											   havingQual,
 											   dNumGroups));
 				}
@@ -6220,79 +6316,6 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 					Assert(false);
 				}
 			}
-
-			/*
-			 * Now we may consider incremental sort on this path, but only
-			 * when the path is not already sorted and when incremental sort
-			 * is enabled.
-			 */
-			if (is_sorted || !enable_incremental_sort)
-				continue;
-
-			/* Restore the input path (we might have added Sort on top). */
-			path = path_original;
-
-			/* no shared prefix, no point in building incremental sort */
-			if (presorted_keys == 0)
-				continue;
-
-			/*
-			 * We should have already excluded pathkeys of length 1 because
-			 * then presorted_keys > 0 would imply is_sorted was true.
-			 */
-			Assert(list_length(root->group_pathkeys) != 1);
-
-			path = (Path *) create_incremental_sort_path(root,
-														 grouped_rel,
-														 path,
-														 root->group_pathkeys,
-														 presorted_keys,
-														 -1.0);
-
-			/* Now decide what to stick atop it */
-			if (parse->groupingSets)
-			{
-				consider_groupingsets_paths(root, grouped_rel,
-											path, true, can_hash,
-											gd, agg_costs, dNumGroups);
-			}
-			else if (parse->hasAggs)
-			{
-				/*
-				 * We have aggregation, possibly with plain GROUP BY. Make an
-				 * AggPath.
-				 */
-				add_path(grouped_rel, (Path *)
-						 create_agg_path(root,
-										 grouped_rel,
-										 path,
-										 grouped_rel->reltarget,
-										 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-										 AGGSPLIT_SIMPLE,
-										 parse->groupClause,
-										 havingQual,
-										 agg_costs,
-										 dNumGroups));
-			}
-			else if (parse->groupClause)
-			{
-				/*
-				 * We have GROUP BY without aggregation or grouping sets. Make
-				 * a GroupPath.
-				 */
-				add_path(grouped_rel, (Path *)
-						 create_group_path(root,
-										   grouped_rel,
-										   path,
-										   parse->groupClause,
-										   havingQual,
-										   dNumGroups));
-			}
-			else
-			{
-				/* Other cases should have been handled above */
-				Assert(false);
-			}
 		}
 
 		/*
@@ -6303,100 +6326,125 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 		{
 			foreach(lc, partially_grouped_rel->pathlist)
 			{
+				ListCell   *lc2;
 				Path	   *path = (Path *) lfirst(lc);
 				Path	   *path_original = path;
-				bool		is_sorted;
-				int			presorted_keys;
 
-				is_sorted = pathkeys_count_contained_in(root->group_pathkeys,
-														path->pathkeys,
-														&presorted_keys);
+				List	   *pathkey_orderings = NIL;
 
-				/*
-				 * Insert a Sort node, if required.  But there's no point in
-				 * sorting anything but the cheapest path.
-				 */
-				if (!is_sorted)
+				List	   *group_pathkeys = root->group_pathkeys;
+				List	   *group_clauses = parse->groupClause;
+
+				/* generate alternative group orderings that might be useful */
+				pathkey_orderings = get_useful_group_keys_orderings(root,
+																	path->rows,
+																	path->pathkeys,
+																	group_pathkeys,
+																	group_clauses);
+
+				Assert(list_length(pathkey_orderings) > 0);
+
+				/* process all potentially interesting grouping reorderings */
+				foreach (lc2, pathkey_orderings)
 				{
-					if (path != partially_grouped_rel->cheapest_total_path)
-						continue;
-					path = (Path *) create_sort_path(root,
-													 grouped_rel,
-													 path,
-													 root->group_pathkeys,
-													 -1.0);
-				}
+					bool		is_sorted;
+					int			presorted_keys = 0;
+					PathKeyInfo *info = (PathKeyInfo *) lfirst(lc2);
 
-				if (parse->hasAggs)
-					add_path(grouped_rel, (Path *)
-							 create_agg_path(root,
-											 grouped_rel,
-											 path,
-											 grouped_rel->reltarget,
-											 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-											 AGGSPLIT_FINAL_DESERIAL,
-											 parse->groupClause,
-											 havingQual,
-											 agg_final_costs,
-											 dNumGroups));
-				else
-					add_path(grouped_rel, (Path *)
-							 create_group_path(root,
-											   grouped_rel,
-											   path,
-											   parse->groupClause,
-											   havingQual,
-											   dNumGroups));
+					/* restore the path (we replace it in the loop) */
+					path = path_original;
 
-				/*
-				 * Now we may consider incremental sort on this path, but only
-				 * when the path is not already sorted and when incremental
-				 * sort is enabled.
-				 */
-				if (is_sorted || !enable_incremental_sort)
-					continue;
+					is_sorted = pathkeys_count_contained_in(info->pathkeys,
+															path->pathkeys,
+															&presorted_keys);
 
-				/* Restore the input path (we might have added Sort on top). */
-				path = path_original;
+					/*
+					 * Insert a Sort node, if required.  But there's no point in
+					 * sorting anything but the cheapest path.
+					 */
+					if (!is_sorted)
+					{
+						if (path != partially_grouped_rel->cheapest_total_path)
+							continue;
 
-				/* no shared prefix, not point in building incremental sort */
-				if (presorted_keys == 0)
-					continue;
+						path = (Path *) create_sort_path(root,
+														 grouped_rel,
+														 path,
+														 info->pathkeys,
+														 -1.0);
+					}
 
-				/*
-				 * We should have already excluded pathkeys of length 1
-				 * because then presorted_keys > 0 would imply is_sorted was
-				 * true.
-				 */
-				Assert(list_length(root->group_pathkeys) != 1);
+					if (parse->hasAggs)
+						add_path(grouped_rel, (Path *)
+								 create_agg_path(root,
+												 grouped_rel,
+												 path,
+												 grouped_rel->reltarget,
+												 info->clauses ? AGG_SORTED : AGG_PLAIN,
+												 AGGSPLIT_FINAL_DESERIAL,
+												 info->clauses,
+												 havingQual,
+												 agg_final_costs,
+												 dNumGroups));
+					else
+						add_path(grouped_rel, (Path *)
+								 create_group_path(root,
+												   grouped_rel,
+												   path,
+												   info->clauses,
+												   havingQual,
+												   dNumGroups));
 
-				path = (Path *) create_incremental_sort_path(root,
-															 grouped_rel,
-															 path,
-															 root->group_pathkeys,
-															 presorted_keys,
-															 -1.0);
+					/*
+					 * Now we may consider incremental sort on this path, but only
+					 * when the path is not already sorted and when incremental
+					 * sort is enabled.
+					 */
+					if (is_sorted || !enable_incremental_sort)
+						continue;
 
-				if (parse->hasAggs)
-					add_path(grouped_rel, (Path *)
-							 create_agg_path(root,
-											 grouped_rel,
-											 path,
-											 grouped_rel->reltarget,
-											 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-											 AGGSPLIT_FINAL_DESERIAL,
-											 parse->groupClause,
-											 havingQual,
-											 agg_final_costs,
-											 dNumGroups));
-				else
-					add_path(grouped_rel, (Path *)
-							 create_group_path(root,
-											   grouped_rel,
-											   path,
-											   parse->groupClause,
-											   havingQual,
-											   dNumGroups));
+					/* Restore the input path (we might have added Sort on top). */
+					path = path_original;
+
+					/* no shared prefix, not point in building incremental sort */
+					if (presorted_keys == 0)
+						continue;
+
+					/*
+					 * We should have already excluded pathkeys of length 1
+					 * because then presorted_keys > 0 would imply is_sorted was
+					 * true.
+					 */
+					Assert(list_length(root->group_pathkeys) != 1);
+
+					path = (Path *) create_incremental_sort_path(root,
+																 grouped_rel,
+																 path,
+																 info->pathkeys,
+																 presorted_keys,
+																 -1.0);
+
+					if (parse->hasAggs)
+						add_path(grouped_rel, (Path *)
+								 create_agg_path(root,
+												 grouped_rel,
+												 path,
+												 grouped_rel->reltarget,
+												 info->clauses ? AGG_SORTED : AGG_PLAIN,
+												 AGGSPLIT_FINAL_DESERIAL,
+												 info->clauses,
+												 havingQual,
+												 agg_final_costs,
+												 dNumGroups));
+					else
+						add_path(grouped_rel, (Path *)
+								 create_group_path(root,
+												   grouped_rel,
+												   path,
+												   info->clauses,
+												   havingQual,
+												   dNumGroups));
+				}
 			}
 		}
 	}
@@ -6599,41 +6647,67 @@ create_partial_grouping_paths(PlannerInfo *root,
 		 */
 		foreach(lc, input_rel->pathlist)
 		{
+			ListCell   *lc2;
 			Path	   *path = (Path *) lfirst(lc);
-			bool		is_sorted;
 
-			is_sorted = pathkeys_contained_in(root->group_pathkeys,
-											  path->pathkeys);
-			if (path == cheapest_total_path || is_sorted)
+			List	   *pathkey_orderings = NIL;
+
+			List	   *group_pathkeys = root->group_pathkeys;
+			List	   *group_clauses = parse->groupClause;
+
+			/* generate alternative group orderings that might be useful */
+			pathkey_orderings = get_useful_group_keys_orderings(root,
+																path->rows,
+																path->pathkeys,
+																group_pathkeys,
+																group_clauses);
+
+			Assert(list_length(pathkey_orderings) > 0);
+
+			/* process all potentially interesting grouping reorderings */
+			foreach (lc2, pathkey_orderings)
 			{
-				/* Sort the cheapest partial path, if it isn't already */
-				if (!is_sorted)
-					path = (Path *) create_sort_path(root,
-													 partially_grouped_rel,
-													 path,
-													 root->group_pathkeys,
-													 -1.0);
+				bool		is_sorted;
+				int			presorted_keys = 0;
+				PathKeyInfo *info = (PathKeyInfo *) lfirst(lc2);
 
-				if (parse->hasAggs)
-					add_path(partially_grouped_rel, (Path *)
-							 create_agg_path(root,
-											 partially_grouped_rel,
-											 path,
-											 partially_grouped_rel->reltarget,
-											 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-											 AGGSPLIT_INITIAL_SERIAL,
-											 parse->groupClause,
-											 NIL,
-											 agg_partial_costs,
-											 dNumPartialGroups));
-				else
-					add_path(partially_grouped_rel, (Path *)
-							 create_group_path(root,
-											   partially_grouped_rel,
-											   path,
-											   parse->groupClause,
-											   NIL,
-											   dNumPartialGroups));
+				is_sorted = pathkeys_count_contained_in(info->pathkeys,
+														path->pathkeys,
+														&presorted_keys);
+
+				if (path == cheapest_total_path || is_sorted)
+				{
+					/* Sort the cheapest partial path, if it isn't already */
+					if (!is_sorted)
+					{
+						path = (Path *) create_sort_path(root,
+														 partially_grouped_rel,
+														 path,
+														 info->pathkeys,
+														 -1.0);
+					}
+
+					if (parse->hasAggs)
+						add_path(partially_grouped_rel, (Path *)
+								 create_agg_path(root,
+												 partially_grouped_rel,
+												 path,
+												 partially_grouped_rel->reltarget,
+												 info->clauses ? AGG_SORTED : AGG_PLAIN,
+												 AGGSPLIT_INITIAL_SERIAL,
+												 info->clauses,
+												 NIL,
+												 agg_partial_costs,
+												 dNumPartialGroups));
+					else
+						add_path(partially_grouped_rel, (Path *)
+								 create_group_path(root,
+												   partially_grouped_rel,
+												   path,
+												   info->clauses,
+												   NIL,
+												   dNumPartialGroups));
+				}
 			}
 		}
 
@@ -6643,6 +6717,8 @@ create_partial_grouping_paths(PlannerInfo *root,
 		 * We can also skip the entire loop when we only have a single-item
 		 * group_pathkeys because then we can't possibly have a presorted
 		 * prefix of the list without having the list be fully sorted.
+		 *
+		 * XXX Shouldn't this also consider the group-key-reordering?
 		 */
 		if (enable_incremental_sort && list_length(root->group_pathkeys) > 1)
 		{
@@ -6701,24 +6777,100 @@ create_partial_grouping_paths(PlannerInfo *root,
 		/* Similar to above logic, but for partial paths. */
 		foreach(lc, input_rel->partial_pathlist)
 		{
+			ListCell   *lc2;
 			Path	   *path = (Path *) lfirst(lc);
 			Path	   *path_original = path;
-			bool		is_sorted;
-			int			presorted_keys;
 
-			is_sorted = pathkeys_count_contained_in(root->group_pathkeys,
-													path->pathkeys,
-													&presorted_keys);
+			List	   *pathkey_orderings = NIL;
+
+			List	   *group_pathkeys = root->group_pathkeys;
+			List	   *group_clauses = parse->groupClause;
 
-			if (path == cheapest_partial_path || is_sorted)
+			/* generate alternative group orderings that might be useful */
+			pathkey_orderings = get_useful_group_keys_orderings(root,
+																path->rows,
+																path->pathkeys,
+																group_pathkeys,
+																group_clauses);
+
+			Assert(list_length(pathkey_orderings) > 0);
+
+			/* process all potentially interesting grouping reorderings */
+			foreach (lc2, pathkey_orderings)
 			{
-				/* Sort the cheapest partial path, if it isn't already */
-				if (!is_sorted)
-					path = (Path *) create_sort_path(root,
-													 partially_grouped_rel,
-													 path,
-													 root->group_pathkeys,
-													 -1.0);
+				bool		is_sorted;
+				int			presorted_keys = 0;
+				PathKeyInfo *info = (PathKeyInfo *) lfirst(lc2);
+
+				/* restore the path (we replace it in the loop) */
+				path = path_original;
+
+				is_sorted = pathkeys_count_contained_in(info->pathkeys,
+														path->pathkeys,
+														&presorted_keys);
+
+				if (path == cheapest_partial_path || is_sorted)
+				{
+
+					/* Sort the cheapest partial path, if it isn't already */
+					if (!is_sorted)
+					{
+						path = (Path *) create_sort_path(root,
+														 partially_grouped_rel,
+														 path,
+														 info->pathkeys,
+														 -1.0);
+					}
+
+					if (parse->hasAggs)
+						add_partial_path(partially_grouped_rel, (Path *)
+										 create_agg_path(root,
+														 partially_grouped_rel,
+														 path,
+														 partially_grouped_rel->reltarget,
+														 info->clauses ? AGG_SORTED : AGG_PLAIN,
+														 AGGSPLIT_INITIAL_SERIAL,
+														 info->clauses,
+														 NIL,
+														 agg_partial_costs,
+														 dNumPartialPartialGroups));
+					else
+						add_partial_path(partially_grouped_rel, (Path *)
+										 create_group_path(root,
+														   partially_grouped_rel,
+														   path,
+														   info->clauses,
+														   NIL,
+														   dNumPartialPartialGroups));
+				}
+
+				/*
+				 * Now we may consider incremental sort on this path, but only
+				 * when the path is not already sorted and when incremental sort
+				 * is enabled.
+				 */
+				if (is_sorted || !enable_incremental_sort)
+					continue;
+
+				/* Restore the input path (we might have added Sort on top). */
+				path = path_original;
+
+				/* no shared prefix, not point in building incremental sort */
+				if (presorted_keys == 0)
+					continue;
+
+				/*
+				 * We should have already excluded pathkeys of length 1 because
+				 * then presorted_keys > 0 would imply is_sorted was true.
+				 */
+				Assert(list_length(root->group_pathkeys) != 1);
+
+				path = (Path *) create_incremental_sort_path(root,
+															 partially_grouped_rel,
+															 path,
+															 info->pathkeys,
+															 presorted_keys,
+															 -1.0);
 
 				if (parse->hasAggs)
 					add_partial_path(partially_grouped_rel, (Path *)
@@ -6726,9 +6878,9 @@ create_partial_grouping_paths(PlannerInfo *root,
 													 partially_grouped_rel,
 													 path,
 													 partially_grouped_rel->reltarget,
-													 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+													 info->clauses ? AGG_SORTED : AGG_PLAIN,
 													 AGGSPLIT_INITIAL_SERIAL,
-													 parse->groupClause,
+													 info->clauses,
 													 NIL,
 													 agg_partial_costs,
 													 dNumPartialPartialGroups));
@@ -6737,59 +6889,10 @@ create_partial_grouping_paths(PlannerInfo *root,
 									 create_group_path(root,
 													   partially_grouped_rel,
 													   path,
-													   parse->groupClause,
+													   info->clauses,
 													   NIL,
 													   dNumPartialPartialGroups));
 			}
-
-			/*
-			 * Now we may consider incremental sort on this path, but only
-			 * when the path is not already sorted and when incremental sort
-			 * is enabled.
-			 */
-			if (is_sorted || !enable_incremental_sort)
-				continue;
-
-			/* Restore the input path (we might have added Sort on top). */
-			path = path_original;
-
-			/* no shared prefix, not point in building incremental sort */
-			if (presorted_keys == 0)
-				continue;
-
-			/*
-			 * We should have already excluded pathkeys of length 1 because
-			 * then presorted_keys > 0 would imply is_sorted was true.
-			 */
-			Assert(list_length(root->group_pathkeys) != 1);
-
-			path = (Path *) create_incremental_sort_path(root,
-														 partially_grouped_rel,
-														 path,
-														 root->group_pathkeys,
-														 presorted_keys,
-														 -1.0);
-
-			if (parse->hasAggs)
-				add_partial_path(partially_grouped_rel, (Path *)
-								 create_agg_path(root,
-												 partially_grouped_rel,
-												 path,
-												 partially_grouped_rel->reltarget,
-												 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-												 AGGSPLIT_INITIAL_SERIAL,
-												 parse->groupClause,
-												 NIL,
-												 agg_partial_costs,
-												 dNumPartialPartialGroups));
-			else
-				add_partial_path(partially_grouped_rel, (Path *)
-								 create_group_path(root,
-												   partially_grouped_rel,
-												   path,
-												   parse->groupClause,
-												   NIL,
-												   dNumPartialPartialGroups));
 		}
 	}
 
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index af5e8df26b..ce084f6973 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1342,7 +1342,7 @@ create_append_path(PlannerInfo *root,
 		pathnode->path.pathkeys = child->pathkeys;
 	}
 	else
-		cost_append(pathnode);
+		cost_append(pathnode, root);
 
 	/* If the caller provided a row estimate, override the computed value. */
 	if (rows >= 0)
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 10895fb287..656c34eaaf 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3294,7 +3294,10 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
 }
 
 /*
- * estimate_num_groups		- Estimate number of groups in a grouped query
+ * estimate_num_groups/estimate_num_groups_incremental
+ *		- Estimate number of groups in a grouped query.
+ *		  _incremental variant is performance optimization for
+ *		  case of adding one-by-one column
  *
  * Given a query having a GROUP BY clause, estimate how many groups there
  * will be --- ie, the number of distinct combinations of the GROUP BY
@@ -3368,11 +3371,22 @@ double
 estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 					List **pgset, EstimationInfo *estinfo)
 {
-	List	   *varinfos = NIL;
+	return estimate_num_groups_incremental(root, groupExprs,
+										   input_rows, pgset, estinfo,
+										   NULL, 0);
+}
+
+double
+estimate_num_groups_incremental(PlannerInfo *root, List *groupExprs,
+					double input_rows,
+					List **pgset, EstimationInfo *estinfo,
+					List **cache_varinfos, int prevNExprs)
+{
+	List	   *varinfos = (cache_varinfos) ? *cache_varinfos : NIL;
 	double		srf_multiplier = 1.0;
 	double		numdistinct;
 	ListCell   *l;
-	int			i;
+	int			i, j;
 
 	/* Zero the estinfo output parameter, if non-NULL */
 	if (estinfo != NULL)
@@ -3403,7 +3417,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 	 */
 	numdistinct = 1.0;
 
-	i = 0;
+	i = j = 0;
 	foreach(l, groupExprs)
 	{
 		Node	   *groupexpr = (Node *) lfirst(l);
@@ -3412,6 +3426,14 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		List	   *varshere;
 		ListCell   *l2;
 
+		/* was done on previous call */
+		if (cache_varinfos && j++ < prevNExprs)
+		{
+			if (pgset)
+				i++; /* to keep in sync with lines below */
+			continue;
+		}
+
 		/* is expression in this grouping set? */
 		if (pgset && !list_member_int(*pgset, i++))
 			continue;
@@ -3481,7 +3503,11 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		if (varshere == NIL)
 		{
 			if (contain_volatile_functions(groupexpr))
+			{
+				if (cache_varinfos)
+					*cache_varinfos = varinfos;
 				return input_rows;
+			}
 			continue;
 		}
 
@@ -3498,6 +3524,9 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		}
 	}
 
+	if (cache_varinfos)
+		*cache_varinfos = varinfos;
+
 	/*
 	 * If now no Vars, we must have an all-constant or all-boolean GROUP BY
 	 * list.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f9504d3aec..0074cc42e4 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2120,6 +2120,35 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+/************************<DEBUG OPT GROUP BY>*********************************/
+	{
+		{"debug_group_by_reorder_by_pathkeys", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("enable reorder GROUP BY by pathkeys"),
+			NULL
+		},
+		&debug_group_by_reorder_by_pathkeys,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"debug_enable_group_by_match_order_by", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("enable matching GROUP BY by ORDER BY."),
+			NULL
+		},
+		&debug_group_by_match_order_by,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"debug_enable_cheapest_group_by", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("find a cheapest order of columns in GROUP BY."),
+			NULL
+		},
+		&debug_cheapest_group_by,
+		true,
+		NULL, NULL, NULL
+	},
+/************************</DEBUG OPT GROUP BY>********************************/
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 7c657c1241..9b8bdd1083 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -266,6 +266,7 @@ typedef enum NodeTag
 	T_EquivalenceClass,
 	T_EquivalenceMember,
 	T_PathKey,
+	T_PathKeyInfo,
 	T_PathTarget,
 	T_RestrictInfo,
 	T_IndexClause,
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 324d92880b..97b3c0597b 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1069,6 +1069,16 @@ typedef struct PathKey
 	bool		pk_nulls_first; /* do NULLs come before normal values? */
 } PathKey;
 
+/*
+ * Combines information about pathkeys and the associated clauses.
+ */
+typedef struct PathKeyInfo
+{
+	NodeTag		type;
+	List	   *pathkeys;
+	List	   *clauses;
+} PathKeyInfo;
+
 /*
  * VolatileFunctionStatus -- allows nodes to cache their
  * contain_volatile_functions properties. VOLATILITY_UNKNOWN means not yet
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 2113bc82de..a3ff71fa9d 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -112,7 +112,9 @@ extern void cost_incremental_sort(Path *path,
 								  Cost input_startup_cost, Cost input_total_cost,
 								  double input_tuples, int width, Cost comparison_cost, int sort_mem,
 								  double limit_tuples);
-extern void cost_append(AppendPath *path);
+extern Cost cost_sort_estimate(PlannerInfo *root, List *pathkeys,
+							   int nPresortedKeys, double tuples);
+extern void cost_append(AppendPath *path, PlannerInfo *root);
 extern void cost_merge_append(Path *path, PlannerInfo *root,
 							  List *pathkeys, int n_streams,
 							  Cost input_startup_cost, Cost input_total_cost,
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index f1d111063c..47540b6c0e 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -203,6 +203,17 @@ typedef enum
 extern PathKeysComparison compare_pathkeys(List *keys1, List *keys2);
 extern bool pathkeys_contained_in(List *keys1, List *keys2);
 extern bool pathkeys_count_contained_in(List *keys1, List *keys2, int *n_common);
+extern int group_keys_reorder_by_pathkeys(List *pathkeys,
+										  List **group_pathkeys,
+										  List **group_clauses);
+/************************<DEBUG OPT GROUP BY>*********************************/
+extern bool debug_group_by_reorder_by_pathkeys;
+extern bool debug_group_by_match_order_by;
+extern bool debug_cheapest_group_by;
+/************************</DEBUG OPT GROUP BY>********************************/
+extern List *get_useful_group_keys_orderings(PlannerInfo *root, double nrows,
+											 List *path_pathkeys,
+											 List *pathkeys, List *clauses);
 extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
 											Relids required_outer,
 											CostSelector cost_criterion,
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 9dd444e1ff..e20f45e104 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -214,6 +214,11 @@ extern double estimate_num_groups(PlannerInfo *root, List *groupExprs,
 								  double input_rows, List **pgset,
 								  EstimationInfo *estinfo);
 
+extern double estimate_num_groups_incremental(PlannerInfo *root, List *groupExprs,
+					double input_rows, List **pgset,
+					EstimationInfo *estinfo,
+					List **cache_varinfos, int prevNExprs);
+
 extern void estimate_hash_bucket_stats(PlannerInfo *root,
 									   Node *hashkey, double nbuckets,
 									   Selectivity *mcv_freq,
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index be5fa5727d..34cf3d900c 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1200,7 +1200,8 @@ explain (costs off)
   select distinct min(f1), max(f1) from minmaxtest;
                                          QUERY PLAN                                          
 ---------------------------------------------------------------------------------------------
- Unique
+ HashAggregate
+   Group Key: $0, $1
    InitPlan 1 (returns $0)
      ->  Limit
            ->  Merge Append
@@ -1223,10 +1224,8 @@ explain (costs off)
                  ->  Index Only Scan using minmaxtest2i on minmaxtest2 minmaxtest_8
                        Index Cond: (f1 IS NOT NULL)
                  ->  Index Only Scan Backward using minmaxtest3i on minmaxtest3 minmaxtest_9
-   ->  Sort
-         Sort Key: ($0), ($1)
-         ->  Result
-(26 rows)
+   ->  Result
+(25 rows)
 
 select distinct min(f1), max(f1) from minmaxtest;
  min | max 
@@ -2438,6 +2437,241 @@ SELECT balk(hundred) FROM tenk1;
 (1 row)
 
 ROLLBACK;
+-- GROUP BY optimization by reorder columns
+SELECT
+	i AS id,
+	i/2 AS p,
+	format('%60s', i%2) AS v,
+	i/4 AS c,
+	i/8 AS d,
+	(random() * (10000/8))::int as e --the same as d but no correlation with p
+	INTO btg
+FROM
+	generate_series(1, 10000) i;
+VACUUM btg;
+ANALYZE btg;
+-- GROUP BY optimization by reorder columns by frequency
+SET enable_hashagg=off;
+SET max_parallel_workers= 0;
+SET max_parallel_workers_per_gather = 0;
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, v;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, v
+   ->  Sort
+         Sort Key: p, v
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, v
+   ->  Sort
+         Sort Key: p, v
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, c, v
+   ->  Sort
+         Sort Key: p, c, v
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c ORDER BY v, p, c;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: v, p, c
+   ->  Sort
+         Sort Key: v, p, c
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, d, c;
+          QUERY PLAN          
+------------------------------
+ GroupAggregate
+   Group Key: p, d, c, v
+   ->  Sort
+         Sort Key: p, d, c, v
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, d, c ORDER BY v, p, d ,c;
+          QUERY PLAN          
+------------------------------
+ GroupAggregate
+   Group Key: v, p, d, c
+   ->  Sort
+         Sort Key: v, p, d, c
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, d, c ORDER BY p, v, d ,c;
+          QUERY PLAN          
+------------------------------
+ GroupAggregate
+   Group Key: p, v, d, c
+   ->  Sort
+         Sort Key: p, v, d, c
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, d, e;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, d, e
+   ->  Sort
+         Sort Key: p, d, e
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, e, d;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, e, d
+   ->  Sort
+         Sort Key: p, e, d
+         ->  Seq Scan on btg
+(5 rows)
+
+CREATE STATISTICS btg_dep ON d, e, p FROM btg;
+ANALYZE btg;
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, d, e;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, d, e
+   ->  Sort
+         Sort Key: p, d, e
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, e, d;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, e, d
+   ->  Sort
+         Sort Key: p, e, d
+         ->  Seq Scan on btg
+(5 rows)
+
+-- GROUP BY optimization by reorder columns by index scan
+CREATE INDEX ON btg(p, v);
+SET enable_seqscan=off;
+SET enable_bitmapscan=off;
+VACUUM btg;
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, v;
+                   QUERY PLAN                   
+------------------------------------------------
+ GroupAggregate
+   Group Key: p, v
+   ->  Index Only Scan using btg_p_v_idx on btg
+(3 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, v ORDER BY p, v;
+                   QUERY PLAN                   
+------------------------------------------------
+ GroupAggregate
+   Group Key: p, v
+   ->  Index Only Scan using btg_p_v_idx on btg
+(3 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p;
+                   QUERY PLAN                   
+------------------------------------------------
+ GroupAggregate
+   Group Key: p, v
+   ->  Index Only Scan using btg_p_v_idx on btg
+(3 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p ORDER BY p, v;
+                   QUERY PLAN                   
+------------------------------------------------
+ GroupAggregate
+   Group Key: p, v
+   ->  Index Only Scan using btg_p_v_idx on btg
+(3 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c;
+                   QUERY PLAN                    
+-------------------------------------------------
+ GroupAggregate
+   Group Key: p, c, v
+   ->  Incremental Sort
+         Sort Key: p, c, v
+         Presorted Key: p
+         ->  Index Scan using btg_p_v_idx on btg
+(6 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c ORDER BY p, v;
+                   QUERY PLAN                    
+-------------------------------------------------
+ GroupAggregate
+   Group Key: p, v, c
+   ->  Incremental Sort
+         Sort Key: p, v, c
+         Presorted Key: p, v
+         ->  Index Scan using btg_p_v_idx on btg
+(6 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, c, p, d;
+                   QUERY PLAN                    
+-------------------------------------------------
+ GroupAggregate
+   Group Key: p, c, d, v
+   ->  Incremental Sort
+         Sort Key: p, c, d, v
+         Presorted Key: p
+         ->  Index Scan using btg_p_v_idx on btg
+(6 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, c, p, d ORDER BY p, v;
+                   QUERY PLAN                    
+-------------------------------------------------
+ GroupAggregate
+   Group Key: p, v, c, d
+   ->  Incremental Sort
+         Sort Key: p, v, c, d
+         Presorted Key: p, v
+         ->  Index Scan using btg_p_v_idx on btg
+(6 rows)
+
+DROP TABLE btg;
+RESET enable_hashagg;
+RESET max_parallel_workers;
+RESET max_parallel_workers_per_gather;
+RESET enable_seqscan;
+RESET enable_bitmapscan;
 -- Secondly test the case of a parallel aggregate combiner function
 -- returning NULL. For that use normal transition function, but a
 -- combiner function returning NULL.
diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out
index 545e301e48..21c429226f 100644
--- a/src/test/regress/expected/incremental_sort.out
+++ b/src/test/regress/expected/incremental_sort.out
@@ -1439,7 +1439,7 @@ set parallel_setup_cost = 0;
 set parallel_tuple_cost = 0;
 set max_parallel_workers_per_gather = 2;
 create table t (a int, b int, c int);
-insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i);
+insert into t select mod(i,10),mod(i,10),i from generate_series(1,60000) s(i);
 create index on t (a);
 analyze t;
 set enable_incremental_sort = off;
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index d5b5b775fd..01d81292f5 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -1984,8 +1984,8 @@ USING (name);
 ------+----+----
  bb   | 12 | 13
  cc   | 22 | 23
- dd   |    | 33
  ee   | 42 |   
+ dd   |    | 33
 (4 rows)
 
 -- Cases with non-nullable expressions in subquery results;
@@ -2019,8 +2019,8 @@ NATURAL FULL JOIN
 ------+------+------+------+------
  bb   |   12 |    2 |   13 |    3
  cc   |   22 |    2 |   23 |    3
- dd   |      |      |   33 |    3
  ee   |   42 |    2 |      |     
+ dd   |      |      |   33 |    3
 (4 rows)
 
 SELECT * FROM
@@ -4618,18 +4618,20 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s
 explain (costs off)
 select d.* from d left join (select distinct * from b) s
   on d.a = s.id;
-              QUERY PLAN              
---------------------------------------
- Merge Right Join
-   Merge Cond: (b.id = d.a)
-   ->  Unique
-         ->  Sort
-               Sort Key: b.id, b.c_id
-               ->  Seq Scan on b
+                 QUERY PLAN                  
+---------------------------------------------
+ Merge Left Join
+   Merge Cond: (d.a = s.id)
    ->  Sort
          Sort Key: d.a
          ->  Seq Scan on d
-(9 rows)
+   ->  Sort
+         Sort Key: s.id
+         ->  Subquery Scan on s
+               ->  HashAggregate
+                     Group Key: b.id, b.c_id
+                     ->  Seq Scan on b
+(11 rows)
 
 -- check join removal works when uniqueness of the join condition is enforced
 -- by a UNION
@@ -6336,44 +6338,39 @@ select * from j1 natural join j2;
 explain (verbose, costs off)
 select * from j1
 inner join (select distinct id from j3) j3 on j1.id = j3.id;
-               QUERY PLAN                
------------------------------------------
+            QUERY PLAN             
+-----------------------------------
  Nested Loop
    Output: j1.id, j3.id
    Inner Unique: true
    Join Filter: (j1.id = j3.id)
-   ->  Unique
+   ->  HashAggregate
          Output: j3.id
-         ->  Sort
+         Group Key: j3.id
+         ->  Seq Scan on public.j3
                Output: j3.id
-               Sort Key: j3.id
-               ->  Seq Scan on public.j3
-                     Output: j3.id
    ->  Seq Scan on public.j1
          Output: j1.id
-(13 rows)
+(11 rows)
 
 -- ensure group by clause allows the inner to become unique
 explain (verbose, costs off)
 select * from j1
 inner join (select id from j3 group by id) j3 on j1.id = j3.id;
-               QUERY PLAN                
------------------------------------------
+            QUERY PLAN             
+-----------------------------------
  Nested Loop
    Output: j1.id, j3.id
    Inner Unique: true
    Join Filter: (j1.id = j3.id)
-   ->  Group
+   ->  HashAggregate
          Output: j3.id
          Group Key: j3.id
-         ->  Sort
+         ->  Seq Scan on public.j3
                Output: j3.id
-               Sort Key: j3.id
-               ->  Seq Scan on public.j3
-                     Output: j3.id
    ->  Seq Scan on public.j1
          Output: j1.id
-(14 rows)
+(11 rows)
 
 drop table j1;
 drop table j2;
diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out
index dfa4b036b5..a08a3825ff 100644
--- a/src/test/regress/expected/partition_aggregate.out
+++ b/src/test/regress/expected/partition_aggregate.out
@@ -952,32 +952,30 @@ SELECT a, sum(b), array_agg(distinct c), count(*) FROM pagg_tab_ml GROUP BY a HA
 --------------------------------------------------------------------------------------
  Sort
    Sort Key: pagg_tab_ml.a, (sum(pagg_tab_ml.b)), (array_agg(DISTINCT pagg_tab_ml.c))
-   ->  Gather
-         Workers Planned: 2
-         ->  Parallel Append
-               ->  GroupAggregate
-                     Group Key: pagg_tab_ml.a
-                     Filter: (avg(pagg_tab_ml.b) < '3'::numeric)
-                     ->  Sort
-                           Sort Key: pagg_tab_ml.a
-                           ->  Seq Scan on pagg_tab_ml_p1 pagg_tab_ml
-               ->  GroupAggregate
-                     Group Key: pagg_tab_ml_5.a
-                     Filter: (avg(pagg_tab_ml_5.b) < '3'::numeric)
-                     ->  Sort
-                           Sort Key: pagg_tab_ml_5.a
-                           ->  Append
-                                 ->  Seq Scan on pagg_tab_ml_p3_s1 pagg_tab_ml_5
-                                 ->  Seq Scan on pagg_tab_ml_p3_s2 pagg_tab_ml_6
-               ->  GroupAggregate
-                     Group Key: pagg_tab_ml_2.a
-                     Filter: (avg(pagg_tab_ml_2.b) < '3'::numeric)
-                     ->  Sort
-                           Sort Key: pagg_tab_ml_2.a
-                           ->  Append
-                                 ->  Seq Scan on pagg_tab_ml_p2_s1 pagg_tab_ml_2
-                                 ->  Seq Scan on pagg_tab_ml_p2_s2 pagg_tab_ml_3
-(27 rows)
+   ->  Append
+         ->  GroupAggregate
+               Group Key: pagg_tab_ml.a
+               Filter: (avg(pagg_tab_ml.b) < '3'::numeric)
+               ->  Sort
+                     Sort Key: pagg_tab_ml.a
+                     ->  Seq Scan on pagg_tab_ml_p1 pagg_tab_ml
+         ->  GroupAggregate
+               Group Key: pagg_tab_ml_2.a
+               Filter: (avg(pagg_tab_ml_2.b) < '3'::numeric)
+               ->  Sort
+                     Sort Key: pagg_tab_ml_2.a
+                     ->  Append
+                           ->  Seq Scan on pagg_tab_ml_p2_s1 pagg_tab_ml_2
+                           ->  Seq Scan on pagg_tab_ml_p2_s2 pagg_tab_ml_3
+         ->  GroupAggregate
+               Group Key: pagg_tab_ml_5.a
+               Filter: (avg(pagg_tab_ml_5.b) < '3'::numeric)
+               ->  Sort
+                     Sort Key: pagg_tab_ml_5.a
+                     ->  Append
+                           ->  Seq Scan on pagg_tab_ml_p3_s1 pagg_tab_ml_5
+                           ->  Seq Scan on pagg_tab_ml_p3_s2 pagg_tab_ml_6
+(25 rows)
 
 SELECT a, sum(b), array_agg(distinct c), count(*) FROM pagg_tab_ml GROUP BY a HAVING avg(b) < 3 ORDER BY 1, 2, 3;
  a  | sum  |  array_agg  | count 
@@ -996,34 +994,32 @@ SELECT a, sum(b), array_agg(distinct c), count(*) FROM pagg_tab_ml GROUP BY a HA
 -- Without ORDER BY clause, to test Gather at top-most path
 EXPLAIN (COSTS OFF)
 SELECT a, sum(b), array_agg(distinct c), count(*) FROM pagg_tab_ml GROUP BY a HAVING avg(b) < 3;
-                                QUERY PLAN                                 
----------------------------------------------------------------------------
- Gather
-   Workers Planned: 2
-   ->  Parallel Append
-         ->  GroupAggregate
-               Group Key: pagg_tab_ml.a
-               Filter: (avg(pagg_tab_ml.b) < '3'::numeric)
-               ->  Sort
-                     Sort Key: pagg_tab_ml.a
-                     ->  Seq Scan on pagg_tab_ml_p1 pagg_tab_ml
-         ->  GroupAggregate
-               Group Key: pagg_tab_ml_5.a
-               Filter: (avg(pagg_tab_ml_5.b) < '3'::numeric)
-               ->  Sort
-                     Sort Key: pagg_tab_ml_5.a
-                     ->  Append
-                           ->  Seq Scan on pagg_tab_ml_p3_s1 pagg_tab_ml_5
-                           ->  Seq Scan on pagg_tab_ml_p3_s2 pagg_tab_ml_6
-         ->  GroupAggregate
-               Group Key: pagg_tab_ml_2.a
-               Filter: (avg(pagg_tab_ml_2.b) < '3'::numeric)
-               ->  Sort
-                     Sort Key: pagg_tab_ml_2.a
-                     ->  Append
-                           ->  Seq Scan on pagg_tab_ml_p2_s1 pagg_tab_ml_2
-                           ->  Seq Scan on pagg_tab_ml_p2_s2 pagg_tab_ml_3
-(25 rows)
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Append
+   ->  GroupAggregate
+         Group Key: pagg_tab_ml.a
+         Filter: (avg(pagg_tab_ml.b) < '3'::numeric)
+         ->  Sort
+               Sort Key: pagg_tab_ml.a
+               ->  Seq Scan on pagg_tab_ml_p1 pagg_tab_ml
+   ->  GroupAggregate
+         Group Key: pagg_tab_ml_2.a
+         Filter: (avg(pagg_tab_ml_2.b) < '3'::numeric)
+         ->  Sort
+               Sort Key: pagg_tab_ml_2.a
+               ->  Append
+                     ->  Seq Scan on pagg_tab_ml_p2_s1 pagg_tab_ml_2
+                     ->  Seq Scan on pagg_tab_ml_p2_s2 pagg_tab_ml_3
+   ->  GroupAggregate
+         Group Key: pagg_tab_ml_5.a
+         Filter: (avg(pagg_tab_ml_5.b) < '3'::numeric)
+         ->  Sort
+               Sort Key: pagg_tab_ml_5.a
+               ->  Append
+                     ->  Seq Scan on pagg_tab_ml_p3_s1 pagg_tab_ml_5
+                     ->  Seq Scan on pagg_tab_ml_p3_s2 pagg_tab_ml_6
+(23 rows)
 
 -- Full aggregation at level 1 as GROUP BY clause matches with PARTITION KEY
 -- for level 1 only. For subpartitions, GROUP BY clause does not match with
@@ -1379,28 +1375,26 @@ SELECT x, sum(y), avg(y), count(*) FROM pagg_tab_para GROUP BY x HAVING avg(y) <
 -- When GROUP BY clause does not match; partial aggregation is performed for each partition.
 EXPLAIN (COSTS OFF)
 SELECT y, sum(x), avg(x), count(*) FROM pagg_tab_para GROUP BY y HAVING avg(x) < 12 ORDER BY 1, 2, 3;
-                                        QUERY PLAN                                         
--------------------------------------------------------------------------------------------
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
  Sort
    Sort Key: pagg_tab_para.y, (sum(pagg_tab_para.x)), (avg(pagg_tab_para.x))
-   ->  Finalize GroupAggregate
+   ->  Finalize HashAggregate
          Group Key: pagg_tab_para.y
          Filter: (avg(pagg_tab_para.x) < '12'::numeric)
-         ->  Gather Merge
+         ->  Gather
                Workers Planned: 2
-               ->  Sort
-                     Sort Key: pagg_tab_para.y
-                     ->  Parallel Append
-                           ->  Partial HashAggregate
-                                 Group Key: pagg_tab_para.y
-                                 ->  Parallel Seq Scan on pagg_tab_para_p1 pagg_tab_para
-                           ->  Partial HashAggregate
-                                 Group Key: pagg_tab_para_1.y
-                                 ->  Parallel Seq Scan on pagg_tab_para_p2 pagg_tab_para_1
-                           ->  Partial HashAggregate
-                                 Group Key: pagg_tab_para_2.y
-                                 ->  Parallel Seq Scan on pagg_tab_para_p3 pagg_tab_para_2
-(19 rows)
+               ->  Parallel Append
+                     ->  Partial HashAggregate
+                           Group Key: pagg_tab_para.y
+                           ->  Parallel Seq Scan on pagg_tab_para_p1 pagg_tab_para
+                     ->  Partial HashAggregate
+                           Group Key: pagg_tab_para_1.y
+                           ->  Parallel Seq Scan on pagg_tab_para_p2 pagg_tab_para_1
+                     ->  Partial HashAggregate
+                           Group Key: pagg_tab_para_2.y
+                           ->  Parallel Seq Scan on pagg_tab_para_p3 pagg_tab_para_2
+(17 rows)
 
 SELECT y, sum(x), avg(x), count(*) FROM pagg_tab_para GROUP BY y HAVING avg(x) < 12 ORDER BY 1, 2, 3;
  y  |  sum  |         avg         | count 
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index 27f7525b3e..100ded51b0 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -466,52 +466,41 @@ EXPLAIN (COSTS OFF)
 SELECT a, b FROM prt1 FULL JOIN prt2 p2(b,a,c) USING(a,b)
   WHERE a BETWEEN 490 AND 510
   GROUP BY 1, 2 ORDER BY 1, 2;
-                                                   QUERY PLAN                                                    
------------------------------------------------------------------------------------------------------------------
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
  Group
    Group Key: (COALESCE(prt1.a, p2.a)), (COALESCE(prt1.b, p2.b))
-   ->  Merge Append
+   ->  Sort
          Sort Key: (COALESCE(prt1.a, p2.a)), (COALESCE(prt1.b, p2.b))
-         ->  Group
-               Group Key: (COALESCE(prt1.a, p2.a)), (COALESCE(prt1.b, p2.b))
-               ->  Sort
-                     Sort Key: (COALESCE(prt1.a, p2.a)), (COALESCE(prt1.b, p2.b))
-                     ->  Merge Full Join
-                           Merge Cond: ((prt1.a = p2.a) AND (prt1.b = p2.b))
-                           Filter: ((COALESCE(prt1.a, p2.a) >= 490) AND (COALESCE(prt1.a, p2.a) <= 510))
-                           ->  Sort
-                                 Sort Key: prt1.a, prt1.b
-                                 ->  Seq Scan on prt1_p1 prt1
-                           ->  Sort
-                                 Sort Key: p2.a, p2.b
-                                 ->  Seq Scan on prt2_p1 p2
-         ->  Group
-               Group Key: (COALESCE(prt1_1.a, p2_1.a)), (COALESCE(prt1_1.b, p2_1.b))
-               ->  Sort
-                     Sort Key: (COALESCE(prt1_1.a, p2_1.a)), (COALESCE(prt1_1.b, p2_1.b))
-                     ->  Merge Full Join
-                           Merge Cond: ((prt1_1.a = p2_1.a) AND (prt1_1.b = p2_1.b))
-                           Filter: ((COALESCE(prt1_1.a, p2_1.a) >= 490) AND (COALESCE(prt1_1.a, p2_1.a) <= 510))
-                           ->  Sort
-                                 Sort Key: prt1_1.a, prt1_1.b
-                                 ->  Seq Scan on prt1_p2 prt1_1
-                           ->  Sort
-                                 Sort Key: p2_1.a, p2_1.b
-                                 ->  Seq Scan on prt2_p2 p2_1
-         ->  Group
-               Group Key: (COALESCE(prt1_2.a, p2_2.a)), (COALESCE(prt1_2.b, p2_2.b))
-               ->  Sort
-                     Sort Key: (COALESCE(prt1_2.a, p2_2.a)), (COALESCE(prt1_2.b, p2_2.b))
-                     ->  Merge Full Join
-                           Merge Cond: ((prt1_2.a = p2_2.a) AND (prt1_2.b = p2_2.b))
-                           Filter: ((COALESCE(prt1_2.a, p2_2.a) >= 490) AND (COALESCE(prt1_2.a, p2_2.a) <= 510))
-                           ->  Sort
-                                 Sort Key: prt1_2.a, prt1_2.b
-                                 ->  Seq Scan on prt1_p3 prt1_2
-                           ->  Sort
-                                 Sort Key: p2_2.a, p2_2.b
-                                 ->  Seq Scan on prt2_p3 p2_2
-(43 rows)
+         ->  Append
+               ->  Merge Full Join
+                     Merge Cond: ((prt1_1.a = p2_1.a) AND (prt1_1.b = p2_1.b))
+                     Filter: ((COALESCE(prt1_1.a, p2_1.a) >= 490) AND (COALESCE(prt1_1.a, p2_1.a) <= 510))
+                     ->  Sort
+                           Sort Key: prt1_1.a, prt1_1.b
+                           ->  Seq Scan on prt1_p1 prt1_1
+                     ->  Sort
+                           Sort Key: p2_1.a, p2_1.b
+                           ->  Seq Scan on prt2_p1 p2_1
+               ->  Merge Full Join
+                     Merge Cond: ((prt1_2.a = p2_2.a) AND (prt1_2.b = p2_2.b))
+                     Filter: ((COALESCE(prt1_2.a, p2_2.a) >= 490) AND (COALESCE(prt1_2.a, p2_2.a) <= 510))
+                     ->  Sort
+                           Sort Key: prt1_2.a, prt1_2.b
+                           ->  Seq Scan on prt1_p2 prt1_2
+                     ->  Sort
+                           Sort Key: p2_2.a, p2_2.b
+                           ->  Seq Scan on prt2_p2 p2_2
+               ->  Merge Full Join
+                     Merge Cond: ((prt1_3.b = p2_3.b) AND (prt1_3.a = p2_3.a))
+                     Filter: ((COALESCE(prt1_3.a, p2_3.a) >= 490) AND (COALESCE(prt1_3.a, p2_3.a) <= 510))
+                     ->  Sort
+                           Sort Key: prt1_3.b, prt1_3.a
+                           ->  Seq Scan on prt1_p3 prt1_3
+                     ->  Sort
+                           Sort Key: p2_3.b, p2_3.a
+                           ->  Seq Scan on prt2_p3 p2_3
+(32 rows)
 
 SELECT a, b FROM prt1 FULL JOIN prt2 p2(b,a,c) USING(a,b)
   WHERE a BETWEEN 490 AND 510
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
index dece7310cf..7ac4a9380e 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -1303,24 +1303,22 @@ select distinct q1 from
    union all
    select distinct * from int8_tbl i82) ss
 where q2 = q2;
-                        QUERY PLAN                        
-----------------------------------------------------------
- Unique
-   ->  Merge Append
-         Sort Key: "*SELECT* 1".q1
+                     QUERY PLAN                     
+----------------------------------------------------
+ HashAggregate
+   Group Key: "*SELECT* 1".q1
+   ->  Append
          ->  Subquery Scan on "*SELECT* 1"
-               ->  Unique
-                     ->  Sort
-                           Sort Key: i81.q1, i81.q2
-                           ->  Seq Scan on int8_tbl i81
-                                 Filter: (q2 IS NOT NULL)
+               ->  HashAggregate
+                     Group Key: i81.q1, i81.q2
+                     ->  Seq Scan on int8_tbl i81
+                           Filter: (q2 IS NOT NULL)
          ->  Subquery Scan on "*SELECT* 2"
-               ->  Unique
-                     ->  Sort
-                           Sort Key: i82.q1, i82.q2
-                           ->  Seq Scan on int8_tbl i82
-                                 Filter: (q2 IS NOT NULL)
-(15 rows)
+               ->  HashAggregate
+                     Group Key: i82.q1, i82.q2
+                     ->  Seq Scan on int8_tbl i82
+                           Filter: (q2 IS NOT NULL)
+(13 rows)
 
 select distinct q1 from
   (select distinct * from int8_tbl i81
@@ -1339,24 +1337,22 @@ select distinct q1 from
    union all
    select distinct * from int8_tbl i82) ss
 where -q1 = q2;
-                       QUERY PLAN                       
---------------------------------------------------------
- Unique
-   ->  Merge Append
-         Sort Key: "*SELECT* 1".q1
+                    QUERY PLAN                    
+--------------------------------------------------
+ HashAggregate
+   Group Key: "*SELECT* 1".q1
+   ->  Append
          ->  Subquery Scan on "*SELECT* 1"
-               ->  Unique
-                     ->  Sort
-                           Sort Key: i81.q1, i81.q2
-                           ->  Seq Scan on int8_tbl i81
-                                 Filter: ((- q1) = q2)
+               ->  HashAggregate
+                     Group Key: i81.q1, i81.q2
+                     ->  Seq Scan on int8_tbl i81
+                           Filter: ((- q1) = q2)
          ->  Subquery Scan on "*SELECT* 2"
-               ->  Unique
-                     ->  Sort
-                           Sort Key: i82.q1, i82.q2
-                           ->  Seq Scan on int8_tbl i82
-                                 Filter: ((- q1) = q2)
-(15 rows)
+               ->  HashAggregate
+                     Group Key: i82.q1, i82.q2
+                     ->  Seq Scan on int8_tbl i82
+                           Filter: ((- q1) = q2)
+(13 rows)
 
 select distinct q1 from
   (select distinct * from int8_tbl i81
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 7cf86465e9..a97d405814 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -1010,6 +1010,105 @@ SELECT balk(hundred) FROM tenk1;
 
 ROLLBACK;
 
+-- GROUP BY optimization by reorder columns
+
+SELECT
+	i AS id,
+	i/2 AS p,
+	format('%60s', i%2) AS v,
+	i/4 AS c,
+	i/8 AS d,
+	(random() * (10000/8))::int as e --the same as d but no correlation with p
+	INTO btg
+FROM
+	generate_series(1, 10000) i;
+
+VACUUM btg;
+ANALYZE btg;
+
+-- GROUP BY optimization by reorder columns by frequency
+
+SET enable_hashagg=off;
+SET max_parallel_workers= 0;
+SET max_parallel_workers_per_gather = 0;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, v;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c ORDER BY v, p, c;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, d, c;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, d, c ORDER BY v, p, d ,c;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, d, c ORDER BY p, v, d ,c;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, d, e;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, e, d;
+
+CREATE STATISTICS btg_dep ON d, e, p FROM btg;
+ANALYZE btg;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, d, e;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, e, d;
+
+
+-- GROUP BY optimization by reorder columns by index scan
+
+CREATE INDEX ON btg(p, v);
+SET enable_seqscan=off;
+SET enable_bitmapscan=off;
+VACUUM btg;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, v;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, v ORDER BY p, v;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p ORDER BY p, v;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c ORDER BY p, v;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, c, p, d;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, c, p, d ORDER BY p, v;
+
+DROP TABLE btg;
+
+RESET enable_hashagg;
+RESET max_parallel_workers;
+RESET max_parallel_workers_per_gather;
+RESET enable_seqscan;
+RESET enable_bitmapscan;
+
+
 -- Secondly test the case of a parallel aggregate combiner function
 -- returning NULL. For that use normal transition function, but a
 -- combiner function returning NULL.
diff --git a/src/test/regress/sql/incremental_sort.sql b/src/test/regress/sql/incremental_sort.sql
index d8768a6b54..1de163e039 100644
--- a/src/test/regress/sql/incremental_sort.sql
+++ b/src/test/regress/sql/incremental_sort.sql
@@ -213,7 +213,7 @@ set parallel_tuple_cost = 0;
 set max_parallel_workers_per_gather = 2;
 
 create table t (a int, b int, c int);
-insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i);
+insert into t select mod(i,10),mod(i,10),i from generate_series(1,60000) s(i);
 create index on t (a);
 analyze t;
 
-- 
2.25.1

Re: POC: GROUP BY optimization

Reply via email to