Re: Memory-Bounded Hash Aggregation

Jeff Davis Wed, 27 Nov 2019 14:59:25 -0800

On Wed, 2019-08-28 at 12:52 -0700, Taylor Vesely wrote:
> Right now the patch always initializes 32 spill partitions. Have you
> given
> any thought into how to intelligently pick an optimal number of
> partitions yet?


Attached a new patch that addresses this.

1. Divide hash table memory used by the number of groups in the hash
table to get the average memory used per group.
2. Multiply by the number of groups spilled -- which I pessimistically
estimate as the number of tuples spilled -- to get the total amount of
memory that we'd like to have to process all spilled tuples at once.
3. Divide the desired amount of memory by work_mem to get the number of
partitions we'd like to have such that each partition can be processed
in work_mem without spilling.
4. Apply a few sanity checks, fudge factors, and limits.

Using this runtime information should be substantially better than
using estimates and projections.

Additionally, I removed some branches from the common path. I think I
still have more work to do there.

I also rebased of course, and fixed a few other things.

Regards,
        Jeff Davis

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index d4d1fe45cc1..6ddbadb2abd 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1753,6 +1753,23 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-hashagg-mem-overflow" xreflabel="hashagg_mem_overflow">
+      <term><varname>hashagg_mem_overflow</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>hashagg_mem_overflow</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+         If hash aggregation exceeds <varname>work_mem</varname> at query
+         execution time, and <varname>hashagg_mem_overflow</varname> is set
+         to <literal>on</literal>, continue consuming more memory rather than
+         performing disk-based hash aggregation. The default
+         is <literal>off</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-max-stack-depth" xreflabel="max_stack_depth">
       <term><varname>max_stack_depth</varname> (<type>integer</type>)
       <indexterm>
@@ -4453,6 +4470,24 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-enable-hashagg-spill" xreflabel="enable_hashagg_spill">
+      <term><varname>enable_hashagg_spill</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>enable_hashagg_spill</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables or disables the query planner's use of hashed aggregation plan
+        types when the memory usage is expected to
+        exceed <varname>work_mem</varname>. This only affects the planner
+        choice; actual behavior at execution time is dictated by
+        <xref linkend="guc-hashagg-mem-overflow"/>. The default
+        is <literal>on</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-enable-hashjoin" xreflabel="enable_hashjoin">
       <term><varname>enable_hashjoin</varname> (<type>boolean</type>)
       <indexterm>
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 62fb3434a32..092a79ea14f 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -102,6 +102,7 @@ static void show_tablesample(TableSampleClause *tsc, PlanState *planstate,
 							 List *ancestors, ExplainState *es);
 static void show_sort_info(SortState *sortstate, ExplainState *es);
 static void show_hash_info(HashState *hashstate, ExplainState *es);
+static void show_hashagg_info(AggState *hashstate, ExplainState *es);
 static void show_tidbitmap_info(BitmapHeapScanState *planstate,
 								ExplainState *es);
 static void show_instrumentation_count(const char *qlabel, int which,
@@ -1826,6 +1827,8 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_Agg:
 			show_agg_keys(castNode(AggState, planstate), ancestors, es);
 			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (es->analyze)
+				show_hashagg_info((AggState *) planstate, es);
 			if (plan->qual)
 				show_instrumentation_count("Rows Removed by Filter", 1,
 										   planstate, es);
@@ -2715,6 +2718,56 @@ show_hash_info(HashState *hashstate, ExplainState *es)
 	}
 }
 
+/*
+ * If EXPLAIN ANALYZE, show information on hash aggregate memory usage and
+ * batches.
+ */
+static void
+show_hashagg_info(AggState *aggstate, ExplainState *es)
+{
+	Agg		*agg	   = (Agg *)aggstate->ss.ps.plan;
+	long	 memPeakKb = (aggstate->hash_mem_peak + 1023) / 1024;
+	long	 diskKb    = (aggstate->hash_disk_used + 1023) / 1024;
+
+
+	Assert(IsA(aggstate, AggState));
+
+	if (agg->aggstrategy != AGG_HASHED &&
+		agg->aggstrategy != AGG_MIXED)
+		return;
+
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		appendStringInfoSpaces(es->str, es->indent * 2);
+		appendStringInfo(
+			es->str,
+			"Memory Usage: %ldkB",
+			memPeakKb);
+
+		if (aggstate->hash_batches_used > 0)
+		{
+			appendStringInfo(
+				es->str,
+				"  Batches: %d  Disk Usage:%ldkB",
+				aggstate->hash_batches_used, diskKb);
+		}
+
+		appendStringInfo(
+			es->str,
+			"\n");
+	}
+	else
+	{
+		ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es);
+		if (aggstate->hash_batches_used > 0)
+		{
+			ExplainPropertyInteger("HashAgg Batches", NULL,
+								   aggstate->hash_batches_used, es);
+			ExplainPropertyInteger("Disk Usage", "kB", diskKb, es);
+		}
+	}
+}
+
 /*
  * If it's EXPLAIN ANALYZE, show exact/lossy pages for a BitmapHeapScan node
  */
diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c
index dbed5978162..07ac8e96fdf 100644
--- a/src/backend/executor/execExprInterp.c
+++ b/src/backend/executor/execExprInterp.c
@@ -1603,14 +1603,14 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
 		{
 			AggState   *aggstate;
 			AggStatePerGroup pergroup;
+			AggStatePerGroup pergroup_allaggs;
 
 			aggstate = op->d.agg_init_trans.aggstate;
-			pergroup = &aggstate->all_pergroups
-				[op->d.agg_init_trans.setoff]
-				[op->d.agg_init_trans.transno];
+			pergroup_allaggs = aggstate->all_pergroups[op->d.agg_init_trans.setoff];
+			pergroup = &pergroup_allaggs[op->d.agg_init_trans.transno];
 
 			/* If transValue has not yet been initialized, do so now. */
-			if (pergroup->noTransValue)
+			if (pergroup_allaggs != NULL && pergroup->noTransValue)
 			{
 				AggStatePerTrans pertrans = op->d.agg_init_trans.pertrans;
 
@@ -1631,13 +1631,14 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
 		{
 			AggState   *aggstate;
 			AggStatePerGroup pergroup;
+			AggStatePerGroup pergroup_allaggs;
 
 			aggstate = op->d.agg_strict_trans_check.aggstate;
-			pergroup = &aggstate->all_pergroups
-				[op->d.agg_strict_trans_check.setoff]
-				[op->d.agg_strict_trans_check.transno];
+			pergroup_allaggs = aggstate->all_pergroups[op->d.agg_strict_trans_check.setoff];
+			pergroup = &pergroup_allaggs[op->d.agg_strict_trans_check.transno];
 
-			if (unlikely(pergroup->transValueIsNull))
+			if (pergroup_allaggs == NULL ||
+				unlikely(pergroup->transValueIsNull))
 				EEO_JUMP(op->d.agg_strict_trans_check.jumpnull);
 
 			EEO_NEXT();
@@ -1653,6 +1654,7 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
 			AggState   *aggstate;
 			AggStatePerTrans pertrans;
 			AggStatePerGroup pergroup;
+			AggStatePerGroup pergroup_allaggs;
 			FunctionCallInfo fcinfo;
 			MemoryContext oldContext;
 			Datum		newVal;
@@ -1660,9 +1662,11 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
 			aggstate = op->d.agg_trans.aggstate;
 			pertrans = op->d.agg_trans.pertrans;
 
-			pergroup = &aggstate->all_pergroups
-				[op->d.agg_trans.setoff]
-				[op->d.agg_trans.transno];
+			pergroup_allaggs = aggstate->all_pergroups[op->d.agg_trans.setoff];
+			pergroup = &pergroup_allaggs[op->d.agg_trans.transno];
+
+			if (pergroup_allaggs == NULL)
+				EEO_NEXT();
 
 			Assert(pertrans->transtypeByVal);
 
@@ -1704,6 +1708,7 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
 			AggState   *aggstate;
 			AggStatePerTrans pertrans;
 			AggStatePerGroup pergroup;
+			AggStatePerGroup pergroup_allaggs;
 			FunctionCallInfo fcinfo;
 			MemoryContext oldContext;
 			Datum		newVal;
@@ -1711,9 +1716,11 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
 			aggstate = op->d.agg_trans.aggstate;
 			pertrans = op->d.agg_trans.pertrans;
 
-			pergroup = &aggstate->all_pergroups
-				[op->d.agg_trans.setoff]
-				[op->d.agg_trans.transno];
+			pergroup_allaggs = aggstate->all_pergroups[op->d.agg_trans.setoff];
+			pergroup = &pergroup_allaggs[op->d.agg_trans.transno];
+
+			if (pergroup_allaggs == NULL)
+				EEO_NEXT();
 
 			Assert(!pertrans->transtypeByVal);
 
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 7bc5e405bcc..7c831831b5d 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -25,7 +25,6 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 
-static uint32 TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple);
 static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
 
 /*
@@ -299,6 +298,28 @@ ResetTupleHashTable(TupleHashTable hashtable)
 TupleHashEntry
 LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 					 bool *isnew)
+{
+	MemoryContext	oldContext;
+	uint32			hash;
+
+	/* Need to run the hash functions in short-lived context */
+	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
+
+	/* set up data needed by hash and match functions */
+	hashtable->inputslot = slot;
+	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
+	hashtable->cur_eq_func = hashtable->tab_eq_func;
+
+	hash = TupleHashTableHash(hashtable->hashtab, NULL);
+
+	MemoryContextSwitchTo(oldContext);
+
+	return LookupTupleHashEntryHash(hashtable, slot, isnew, hash);
+}
+
+TupleHashEntry
+LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot,
+						 bool *isnew, uint32 hash)
 {
 	TupleHashEntryData *entry;
 	MemoryContext oldContext;
@@ -317,7 +338,7 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 
 	if (isnew)
 	{
-		entry = tuplehash_insert(hashtable->hashtab, key, &found);
+		entry = tuplehash_insert_hash(hashtable->hashtab, key, hash, &found);
 
 		if (found)
 		{
@@ -337,7 +358,7 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 	}
 	else
 	{
-		entry = tuplehash_lookup(hashtable->hashtab, key);
+		entry = tuplehash_lookup_hash(hashtable->hashtab, key, hash);
 	}
 
 	MemoryContextSwitchTo(oldContext);
@@ -382,17 +403,12 @@ FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 /*
  * Compute the hash value for a tuple
  *
- * The passed-in key is a pointer to TupleHashEntryData.  In an actual hash
- * table entry, the firstTuple field points to a tuple (in MinimalTuple
- * format).  LookupTupleHashEntry sets up a dummy TupleHashEntryData with a
- * NULL firstTuple field --- that cues us to look at the inputslot instead.
- * This convention avoids the need to materialize virtual input tuples unless
- * they actually need to get copied into the table.
+ * If tuple is NULL, use the input slot instead.
  *
  * Also, the caller must select an appropriate memory context for running
  * the hash functions. (dynahash.c doesn't change CurrentMemoryContext.)
  */
-static uint32
+uint32
 TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple)
 {
 	TupleHashTable hashtable = (TupleHashTable) tb->private_data;
@@ -413,9 +429,6 @@ TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple)
 	{
 		/*
 		 * Process a tuple already stored in the table.
-		 *
-		 * (this case never actually occurs due to the way simplehash.h is
-		 * used, as the hash-value is stored in the entries)
 		 */
 		slot = hashtable->tableslot;
 		ExecStoreMinimalTuple(tuple, slot, false);
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 6ee24eab3d2..a70151cf7da 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -194,6 +194,18 @@
  *	  transition values.  hashcontext is the single context created to support
  *	  all hash tables.
  *
+ *	  When the hash table memory exceeds work_mem, we advance the transition
+ *	  states only for groups already in the hash table. For tuples that would
+ *	  need to create a new hash table entries (and initialize new transition
+ *	  states), we spill them to disk to be processed later. The tuples are
+ *	  spilled in a partitioned manner, so that subsequent batches are smaller
+ *	  and less likely to exceed work_mem (if a batch does exceed work_mem, it
+ *	  must be spilled recursively).
+ *
+ *	  Note that it's possible for transition states to start small but then
+ *	  grow very large; for instance in the case of ARRAY_AGG. In such cases,
+ *	  it's still possible to significantly exceed work_mem.
+ *
  *    Transition / Combine function invocation:
  *
  *    For performance reasons transition functions, including combine
@@ -229,15 +241,65 @@
 #include "optimizer/optimizer.h"
 #include "parser/parse_agg.h"
 #include "parser/parse_coerce.h"
+#include "storage/buffile.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/datum.h"
+#include "utils/dynahash.h"
 #include "utils/expandeddatum.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/syscache.h"
 #include "utils/tuplesort.h"
 
+/*
+ * Control how many partitions are created when spilling HashAgg to
+ * disk.
+ *
+ * HASH_PARTITION_FACTOR is multiplied by the estimated number of partitions
+ * needed such that each partition will fit in memory. The factor is set
+ * higher than one because there's not a high cost to having a few too many
+ * partitions, and it makes it less likely that a partition will need to be
+ * spilled recursively. Another benefit of having more, smaller partitions is
+ * that small hash tables may perform better than large ones due to memory
+ * caching effects.
+ *
+ * HASH_PARTITION_MEM is the approximate amount of work_mem we should reserve
+ * for the partitions themselves (i.e. buffering of the files backing the
+ * partitions). This is an estimate, because we choose the number of
+ * partitions at the time we need to spill, and because this algorithm
+ * shouldn't depend too directly on the internal memory needs of a BufFile.
+ */
+#define HASH_PARTITION_FACTOR 1.50
+#define HASH_MIN_PARTITIONS 4
+#define HASH_MAX_PARTITIONS 256
+#define HASH_PARTITION_MEM (HASH_MIN_PARTITIONS * BLCKSZ)
+
+/*
+ * Represents partitioned spill data for a single hashtable.
+ */
+typedef struct HashAggSpill
+{
+	int       n_partitions;		/* number of output partitions */
+	int       partition_bits;	/* number of bits for partition mask
+								   log2(n_partitions) parent partition bits */
+	BufFile **partitions;		/* output partition files */
+	int64    *ntuples;			/* number of tuples in each partition */
+} HashAggSpill;
+
+/*
+ * Represents work to be done for one pass of hash aggregation. Initially,
+ * only the input fields are set. If spilled to disk, also set the spill data.
+ */
+typedef struct HashAggBatch
+{
+	BufFile *input_file;		/* input partition */
+	int      input_bits;		/* number of bits for input partition mask */
+	int64    input_groups;		/* estimated number of input groups */
+	int		 setno;				/* grouping set */
+	HashAggSpill spill;			/* spill output */
+} HashAggBatch;
+
 static void select_current_set(AggState *aggstate, int setno, bool is_hash);
 static void initialize_phase(AggState *aggstate, int newphase);
 static TupleTableSlot *fetch_input_tuple(AggState *aggstate);
@@ -272,11 +334,27 @@ static TupleTableSlot *project_aggregates(AggState *aggstate);
 static Bitmapset *find_unaggregated_cols(AggState *aggstate);
 static bool find_unaggregated_cols_walker(Node *node, Bitmapset **colnos);
 static void build_hash_table(AggState *aggstate);
-static TupleHashEntryData *lookup_hash_entry(AggState *aggstate);
+static void prepare_hash_slot(AggState *aggstate);
+static uint32 calculate_hash(AggState *aggstate);
+static AggStatePerGroup lookup_hash_entry(AggState *aggstate, uint32 hash);
 static void lookup_hash_entries(AggState *aggstate);
 static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
 static void agg_fill_hash_table(AggState *aggstate);
+static bool agg_refill_hash_table(AggState *aggstate);
 static TupleTableSlot *agg_retrieve_hash_table(AggState *aggstate);
+static TupleTableSlot *agg_retrieve_hash_table_in_memory(AggState *aggstate);
+static void hash_spill_init(HashAggSpill *spill, int input_bits,
+							uint64 input_tuples, double hashentrysize);
+static Size hash_spill_tuple(HashAggSpill *spill, int input_bits,
+							 TupleTableSlot *slot, uint32 hash);
+static MinimalTuple hash_read_spilled(BufFile *file, uint32 *hashp);
+static HashAggBatch *hash_batch_new(BufFile *input_file, int setno,
+									int64 input_groups, int input_bits);
+static void hash_finish_initial_spills(AggState *aggstate);
+static void hash_spill_finish(AggState *aggstate, HashAggSpill *spill,
+							  int setno, int input_bits);
+static void hash_reset_spill(HashAggSpill *spill);
+static void hash_reset_spills(AggState *aggstate);
 static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
 static void build_pertrans_for_aggref(AggStatePerTrans pertrans,
 									  AggState *aggstate, EState *estate,
@@ -1269,6 +1347,10 @@ build_hash_table(AggState *aggstate)
 
 	Assert(aggstate->aggstrategy == AGG_HASHED || aggstate->aggstrategy == AGG_MIXED);
 
+	/* TODO: work harder to find a good nGroups for each hash table. We don't
+	 * want the hash table itself to fill up work_mem with no room for
+	 * out-of-line transition values. Also, we need to consider that there are
+	 * multiple hash tables for grouping sets. */
 	additionalsize = aggstate->numtrans * sizeof(AggStatePerGroupData);
 
 	for (i = 0; i < aggstate->num_hashes; ++i)
@@ -1294,6 +1376,24 @@ build_hash_table(AggState *aggstate)
 														tmpmem,
 														DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
 	}
+
+	aggstate->hash_mem_current = MemoryContextMemAllocated(
+		aggstate->hashcontext->ecxt_per_tuple_memory, true);
+	aggstate->hash_ngroups_current = 0;
+
+	/*
+	 * Initialize the threshold at which we stop creating new hash entries and
+	 * start spilling. If an empty hash table exceeds the limit, increase the
+	 * limit to be the size of the empty hash table. This ensures that at
+	 * least one entry can be added so that the algorithm can make progress.
+	 */
+	if (hashagg_mem_overflow)
+		aggstate->hash_mem_limit = SIZE_MAX;
+	else
+		aggstate->hash_mem_limit = (work_mem * 1024L) - HASH_PARTITION_MEM;
+
+	if (aggstate->hash_mem_current > aggstate->hash_mem_limit)
+		aggstate->hash_mem_limit = aggstate->hash_mem_current;
 }
 
 /*
@@ -1454,23 +1554,13 @@ hash_agg_entry_size(int numAggs)
 	return entrysize;
 }
 
-/*
- * Find or create a hashtable entry for the tuple group containing the current
- * tuple (already set in tmpcontext's outertuple slot), in the current grouping
- * set (which the caller must have selected - note that initialize_aggregate
- * depends on this).
- *
- * When called, CurrentMemoryContext should be the per-query context.
- */
-static TupleHashEntryData *
-lookup_hash_entry(AggState *aggstate)
+static void
+prepare_hash_slot(AggState *aggstate)
 {
-	TupleTableSlot *inputslot = aggstate->tmpcontext->ecxt_outertuple;
-	AggStatePerHash perhash = &aggstate->perhash[aggstate->current_set];
-	TupleTableSlot *hashslot = perhash->hashslot;
-	TupleHashEntryData *entry;
-	bool		isnew;
-	int			i;
+	TupleTableSlot	*inputslot = aggstate->tmpcontext->ecxt_outertuple;
+	AggStatePerHash	 perhash   = &aggstate->perhash[aggstate->current_set];
+	TupleTableSlot	*hashslot  = perhash->hashslot;
+	int				 i;
 
 	/* transfer just the needed columns into hashslot */
 	slot_getsomeattrs(inputslot, perhash->largestGrpColIdx);
@@ -1484,14 +1574,71 @@ lookup_hash_entry(AggState *aggstate)
 		hashslot->tts_isnull[i] = inputslot->tts_isnull[varNumber];
 	}
 	ExecStoreVirtualTuple(hashslot);
+}
+
+static uint32
+calculate_hash(AggState *aggstate)
+{
+	AggStatePerHash	 perhash   = &aggstate->perhash[aggstate->current_set];
+	TupleHashTable	 hashtable = perhash->hashtable;
+	MemoryContext	 oldContext;
+	uint32			 hash;
+
+	/* set up data needed by hash and match functions */
+	hashtable->inputslot = perhash->hashslot;
+	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
+	hashtable->cur_eq_func = hashtable->tab_eq_func;
+
+	/* Need to run the hash functions in short-lived context */
+	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
+
+	hash = TupleHashTableHash(hashtable->hashtab, NULL);
+
+	MemoryContextSwitchTo(oldContext);
+
+	return hash;
+}
+
+/*
+ * Find or create a hashtable entry for the tuple group containing the current
+ * tuple (already set in tmpcontext's outertuple slot), in the current grouping
+ * set (which the caller must have selected - note that initialize_aggregate
+ * depends on this).
+ *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static AggStatePerGroup
+lookup_hash_entry(AggState *aggstate, uint32 hash)
+{
+	AggStatePerHash perhash = &aggstate->perhash[aggstate->current_set];
+	TupleTableSlot *hashslot = perhash->hashslot;
+	TupleHashEntryData *entry;
+	bool		isnew = false;
+	bool	   *p_isnew;
+
+	/* if hash table memory limit is exceeded, don't create new entries */
+	p_isnew = (aggstate->hash_mem_current > aggstate->hash_mem_limit) ?
+		NULL : &isnew;
 
 	/* find or create the hashtable entry using the filtered tuple */
-	entry = LookupTupleHashEntry(perhash->hashtable, hashslot, &isnew);
+	entry = LookupTupleHashEntryHash(perhash->hashtable, hashslot, p_isnew,
+									 hash);
+
+	if (entry == NULL)
+		return NULL;
 
 	if (isnew)
 	{
-		AggStatePerGroup pergroup;
-		int			transno;
+		AggStatePerGroup	pergroup;
+		int					transno;
+
+		aggstate->hash_ngroups_current++;
+
+		aggstate->hash_mem_current = MemoryContextMemAllocated(
+			aggstate->hashcontext->ecxt_per_tuple_memory, true);
+
+		if (aggstate->hash_mem_current > aggstate->hash_mem_peak)
+			aggstate->hash_mem_peak = aggstate->hash_mem_current;
 
 		pergroup = (AggStatePerGroup)
 			MemoryContextAlloc(perhash->hashtable->tablecxt,
@@ -1511,7 +1658,7 @@ lookup_hash_entry(AggState *aggstate)
 		}
 	}
 
-	return entry;
+	return entry->additional;
 }
 
 /*
@@ -1519,18 +1666,49 @@ lookup_hash_entry(AggState *aggstate)
  * returning an array of pergroup pointers suitable for advance_aggregates.
  *
  * Be aware that lookup_hash_entry can reset the tmpcontext.
+ *
+ * Return false if hash table has exceeded its memory limit.
  */
 static void
 lookup_hash_entries(AggState *aggstate)
 {
-	int			numHashes = aggstate->num_hashes;
 	AggStatePerGroup *pergroup = aggstate->hash_pergroup;
 	int			setno;
 
-	for (setno = 0; setno < numHashes; setno++)
+	for (setno = 0; setno < aggstate->num_hashes; setno++)
 	{
+		AggStatePerHash perhash = &aggstate->perhash[setno];
+		uint32			hash;
+
 		select_current_set(aggstate, setno, true);
-		pergroup[setno] = lookup_hash_entry(aggstate)->additional;
+		prepare_hash_slot(aggstate);
+		hash = calculate_hash(aggstate);
+		pergroup[setno] = lookup_hash_entry(aggstate, hash);
+
+		if (pergroup[setno] == NULL)
+		{
+			HashAggSpill *spill;
+			TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple;
+			double			hashentrysize = 0;
+
+			/* average memory cost per entry */
+			if (aggstate->hash_ngroups_current > 0)
+				hashentrysize = (double)aggstate->hash_mem_current /
+					(double)aggstate->hash_ngroups_current;
+
+			if (aggstate->hash_spills == NULL)
+				aggstate->hash_spills = palloc0(
+					sizeof(HashAggSpill) * aggstate->num_hashes);
+			aggstate->hash_spilled = true;
+
+			spill = &aggstate->hash_spills[setno];
+
+			if (spill->partitions == NULL)
+				hash_spill_init(spill, 0, perhash->aggnode->numGroups,
+								hashentrysize);
+
+			aggstate->hash_disk_used += hash_spill_tuple(spill, 0, slot, hash);
+		}
 	}
 }
 
@@ -1852,6 +2030,10 @@ agg_retrieve_direct(AggState *aggstate)
 					outerslot = fetch_input_tuple(aggstate);
 					if (TupIsNull(outerslot))
 					{
+						if (aggstate->aggstrategy == AGG_MIXED &&
+							aggstate->current_phase == 1)
+							hash_finish_initial_spills(aggstate);
+
 						/* no more outer-plan tuples available */
 						if (hasGroupingSets)
 						{
@@ -1955,6 +2137,8 @@ agg_fill_hash_table(AggState *aggstate)
 		ResetExprContext(aggstate->tmpcontext);
 	}
 
+	hash_finish_initial_spills(aggstate);
+
 	aggstate->table_filled = true;
 	/* Initialize to walk the first hash table */
 	select_current_set(aggstate, 0, true);
@@ -1962,11 +2146,149 @@ agg_fill_hash_table(AggState *aggstate)
 						   &aggstate->perhash[0].hashiter);
 }
 
+/*
+ * If any data was spilled during hash aggregation, reset the hash table and
+ * reprocess one batch of spilled data. After reprocessing a batch, the hash
+ * table will again contain data, ready to be consumed by
+ * agg_retrieve_hash_table_in_memory().
+ *
+ * Should only be called after all in memory hash table entries have been
+ * consumed.
+ *
+ * Return false when input is exhausted and there's no more work to be done;
+ * otherwise return true.
+ */
+static bool
+agg_refill_hash_table(AggState *aggstate)
+{
+	HashAggBatch	*batch;
+	AggStatePerGroup *pergroup;
+
+	if (aggstate->hash_batches == NIL)
+		return false;
+
+	pergroup = aggstate->all_pergroups;
+	while(pergroup != aggstate->hash_pergroup) {
+		*pergroup = NULL;
+		pergroup++;
+	}
+
+	/* free memory */
+	ReScanExprContext(aggstate->hashcontext);
+	/* Rebuild an empty hash table */
+	build_hash_table(aggstate);
+
+	batch = linitial(aggstate->hash_batches);
+	aggstate->hash_batches = list_delete_first(aggstate->hash_batches);
+
+	Assert(aggstate->current_phase == 0);
+
+	/*
+	 * TODO: what should be done here to set up for advance_aggregates?
+	 */
+	if (aggstate->phase->aggstrategy == AGG_MIXED)
+	{
+		aggstate->current_phase = 1;
+		aggstate->phase = &aggstate->phases[aggstate->current_phase];
+	}
+
+	for (;;) {
+		TupleTableSlot	*slot = aggstate->hash_spill_slot;
+		MinimalTuple	 tuple;
+		uint32			 hash;
+
+		CHECK_FOR_INTERRUPTS();
+
+		tuple = hash_read_spilled(batch->input_file, &hash);
+		if (tuple == NULL)
+			break;
+
+		/*
+		 * TODO: Should we re-compile the expressions to use a minimal tuple
+		 * slot so that we don't have to create the virtual tuple here? If we
+		 * project the tuple before writing, then perhaps this is not
+		 * important.
+		 */
+		ExecForceStoreMinimalTuple(tuple, slot, true);
+		aggstate->tmpcontext->ecxt_outertuple = slot;
+
+		/* Find or build hashtable entries */
+		memset(aggstate->hash_pergroup, 0,
+			   sizeof(AggStatePerGroup) * aggstate->num_hashes);
+		select_current_set(aggstate, batch->setno, true);
+		prepare_hash_slot(aggstate);
+		aggstate->hash_pergroup[batch->setno] = lookup_hash_entry(aggstate, hash);
+		if (aggstate->hash_pergroup[batch->setno] == NULL)
+		{
+			double			hashentrysize = 0;
+
+			/* average memory cost per entry */
+			if (aggstate->hash_ngroups_current > 0)
+				hashentrysize = (double)aggstate->hash_mem_current /
+					(double)aggstate->hash_ngroups_current;
+
+			if (batch->spill.partitions == NULL)
+				hash_spill_init(&batch->spill, batch->input_bits,
+								batch->input_groups, hashentrysize);
+
+			aggstate->hash_disk_used += hash_spill_tuple(
+				&batch->spill, batch->input_bits, slot, hash);
+		}
+
+		/* Advance the aggregates (or combine functions) */
+		advance_aggregates(aggstate);
+
+		/*
+		 * Reset per-input-tuple context after each tuple, but note that the
+		 * hash lookups do this too
+		 */
+		ResetExprContext(aggstate->tmpcontext);
+	}
+
+	BufFileClose(batch->input_file);
+
+	aggstate->current_phase = 0;
+	aggstate->phase = &aggstate->phases[aggstate->current_phase];
+
+	hash_spill_finish(aggstate, &batch->spill, batch->setno,
+					  batch->input_bits);
+
+	pfree(batch);
+
+	/* Initialize to walk the first hash table */
+	select_current_set(aggstate, 0, true);
+	ResetTupleHashIterator(aggstate->perhash[0].hashtable,
+						   &aggstate->perhash[0].hashiter);
+
+	return true;
+}
+
 /*
  * ExecAgg for hashed case: retrieving groups from hash table
  */
 static TupleTableSlot *
 agg_retrieve_hash_table(AggState *aggstate)
+{
+	TupleTableSlot *result = NULL;
+
+	while (result == NULL)
+	{
+		result = agg_retrieve_hash_table_in_memory(aggstate);
+		if (result == NULL)
+		{
+			if (!agg_refill_hash_table(aggstate))
+			{
+				aggstate->agg_done = true;
+				break;
+			}
+		}
+	}
+
+	return result;
+}
+
+static TupleTableSlot *
+agg_retrieve_hash_table_in_memory(AggState *aggstate)
 {
 	ExprContext *econtext;
 	AggStatePerAgg peragg;
@@ -1995,7 +2317,7 @@ agg_retrieve_hash_table(AggState *aggstate)
 	 * We loop retrieving groups until we find one satisfying
 	 * aggstate->ss.ps.qual
 	 */
-	while (!aggstate->agg_done)
+	for (;;)
 	{
 		TupleTableSlot *hashslot = perhash->hashslot;
 		int			i;
@@ -2026,8 +2348,6 @@ agg_retrieve_hash_table(AggState *aggstate)
 			}
 			else
 			{
-				/* No more hashtables, so done */
-				aggstate->agg_done = true;
 				return NULL;
 			}
 		}
@@ -2084,6 +2404,322 @@ agg_retrieve_hash_table(AggState *aggstate)
 	return NULL;
 }
 
+/*
+ * Determine the number of partitions to create when spilling.
+ */
+static int
+hash_spill_npartitions(uint64 input_tuples, double hashentrysize)
+{
+	Size	mem_needed;
+	int		partition_limit;
+	int		npartitions;
+
+	/*
+	 * Avoid creating so many partitions that the memory requirements of the
+	 * open partition files (estimated at BLCKSZ for buffering) are greater
+	 * than 1/4 of work_mem.
+	 */
+	partition_limit = (work_mem * 1024L * 0.25) / BLCKSZ;
+
+	/* pessimistically estimate that each input tuple creates a new group */
+	mem_needed = HASH_PARTITION_FACTOR * input_tuples * hashentrysize;
+
+	/* make enough partitions so that each one is likely to fit in memory */
+	npartitions = 1 + (mem_needed / (work_mem * 1024L));
+
+	if (npartitions > partition_limit)
+		npartitions = partition_limit;
+
+	if (npartitions < HASH_MIN_PARTITIONS)
+		npartitions = HASH_MIN_PARTITIONS;
+	if (npartitions > HASH_MAX_PARTITIONS)
+		npartitions = HASH_MAX_PARTITIONS;
+
+	return npartitions;
+}
+
+/*
+ * hash_spill_init
+ *
+ * Called after we determined that spilling is necessary. Chooses the number
+ * of partitions to create, and initializes them.
+ */
+static void
+hash_spill_init(HashAggSpill *spill, int input_bits, uint64 input_tuples,
+				double hashentrysize)
+{
+	int     npartitions;
+	int     partition_bits;
+
+	npartitions = hash_spill_npartitions(input_tuples, hashentrysize);
+	partition_bits = my_log2(npartitions);
+
+	/* make sure that we don't exhaust the hash bits */
+	if (partition_bits + input_bits >= 32)
+		partition_bits = 32 - input_bits;
+
+	/* number of partitions will be a power of two */
+	npartitions = 1L << partition_bits;
+
+	spill->partition_bits = partition_bits;
+	spill->n_partitions = npartitions;
+	spill->partitions = palloc0(sizeof(BufFile *) * npartitions);
+	spill->ntuples = palloc0(sizeof(int64) * npartitions);
+}
+
+/*
+ * hash_spill_tuple
+ *
+ * Not enough memory to add tuple as new entry in hash table. Save for later
+ * in the appropriate partition.
+ */
+static Size
+hash_spill_tuple(HashAggSpill *spill, int input_bits, TupleTableSlot *slot,
+				 uint32 hash)
+{
+	int					 partition;
+	MinimalTuple		 tuple;
+	BufFile				*file;
+	int					 written;
+	int					 total_written = 0;
+	bool				 shouldFree;
+
+	Assert(spill->partitions != NULL);
+
+	/*
+	 * TODO: should we project only needed attributes from the tuple before
+	 * writing it?
+	 */
+	tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+
+	if (spill->partition_bits == 0)
+		partition = 0;
+	else
+		partition = (hash << input_bits) >>
+			(32 - spill->partition_bits);
+
+	spill->ntuples[partition]++;
+
+	/*
+	 * TODO: use logtape.c instead?
+	 */
+	if (spill->partitions[partition] == NULL)
+		spill->partitions[partition] = BufFileCreateTemp(false);
+	file = spill->partitions[partition];
+
+
+	written = BufFileWrite(file, (void *) &hash, sizeof(uint32));
+	if (written != sizeof(uint32))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to HashAgg temporary file: %m")));
+	total_written += written;
+
+	written = BufFileWrite(file, (void *) tuple, tuple->t_len);
+	if (written != tuple->t_len)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to HashAgg temporary file: %m")));
+	total_written += written;
+
+	if (shouldFree)
+		pfree(tuple);
+
+	return total_written;
+}
+
+/*
+ * read_spilled_tuple
+ * 		read the next tuple from a batch file.  Return NULL if no more.
+ */
+static MinimalTuple
+hash_read_spilled(BufFile *file, uint32 *hashp)
+{
+	MinimalTuple	tuple;
+	uint32			t_len;
+	size_t			nread;
+	uint32			hash;
+
+	nread = BufFileRead(file, &hash, sizeof(uint32));
+	if (nread == 0)
+		return NULL;
+	if (nread != sizeof(uint32))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from HashAgg temporary file: %m")));
+	if (hashp != NULL)
+		*hashp = hash;
+
+	nread = BufFileRead(file, &t_len, sizeof(t_len));
+	if (nread != sizeof(uint32))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from HashAgg temporary file: %m")));
+
+	tuple = (MinimalTuple) palloc(t_len);
+	tuple->t_len = t_len;
+
+	nread = BufFileRead(file, (void *)((char *)tuple + sizeof(uint32)),
+						t_len - sizeof(uint32));
+	if (nread != t_len - sizeof(uint32))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from HashAgg temporary file: %m")));
+
+	return tuple;
+}
+
+/*
+ * new_hashagg_batch
+ *
+ * Construct a HashAggBatch item, which represents one iteration of HashAgg to
+ * be done. Should be called in the aggregate's memory context.
+ */
+static HashAggBatch *
+hash_batch_new(BufFile *input_file, int setno, int64 input_groups,
+			   int input_bits)
+{
+	HashAggBatch *batch = palloc0(sizeof(HashAggBatch));
+
+	batch->input_file = input_file;
+	batch->input_bits = input_bits;
+	batch->input_groups = input_groups;
+	batch->setno = setno;
+
+	/* batch->spill will be set only after spilling this batch */
+
+	return batch;
+}
+
+/*
+ * hash_finish_initial_spills
+ *
+ * After a HashAggBatch has been processed, it may have spilled tuples to
+ * disk. If so, turn the spilled partitions into new batches that must later
+ * be executed.
+ */
+static void
+hash_finish_initial_spills(AggState *aggstate)
+{
+	int setno;
+
+	if (aggstate->hash_spills == NULL)
+		return;
+
+	for (setno = 0; setno < aggstate->num_hashes; setno++)
+		hash_spill_finish(aggstate, &aggstate->hash_spills[setno], setno, 0);
+
+	pfree(aggstate->hash_spills);
+	aggstate->hash_spills = NULL;
+}
+
+/*
+ * hash_spill_finish
+ *
+ *
+ */
+static void
+hash_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno, int input_bits)
+{
+	int i;
+
+	if (spill->n_partitions == 0)
+		return;	/* didn't spill */
+
+	for (i = 0; i < spill->n_partitions; i++)
+	{
+		BufFile         *file = spill->partitions[i];
+		MemoryContext    oldContext;
+		HashAggBatch    *new_batch;
+		int64            input_ngroups;
+
+		/* partition is empty */
+		if (file == NULL)
+			continue;
+
+		/* rewind file for reading */
+		if (BufFileSeek(file, 0, 0L, SEEK_SET))
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not rewind HashAgg temporary file: %m")));
+
+		/*
+		 * Estimate the number of input groups for this new work item as the
+		 * total number of tuples in its input file. Although that's a worst
+		 * case, it's not bad here for two reasons: (1) overestimating is
+		 * better than underestimating; and (2) we've already scanned the
+		 * relation once, so it's likely that we've already finalized many of
+		 * the common values.
+		 */
+		input_ngroups = spill->ntuples[i];
+
+		oldContext = MemoryContextSwitchTo(aggstate->ss.ps.state->es_query_cxt);
+		new_batch = hash_batch_new(file, setno, input_ngroups,
+								   spill->partition_bits + input_bits);
+		aggstate->hash_batches = lappend(aggstate->hash_batches, new_batch);
+		aggstate->hash_batches_used++;
+		MemoryContextSwitchTo(oldContext);
+	}
+
+	pfree(spill->ntuples);
+	pfree(spill->partitions);
+}
+
+/*
+ * Clear a HashAggSpill, free its memory, and close its files.
+ */
+static void
+hash_reset_spill(HashAggSpill *spill)
+{
+	int i;
+	for (i = 0; i < spill->n_partitions; i++)
+	{
+		BufFile         *file = spill->partitions[i];
+
+		if (file != NULL)
+			BufFileClose(file);
+	}
+	if (spill->ntuples != NULL)
+		pfree(spill->ntuples);
+	if (spill->partitions != NULL)
+		pfree(spill->partitions);
+}
+
+/*
+ * Find and reset all active HashAggSpills.
+ */
+static void
+hash_reset_spills(AggState *aggstate)
+{
+	ListCell *lc;
+
+	if (aggstate->hash_spills != NULL)
+	{
+		int setno;
+
+		for (setno = 0; setno < aggstate->num_hashes; setno++)
+			hash_reset_spill(&aggstate->hash_spills[setno]);
+
+		pfree(aggstate->hash_spills);
+		aggstate->hash_spills = NULL;
+	}
+
+	foreach(lc, aggstate->hash_batches)
+	{
+		HashAggBatch *batch = (HashAggBatch*) lfirst(lc);
+		if (batch->input_file != NULL)
+		{
+			BufFileClose(batch->input_file);
+			batch->input_file = NULL;
+		}
+		hash_reset_spill(&batch->spill);
+		pfree(batch);
+	}
+	list_free(aggstate->hash_batches);
+	aggstate->hash_batches = NIL;
+}
+
+
 /* -----------------
  * ExecInitAgg
  *
@@ -2268,6 +2904,10 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 			aggstate->ss.ps.outeropsfixed = false;
 	}
 
+	if (use_hashing)
+		aggstate->hash_spill_slot = ExecInitExtraTupleSlot(estate, scanDesc,
+														   &TTSOpsVirtual);
+
 	/*
 	 * Initialize result type, slot and projection.
 	 */
@@ -3398,6 +4038,8 @@ ExecEndAgg(AggState *node)
 	if (node->sort_out)
 		tuplesort_end(node->sort_out);
 
+	hash_reset_spills(node);
+
 	for (transno = 0; transno < node->numtrans; transno++)
 	{
 		AggStatePerTrans pertrans = &node->pertrans[transno];
@@ -3453,12 +4095,13 @@ ExecReScanAgg(AggState *node)
 			return;
 
 		/*
-		 * If we do have the hash table, and the subplan does not have any
-		 * parameter changes, and none of our own parameter changes affect
-		 * input expressions of the aggregated functions, then we can just
-		 * rescan the existing hash table; no need to build it again.
+		 * If we do have the hash table, and it never spilled, and the subplan
+		 * does not have any parameter changes, and none of our own parameter
+		 * changes affect input expressions of the aggregated functions, then
+		 * we can just rescan the existing hash table; no need to build it
+		 * again.
 		 */
-		if (outerPlan->chgParam == NULL &&
+		if (outerPlan->chgParam == NULL && !node->hash_spilled &&
 			!bms_overlap(node->ss.ps.chgParam, aggnode->aggParams))
 		{
 			ResetTupleHashIterator(node->perhash[0].hashtable,
@@ -3515,6 +4158,17 @@ ExecReScanAgg(AggState *node)
 	 */
 	if (node->aggstrategy == AGG_HASHED || node->aggstrategy == AGG_MIXED)
 	{
+		hash_reset_spills(node);
+
+		node->hash_spilled = false;
+		node->hash_mem_current = 0;
+		node->hash_ngroups_current = 0;
+
+		/* reset stats */
+		node->hash_mem_peak = 0;
+		node->hash_disk_used = 0;
+		node->hash_batches_used = 0;
+
 		ReScanExprContext(node->hashcontext);
 		/* Rebuild an empty hash table */
 		build_hash_table(node);
diff --git a/src/backend/jit/llvm/llvmjit_expr.c b/src/backend/jit/llvm/llvmjit_expr.c
index a9d362100a8..f0f742eebf5 100644
--- a/src/backend/jit/llvm/llvmjit_expr.c
+++ b/src/backend/jit/llvm/llvmjit_expr.c
@@ -2093,12 +2093,14 @@ llvm_compile_expr(ExprState *state)
 					LLVMValueRef v_allpergroupsp;
 
 					LLVMValueRef v_pergroupp;
+					LLVMValueRef v_pergroup_allaggs;
 
 					LLVMValueRef v_setoff,
 								v_transno;
 
 					LLVMValueRef v_notransvalue;
 
+					LLVMBasicBlockRef b_check_notransvalue;
 					LLVMBasicBlockRef b_init;
 
 					aggstate = op->d.agg_init_trans.aggstate;
@@ -2120,11 +2122,22 @@ llvm_compile_expr(ExprState *state)
 										  "aggstate.all_pergroups");
 					v_setoff = l_int32_const(op->d.agg_init_trans.setoff);
 					v_transno = l_int32_const(op->d.agg_init_trans.transno);
-					v_pergroupp =
-						LLVMBuildGEP(b,
-									 l_load_gep1(b, v_allpergroupsp, v_setoff, ""),
-									 &v_transno, 1, "");
+					v_pergroup_allaggs = l_load_gep1(b, v_allpergroupsp, v_setoff, "");
 
+					b_check_notransvalue = l_bb_before_v(
+						opblocks[i + 1], "op.%d.check_notransvalue", i);
+
+					LLVMBuildCondBr(b,
+									LLVMBuildICmp(b, LLVMIntEQ,
+												  LLVMBuildPtrToInt(b, v_pergroup_allaggs,
+																	TypeSizeT, ""),
+												  l_sizet_const(0), ""),
+									opblocks[i + 1],
+									b_check_notransvalue);
+
+					LLVMPositionBuilderAtEnd(b, b_check_notransvalue);
+
+					v_pergroupp = LLVMBuildGEP(b, v_pergroup_allaggs, &v_transno, 1, "");
 					v_notransvalue =
 						l_load_struct_gep(b, v_pergroupp,
 										  FIELDNO_AGGSTATEPERGROUPDATA_NOTRANSVALUE,
@@ -2191,6 +2204,9 @@ llvm_compile_expr(ExprState *state)
 
 					LLVMValueRef v_transnull;
 					LLVMValueRef v_pergroupp;
+					LLVMValueRef v_pergroup_allaggs;
+
+					LLVMBasicBlockRef b_check_transnull;
 
 					int			jumpnull = op->d.agg_strict_trans_check.jumpnull;
 
@@ -2210,11 +2226,22 @@ llvm_compile_expr(ExprState *state)
 						l_int32_const(op->d.agg_strict_trans_check.setoff);
 					v_transno =
 						l_int32_const(op->d.agg_strict_trans_check.transno);
-					v_pergroupp =
-						LLVMBuildGEP(b,
-									 l_load_gep1(b, v_allpergroupsp, v_setoff, ""),
-									 &v_transno, 1, "");
+					v_pergroup_allaggs = l_load_gep1(b, v_allpergroupsp, v_setoff, "");
+
+					b_check_transnull = l_bb_before_v(opblocks[i + 1],
+													  "op.%d.check_transnull", i);
 
+					LLVMBuildCondBr(b,
+									LLVMBuildICmp(b, LLVMIntEQ,
+												  LLVMBuildPtrToInt(b, v_pergroup_allaggs,
+																	TypeSizeT, ""),
+												  l_sizet_const(0), ""),
+									opblocks[jumpnull],
+									b_check_transnull);
+
+					LLVMPositionBuilderAtEnd(b, b_check_transnull);
+
+					v_pergroupp = LLVMBuildGEP(b, v_pergroup_allaggs, &v_transno, 1, "");
 					v_transnull =
 						l_load_struct_gep(b, v_pergroupp,
 										  FIELDNO_AGGSTATEPERGROUPDATA_TRANSVALUEISNULL,
@@ -2256,12 +2283,15 @@ llvm_compile_expr(ExprState *state)
 					LLVMValueRef v_pertransp;
 
 					LLVMValueRef v_pergroupp;
+					LLVMValueRef v_pergroup_allaggs;
 
 					LLVMValueRef v_retval;
 
 					LLVMValueRef v_tmpcontext;
 					LLVMValueRef v_oldcontext;
 
+					LLVMBasicBlockRef b_advance_transval;
+
 					aggstate = op->d.agg_trans.aggstate;
 					pertrans = op->d.agg_trans.pertrans;
 
@@ -2283,10 +2313,22 @@ llvm_compile_expr(ExprState *state)
 										  "aggstate.all_pergroups");
 					v_setoff = l_int32_const(op->d.agg_trans.setoff);
 					v_transno = l_int32_const(op->d.agg_trans.transno);
-					v_pergroupp =
-						LLVMBuildGEP(b,
-									 l_load_gep1(b, v_allpergroupsp, v_setoff, ""),
-									 &v_transno, 1, "");
+					v_pergroup_allaggs = l_load_gep1(b, v_allpergroupsp, v_setoff, "");
+
+					b_advance_transval = l_bb_before_v(opblocks[i + 1],
+													   "op.%d.advance_transval", i);
+
+					LLVMBuildCondBr(b,
+									LLVMBuildICmp(b, LLVMIntEQ,
+												  LLVMBuildPtrToInt(b, v_pergroup_allaggs,
+																	TypeSizeT, ""),
+												  l_sizet_const(0), ""),
+									opblocks[i + 1],
+									b_advance_transval);
+
+					LLVMPositionBuilderAtEnd(b, b_advance_transval);
+
+					v_pergroupp = LLVMBuildGEP(b, v_pergroup_allaggs, &v_transno, 1, "");
 
 					v_fcinfo = l_ptr_const(fcinfo,
 										   l_ptr(StructFunctionCallInfoData));
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index c5f65934859..3f0d2899635 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -128,6 +128,7 @@ bool		enable_bitmapscan = true;
 bool		enable_tidscan = true;
 bool		enable_sort = true;
 bool		enable_hashagg = true;
+bool		enable_hashagg_spill = true;
 bool		enable_nestloop = true;
 bool		enable_material = true;
 bool		enable_mergejoin = true;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 7fe11b59a02..511f8861a8f 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4255,6 +4255,9 @@ consider_groupingsets_paths(PlannerInfo *root,
 		 * gd->rollups is empty if we have only unsortable columns to work
 		 * with.  Override work_mem in that case; otherwise, we'll rely on the
 		 * sorted-input case to generate usable mixed paths.
+		 *
+		 * TODO: think more about how to plan grouping sets when spilling hash
+		 * tables is an option
 		 */
 		if (hashsize > work_mem * 1024L && gd->rollups)
 			return;				/* nope, won't fit */
@@ -6527,7 +6530,8 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			 * were unable to sort above, then we'd better generate a Path, so
 			 * that we at least have one.
 			 */
-			if (hashaggtablesize < work_mem * 1024L ||
+			if (enable_hashagg_spill ||
+				hashaggtablesize < work_mem * 1024L ||
 				grouped_rel->pathlist == NIL)
 			{
 				/*
@@ -6560,7 +6564,8 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 														  agg_final_costs,
 														  dNumGroups);
 
-			if (hashaggtablesize < work_mem * 1024L)
+			if (enable_hashagg_spill ||
+				hashaggtablesize < work_mem * 1024L)
 				add_path(grouped_rel, (Path *)
 						 create_agg_path(root,
 										 grouped_rel,
@@ -6829,7 +6834,7 @@ create_partial_grouping_paths(PlannerInfo *root,
 		 * Tentatively produce a partial HashAgg Path, depending on if it
 		 * looks as if the hash table will fit in work_mem.
 		 */
-		if (hashaggtablesize < work_mem * 1024L &&
+		if ((enable_hashagg_spill || hashaggtablesize < work_mem * 1024L) &&
 			cheapest_total_path != NULL)
 		{
 			add_path(partially_grouped_rel, (Path *)
@@ -6856,7 +6861,7 @@ create_partial_grouping_paths(PlannerInfo *root,
 									   dNumPartialPartialGroups);
 
 		/* Do the same for partial paths. */
-		if (hashaggtablesize < work_mem * 1024L &&
+		if ((enable_hashagg_spill || hashaggtablesize < work_mem * 1024L) &&
 			cheapest_partial_path != NULL)
 		{
 			add_partial_path(partially_grouped_rel, (Path *)
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 3bf96de256d..b0cb1d7e6b2 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -120,6 +120,7 @@ bool		enableFsync = true;
 bool		allowSystemTableMods = false;
 int			work_mem = 1024;
 int			maintenance_work_mem = 16384;
+bool		hashagg_mem_overflow = false;
 int			max_parallel_maintenance_workers = 2;
 
 /*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ba4edde71a3..d588198df55 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -957,6 +957,26 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_hashagg_spill", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of hashed aggregation plans that are expected to exceed work_mem."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_hashagg_spill,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"hashagg_mem_overflow", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables hashed aggregation to overflow work_mem at execution time."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&hashagg_mem_overflow,
+		false,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_material", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of materialization."),
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 6298c7c8cad..84a71444264 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -140,10 +140,15 @@ extern TupleHashTable BuildTupleHashTableExt(PlanState *parent,
 extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable,
 										   TupleTableSlot *slot,
 										   bool *isnew);
+extern TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable,
+											   TupleTableSlot *slot,
+											   bool *isnew, uint32 hash);
 extern TupleHashEntry FindTupleHashEntry(TupleHashTable hashtable,
 										 TupleTableSlot *slot,
 										 ExprState *eqcomp,
 										 FmgrInfo *hashfunctions);
+extern uint32 TupleHashTableHash(struct tuplehash_hash *tb,
+								 const MinimalTuple tuple);
 extern void ResetTupleHashTable(TupleHashTable hashtable);
 
 /*
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index bc6e03fbc7e..321759ead51 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -244,6 +244,7 @@ extern bool enableFsync;
 extern PGDLLIMPORT bool allowSystemTableMods;
 extern PGDLLIMPORT int work_mem;
 extern PGDLLIMPORT int maintenance_work_mem;
+extern PGDLLIMPORT bool hashagg_mem_overflow;
 extern PGDLLIMPORT int max_parallel_maintenance_workers;
 
 extern int	VacuumCostPageHit;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 6eb647290be..e7b12ed39b8 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2070,13 +2070,26 @@ typedef struct AggState
 	HeapTuple	grp_firstTuple; /* copy of first tuple of current group */
 	/* these fields are used in AGG_HASHED and AGG_MIXED modes: */
 	bool		table_filled;	/* hash table filled yet? */
-	int			num_hashes;
+	int			num_hashes;		/* number of hash tables active at once */
+	bool		hash_spilled;	/* any hash table ever spilled? */
+	struct HashAggSpill *hash_spills; /* HashAggSpill for each hash table,
+										 exists only during first pass if spilled */
+	TupleTableSlot *hash_spill_slot; /* slot for reading from spill files */
+	Size		hash_mem_limit;	/* limit before spilling hash table */
+	Size		hash_mem_peak;	/* peak hash table memory usage */
+	uint64		hash_ngroups_current;	/* number of tuples currently in
+										   memory in all hash tables */
+	Size		hash_mem_current; /* current hash table memory usage */
+	uint64		hash_disk_used; /* bytes of disk space used */
+	int			hash_batches_used;	/* batches used during entire execution */
+	List	   *hash_batches;	/* hash batches remaining to be processed */
+
 	AggStatePerHash perhash;	/* array of per-hashtable data */
 	AggStatePerGroup *hash_pergroup;	/* grouping set indexed array of
 										 * per-group pointers */
 
 	/* support for evaluation of agg input expressions: */
-#define FIELDNO_AGGSTATE_ALL_PERGROUPS 34
+#define FIELDNO_AGGSTATE_ALL_PERGROUPS 44
 	AggStatePerGroup *all_pergroups;	/* array of first ->pergroups, than
 										 * ->hash_pergroup */
 	ProjectionInfo *combinedproj;	/* projection machinery */
@@ -2248,7 +2261,7 @@ typedef struct HashInstrumentation
 	int			nbuckets_original;	/* planned number of buckets */
 	int			nbatch;			/* number of batches at end of execution */
 	int			nbatch_original;	/* planned number of batches */
-	size_t		space_peak;		/* speak memory usage in bytes */
+	size_t		space_peak;		/* peak memory usage in bytes */
 } HashInstrumentation;
 
 /* ----------------
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index b3d0b4f6fbc..b72e2d08290 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -54,6 +54,7 @@ extern PGDLLIMPORT bool enable_bitmapscan;
 extern PGDLLIMPORT bool enable_tidscan;
 extern PGDLLIMPORT bool enable_sort;
 extern PGDLLIMPORT bool enable_hashagg;
+extern PGDLLIMPORT bool enable_hashagg_spill;
 extern PGDLLIMPORT bool enable_nestloop;
 extern PGDLLIMPORT bool enable_material;
 extern PGDLLIMPORT bool enable_mergejoin;
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index be4ddf86a43..8b64d15368e 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -2331,3 +2331,95 @@ explain (costs off)
                ->  Seq Scan on onek
 (8 rows)
 
+--
+-- Compare results between plans using sorting and plans using hash
+-- aggregation. Force spilling in both cases by setting work_mem low.
+--
+set work_mem='64kB';
+-- Produce results with sorting.
+set enable_hashagg = false;
+set jit_above_cost = 0;
+explain (costs off)
+select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3
+  from generate_series(0, 199999) g
+  group by g%100000;
+                   QUERY PLAN                   
+------------------------------------------------
+ GroupAggregate
+   Group Key: ((g % 100000))
+   ->  Sort
+         Sort Key: ((g % 100000))
+         ->  Function Scan on generate_series g
+(5 rows)
+
+create table agg_group_1 as
+select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3
+  from generate_series(0, 199999) g
+  group by g%100000;
+set jit_above_cost to default;
+create table agg_group_2 as
+select (g/2)::numeric as c1, sum(7::int4) as c2, count(*) as c3
+  from generate_series(0, 1999) g
+  group by g/2;
+create table agg_group_3 as
+select (g/2)::numeric as c1, array_agg(g::numeric) as c2, count(*) as c3
+  from generate_series(0, 1999) g
+  group by g/2;
+-- Produce results with hash aggregation
+set enable_hashagg = true;
+set enable_sort = false;
+set jit_above_cost = 0;
+explain (costs off)
+select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3
+  from generate_series(0, 199999) g
+  group by g%100000;
+                QUERY PLAN                
+------------------------------------------
+ HashAggregate
+   Group Key: (g % 100000)
+   ->  Function Scan on generate_series g
+(3 rows)
+
+create table agg_hash_1 as
+select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3
+  from generate_series(0, 199999) g
+  group by g%100000;
+set jit_above_cost to default;
+create table agg_hash_2 as
+select (g/2)::numeric as c1, sum(7::int4) as c2, count(*) as c3
+  from generate_series(0, 1999) g
+  group by g/2;
+create table agg_hash_3 as
+select (g/2)::numeric as c1, array_agg(g::numeric) as c2, count(*) as c3
+  from generate_series(0, 1999) g
+  group by g/2;
+set enable_sort = true;
+set work_mem to default;
+-- Compare group aggregation results to hash aggregation results
+(select * from agg_hash_1 except select * from agg_group_1)
+  union all
+(select * from agg_group_1 except select * from agg_hash_1);
+ c1 | c2 | c3 
+----+----+----
+(0 rows)
+
+(select * from agg_hash_2 except select * from agg_group_2)
+  union all
+(select * from agg_group_2 except select * from agg_hash_2);
+ c1 | c2 | c3 
+----+----+----
+(0 rows)
+
+(select * from agg_hash_3 except select * from agg_group_3)
+  union all
+(select * from agg_group_3 except select * from agg_hash_3);
+ c1 | c2 | c3 
+----+----+----
+(0 rows)
+
+drop table agg_group_1;
+drop table agg_group_2;
+drop table agg_group_3;
+drop table agg_hash_1;
+drop table agg_hash_2;
+drop table agg_hash_3;
diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out
index c1f802c88a7..767f60a96c7 100644
--- a/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@ -1633,4 +1633,127 @@ select v||'a', case when grouping(v||'a') = 1 then 1 else 0 end, count(*)
           |    1 |     2
 (4 rows)
 
+--
+-- Compare results between plans using sorting and plans using hash
+-- aggregation. Force spilling in both cases by setting work_mem low.
+--
+SET work_mem='64kB';
+-- Produce results with sorting.
+set enable_hashagg = false;
+set jit_above_cost = 0;
+explain (costs off)
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10);
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ GroupAggregate
+   Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
+   Group Key: ((g.g % 1000)), ((g.g % 100))
+   Group Key: ((g.g % 1000))
+   Group Key: ()
+   Sort Key: ((g.g % 100)), ((g.g % 10))
+     Group Key: ((g.g % 100)), ((g.g % 10))
+     Group Key: ((g.g % 100))
+   Sort Key: ((g.g % 10)), ((g.g % 1000))
+     Group Key: ((g.g % 10)), ((g.g % 1000))
+     Group Key: ((g.g % 10))
+   ->  Sort
+         Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
+         ->  Function Scan on generate_series g
+(14 rows)
+
+create table gs_group_1 as
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10);
+set jit_above_cost to default;
+create table gs_group_2 as
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g/20 as g1000, g/200 as g100, g/2000 as g10, g
+   from generate_series(0,19999) g) s
+group by cube (g1000,g100,g10);
+create table gs_group_3 as
+select g100, g10, array_agg(g) as a, count(*) as c, max(g::text) as m from
+  (select g/200 as g100, g/2000 as g10, g
+   from generate_series(0,19999) g) s
+group by grouping sets (g100,g10);
+-- Produce results with hash aggregation.
+set enable_hashagg = true;
+set enable_sort = false;
+set work_mem='64kB';
+set jit_above_cost = 0;
+explain (costs off)
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10);
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ GroupAggregate
+   Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
+   Group Key: ((g.g % 1000)), ((g.g % 100))
+   Group Key: ((g.g % 1000))
+   Group Key: ()
+   Sort Key: ((g.g % 100)), ((g.g % 10))
+     Group Key: ((g.g % 100)), ((g.g % 10))
+     Group Key: ((g.g % 100))
+   Sort Key: ((g.g % 10)), ((g.g % 1000))
+     Group Key: ((g.g % 10)), ((g.g % 1000))
+     Group Key: ((g.g % 10))
+   ->  Sort
+         Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
+         ->  Function Scan on generate_series g
+(14 rows)
+
+create table gs_hash_1 as
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10);
+set jit_above_cost to default;
+create table gs_hash_2 as
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g/20 as g1000, g/200 as g100, g/2000 as g10, g
+   from generate_series(0,19999) g) s
+group by cube (g1000,g100,g10);
+create table gs_hash_3 as
+select g100, g10, array_agg(g) as a, count(*) as c, max(g::text) as m from
+  (select g/200 as g100, g/2000 as g10, g
+   from generate_series(0,19999) g) s
+group by grouping sets (g100,g10);
+set enable_sort = true;
+set work_mem to default;
+-- Compare results
+(select * from gs_hash_1 except select * from gs_group_1)
+  union all
+(select * from gs_group_1 except select * from gs_hash_1);
+ g1000 | g100 | g10 | sum | count | max 
+-------+------+-----+-----+-------+-----
+(0 rows)
+
+(select * from gs_hash_2 except select * from gs_group_2)
+  union all
+(select * from gs_group_2 except select * from gs_hash_2);
+ g1000 | g100 | g10 | sum | count | max 
+-------+------+-----+-----+-------+-----
+(0 rows)
+
+(select g100,g10,unnest(a),c,m from gs_hash_3 except
+  select g100,g10,unnest(a),c,m from gs_group_3)
+    union all
+(select g100,g10,unnest(a),c,m from gs_group_3 except
+  select g100,g10,unnest(a),c,m from gs_hash_3);
+ g100 | g10 | unnest | c | m 
+------+-----+--------+---+---
+(0 rows)
+
+drop table gs_group_1;
+drop table gs_group_2;
+drop table gs_group_3;
+drop table gs_hash_1;
+drop table gs_hash_2;
+drop table gs_hash_3;
 -- end
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index f3696c6d1de..11c6f50fbfa 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -148,6 +148,68 @@ SELECT count(*) FROM
      4
 (1 row)
 
+--
+-- Compare results between plans using sorting and plans using hash
+-- aggregation. Force spilling in both cases by setting work_mem low.
+--
+SET work_mem='64kB';
+-- Produce results with sorting.
+SET enable_hashagg=FALSE;
+SET jit_above_cost=0;
+EXPLAIN (costs off)
+SELECT DISTINCT g%1000 FROM generate_series(0,9999) g;
+                   QUERY PLAN                   
+------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: ((g % 1000))
+         ->  Function Scan on generate_series g
+(4 rows)
+
+CREATE TABLE distinct_group_1 AS
+SELECT DISTINCT g%1000 FROM generate_series(0,9999) g;
+SET jit_above_cost TO DEFAULT;
+CREATE TABLE distinct_group_2 AS
+SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g;
+SET enable_hashagg=TRUE;
+-- Produce results with hash aggregation.
+SET enable_sort=FALSE;
+SET jit_above_cost=0;
+EXPLAIN (costs off)
+SELECT DISTINCT g%1000 FROM generate_series(0,9999) g;
+                QUERY PLAN                
+------------------------------------------
+ HashAggregate
+   Group Key: (g % 1000)
+   ->  Function Scan on generate_series g
+(3 rows)
+
+CREATE TABLE distinct_hash_1 AS
+SELECT DISTINCT g%1000 FROM generate_series(0,9999) g;
+SET jit_above_cost TO DEFAULT;
+CREATE TABLE distinct_hash_2 AS
+SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g;
+SET enable_sort=TRUE;
+SET work_mem TO DEFAULT;
+-- Compare results
+(SELECT * FROM distinct_hash_1 EXCEPT SELECT * FROM distinct_group_1)
+  UNION ALL
+(SELECT * FROM distinct_group_1 EXCEPT SELECT * FROM distinct_hash_1);
+ ?column? 
+----------
+(0 rows)
+
+(SELECT * FROM distinct_hash_1 EXCEPT SELECT * FROM distinct_group_1)
+  UNION ALL
+(SELECT * FROM distinct_group_1 EXCEPT SELECT * FROM distinct_hash_1);
+ ?column? 
+----------
+(0 rows)
+
+DROP TABLE distinct_hash_1;
+DROP TABLE distinct_hash_2;
+DROP TABLE distinct_group_1;
+DROP TABLE distinct_group_2;
 --
 -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
 -- very own regression file.
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index a1c90eb9057..c40bf6c16eb 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -75,6 +75,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_bitmapscan              | on
  enable_gathermerge             | on
  enable_hashagg                 | on
+ enable_hashagg_spill           | on
  enable_hashjoin                | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
@@ -89,7 +90,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(17 rows)
+(18 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 17fb256aec5..bcd336c5812 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -1017,3 +1017,91 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*)
 explain (costs off)
   select 1 from tenk1
    where (hundred, thousand) in (select twothousand, twothousand from onek);
+
+--
+-- Compare results between plans using sorting and plans using hash
+-- aggregation. Force spilling in both cases by setting work_mem low.
+--
+
+set work_mem='64kB';
+
+-- Produce results with sorting.
+
+set enable_hashagg = false;
+
+set jit_above_cost = 0;
+
+explain (costs off)
+select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3
+  from generate_series(0, 199999) g
+  group by g%100000;
+
+create table agg_group_1 as
+select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3
+  from generate_series(0, 199999) g
+  group by g%100000;
+
+set jit_above_cost to default;
+
+create table agg_group_2 as
+select (g/2)::numeric as c1, sum(7::int4) as c2, count(*) as c3
+  from generate_series(0, 1999) g
+  group by g/2;
+
+create table agg_group_3 as
+select (g/2)::numeric as c1, array_agg(g::numeric) as c2, count(*) as c3
+  from generate_series(0, 1999) g
+  group by g/2;
+
+-- Produce results with hash aggregation
+
+set enable_hashagg = true;
+set enable_sort = false;
+
+set jit_above_cost = 0;
+
+explain (costs off)
+select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3
+  from generate_series(0, 199999) g
+  group by g%100000;
+
+create table agg_hash_1 as
+select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3
+  from generate_series(0, 199999) g
+  group by g%100000;
+
+set jit_above_cost to default;
+
+create table agg_hash_2 as
+select (g/2)::numeric as c1, sum(7::int4) as c2, count(*) as c3
+  from generate_series(0, 1999) g
+  group by g/2;
+
+create table agg_hash_3 as
+select (g/2)::numeric as c1, array_agg(g::numeric) as c2, count(*) as c3
+  from generate_series(0, 1999) g
+  group by g/2;
+
+set enable_sort = true;
+set work_mem to default;
+
+-- Compare group aggregation results to hash aggregation results
+
+(select * from agg_hash_1 except select * from agg_group_1)
+  union all
+(select * from agg_group_1 except select * from agg_hash_1);
+
+(select * from agg_hash_2 except select * from agg_group_2)
+  union all
+(select * from agg_group_2 except select * from agg_hash_2);
+
+(select * from agg_hash_3 except select * from agg_group_3)
+  union all
+(select * from agg_group_3 except select * from agg_hash_3);
+
+drop table agg_group_1;
+drop table agg_group_2;
+drop table agg_group_3;
+drop table agg_hash_1;
+drop table agg_hash_2;
+drop table agg_hash_3;
diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql
index 95ac3fb52f6..bf8bce6ed31 100644
--- a/src/test/regress/sql/groupingsets.sql
+++ b/src/test/regress/sql/groupingsets.sql
@@ -441,4 +441,103 @@ select v||'a', case when grouping(v||'a') = 1 then 1 else 0 end, count(*)
   from unnest(array[1,1], array['a','b']) u(i,v)
  group by rollup(i, v||'a') order by 1,3;
 
+--
+-- Compare results between plans using sorting and plans using hash
+-- aggregation. Force spilling in both cases by setting work_mem low.
+--
+
+SET work_mem='64kB';
+
+-- Produce results with sorting.
+
+set enable_hashagg = false;
+
+set jit_above_cost = 0;
+
+explain (costs off)
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10);
+
+create table gs_group_1 as
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10);
+
+set jit_above_cost to default;
+
+create table gs_group_2 as
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g/20 as g1000, g/200 as g100, g/2000 as g10, g
+   from generate_series(0,19999) g) s
+group by cube (g1000,g100,g10);
+
+create table gs_group_3 as
+select g100, g10, array_agg(g) as a, count(*) as c, max(g::text) as m from
+  (select g/200 as g100, g/2000 as g10, g
+   from generate_series(0,19999) g) s
+group by grouping sets (g100,g10);
+
+-- Produce results with hash aggregation.
+
+set enable_hashagg = true;
+set enable_sort = false;
+set work_mem='64kB';
+
+set jit_above_cost = 0;
+
+explain (costs off)
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10);
+
+create table gs_hash_1 as
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10);
+
+set jit_above_cost to default;
+
+create table gs_hash_2 as
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g/20 as g1000, g/200 as g100, g/2000 as g10, g
+   from generate_series(0,19999) g) s
+group by cube (g1000,g100,g10);
+
+create table gs_hash_3 as
+select g100, g10, array_agg(g) as a, count(*) as c, max(g::text) as m from
+  (select g/200 as g100, g/2000 as g10, g
+   from generate_series(0,19999) g) s
+group by grouping sets (g100,g10);
+
+set enable_sort = true;
+set work_mem to default;
+
+-- Compare results
+
+(select * from gs_hash_1 except select * from gs_group_1)
+  union all
+(select * from gs_group_1 except select * from gs_hash_1);
+
+(select * from gs_hash_2 except select * from gs_group_2)
+  union all
+(select * from gs_group_2 except select * from gs_hash_2);
+
+(select g100,g10,unnest(a),c,m from gs_hash_3 except
+  select g100,g10,unnest(a),c,m from gs_group_3)
+    union all
+(select g100,g10,unnest(a),c,m from gs_group_3 except
+  select g100,g10,unnest(a),c,m from gs_hash_3);
+
+drop table gs_group_1;
+drop table gs_group_2;
+drop table gs_group_3;
+drop table gs_hash_1;
+drop table gs_hash_2;
+drop table gs_hash_3;
+
 -- end
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index a605e86449e..33102744ebf 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -45,6 +45,68 @@ SELECT count(*) FROM
 SELECT count(*) FROM
   (SELECT DISTINCT two, four, two FROM tenk1) ss;
 
+--
+-- Compare results between plans using sorting and plans using hash
+-- aggregation. Force spilling in both cases by setting work_mem low.
+--
+
+SET work_mem='64kB';
+
+-- Produce results with sorting.
+
+SET enable_hashagg=FALSE;
+
+SET jit_above_cost=0;
+
+EXPLAIN (costs off)
+SELECT DISTINCT g%1000 FROM generate_series(0,9999) g;
+
+CREATE TABLE distinct_group_1 AS
+SELECT DISTINCT g%1000 FROM generate_series(0,9999) g;
+
+SET jit_above_cost TO DEFAULT;
+
+CREATE TABLE distinct_group_2 AS
+SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g;
+
+SET enable_hashagg=TRUE;
+
+-- Produce results with hash aggregation.
+
+SET enable_sort=FALSE;
+
+SET jit_above_cost=0;
+
+EXPLAIN (costs off)
+SELECT DISTINCT g%1000 FROM generate_series(0,9999) g;
+
+CREATE TABLE distinct_hash_1 AS
+SELECT DISTINCT g%1000 FROM generate_series(0,9999) g;
+
+SET jit_above_cost TO DEFAULT;
+
+CREATE TABLE distinct_hash_2 AS
+SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g;
+
+SET enable_sort=TRUE;
+
+SET work_mem TO DEFAULT;
+
+-- Compare results
+
+(SELECT * FROM distinct_hash_1 EXCEPT SELECT * FROM distinct_group_1)
+  UNION ALL
+(SELECT * FROM distinct_group_1 EXCEPT SELECT * FROM distinct_hash_1);
+
+(SELECT * FROM distinct_hash_1 EXCEPT SELECT * FROM distinct_group_1)
+  UNION ALL
+(SELECT * FROM distinct_group_1 EXCEPT SELECT * FROM distinct_hash_1);
+
+DROP TABLE distinct_hash_1;
+DROP TABLE distinct_hash_2;
+DROP TABLE distinct_group_1;
+DROP TABLE distinct_group_2;
+
 --
 -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
 -- very own regression file.

Re: Memory-Bounded Hash Aggregation

Reply via email to