On Thu, 2019-07-11 at 17:55 +0200, Tomas Vondra wrote: > Makes sense. I haven't thought about how the hybrid approach would be > implemented very much, so I can't quite judge how complicated would > it be > to extend "approach 1" later. But if you think it's a sensible first > step, > I trust you. And I certainly agree we need something to compare the > other > approaches against.
Is this a duplicate of your previous email? I'm slightly confused but I will use the opportunity to put out another WIP patch. The patch could use a few rounds of cleanup and quality work, but the funcionality is there and the performance seems reasonable. I rebased on master and fixed a few bugs, and most importantly, added tests. It seems to be working with grouping sets fine. It will take a little longer to get good performance numbers, but even for group size of one, I'm seeing HashAgg get close to Sort+Group in some cases. You are right that the missed lookups appear to be costly, at least when the data all fits in system memory. I think it's the cache misses, because sometimes reducing work_mem improves performance. I'll try tuning the number of buckets for the hash table and see if that helps. If not, then the performance still seems pretty good to me. Of course, HashAgg can beat sort for larger group sizes, but I'll try to gather some more data on the cross-over point. Regards, Jeff Davis
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index c91e3e1550..d2f97d5fce 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1702,6 +1702,23 @@ include_dir 'conf.d' </listitem> </varlistentry> + <varlistentry id="guc-hashagg-mem-overflow" xreflabel="hashagg_mem_overflow"> + <term><varname>hashagg_mem_overflow</varname> (<type>boolean</type>) + <indexterm> + <primary><varname>hashagg_mem_overflow</varname> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + If hash aggregation exceeds <varname>work_mem</varname> at query + execution time, and <varname>hashagg_mem_overflow</varname> is set + to <literal>on</literal>, continue consuming more memory rather than + performing disk-based hash aggregation. The default + is <literal>off</literal>. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-max-stack-depth" xreflabel="max_stack_depth"> <term><varname>max_stack_depth</varname> (<type>integer</type>) <indexterm> @@ -4354,6 +4371,24 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class=" </listitem> </varlistentry> + <varlistentry id="guc-enable-hashagg-spill" xreflabel="enable_hashagg_spill"> + <term><varname>enable_hashagg_spill</varname> (<type>boolean</type>) + <indexterm> + <primary><varname>enable_hashagg_spill</varname> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + Enables or disables the query planner's use of hashed aggregation plan + types when the memory usage is expected to + exceed <varname>work_mem</varname>. This only affects the planner + choice; actual behavior at execution time is dictated by + <xref linkend="guc-hashagg-mem-overflow"/>. The default + is <literal>on</literal>. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-enable-hashjoin" xreflabel="enable_hashjoin"> <term><varname>enable_hashjoin</varname> (<type>boolean</type>) <indexterm> diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index dff2ed3f97..a5b7b73b13 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -102,6 +102,7 @@ static void show_tablesample(TableSampleClause *tsc, PlanState *planstate, List *ancestors, ExplainState *es); static void show_sort_info(SortState *sortstate, ExplainState *es); static void show_hash_info(HashState *hashstate, ExplainState *es); +static void show_hashagg_info(AggState *hashstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es); static void show_instrumentation_count(const char *qlabel, int which, @@ -1826,6 +1827,8 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_Agg: show_agg_keys(castNode(AggState, planstate), ancestors, es); show_upper_qual(plan->qual, "Filter", planstate, ancestors, es); + if (es->analyze) + show_hashagg_info((AggState *) planstate, es); if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); @@ -2715,6 +2718,56 @@ show_hash_info(HashState *hashstate, ExplainState *es) } } +/* + * If EXPLAIN ANALYZE, show information on hash aggregate memory usage and + * batches. + */ +static void +show_hashagg_info(AggState *aggstate, ExplainState *es) +{ + Agg *agg = (Agg *)aggstate->ss.ps.plan; + long memPeakKb = (aggstate->hash_mem_peak + 1023) / 1024; + long diskKb = (aggstate->hash_disk_used + 1023) / 1024; + + + Assert(IsA(aggstate, AggState)); + + if (agg->aggstrategy != AGG_HASHED && + agg->aggstrategy != AGG_MIXED) + return; + + if (es->format == EXPLAIN_FORMAT_TEXT) + { + appendStringInfoSpaces(es->str, es->indent * 2); + appendStringInfo( + es->str, + "Memory Usage: %ldkB", + memPeakKb); + + if (aggstate->hash_batches_used > 0) + { + appendStringInfo( + es->str, + " Batches: %d Disk Usage:%ldkB", + aggstate->hash_batches_used, diskKb); + } + + appendStringInfo( + es->str, + "\n"); + } + else + { + ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es); + if (aggstate->hash_batches_used > 0) + { + ExplainPropertyInteger("HashAgg Batches", NULL, + aggstate->hash_batches_used, es); + ExplainPropertyInteger("Disk Usage", "kB", diskKb, es); + } + } +} + /* * If it's EXPLAIN ANALYZE, show exact/lossy pages for a BitmapHeapScan node */ diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index 66a67c72b2..62014e4ffb 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -1563,14 +1563,14 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull) { AggState *aggstate; AggStatePerGroup pergroup; + AggStatePerGroup pergroup_allaggs; aggstate = op->d.agg_init_trans.aggstate; - pergroup = &aggstate->all_pergroups - [op->d.agg_init_trans.setoff] - [op->d.agg_init_trans.transno]; + pergroup_allaggs = aggstate->all_pergroups[op->d.agg_init_trans.setoff]; + pergroup = &pergroup_allaggs[op->d.agg_init_trans.transno]; /* If transValue has not yet been initialized, do so now. */ - if (pergroup->noTransValue) + if (pergroup_allaggs != NULL && pergroup->noTransValue) { AggStatePerTrans pertrans = op->d.agg_init_trans.pertrans; @@ -1591,13 +1591,14 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull) { AggState *aggstate; AggStatePerGroup pergroup; + AggStatePerGroup pergroup_allaggs; aggstate = op->d.agg_strict_trans_check.aggstate; - pergroup = &aggstate->all_pergroups - [op->d.agg_strict_trans_check.setoff] - [op->d.agg_strict_trans_check.transno]; + pergroup_allaggs = aggstate->all_pergroups[op->d.agg_strict_trans_check.setoff]; + pergroup = &pergroup_allaggs[op->d.agg_strict_trans_check.transno]; - if (unlikely(pergroup->transValueIsNull)) + if (pergroup_allaggs == NULL || + unlikely(pergroup->transValueIsNull)) EEO_JUMP(op->d.agg_strict_trans_check.jumpnull); EEO_NEXT(); @@ -1613,6 +1614,7 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull) AggState *aggstate; AggStatePerTrans pertrans; AggStatePerGroup pergroup; + AggStatePerGroup pergroup_allaggs; FunctionCallInfo fcinfo; MemoryContext oldContext; Datum newVal; @@ -1620,9 +1622,11 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull) aggstate = op->d.agg_trans.aggstate; pertrans = op->d.agg_trans.pertrans; - pergroup = &aggstate->all_pergroups - [op->d.agg_trans.setoff] - [op->d.agg_trans.transno]; + pergroup_allaggs = aggstate->all_pergroups[op->d.agg_trans.setoff]; + pergroup = &pergroup_allaggs[op->d.agg_trans.transno]; + + if (pergroup_allaggs == NULL) + EEO_NEXT(); Assert(pertrans->transtypeByVal); @@ -1664,6 +1668,7 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull) AggState *aggstate; AggStatePerTrans pertrans; AggStatePerGroup pergroup; + AggStatePerGroup pergroup_allaggs; FunctionCallInfo fcinfo; MemoryContext oldContext; Datum newVal; @@ -1671,9 +1676,11 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull) aggstate = op->d.agg_trans.aggstate; pertrans = op->d.agg_trans.pertrans; - pergroup = &aggstate->all_pergroups - [op->d.agg_trans.setoff] - [op->d.agg_trans.transno]; + pergroup_allaggs = aggstate->all_pergroups[op->d.agg_trans.setoff]; + pergroup = &pergroup_allaggs[op->d.agg_trans.transno]; + + if (pergroup_allaggs == NULL) + EEO_NEXT(); Assert(!pertrans->transtypeByVal); diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c index 14ee8db3f9..8f5404b3d6 100644 --- a/src/backend/executor/execGrouping.c +++ b/src/backend/executor/execGrouping.c @@ -25,7 +25,6 @@ #include "utils/hashutils.h" #include "utils/memutils.h" -static uint32 TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple); static int TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2); /* @@ -288,6 +287,28 @@ ResetTupleHashTable(TupleHashTable hashtable) TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew) +{ + MemoryContext oldContext; + uint32 hash; + + /* Need to run the hash functions in short-lived context */ + oldContext = MemoryContextSwitchTo(hashtable->tempcxt); + + /* set up data needed by hash and match functions */ + hashtable->inputslot = slot; + hashtable->in_hash_funcs = hashtable->tab_hash_funcs; + hashtable->cur_eq_func = hashtable->tab_eq_func; + + hash = TupleHashTableHash(hashtable->hashtab, NULL); + + MemoryContextSwitchTo(oldContext); + + return LookupTupleHashEntryHash(hashtable, slot, isnew, hash); +} + +TupleHashEntry +LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot, + bool *isnew, uint32 hash) { TupleHashEntryData *entry; MemoryContext oldContext; @@ -306,7 +327,7 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, if (isnew) { - entry = tuplehash_insert(hashtable->hashtab, key, &found); + entry = tuplehash_insert_hash(hashtable->hashtab, key, hash, &found); if (found) { @@ -326,7 +347,7 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, } else { - entry = tuplehash_lookup(hashtable->hashtab, key); + entry = tuplehash_lookup_hash(hashtable->hashtab, key, hash); } MemoryContextSwitchTo(oldContext); @@ -371,17 +392,12 @@ FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, /* * Compute the hash value for a tuple * - * The passed-in key is a pointer to TupleHashEntryData. In an actual hash - * table entry, the firstTuple field points to a tuple (in MinimalTuple - * format). LookupTupleHashEntry sets up a dummy TupleHashEntryData with a - * NULL firstTuple field --- that cues us to look at the inputslot instead. - * This convention avoids the need to materialize virtual input tuples unless - * they actually need to get copied into the table. + * If tuple is NULL, use the input slot instead. * * Also, the caller must select an appropriate memory context for running * the hash functions. (dynahash.c doesn't change CurrentMemoryContext.) */ -static uint32 +uint32 TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple) { TupleHashTable hashtable = (TupleHashTable) tb->private_data; @@ -402,9 +418,6 @@ TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple) { /* * Process a tuple already stored in the table. - * - * (this case never actually occurs due to the way simplehash.h is - * used, as the hash-value is stored in the entries) */ slot = hashtable->tableslot; ExecStoreMinimalTuple(tuple, slot, false); diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 6b8ef40599..6cb3e32767 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -229,14 +229,40 @@ #include "optimizer/optimizer.h" #include "parser/parse_agg.h" #include "parser/parse_coerce.h" +#include "storage/buffile.h" #include "utils/acl.h" #include "utils/builtins.h" +#include "utils/dynahash.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/syscache.h" #include "utils/tuplesort.h" #include "utils/datum.h" +/* + * Represents partitioned spill data for a single hashtable. + */ +typedef struct HashAggSpill +{ + int n_partitions; /* number of output partitions */ + int partition_bits; /* number of bits for partition mask + log2(n_partitions) parent partition bits */ + BufFile **partitions; /* output partition files */ + int64 *ntuples; /* number of tuples in each partition */ +} HashAggSpill; + +/* + * Represents work to be done for one pass of hash aggregation. Initially, + * only the input fields are set. If spilled to disk, also set the spill data. + */ +typedef struct HashAggBatch +{ + BufFile *input_file; /* input partition */ + int input_bits; /* number of bits for input partition mask */ + int64 input_groups; /* estimated number of input groups */ + int setno; /* grouping set */ + HashAggSpill spill; /* spill output */ +} HashAggBatch; static void select_current_set(AggState *aggstate, int setno, bool is_hash); static void initialize_phase(AggState *aggstate, int newphase); @@ -272,11 +298,25 @@ static TupleTableSlot *project_aggregates(AggState *aggstate); static Bitmapset *find_unaggregated_cols(AggState *aggstate); static bool find_unaggregated_cols_walker(Node *node, Bitmapset **colnos); static void build_hash_table(AggState *aggstate); -static TupleHashEntryData *lookup_hash_entry(AggState *aggstate); +static void prepare_hash_slot(AggState *aggstate); +static uint32 calculate_hash(AggState *aggstate); +static AggStatePerGroup lookup_hash_entry(AggState *aggstate, uint32 hash); static void lookup_hash_entries(AggState *aggstate); static TupleTableSlot *agg_retrieve_direct(AggState *aggstate); static void agg_fill_hash_table(AggState *aggstate); +static bool agg_refill_hash_table(AggState *aggstate); static TupleTableSlot *agg_retrieve_hash_table(AggState *aggstate); +static TupleTableSlot *agg_retrieve_hash_table_in_memory(AggState *aggstate); +static Size hash_spill_tuple(HashAggSpill *spill, int input_bits, + TupleTableSlot *slot, uint32 hash); +static MinimalTuple hash_read_spilled(BufFile *file, uint32 *hashp); +static HashAggBatch *hash_batch_new(BufFile *input_file, int setno, + int64 input_groups, int input_bits); +static void hash_finish_initial_spills(AggState *aggstate); +static void hash_spill_finish(AggState *aggstate, HashAggSpill *spill, + int setno, int input_bits); +static void hash_reset_spill(HashAggSpill *spill); +static void hash_reset_spills(AggState *aggstate); static Datum GetAggInitVal(Datum textInitVal, Oid transtype); static void build_pertrans_for_aggref(AggStatePerTrans pertrans, AggState *aggstate, EState *estate, @@ -1269,6 +1309,10 @@ build_hash_table(AggState *aggstate) Assert(aggstate->aggstrategy == AGG_HASHED || aggstate->aggstrategy == AGG_MIXED); + /* TODO: work harder to find a good nGroups for each hash table. We don't + * want the hash table itself to fill up work_mem with no room for + * out-of-line transition values. Also, we need to consider that there are + * multiple hash tables for grouping sets. */ additionalsize = aggstate->numtrans * sizeof(AggStatePerGroupData); for (i = 0; i < aggstate->num_hashes; ++i) @@ -1294,6 +1338,15 @@ build_hash_table(AggState *aggstate) tmpmem, DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit)); } + + /* + * Set initial size to be that of an empty hash table. This ensures that + * at least one entry can be added before it exceeds work_mem; otherwise + * the algorithm might not make progress. + */ + aggstate->hash_mem_init = MemoryContextMemAllocated( + aggstate->hashcontext->ecxt_per_tuple_memory, true); + aggstate->hash_mem_current = aggstate->hash_mem_init; } /* @@ -1454,23 +1507,13 @@ hash_agg_entry_size(int numAggs) return entrysize; } -/* - * Find or create a hashtable entry for the tuple group containing the current - * tuple (already set in tmpcontext's outertuple slot), in the current grouping - * set (which the caller must have selected - note that initialize_aggregate - * depends on this). - * - * When called, CurrentMemoryContext should be the per-query context. - */ -static TupleHashEntryData * -lookup_hash_entry(AggState *aggstate) +void +prepare_hash_slot(AggState *aggstate) { - TupleTableSlot *inputslot = aggstate->tmpcontext->ecxt_outertuple; - AggStatePerHash perhash = &aggstate->perhash[aggstate->current_set]; - TupleTableSlot *hashslot = perhash->hashslot; - TupleHashEntryData *entry; - bool isnew; - int i; + TupleTableSlot *inputslot = aggstate->tmpcontext->ecxt_outertuple; + AggStatePerHash perhash = &aggstate->perhash[aggstate->current_set]; + TupleTableSlot *hashslot = perhash->hashslot; + int i; /* transfer just the needed columns into hashslot */ slot_getsomeattrs(inputslot, perhash->largestGrpColIdx); @@ -1484,14 +1527,70 @@ lookup_hash_entry(AggState *aggstate) hashslot->tts_isnull[i] = inputslot->tts_isnull[varNumber]; } ExecStoreVirtualTuple(hashslot); +} + +uint32 +calculate_hash(AggState *aggstate) +{ + AggStatePerHash perhash = &aggstate->perhash[aggstate->current_set]; + TupleHashTable hashtable = perhash->hashtable; + MemoryContext oldContext; + uint32 hash; + + /* set up data needed by hash and match functions */ + hashtable->inputslot = perhash->hashslot; + hashtable->in_hash_funcs = hashtable->tab_hash_funcs; + hashtable->cur_eq_func = hashtable->tab_eq_func; + + /* Need to run the hash functions in short-lived context */ + oldContext = MemoryContextSwitchTo(hashtable->tempcxt); + + hash = TupleHashTableHash(hashtable->hashtab, NULL); + + MemoryContextSwitchTo(oldContext); + + return hash; +} + +/* + * Find or create a hashtable entry for the tuple group containing the current + * tuple (already set in tmpcontext's outertuple slot), in the current grouping + * set (which the caller must have selected - note that initialize_aggregate + * depends on this). + * + * When called, CurrentMemoryContext should be the per-query context. + */ +static AggStatePerGroup +lookup_hash_entry(AggState *aggstate, uint32 hash) +{ + AggStatePerHash perhash = &aggstate->perhash[aggstate->current_set]; + TupleTableSlot *hashslot = perhash->hashslot; + TupleHashEntryData *entry; + bool isnew = false; /* find or create the hashtable entry using the filtered tuple */ - entry = LookupTupleHashEntry(perhash->hashtable, hashslot, &isnew); + if (!hashagg_mem_overflow && + aggstate->hash_mem_current > work_mem * 1024L && + aggstate->hash_mem_current > aggstate->hash_mem_init) + entry = LookupTupleHashEntryHash(perhash->hashtable, hashslot, + NULL, hash); + else + entry = LookupTupleHashEntryHash(perhash->hashtable, hashslot, + &isnew, hash); + + if (entry == NULL) + return NULL; if (isnew) { - AggStatePerGroup pergroup; - int transno; + AggStatePerGroup pergroup; + int transno; + + aggstate->hash_mem_current = MemoryContextMemAllocated( + aggstate->hashcontext->ecxt_per_tuple_memory, true); + + if (aggstate->hash_mem_current > aggstate->hash_mem_peak) + aggstate->hash_mem_peak = aggstate->hash_mem_current; pergroup = (AggStatePerGroup) MemoryContextAlloc(perhash->hashtable->tablecxt, @@ -1511,7 +1610,7 @@ lookup_hash_entry(AggState *aggstate) } } - return entry; + return entry->additional; } /* @@ -1519,18 +1618,38 @@ lookup_hash_entry(AggState *aggstate) * returning an array of pergroup pointers suitable for advance_aggregates. * * Be aware that lookup_hash_entry can reset the tmpcontext. + * + * Return false if hash table has exceeded its memory limit. */ static void lookup_hash_entries(AggState *aggstate) { - int numHashes = aggstate->num_hashes; AggStatePerGroup *pergroup = aggstate->hash_pergroup; int setno; - for (setno = 0; setno < numHashes; setno++) + for (setno = 0; setno < aggstate->num_hashes; setno++) { + uint32 hash; + select_current_set(aggstate, setno, true); - pergroup[setno] = lookup_hash_entry(aggstate)->additional; + prepare_hash_slot(aggstate); + hash = calculate_hash(aggstate); + pergroup[setno] = lookup_hash_entry(aggstate, hash); + + if (pergroup[setno] == NULL) + { + HashAggSpill *spill; + TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple; + + if (aggstate->hash_spills == NULL) + aggstate->hash_spills = palloc0( + sizeof(HashAggSpill) * aggstate->num_hashes); + aggstate->hash_spilled = true; + + spill = &aggstate->hash_spills[setno]; + + aggstate->hash_disk_used += hash_spill_tuple(spill, 0, slot, hash); + } } } @@ -1852,6 +1971,10 @@ agg_retrieve_direct(AggState *aggstate) outerslot = fetch_input_tuple(aggstate); if (TupIsNull(outerslot)) { + if (aggstate->aggstrategy == AGG_MIXED && + aggstate->current_phase == 1) + hash_finish_initial_spills(aggstate); + /* no more outer-plan tuples available */ if (hasGroupingSets) { @@ -1955,6 +2078,8 @@ agg_fill_hash_table(AggState *aggstate) ResetExprContext(aggstate->tmpcontext); } + hash_finish_initial_spills(aggstate); + aggstate->table_filled = true; /* Initialize to walk the first hash table */ select_current_set(aggstate, 0, true); @@ -1962,11 +2087,136 @@ agg_fill_hash_table(AggState *aggstate) &aggstate->perhash[0].hashiter); } +/* + * If any data was spilled during hash aggregation, reset the hash table and + * reprocess one batch of spilled data. After reprocessing a batch, the hash + * table will again contain data, ready to be consumed by + * agg_retrieve_hash_table_in_memory(). + * + * Should only be called after all in memory hash table entries have been + * consumed. + * + * Return false when input is exhausted and there's no more work to be done; + * otherwise return true. + */ +static bool +agg_refill_hash_table(AggState *aggstate) +{ + HashAggBatch *batch; + AggStatePerGroup *pergroup; + + if (aggstate->hash_batches == NIL) + return false; + + pergroup = aggstate->all_pergroups; + while(pergroup != aggstate->hash_pergroup) { + *pergroup = NULL; + pergroup++; + } + + /* free memory */ + ReScanExprContext(aggstate->hashcontext); + /* Rebuild an empty hash table */ + build_hash_table(aggstate); + + batch = linitial(aggstate->hash_batches); + aggstate->hash_batches = list_delete_first(aggstate->hash_batches); + + Assert(aggstate->current_phase == 0); + + /* + * TODO: what should be done here to set up for advance_aggregates? + */ + if (aggstate->phase->aggstrategy == AGG_MIXED) + { + aggstate->current_phase = 1; + aggstate->phase = &aggstate->phases[aggstate->current_phase]; + } + + for (;;) { + TupleTableSlot *slot = aggstate->hash_spill_slot; + MinimalTuple tuple; + uint32 hash; + + CHECK_FOR_INTERRUPTS(); + + tuple = hash_read_spilled(batch->input_file, &hash); + if (tuple == NULL) + break; + + /* + * TODO: Should we re-compile the expressions to use a minimal tuple + * slot so that we don't have to create the virtual tuple here? If we + * project the tuple before writing, then perhaps this is not + * important. + */ + ExecForceStoreMinimalTuple(tuple, slot, true); + aggstate->tmpcontext->ecxt_outertuple = slot; + + /* Find or build hashtable entries */ + memset(aggstate->hash_pergroup, 0, + sizeof(AggStatePerGroup) * aggstate->num_hashes); + select_current_set(aggstate, batch->setno, true); + prepare_hash_slot(aggstate); + aggstate->hash_pergroup[batch->setno] = lookup_hash_entry(aggstate, hash); + if (aggstate->hash_pergroup[batch->setno] == NULL) + aggstate->hash_disk_used += hash_spill_tuple( + &batch->spill, batch->input_bits, slot, hash); + + /* Advance the aggregates (or combine functions) */ + advance_aggregates(aggstate); + + /* + * Reset per-input-tuple context after each tuple, but note that the + * hash lookups do this too + */ + ResetExprContext(aggstate->tmpcontext); + } + + BufFileClose(batch->input_file); + + aggstate->current_phase = 0; + aggstate->phase = &aggstate->phases[aggstate->current_phase]; + + hash_spill_finish(aggstate, &batch->spill, batch->setno, + batch->input_bits); + + pfree(batch); + + /* Initialize to walk the first hash table */ + select_current_set(aggstate, 0, true); + ResetTupleHashIterator(aggstate->perhash[0].hashtable, + &aggstate->perhash[0].hashiter); + + return true; +} + /* * ExecAgg for hashed case: retrieving groups from hash table */ static TupleTableSlot * agg_retrieve_hash_table(AggState *aggstate) +{ + TupleTableSlot *result = NULL; + + while (result == NULL) + { + result = agg_retrieve_hash_table_in_memory(aggstate); + if (result == NULL) + { + if (!agg_refill_hash_table(aggstate)) + { + aggstate->agg_done = true; + break; + } + } + } + + return result; +} + +static TupleTableSlot * +agg_retrieve_hash_table_in_memory(AggState *aggstate) { ExprContext *econtext; AggStatePerAgg peragg; @@ -1995,7 +2245,7 @@ agg_retrieve_hash_table(AggState *aggstate) * We loop retrieving groups until we find one satisfying * aggstate->ss.ps.qual */ - while (!aggstate->agg_done) + for (;;) { TupleTableSlot *hashslot = perhash->hashslot; int i; @@ -2026,8 +2276,6 @@ agg_retrieve_hash_table(AggState *aggstate) } else { - /* No more hashtables, so done */ - aggstate->agg_done = true; return NULL; } } @@ -2084,6 +2332,276 @@ agg_retrieve_hash_table(AggState *aggstate) return NULL; } +/* + * hash_spill_tuple + * + * Not enough memory to add tuple as new entry in hash table. Save for later + * in the appropriate partition. + */ +static Size +hash_spill_tuple(HashAggSpill *spill, int input_bits, TupleTableSlot *slot, + uint32 hash) +{ + int partition; + MinimalTuple tuple; + BufFile *file; + int written; + int total_written = 0; + bool shouldFree; + + /* initialize output partitions */ + if (spill->partitions == NULL) + { + int npartitions; + int partition_bits; + + /*TODO: be smarter */ + npartitions = 32; + + partition_bits = my_log2(npartitions); + + /* make sure that we don't exhaust the hash bits */ + if (partition_bits + input_bits >= 32) + partition_bits = 32 - input_bits; + + /* number of partitions will be a power of two */ + npartitions = 1L << partition_bits; + + spill->partition_bits = partition_bits; + spill->n_partitions = npartitions; + spill->partitions = palloc0(sizeof(BufFile *) * npartitions); + spill->ntuples = palloc0(sizeof(int64) * npartitions); + } + + /* + * TODO: should we project only needed attributes from the tuple before + * writing it? + */ + tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree); + + if (spill->partition_bits == 0) + partition = 0; + else + partition = (hash << input_bits) >> + (32 - spill->partition_bits); + + spill->ntuples[partition]++; + + /* + * TODO: use logtape.c instead? + */ + if (spill->partitions[partition] == NULL) + spill->partitions[partition] = BufFileCreateTemp(false); + file = spill->partitions[partition]; + + + written = BufFileWrite(file, (void *) &hash, sizeof(uint32)); + if (written != sizeof(uint32)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to HashAgg temporary file: %m"))); + total_written += written; + + written = BufFileWrite(file, (void *) tuple, tuple->t_len); + if (written != tuple->t_len) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to HashAgg temporary file: %m"))); + total_written += written; + + if (shouldFree) + pfree(tuple); + + return total_written; +} + +/* + * read_spilled_tuple + * read the next tuple from a batch file. Return NULL if no more. + */ +static MinimalTuple +hash_read_spilled(BufFile *file, uint32 *hashp) +{ + MinimalTuple tuple; + uint32 t_len; + size_t nread; + uint32 hash; + + nread = BufFileRead(file, &hash, sizeof(uint32)); + if (nread == 0) + return NULL; + if (nread != sizeof(uint32)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from HashAgg temporary file: %m"))); + if (hashp != NULL) + *hashp = hash; + + nread = BufFileRead(file, &t_len, sizeof(t_len)); + if (nread != sizeof(uint32)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from HashAgg temporary file: %m"))); + + tuple = (MinimalTuple) palloc(t_len); + tuple->t_len = t_len; + + nread = BufFileRead(file, (void *)((char *)tuple + sizeof(uint32)), + t_len - sizeof(uint32)); + if (nread != t_len - sizeof(uint32)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from HashAgg temporary file: %m"))); + + return tuple; +} + +/* + * new_hashagg_batch + * + * Construct a HashAggBatch item, which represents one iteration of HashAgg to + * be done. Should be called in the aggregate's memory context. + */ +static HashAggBatch * +hash_batch_new(BufFile *input_file, int setno, int64 input_groups, + int input_bits) +{ + HashAggBatch *batch = palloc0(sizeof(HashAggBatch)); + + batch->input_file = input_file; + batch->input_bits = input_bits; + batch->input_groups = input_groups; + batch->setno = setno; + + /* batch->spill will be set only after spilling this batch */ + + return batch; +} + +/* + * hash_finish_initial_spills + * + * After a HashAggBatch has been processed, it may have spilled tuples to + * disk. If so, turn the spilled partitions into new batches that must later + * be executed. + */ +static void +hash_finish_initial_spills(AggState *aggstate) +{ + int setno; + + if (aggstate->hash_spills == NULL) + return; + + for (setno = 0; setno < aggstate->num_hashes; setno++) + hash_spill_finish(aggstate, &aggstate->hash_spills[setno], setno, 0); + + pfree(aggstate->hash_spills); + aggstate->hash_spills = NULL; +} + +/* + * hash_spill_finish + * + * + */ +static void +hash_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno, int input_bits) +{ + int i; + + if (spill->n_partitions == 0) + return; /* didn't spill */ + + for (i = 0; i < spill->n_partitions; i++) + { + BufFile *file = spill->partitions[i]; + MemoryContext oldContext; + HashAggBatch *new_batch; + int64 input_ngroups; + + /* partition is empty */ + if (file == NULL) + continue; + + /* rewind file for reading */ + if (BufFileSeek(file, 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind HashAgg temporary file: %m"))); + + /* + * Estimate the number of input groups for this new work item as the + * total number of tuples in its input file. Although that's a worst + * case, it's not bad here for two reasons: (1) overestimating is + * better than underestimating; and (2) we've already scanned the + * relation once, so it's likely that we've already finalized many of + * the common values. + */ + input_ngroups = spill->ntuples[i]; + + oldContext = MemoryContextSwitchTo(aggstate->ss.ps.state->es_query_cxt); + new_batch = hash_batch_new(file, setno, input_ngroups, + spill->partition_bits + input_bits); + aggstate->hash_batches = lappend(aggstate->hash_batches, new_batch); + aggstate->hash_batches_used++; + MemoryContextSwitchTo(oldContext); + } + + pfree(spill->ntuples); + pfree(spill->partitions); +} + +/* + * Clear a HashAggSpill, free its memory, and close its files. + */ +static void +hash_reset_spill(HashAggSpill *spill) +{ + int i; + for (i = 0; i < spill->n_partitions; i++) + { + BufFile *file = spill->partitions[i]; + + if (file != NULL) + BufFileClose(file); + } + if (spill->ntuples != NULL) + pfree(spill->ntuples); + if (spill->partitions != NULL) + pfree(spill->partitions); +} + +/* + * Find and reset all active HashAggSpills. + */ +static void +hash_reset_spills(AggState *aggstate) +{ + ListCell *lc; + + if (aggstate->hash_spills != NULL) + { + int setno; + + for (setno = 0; setno < aggstate->num_hashes; setno++) + hash_reset_spill(&aggstate->hash_spills[setno]); + + pfree(aggstate->hash_spills); + aggstate->hash_spills = NULL; + } + + foreach(lc, aggstate->hash_batches) + { + HashAggBatch *batch = (HashAggBatch*) lfirst(lc); + hash_reset_spill(&batch->spill); + pfree(batch); + } + list_free(aggstate->hash_batches); + aggstate->hash_batches = NIL; +} + + /* ----------------- * ExecInitAgg * @@ -2238,6 +2756,10 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) aggstate->sort_slot = ExecInitExtraTupleSlot(estate, scanDesc, &TTSOpsMinimalTuple); + if (use_hashing) + aggstate->hash_spill_slot = ExecInitExtraTupleSlot(estate, scanDesc, + &TTSOpsVirtual); + /* * Initialize result type, slot and projection. */ @@ -3368,6 +3890,8 @@ ExecEndAgg(AggState *node) if (node->sort_out) tuplesort_end(node->sort_out); + hash_reset_spills(node); + for (transno = 0; transno < node->numtrans; transno++) { AggStatePerTrans pertrans = &node->pertrans[transno]; @@ -3423,12 +3947,13 @@ ExecReScanAgg(AggState *node) return; /* - * If we do have the hash table, and the subplan does not have any - * parameter changes, and none of our own parameter changes affect - * input expressions of the aggregated functions, then we can just - * rescan the existing hash table; no need to build it again. + * If we do have the hash table, and it never spilled, and the subplan + * does not have any parameter changes, and none of our own parameter + * changes affect input expressions of the aggregated functions, then + * we can just rescan the existing hash table; no need to build it + * again. */ - if (outerPlan->chgParam == NULL && + if (outerPlan->chgParam == NULL && !node->hash_spilled && !bms_overlap(node->ss.ps.chgParam, aggnode->aggParams)) { ResetTupleHashIterator(node->perhash[0].hashtable, @@ -3485,6 +4010,16 @@ ExecReScanAgg(AggState *node) */ if (node->aggstrategy == AGG_HASHED || node->aggstrategy == AGG_MIXED) { + hash_reset_spills(node); + + /* reset stats */ + node->hash_spilled = false; + node->hash_mem_init = 0; + node->hash_mem_peak = 0; + node->hash_mem_current = 0; + node->hash_disk_used = 0; + node->hash_batches_used = 0; + ReScanExprContext(node->hashcontext); /* Rebuild an empty hash table */ build_hash_table(node); diff --git a/src/backend/jit/llvm/llvmjit_expr.c b/src/backend/jit/llvm/llvmjit_expr.c index 30133634c7..14764e9c1d 100644 --- a/src/backend/jit/llvm/llvmjit_expr.c +++ b/src/backend/jit/llvm/llvmjit_expr.c @@ -2094,12 +2094,14 @@ llvm_compile_expr(ExprState *state) LLVMValueRef v_allpergroupsp; LLVMValueRef v_pergroupp; + LLVMValueRef v_pergroup_allaggs; LLVMValueRef v_setoff, v_transno; LLVMValueRef v_notransvalue; + LLVMBasicBlockRef b_check_notransvalue; LLVMBasicBlockRef b_init; aggstate = op->d.agg_init_trans.aggstate; @@ -2121,11 +2123,22 @@ llvm_compile_expr(ExprState *state) "aggstate.all_pergroups"); v_setoff = l_int32_const(op->d.agg_init_trans.setoff); v_transno = l_int32_const(op->d.agg_init_trans.transno); - v_pergroupp = - LLVMBuildGEP(b, - l_load_gep1(b, v_allpergroupsp, v_setoff, ""), - &v_transno, 1, ""); + v_pergroup_allaggs = l_load_gep1(b, v_allpergroupsp, v_setoff, ""); + b_check_notransvalue = l_bb_before_v( + opblocks[i + 1], "op.%d.check_notransvalue", i); + + LLVMBuildCondBr(b, + LLVMBuildICmp(b, LLVMIntEQ, + LLVMBuildPtrToInt(b, v_pergroup_allaggs, + TypeSizeT, ""), + l_sizet_const(0), ""), + opblocks[i + 1], + b_check_notransvalue); + + LLVMPositionBuilderAtEnd(b, b_check_notransvalue); + + v_pergroupp = LLVMBuildGEP(b, v_pergroup_allaggs, &v_transno, 1, ""); v_notransvalue = l_load_struct_gep(b, v_pergroupp, FIELDNO_AGGSTATEPERGROUPDATA_NOTRANSVALUE, @@ -2192,6 +2205,9 @@ llvm_compile_expr(ExprState *state) LLVMValueRef v_transnull; LLVMValueRef v_pergroupp; + LLVMValueRef v_pergroup_allaggs; + + LLVMBasicBlockRef b_check_transnull; int jumpnull = op->d.agg_strict_trans_check.jumpnull; @@ -2211,11 +2227,22 @@ llvm_compile_expr(ExprState *state) l_int32_const(op->d.agg_strict_trans_check.setoff); v_transno = l_int32_const(op->d.agg_strict_trans_check.transno); - v_pergroupp = - LLVMBuildGEP(b, - l_load_gep1(b, v_allpergroupsp, v_setoff, ""), - &v_transno, 1, ""); + v_pergroup_allaggs = l_load_gep1(b, v_allpergroupsp, v_setoff, ""); + + b_check_transnull = l_bb_before_v(opblocks[i + 1], + "op.%d.check_transnull", i); + LLVMBuildCondBr(b, + LLVMBuildICmp(b, LLVMIntEQ, + LLVMBuildPtrToInt(b, v_pergroup_allaggs, + TypeSizeT, ""), + l_sizet_const(0), ""), + opblocks[jumpnull], + b_check_transnull); + + LLVMPositionBuilderAtEnd(b, b_check_transnull); + + v_pergroupp = LLVMBuildGEP(b, v_pergroup_allaggs, &v_transno, 1, ""); v_transnull = l_load_struct_gep(b, v_pergroupp, FIELDNO_AGGSTATEPERGROUPDATA_TRANSVALUEISNULL, @@ -2257,12 +2284,15 @@ llvm_compile_expr(ExprState *state) LLVMValueRef v_pertransp; LLVMValueRef v_pergroupp; + LLVMValueRef v_pergroup_allaggs; LLVMValueRef v_retval; LLVMValueRef v_tmpcontext; LLVMValueRef v_oldcontext; + LLVMBasicBlockRef b_advance_transval; + aggstate = op->d.agg_trans.aggstate; pertrans = op->d.agg_trans.pertrans; @@ -2284,10 +2314,22 @@ llvm_compile_expr(ExprState *state) "aggstate.all_pergroups"); v_setoff = l_int32_const(op->d.agg_trans.setoff); v_transno = l_int32_const(op->d.agg_trans.transno); - v_pergroupp = - LLVMBuildGEP(b, - l_load_gep1(b, v_allpergroupsp, v_setoff, ""), - &v_transno, 1, ""); + v_pergroup_allaggs = l_load_gep1(b, v_allpergroupsp, v_setoff, ""); + + b_advance_transval = l_bb_before_v(opblocks[i + 1], + "op.%d.advance_transval", i); + + LLVMBuildCondBr(b, + LLVMBuildICmp(b, LLVMIntEQ, + LLVMBuildPtrToInt(b, v_pergroup_allaggs, + TypeSizeT, ""), + l_sizet_const(0), ""), + opblocks[i + 1], + b_advance_transval); + + LLVMPositionBuilderAtEnd(b, b_advance_transval); + + v_pergroupp = LLVMBuildGEP(b, v_pergroup_allaggs, &v_transno, 1, ""); v_fcinfo = l_ptr_const(fcinfo, l_ptr(StructFunctionCallInfoData)); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index a2a9b1f7be..3cfc299947 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -128,6 +128,7 @@ bool enable_bitmapscan = true; bool enable_tidscan = true; bool enable_sort = true; bool enable_hashagg = true; +bool enable_hashagg_spill = true; bool enable_nestloop = true; bool enable_material = true; bool enable_mergejoin = true; diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 401299e542..b3c1043c78 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -4255,8 +4255,8 @@ consider_groupingsets_paths(PlannerInfo *root, * with. Override work_mem in that case; otherwise, we'll rely on the * sorted-input case to generate usable mixed paths. */ - if (hashsize > work_mem * 1024L && gd->rollups) - return; /* nope, won't fit */ + if (!enable_hashagg_spill && hashsize > work_mem * 1024L && gd->rollups) + return; /* nope, won't fit */ /* * We need to burst the existing rollups list into individual grouping @@ -6527,7 +6527,8 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, * were unable to sort above, then we'd better generate a Path, so * that we at least have one. */ - if (hashaggtablesize < work_mem * 1024L || + if (enable_hashagg_spill || + hashaggtablesize < work_mem * 1024L || grouped_rel->pathlist == NIL) { /* @@ -6560,7 +6561,8 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, agg_final_costs, dNumGroups); - if (hashaggtablesize < work_mem * 1024L) + if (enable_hashagg_spill || + hashaggtablesize < work_mem * 1024L) add_path(grouped_rel, (Path *) create_agg_path(root, grouped_rel, @@ -6829,7 +6831,7 @@ create_partial_grouping_paths(PlannerInfo *root, * Tentatively produce a partial HashAgg Path, depending on if it * looks as if the hash table will fit in work_mem. */ - if (hashaggtablesize < work_mem * 1024L && + if ((enable_hashagg_spill || hashaggtablesize < work_mem * 1024L) && cheapest_total_path != NULL) { add_path(partially_grouped_rel, (Path *) @@ -6856,7 +6858,7 @@ create_partial_grouping_paths(PlannerInfo *root, dNumPartialPartialGroups); /* Do the same for partial paths. */ - if (hashaggtablesize < work_mem * 1024L && + if ((enable_hashagg_spill || hashaggtablesize < work_mem * 1024L) && cheapest_partial_path != NULL) { add_partial_path(partially_grouped_rel, (Path *) diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 3bf96de256..b0cb1d7e6b 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -120,6 +120,7 @@ bool enableFsync = true; bool allowSystemTableMods = false; int work_mem = 1024; int maintenance_work_mem = 16384; +bool hashagg_mem_overflow = false; int max_parallel_maintenance_workers = 2; /* diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index fc463601ff..c8b44569df 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -951,6 +951,26 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_hashagg_spill", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of hashed aggregation plans that are expected to exceed work_mem."), + NULL, + GUC_EXPLAIN + }, + &enable_hashagg_spill, + true, + NULL, NULL, NULL + }, + { + {"hashagg_mem_overflow", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables hashed aggregation to overflow work_mem at execution time."), + NULL, + GUC_EXPLAIN + }, + &hashagg_mem_overflow, + false, + NULL, NULL, NULL + }, { {"enable_material", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of materialization."), diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c index 6e4a343439..46e12c359e 100644 --- a/src/backend/utils/mmgr/aset.c +++ b/src/backend/utils/mmgr/aset.c @@ -458,6 +458,8 @@ AllocSetContextCreateInternal(MemoryContext parent, parent, name); + ((MemoryContext) set)->mem_allocated = set->keeper->endptr - ((char *)set->keeper) + MAXALIGN(sizeof(AllocSetContext)); + return (MemoryContext) set; } } @@ -546,6 +548,8 @@ AllocSetContextCreateInternal(MemoryContext parent, parent, name); + ((MemoryContext) set)->mem_allocated = set->keeper->endptr - ((char *)set->keeper) + MAXALIGN(sizeof(AllocSetContext)); + return (MemoryContext) set; } @@ -604,6 +608,8 @@ AllocSetReset(MemoryContext context) else { /* Normal case, release the block */ + context->mem_allocated -= block->endptr - ((char*) block); + #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, block->freeptr - ((char *) block)); #endif @@ -688,11 +694,16 @@ AllocSetDelete(MemoryContext context) #endif if (block != set->keeper) + { + context->mem_allocated -= block->endptr - ((char *) block); free(block); + } block = next; } + Assert(context->mem_allocated == 0); + /* Finally, free the context header, including the keeper block */ free(set); } @@ -733,6 +744,9 @@ AllocSetAlloc(MemoryContext context, Size size) block = (AllocBlock) malloc(blksize); if (block == NULL) return NULL; + + context->mem_allocated += blksize; + block->aset = set; block->freeptr = block->endptr = ((char *) block) + blksize; @@ -928,6 +942,8 @@ AllocSetAlloc(MemoryContext context, Size size) if (block == NULL) return NULL; + context->mem_allocated += blksize; + block->aset = set; block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ; block->endptr = ((char *) block) + blksize; @@ -1028,6 +1044,9 @@ AllocSetFree(MemoryContext context, void *pointer) set->blocks = block->next; if (block->next) block->next->prev = block->prev; + + context->mem_allocated -= block->endptr - ((char*) block); + #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, block->freeptr - ((char *) block)); #endif @@ -1144,6 +1163,7 @@ AllocSetRealloc(MemoryContext context, void *pointer, Size size) AllocBlock block = (AllocBlock) (((char *) chunk) - ALLOC_BLOCKHDRSZ); Size chksize; Size blksize; + Size oldblksize; /* * Try to verify that we have a sane block pointer: it should @@ -1159,6 +1179,8 @@ AllocSetRealloc(MemoryContext context, void *pointer, Size size) /* Do the realloc */ chksize = MAXALIGN(size); blksize = chksize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ; + oldblksize = block->endptr - ((char *)block); + block = (AllocBlock) realloc(block, blksize); if (block == NULL) { @@ -1166,6 +1188,9 @@ AllocSetRealloc(MemoryContext context, void *pointer, Size size) VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN); return NULL; } + + context->mem_allocated += blksize - oldblksize; + block->freeptr = block->endptr = ((char *) block) + blksize; /* Update pointers since block has likely been moved */ @@ -1383,6 +1408,7 @@ AllocSetCheck(MemoryContext context) const char *name = set->header.name; AllocBlock prevblock; AllocBlock block; + int64 total_allocated = 0; for (prevblock = NULL, block = set->blocks; block != NULL; @@ -1393,6 +1419,10 @@ AllocSetCheck(MemoryContext context) long blk_data = 0; long nchunks = 0; + total_allocated += block->endptr - ((char *)block); + if (set->keeper == block) + total_allocated += MAXALIGN(sizeof(AllocSetContext)); + /* * Empty block - empty can be keeper-block only */ @@ -1479,6 +1509,8 @@ AllocSetCheck(MemoryContext context) elog(WARNING, "problem in alloc set %s: found inconsistent memory block %p", name, block); } + + Assert(total_allocated == context->mem_allocated); } #endif /* MEMORY_CONTEXT_CHECKING */ diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c index b07be12236..27417af548 100644 --- a/src/backend/utils/mmgr/mcxt.c +++ b/src/backend/utils/mmgr/mcxt.c @@ -462,6 +462,29 @@ MemoryContextIsEmpty(MemoryContext context) return context->methods->is_empty(context); } +/* + * Find the memory allocated to blocks for this memory context. If recurse is + * true, also include children. + */ +int64 +MemoryContextMemAllocated(MemoryContext context, bool recurse) +{ + int64 total = context->mem_allocated; + + AssertArg(MemoryContextIsValid(context)); + + if (recurse) + { + MemoryContext child = context->firstchild; + for (child = context->firstchild; + child != NULL; + child = child->nextchild) + total += MemoryContextMemAllocated(child, true); + } + + return total; +} + /* * MemoryContextStats * Print statistics about the named context and all its descendants. @@ -736,6 +759,7 @@ MemoryContextCreate(MemoryContext node, node->methods = methods; node->parent = parent; node->firstchild = NULL; + node->mem_allocated = 0; node->prevchild = NULL; node->name = name; node->ident = NULL; diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 1fb28b4596..6f1c2f9c73 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -139,10 +139,15 @@ extern TupleHashTable BuildTupleHashTableExt(PlanState *parent, extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew); +extern TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable, + TupleTableSlot *slot, + bool *isnew, uint32 hash); extern TupleHashEntry FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, ExprState *eqcomp, FmgrInfo *hashfunctions); +extern uint32 TupleHashTableHash(struct tuplehash_hash *tb, + const MinimalTuple tuple); extern void ResetTupleHashTable(TupleHashTable hashtable); /* diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h index 5c6bd93bc7..d51c1ea022 100644 --- a/src/include/lib/simplehash.h +++ b/src/include/lib/simplehash.h @@ -74,8 +74,10 @@ #define SH_DESTROY SH_MAKE_NAME(destroy) #define SH_RESET SH_MAKE_NAME(reset) #define SH_INSERT SH_MAKE_NAME(insert) +#define SH_INSERT_HASH SH_MAKE_NAME(insert_hash) #define SH_DELETE SH_MAKE_NAME(delete) #define SH_LOOKUP SH_MAKE_NAME(lookup) +#define SH_LOOKUP_HASH SH_MAKE_NAME(lookup_hash) #define SH_GROW SH_MAKE_NAME(grow) #define SH_START_ITERATE SH_MAKE_NAME(start_iterate) #define SH_START_ITERATE_AT SH_MAKE_NAME(start_iterate_at) @@ -144,7 +146,11 @@ SH_SCOPE void SH_DESTROY(SH_TYPE * tb); SH_SCOPE void SH_RESET(SH_TYPE * tb); SH_SCOPE void SH_GROW(SH_TYPE * tb, uint32 newsize); SH_SCOPE SH_ELEMENT_TYPE *SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found); +SH_SCOPE SH_ELEMENT_TYPE *SH_INSERT_HASH(SH_TYPE * tb, SH_KEY_TYPE key, + uint32 hash, bool *found); SH_SCOPE SH_ELEMENT_TYPE *SH_LOOKUP(SH_TYPE * tb, SH_KEY_TYPE key); +SH_SCOPE SH_ELEMENT_TYPE *SH_LOOKUP_HASH(SH_TYPE * tb, SH_KEY_TYPE key, + uint32 hash); SH_SCOPE bool SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key); SH_SCOPE void SH_START_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter); SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE * tb, SH_ITERATOR * iter, uint32 at); @@ -499,7 +505,14 @@ SH_GROW(SH_TYPE * tb, uint32 newsize) SH_SCOPE SH_ELEMENT_TYPE * SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found) { - uint32 hash = SH_HASH_KEY(tb, key); + uint32 hash = SH_HASH_KEY(tb, key); + + return SH_INSERT_HASH(tb, key, hash, found); +} + +SH_SCOPE SH_ELEMENT_TYPE * +SH_INSERT_HASH(SH_TYPE * tb, SH_KEY_TYPE key, uint32 hash, bool *found) +{ uint32 startelem; uint32 curelem; SH_ELEMENT_TYPE *data; @@ -669,7 +682,14 @@ restart: SH_SCOPE SH_ELEMENT_TYPE * SH_LOOKUP(SH_TYPE * tb, SH_KEY_TYPE key) { - uint32 hash = SH_HASH_KEY(tb, key); + uint32 hash = SH_HASH_KEY(tb, key); + + return SH_LOOKUP_HASH(tb, key, hash); +} + +SH_SCOPE SH_ELEMENT_TYPE * +SH_LOOKUP_HASH(SH_TYPE * tb, SH_KEY_TYPE key, uint32 hash) +{ const uint32 startelem = SH_INITIAL_BUCKET(tb, hash); uint32 curelem = startelem; @@ -971,8 +991,10 @@ SH_STAT(SH_TYPE * tb) #undef SH_DESTROY #undef SH_RESET #undef SH_INSERT +#undef SH_INSERT_HASH #undef SH_DELETE #undef SH_LOOKUP +#undef SH_LOOKUP_HASH #undef SH_GROW #undef SH_START_ITERATE #undef SH_START_ITERATE_AT diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 61a24c2e3c..be9ae1028d 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -244,6 +244,7 @@ extern bool enableFsync; extern PGDLLIMPORT bool allowSystemTableMods; extern PGDLLIMPORT int work_mem; extern PGDLLIMPORT int maintenance_work_mem; +extern PGDLLIMPORT bool hashagg_mem_overflow; extern PGDLLIMPORT int max_parallel_maintenance_workers; extern int VacuumCostPageHit; diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 98bdcbcef5..419de41170 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -2022,13 +2022,24 @@ typedef struct AggState HeapTuple grp_firstTuple; /* copy of first tuple of current group */ /* these fields are used in AGG_HASHED and AGG_MIXED modes: */ bool table_filled; /* hash table filled yet? */ - int num_hashes; + int num_hashes; /* number of hash tables active at once */ + bool hash_spilled; /* any hash table ever spilled? */ + struct HashAggSpill *hash_spills; /* HashAggSpill for each hash table, + exists only during first pass if spilled */ + TupleTableSlot *hash_spill_slot; /* slot for reading from spill files */ + uint64 hash_mem_init; /* initial hash table memory usage */ + uint64 hash_mem_peak; /* peak hash table memory usage */ + uint64 hash_mem_current; /* current hash table memory usage */ + uint64 hash_disk_used; /* bytes of disk space used */ + int hash_batches_used; /* batches used during entire execution */ + List *hash_batches; /* hash batches remaining to be processed */ + AggStatePerHash perhash; /* array of per-hashtable data */ AggStatePerGroup *hash_pergroup; /* grouping set indexed array of * per-group pointers */ /* support for evaluation of agg input expressions: */ -#define FIELDNO_AGGSTATE_ALL_PERGROUPS 34 +#define FIELDNO_AGGSTATE_ALL_PERGROUPS 43 AggStatePerGroup *all_pergroups; /* array of first ->pergroups, than * ->hash_pergroup */ ProjectionInfo *combinedproj; /* projection machinery */ @@ -2200,7 +2211,7 @@ typedef struct HashInstrumentation int nbuckets_original; /* planned number of buckets */ int nbatch; /* number of batches at end of execution */ int nbatch_original; /* planned number of batches */ - size_t space_peak; /* speak memory usage in bytes */ + size_t space_peak; /* peak memory usage in bytes */ } HashInstrumentation; /* ---------------- diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h index dbae98d3d9..df0ae3625c 100644 --- a/src/include/nodes/memnodes.h +++ b/src/include/nodes/memnodes.h @@ -79,6 +79,7 @@ typedef struct MemoryContextData /* these two fields are placed here to minimize alignment wastage: */ bool isReset; /* T = no space alloced since last reset */ bool allowInCritSection; /* allow palloc in critical section */ + int64 mem_allocated; /* track memory allocated for this context */ const MemoryContextMethods *methods; /* virtual function table */ MemoryContext parent; /* NULL if no parent (toplevel context) */ MemoryContext firstchild; /* head of linked list of children */ diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index b3d0b4f6fb..b72e2d0829 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -54,6 +54,7 @@ extern PGDLLIMPORT bool enable_bitmapscan; extern PGDLLIMPORT bool enable_tidscan; extern PGDLLIMPORT bool enable_sort; extern PGDLLIMPORT bool enable_hashagg; +extern PGDLLIMPORT bool enable_hashagg_spill; extern PGDLLIMPORT bool enable_nestloop; extern PGDLLIMPORT bool enable_material; extern PGDLLIMPORT bool enable_mergejoin; diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h index ffe6de536e..6a837bc990 100644 --- a/src/include/utils/memutils.h +++ b/src/include/utils/memutils.h @@ -82,6 +82,7 @@ extern void MemoryContextSetParent(MemoryContext context, extern Size GetMemoryChunkSpace(void *pointer); extern MemoryContext MemoryContextGetParent(MemoryContext context); extern bool MemoryContextIsEmpty(MemoryContext context); +extern int64 MemoryContextMemAllocated(MemoryContext context, bool recurse); extern void MemoryContextStats(MemoryContext context); extern void MemoryContextStatsDetail(MemoryContext context, int max_children); extern void MemoryContextAllowInCriticalSection(MemoryContext context, diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index ef8eec3fbf..8fa4c7466b 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -2331,3 +2331,95 @@ explain (costs off) -> Seq Scan on onek (8 rows) +-- +-- Compare results between plans using sorting and plans using hash +-- aggregation. Force spilling in both cases by setting work_mem low. +-- +set work_mem='64kB'; +-- Produce results with sorting. +set enable_hashagg = false; +set jit_above_cost = 0; +explain (costs off) +select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3 + from generate_series(0, 199999) g + group by g%100000; + QUERY PLAN +------------------------------------------------ + GroupAggregate + Group Key: ((g % 100000)) + -> Sort + Sort Key: ((g % 100000)) + -> Function Scan on generate_series g +(5 rows) + +create table agg_group_1 as +select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3 + from generate_series(0, 199999) g + group by g%100000; +set jit_above_cost to default; +create table agg_group_2 as +select (g/2)::numeric as c1, sum(7::int4) as c2, count(*) as c3 + from generate_series(0, 1999) g + group by g/2; +create table agg_group_3 as +select (g/2)::numeric as c1, array_agg(g::numeric) as c2, count(*) as c3 + from generate_series(0, 1999) g + group by g/2; +-- Produce results with hash aggregation +set enable_hashagg = true; +set enable_sort = false; +set jit_above_cost = 0; +explain (costs off) +select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3 + from generate_series(0, 199999) g + group by g%100000; + QUERY PLAN +------------------------------------------ + HashAggregate + Group Key: (g % 100000) + -> Function Scan on generate_series g +(3 rows) + +create table agg_hash_1 as +select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3 + from generate_series(0, 199999) g + group by g%100000; +set jit_above_cost to default; +create table agg_hash_2 as +select (g/2)::numeric as c1, sum(7::int4) as c2, count(*) as c3 + from generate_series(0, 1999) g + group by g/2; +create table agg_hash_3 as +select (g/2)::numeric as c1, array_agg(g::numeric) as c2, count(*) as c3 + from generate_series(0, 1999) g + group by g/2; +set enable_sort = true; +set work_mem to default; +-- Compare group aggregation results to hash aggregation results +(select * from agg_hash_1 except select * from agg_group_1) + union all +(select * from agg_group_1 except select * from agg_hash_1); + c1 | c2 | c3 +----+----+---- +(0 rows) + +(select * from agg_hash_2 except select * from agg_group_2) + union all +(select * from agg_group_2 except select * from agg_hash_2); + c1 | c2 | c3 +----+----+---- +(0 rows) + +(select * from agg_hash_3 except select * from agg_group_3) + union all +(select * from agg_group_3 except select * from agg_hash_3); + c1 | c2 | c3 +----+----+---- +(0 rows) + +drop table agg_group_1; +drop table agg_group_2; +drop table agg_group_3; +drop table agg_hash_1; +drop table agg_hash_2; +drop table agg_hash_3; diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out index 5d92b08d20..7d7fc929c7 100644 --- a/src/test/regress/expected/groupingsets.out +++ b/src/test/regress/expected/groupingsets.out @@ -1494,22 +1494,18 @@ explain (costs off) count(hundred), count(thousand), count(twothousand), count(*) from tenk1 group by grouping sets (unique1,twothousand,thousand,hundred,ten,four,two); - QUERY PLAN -------------------------------- - MixedAggregate - Hash Key: two - Hash Key: four - Hash Key: ten + QUERY PLAN +------------------------- + HashAggregate + Hash Key: unique1 + Hash Key: twothousand + Hash Key: thousand Hash Key: hundred - Group Key: unique1 - Sort Key: twothousand - Group Key: twothousand - Sort Key: thousand - Group Key: thousand - -> Sort - Sort Key: unique1 - -> Seq Scan on tenk1 -(13 rows) + Hash Key: ten + Hash Key: four + Hash Key: two + -> Seq Scan on tenk1 +(9 rows) explain (costs off) select unique1, @@ -1517,18 +1513,16 @@ explain (costs off) count(hundred), count(thousand), count(twothousand), count(*) from tenk1 group by grouping sets (unique1,hundred,ten,four,two); - QUERY PLAN -------------------------------- - MixedAggregate - Hash Key: two - Hash Key: four - Hash Key: ten + QUERY PLAN +------------------------- + HashAggregate + Hash Key: unique1 Hash Key: hundred - Group Key: unique1 - -> Sort - Sort Key: unique1 - -> Seq Scan on tenk1 -(9 rows) + Hash Key: ten + Hash Key: four + Hash Key: two + -> Seq Scan on tenk1 +(7 rows) set work_mem = '384kB'; explain (costs off) @@ -1537,21 +1531,18 @@ explain (costs off) count(hundred), count(thousand), count(twothousand), count(*) from tenk1 group by grouping sets (unique1,twothousand,thousand,hundred,ten,four,two); - QUERY PLAN -------------------------------- - MixedAggregate - Hash Key: two - Hash Key: four - Hash Key: ten - Hash Key: hundred + QUERY PLAN +------------------------- + HashAggregate + Hash Key: unique1 + Hash Key: twothousand Hash Key: thousand - Group Key: unique1 - Sort Key: twothousand - Group Key: twothousand - -> Sort - Sort Key: unique1 - -> Seq Scan on tenk1 -(12 rows) + Hash Key: hundred + Hash Key: ten + Hash Key: four + Hash Key: two + -> Seq Scan on tenk1 +(9 rows) -- check collation-sensitive matching between grouping expressions -- (similar to a check for aggregates, but there are additional code @@ -1578,4 +1569,123 @@ select v||'a', case when grouping(v||'a') = 1 then 1 else 0 end, count(*) | 1 | 2 (4 rows) +-- +-- Compare results between plans using sorting and plans using hash +-- aggregation. Force spilling in both cases by setting work_mem low. +-- +SET work_mem='64kB'; +-- Produce results with sorting. +set enable_hashagg = false; +set jit_above_cost = 0; +explain (costs off) +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); + QUERY PLAN +--------------------------------------------------------------- + GroupAggregate + Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 1000)), ((g.g % 100)) + Group Key: ((g.g % 1000)) + Group Key: () + Sort Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)) + Sort Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)) + -> Sort + Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + -> Function Scan on generate_series g +(14 rows) + +create table gs_group_1 as +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); +set jit_above_cost to default; +create table gs_group_2 as +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g/20 as g1000, g/200 as g100, g/2000 as g10, g + from generate_series(0,19999) g) s +group by cube (g1000,g100,g10); +create table gs_group_3 as +select g100, g10, array_agg(g) as a, count(*) as c, max(g::text) as m from + (select g/200 as g100, g/2000 as g10, g + from generate_series(0,19999) g) s +group by grouping sets (g100,g10); +-- Produce results with hash aggregation. +set enable_hashagg = true; +set enable_sort = false; +set work_mem='64kB'; +set jit_above_cost = 0; +explain (costs off) +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); + QUERY PLAN +--------------------------------------------------- + MixedAggregate + Hash Key: (g.g % 1000), (g.g % 100), (g.g % 10) + Hash Key: (g.g % 1000), (g.g % 100) + Hash Key: (g.g % 1000) + Hash Key: (g.g % 100), (g.g % 10) + Hash Key: (g.g % 100) + Hash Key: (g.g % 10), (g.g % 1000) + Hash Key: (g.g % 10) + Group Key: () + -> Function Scan on generate_series g +(10 rows) + +create table gs_hash_1 as +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); +set jit_above_cost to default; +create table gs_hash_2 as +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g/20 as g1000, g/200 as g100, g/2000 as g10, g + from generate_series(0,19999) g) s +group by cube (g1000,g100,g10); +create table gs_hash_3 as +select g100, g10, array_agg(g) as a, count(*) as c, max(g::text) as m from + (select g/200 as g100, g/2000 as g10, g + from generate_series(0,19999) g) s +group by grouping sets (g100,g10); +set enable_sort = true; +set work_mem to default; +-- Compare results +(select * from gs_hash_1 except select * from gs_group_1) + union all +(select * from gs_group_1 except select * from gs_hash_1); + g1000 | g100 | g10 | sum | count | max +-------+------+-----+-----+-------+----- +(0 rows) + +(select * from gs_hash_2 except select * from gs_group_2) + union all +(select * from gs_group_2 except select * from gs_hash_2); + g1000 | g100 | g10 | sum | count | max +-------+------+-----+-----+-------+----- +(0 rows) + +(select g100,g10,unnest(a),c,m from gs_hash_3 except + select g100,g10,unnest(a),c,m from gs_group_3) + union all +(select g100,g10,unnest(a),c,m from gs_group_3 except + select g100,g10,unnest(a),c,m from gs_hash_3); + g100 | g10 | unnest | c | m +------+-----+--------+---+--- +(0 rows) + +drop table gs_group_1; +drop table gs_group_2; +drop table gs_group_3; +drop table gs_hash_1; +drop table gs_hash_2; +drop table gs_hash_3; -- end diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out index f3696c6d1d..11c6f50fbf 100644 --- a/src/test/regress/expected/select_distinct.out +++ b/src/test/regress/expected/select_distinct.out @@ -148,6 +148,68 @@ SELECT count(*) FROM 4 (1 row) +-- +-- Compare results between plans using sorting and plans using hash +-- aggregation. Force spilling in both cases by setting work_mem low. +-- +SET work_mem='64kB'; +-- Produce results with sorting. +SET enable_hashagg=FALSE; +SET jit_above_cost=0; +EXPLAIN (costs off) +SELECT DISTINCT g%1000 FROM generate_series(0,9999) g; + QUERY PLAN +------------------------------------------------ + Unique + -> Sort + Sort Key: ((g % 1000)) + -> Function Scan on generate_series g +(4 rows) + +CREATE TABLE distinct_group_1 AS +SELECT DISTINCT g%1000 FROM generate_series(0,9999) g; +SET jit_above_cost TO DEFAULT; +CREATE TABLE distinct_group_2 AS +SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g; +SET enable_hashagg=TRUE; +-- Produce results with hash aggregation. +SET enable_sort=FALSE; +SET jit_above_cost=0; +EXPLAIN (costs off) +SELECT DISTINCT g%1000 FROM generate_series(0,9999) g; + QUERY PLAN +------------------------------------------ + HashAggregate + Group Key: (g % 1000) + -> Function Scan on generate_series g +(3 rows) + +CREATE TABLE distinct_hash_1 AS +SELECT DISTINCT g%1000 FROM generate_series(0,9999) g; +SET jit_above_cost TO DEFAULT; +CREATE TABLE distinct_hash_2 AS +SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g; +SET enable_sort=TRUE; +SET work_mem TO DEFAULT; +-- Compare results +(SELECT * FROM distinct_hash_1 EXCEPT SELECT * FROM distinct_group_1) + UNION ALL +(SELECT * FROM distinct_group_1 EXCEPT SELECT * FROM distinct_hash_1); + ?column? +---------- +(0 rows) + +(SELECT * FROM distinct_hash_1 EXCEPT SELECT * FROM distinct_group_1) + UNION ALL +(SELECT * FROM distinct_group_1 EXCEPT SELECT * FROM distinct_hash_1); + ?column? +---------- +(0 rows) + +DROP TABLE distinct_hash_1; +DROP TABLE distinct_hash_2; +DROP TABLE distinct_group_1; +DROP TABLE distinct_group_2; -- -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its -- very own regression file. diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index a1c90eb905..c40bf6c16e 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -75,6 +75,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_bitmapscan | on enable_gathermerge | on enable_hashagg | on + enable_hashagg_spill | on enable_hashjoin | on enable_indexonlyscan | on enable_indexscan | on @@ -89,7 +90,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(17 rows) +(18 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 17fb256aec..bcd336c581 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -1017,3 +1017,91 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*) explain (costs off) select 1 from tenk1 where (hundred, thousand) in (select twothousand, twothousand from onek); + +-- +-- Compare results between plans using sorting and plans using hash +-- aggregation. Force spilling in both cases by setting work_mem low. +-- + +set work_mem='64kB'; + +-- Produce results with sorting. + +set enable_hashagg = false; + +set jit_above_cost = 0; + +explain (costs off) +select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3 + from generate_series(0, 199999) g + group by g%100000; + +create table agg_group_1 as +select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3 + from generate_series(0, 199999) g + group by g%100000; + +set jit_above_cost to default; + +create table agg_group_2 as +select (g/2)::numeric as c1, sum(7::int4) as c2, count(*) as c3 + from generate_series(0, 1999) g + group by g/2; + +create table agg_group_3 as +select (g/2)::numeric as c1, array_agg(g::numeric) as c2, count(*) as c3 + from generate_series(0, 1999) g + group by g/2; + +-- Produce results with hash aggregation + +set enable_hashagg = true; +set enable_sort = false; + +set jit_above_cost = 0; + +explain (costs off) +select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3 + from generate_series(0, 199999) g + group by g%100000; + +create table agg_hash_1 as +select g%100000 as c1, sum(g::numeric) as c2, count(*) as c3 + from generate_series(0, 199999) g + group by g%100000; + +set jit_above_cost to default; + +create table agg_hash_2 as +select (g/2)::numeric as c1, sum(7::int4) as c2, count(*) as c3 + from generate_series(0, 1999) g + group by g/2; + +create table agg_hash_3 as +select (g/2)::numeric as c1, array_agg(g::numeric) as c2, count(*) as c3 + from generate_series(0, 1999) g + group by g/2; + +set enable_sort = true; +set work_mem to default; + +-- Compare group aggregation results to hash aggregation results + +(select * from agg_hash_1 except select * from agg_group_1) + union all +(select * from agg_group_1 except select * from agg_hash_1); + +(select * from agg_hash_2 except select * from agg_group_2) + union all +(select * from agg_group_2 except select * from agg_hash_2); + +(select * from agg_hash_3 except select * from agg_group_3) + union all +(select * from agg_group_3 except select * from agg_hash_3); + +drop table agg_group_1; +drop table agg_group_2; +drop table agg_group_3; +drop table agg_hash_1; +drop table agg_hash_2; +drop table agg_hash_3; diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql index d8f78fcc00..264c3ab5c2 100644 --- a/src/test/regress/sql/groupingsets.sql +++ b/src/test/regress/sql/groupingsets.sql @@ -429,4 +429,103 @@ select v||'a', case when grouping(v||'a') = 1 then 1 else 0 end, count(*) from unnest(array[1,1], array['a','b']) u(i,v) group by rollup(i, v||'a') order by 1,3; +-- +-- Compare results between plans using sorting and plans using hash +-- aggregation. Force spilling in both cases by setting work_mem low. +-- + +SET work_mem='64kB'; + +-- Produce results with sorting. + +set enable_hashagg = false; + +set jit_above_cost = 0; + +explain (costs off) +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); + +create table gs_group_1 as +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); + +set jit_above_cost to default; + +create table gs_group_2 as +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g/20 as g1000, g/200 as g100, g/2000 as g10, g + from generate_series(0,19999) g) s +group by cube (g1000,g100,g10); + +create table gs_group_3 as +select g100, g10, array_agg(g) as a, count(*) as c, max(g::text) as m from + (select g/200 as g100, g/2000 as g10, g + from generate_series(0,19999) g) s +group by grouping sets (g100,g10); + +-- Produce results with hash aggregation. + +set enable_hashagg = true; +set enable_sort = false; +set work_mem='64kB'; + +set jit_above_cost = 0; + +explain (costs off) +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); + +create table gs_hash_1 as +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); + +set jit_above_cost to default; + +create table gs_hash_2 as +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g/20 as g1000, g/200 as g100, g/2000 as g10, g + from generate_series(0,19999) g) s +group by cube (g1000,g100,g10); + +create table gs_hash_3 as +select g100, g10, array_agg(g) as a, count(*) as c, max(g::text) as m from + (select g/200 as g100, g/2000 as g10, g + from generate_series(0,19999) g) s +group by grouping sets (g100,g10); + +set enable_sort = true; +set work_mem to default; + +-- Compare results + +(select * from gs_hash_1 except select * from gs_group_1) + union all +(select * from gs_group_1 except select * from gs_hash_1); + +(select * from gs_hash_2 except select * from gs_group_2) + union all +(select * from gs_group_2 except select * from gs_hash_2); + +(select g100,g10,unnest(a),c,m from gs_hash_3 except + select g100,g10,unnest(a),c,m from gs_group_3) + union all +(select g100,g10,unnest(a),c,m from gs_group_3 except + select g100,g10,unnest(a),c,m from gs_hash_3); + +drop table gs_group_1; +drop table gs_group_2; +drop table gs_group_3; +drop table gs_hash_1; +drop table gs_hash_2; +drop table gs_hash_3; + -- end diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql index a605e86449..33102744eb 100644 --- a/src/test/regress/sql/select_distinct.sql +++ b/src/test/regress/sql/select_distinct.sql @@ -45,6 +45,68 @@ SELECT count(*) FROM SELECT count(*) FROM (SELECT DISTINCT two, four, two FROM tenk1) ss; +-- +-- Compare results between plans using sorting and plans using hash +-- aggregation. Force spilling in both cases by setting work_mem low. +-- + +SET work_mem='64kB'; + +-- Produce results with sorting. + +SET enable_hashagg=FALSE; + +SET jit_above_cost=0; + +EXPLAIN (costs off) +SELECT DISTINCT g%1000 FROM generate_series(0,9999) g; + +CREATE TABLE distinct_group_1 AS +SELECT DISTINCT g%1000 FROM generate_series(0,9999) g; + +SET jit_above_cost TO DEFAULT; + +CREATE TABLE distinct_group_2 AS +SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g; + +SET enable_hashagg=TRUE; + +-- Produce results with hash aggregation. + +SET enable_sort=FALSE; + +SET jit_above_cost=0; + +EXPLAIN (costs off) +SELECT DISTINCT g%1000 FROM generate_series(0,9999) g; + +CREATE TABLE distinct_hash_1 AS +SELECT DISTINCT g%1000 FROM generate_series(0,9999) g; + +SET jit_above_cost TO DEFAULT; + +CREATE TABLE distinct_hash_2 AS +SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g; + +SET enable_sort=TRUE; + +SET work_mem TO DEFAULT; + +-- Compare results + +(SELECT * FROM distinct_hash_1 EXCEPT SELECT * FROM distinct_group_1) + UNION ALL +(SELECT * FROM distinct_group_1 EXCEPT SELECT * FROM distinct_hash_1); + +(SELECT * FROM distinct_hash_1 EXCEPT SELECT * FROM distinct_group_1) + UNION ALL +(SELECT * FROM distinct_group_1 EXCEPT SELECT * FROM distinct_hash_1); + +DROP TABLE distinct_hash_1; +DROP TABLE distinct_hash_2; +DROP TABLE distinct_group_1; +DROP TABLE distinct_group_2; + -- -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its -- very own regression file.