>>>>> "Andrew" == Andrew Gierth <and...@tao11.riddles.org.uk> writes:
Andrew> My inclination is to fix this in the planner rather than the Andrew> executor; there seems no good reason to actually hash a Andrew> duplicate column more than once. I take this back; I don't believe it's possible to eliminate duplicates in all cases. Consider (a,b) IN (select c,c from...), where a,b,c are different types; I don't think we can assume that (a=c) and (b=c) cross-type comparisons will necessarily induce the same hash function on c, and so we might legitimately need to keep it duplicated. So I'm going with a simpler method of ensuring the array is adequately sized at execution time and not touching the planner at all. Draft patch is attached, will commit it later. -- Andrew (irc:RhodiumToad)
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 5b4a602952..6b8ef40599 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -1312,9 +1312,14 @@ build_hash_table(AggState *aggstate) * by themselves, and secondly ctids for row-marks. * * To eliminate duplicates, we build a bitmapset of the needed columns, and - * then build an array of the columns included in the hashtable. Note that - * the array is preserved over ExecReScanAgg, so we allocate it in the - * per-query context (unlike the hash table itself). + * then build an array of the columns included in the hashtable. We might + * still have duplicates if the passed-in grpColIdx has them, which can happen + * in edge cases from semijoins/distinct; these can't always be removed, + * because it's not certain that the duplicate cols will be using the same + * hash function. + * + * Note that the array is preserved over ExecReScanAgg, so we allocate it in + * the per-query context (unlike the hash table itself). */ static void find_hash_columns(AggState *aggstate) @@ -1335,6 +1340,7 @@ find_hash_columns(AggState *aggstate) AttrNumber *grpColIdx = perhash->aggnode->grpColIdx; List *hashTlist = NIL; TupleDesc hashDesc; + int maxCols; int i; perhash->largestGrpColIdx = 0; @@ -1359,15 +1365,24 @@ find_hash_columns(AggState *aggstate) colnos = bms_del_member(colnos, attnum); } } - /* Add in all the grouping columns */ - for (i = 0; i < perhash->numCols; i++) - colnos = bms_add_member(colnos, grpColIdx[i]); + + /* + * Compute maximum number of input columns accounting for possible + * duplications in the grpColIdx array, which can happen in some edge + * cases where HashAggregate was generated as part of a semijoin or a + * DISTINCT. + */ + maxCols = bms_num_members(colnos) + perhash->numCols; perhash->hashGrpColIdxInput = - palloc(bms_num_members(colnos) * sizeof(AttrNumber)); + palloc(maxCols * sizeof(AttrNumber)); perhash->hashGrpColIdxHash = palloc(perhash->numCols * sizeof(AttrNumber)); + /* Add all the grouping columns to colnos */ + for (i = 0; i < perhash->numCols; i++) + colnos = bms_add_member(colnos, grpColIdx[i]); + /* * First build mapping for columns directly hashed. These are the * first, because they'll be accessed when computing hash values and diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index fed69dc9e1..2e5ce8cc32 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -2270,3 +2270,21 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*) ba | 0 | 1 (2 rows) +-- Make sure that generation of HashAggregate for uniqification purposes +-- does not lead to array overflow due to unexpected duplicate hash keys +-- see cafeejokku0u+a_a9r9316djw-yw3-+gtgvy3ju655qrhr3j...@mail.gmail.com +explain (costs off) + select 1 from tenk1 + where (hundred, thousand) in (select twothousand, twothousand from onek); + QUERY PLAN +------------------------------------------------------------- + Hash Join + Hash Cond: (tenk1.hundred = onek.twothousand) + -> Seq Scan on tenk1 + Filter: (hundred = thousand) + -> Hash + -> HashAggregate + Group Key: onek.twothousand, onek.twothousand + -> Seq Scan on onek +(8 rows) + diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 230f44666a..ca0d5e66b2 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -988,3 +988,10 @@ select v||'a', case v||'a' when 'aa' then 1 else 0 end, count(*) select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*) from unnest(array['a','b']) u(v) group by v||'a' order by 1; + +-- Make sure that generation of HashAggregate for uniqification purposes +-- does not lead to array overflow due to unexpected duplicate hash keys +-- see cafeejokku0u+a_a9r9316djw-yw3-+gtgvy3ju655qrhr3j...@mail.gmail.com +explain (costs off) + select 1 from tenk1 + where (hundred, thousand) in (select twothousand, twothousand from onek);