Re: Memory-Bounded Hash Aggregation

Adam Lee Fri, 20 Dec 2019 01:17:25 -0800

On Sat, Dec 14, 2019 at 06:32:25PM +0100, Tomas Vondra wrote:
> I've done a bit more testing on this, after resolving a couple of minor
> conflicts due to recent commits (rebased version attached).
> 
> In particular, I've made a comparison with different dataset sizes,
> group sizes, GUC settings etc. The script and results from two different
> machines are available here:
> 
> The script essentially runs a simple grouping query with different
> number of rows, groups, work_mem and parallelism settings. There's
> nothing particularly magical about it.


Nice!

> I did run it both on master and patched code, allowing us to compare
> results and assess impact of the patch. Overall, the changes are
> expected and either neutral or beneficial, i.e. the timing are the same
> or faster.
> 
> The number of cases that regressed is fairly small, but sometimes the
> regressions are annoyingly large - up to 2x in some cases. Consider for
> example this trivial example with 100M rows:

I suppose this is because the patch has no costing changes yet. I hacked
a little to give hash agg a spilling punish, just some value based on
(groups_in_hashtable * num_of_input_tuples)/num_groups_from_planner, it
would not choose hash aggregate in this case.

However, that punish is wrong, because comparing to the external sort
algorithm, hash aggregate has the respilling, which involves even more
I/O, especially with a very large number of groups but a very small
number of tuples in a single group like the test you did. It would be a
challenge.

BTW, Jeff, Greenplum has a test for hash agg spill, I modified a little
to check how many batches a query uses, it's attached, not sure if it
would help.

-- 
Adam Lee

create schema hashagg_spill;
set search_path to hashagg_spill;

-- start_ignore
CREATE EXTENSION plpythonu;
-- end_ignore

-- Create a function which checks how many batches there are
-- output example: "Memory Usage: 4096kB  Batches: 68  Disk Usage:8625kB"
create or replace function hashagg_spill.num_batches(explain_query text)
returns setof int as
$$
import re
rv = plpy.execute(explain_query)
search_text = 'batches'
result = []
for i in range(len(rv)):
    cur_line = rv[i]['QUERY PLAN']
    if search_text.lower() in cur_line.lower():
        p = re.compile('.*Memory Usage: (\d+).* Batches: (\d+)  Disk.*')
        m = p.match(cur_line)
        batches = int(m.group(2))
        result.append(batches)
return result
$$
language plpythonu;

create table testhagg (i1 int, i2 int, i3 int, i4 int);
insert into testhagg select i,i,i,i from generate_series(1, 30000)i;

set work_mem="1800";

select * from (select max(i1) from testhagg group by i2) foo order by 1 limit 
10;
select num_batches > 0 from hashagg_spill.num_batches('explain (analyze, 
verbose) select max(i1) from testhagg group by i2;') num_batches;
select num_batches > 3 from hashagg_spill.num_batches('explain (analyze, 
verbose) select max(i1) from testhagg group by i2 limit 90000;') num_batches;

reset all;
set search_path to hashagg_spill;

-- Test agg spilling scenarios
create table aggspill (i int, j int, t text);
insert into aggspill select i, i*2, i::text from generate_series(1, 10000) i;
insert into aggspill select i, i*2, i::text from generate_series(1, 100000) i;
insert into aggspill select i, i*2, i::text from generate_series(1, 1000000) i;

-- No spill with large statement memory
set work_mem = '125MB';
select * from hashagg_spill.num_batches('explain (analyze, verbose)
select count(*) from (select i, count(*) from aggspill group by i,j having 
count(*) = 1) g;');

-- Reduce the statement memory to induce spilling
set work_mem = '10MB';
select num_batches > 32 from hashagg_spill.num_batches('explain (analyze, 
verbose)
select count(*) from (select i, count(*) from aggspill group by i,j having 
count(*) = 2) g;') num_batches;

-- Reduce the statement memory, more batches
set work_mem = '5MB';

select num_batches > 64 from hashagg_spill.num_batches('explain (analyze, 
verbose)
select count(*) from (select i, count(*) from aggspill group by i,j having 
count(*) = 3) g;') num_batches;

-- Check spilling to a temp tablespace
SET work_mem='1000kB';

CREATE TABLE hashagg_spill(col1 numeric, col2 int);
INSERT INTO hashagg_spill SELECT id, 1 FROM generate_series(1,20000) id;
ANALYZE hashagg_spill;

CREATE TABLE spill_temptblspace (a numeric);
SET temp_tablespaces=pg_default;
INSERT INTO spill_temptblspace SELECT avg(col2) col2 FROM hashagg_spill GROUP 
BY col1 HAVING(sum(col1)) < 0;
RESET temp_tablespaces;
RESET work_mem;

drop schema hashagg_spill cascade;

Re: Memory-Bounded Hash Aggregation

Reply via email to