Hi David, >> I pushed the changes to WindowAgg so as not to call tuplestore_end() >> on every partition. Can you rebase this patch over that change? >> >> It would be good to do this in a way that does not add any new state >> to WindowAggState, you can see that I had to shuffle fields around in >> that struct because the next_parition field would have caused the >> struct to become larger. I've not looked closely, but I expect this >> can be done by adding more code to tuplestore_updatemax() to also >> track the disk space used if the current storage has gone to disk. I >> expect the maxSpace field can be used for both, but we'd need another >> bool field to track if the max used was by disk or memory.
I have created a patch in the direction you suggested. See attached patch (v1-0001-Enhance-tuplestore.txt). To not confuse CFbot, the extension is "txt", not "patch". >> I think the performance of this would also need to be tested as it >> means doing an lseek() on every tuplestore_clear() when we've gone to >> disk. Probably that will be dominated by all the other overheads of a >> tuplestore going to disk (i.e. dumptuples() etc), but it would be good >> to check this. I suggest setting work_mem = 64 and making a test case >> that only just spills to disk. Maybe do a few thousand partitions >> worth of that and see if you can measure any slowdown. I copied your shell script and slightly modified it then ran pgbench with (1 10 100 1000 5000 10000) window partitions (see attached shell script). In the script I set work_mem to 64kB. It seems for 10000, 1000 and 100 partitions, the performance difference seems noises. However, for 10, 2, 1 partitions. I see large performance degradation with the patched version: patched is slower than stock master in 1.5% (10 partitions), 41% (2 partitions) and 55.7% (1 partition). See the attached graph.
>From 4749d2018f33e883c292eb904f3253d393a47c99 Mon Sep 17 00:00:00 2001 From: Tatsuo Ishii <is...@postgresql.org> Date: Thu, 5 Sep 2024 20:59:01 +0900 Subject: [PATCH v1] Enhance tuplestore. Let tuplestore_updatemax() handle both memory and disk case. --- src/backend/utils/sort/tuplestore.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index 444c8e25c2..854121fc11 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -107,9 +107,10 @@ struct Tuplestorestate bool backward; /* store extra length words in file? */ bool interXact; /* keep open through transactions? */ bool truncated; /* tuplestore_trim has removed tuples? */ + bool inMem; /* true if maxSpace is for memory */ int64 availMem; /* remaining memory available, in bytes */ int64 allowedMem; /* total memory allowed, in bytes */ - int64 maxSpace; /* maximum space used in memory */ + int64 maxSpace; /* maximum space used in memory or disk */ int64 tuples; /* number of tuples added */ BufFile *myfile; /* underlying file, or NULL if none */ MemoryContext context; /* memory context for holding tuples */ @@ -262,6 +263,7 @@ tuplestore_begin_common(int eflags, bool interXact, int maxKBytes) state->eflags = eflags; state->interXact = interXact; state->truncated = false; + state->inMem = true; state->allowedMem = maxKBytes * 1024L; state->availMem = state->allowedMem; state->maxSpace = 0; @@ -1497,8 +1499,17 @@ static void tuplestore_updatemax(Tuplestorestate *state) { if (state->status == TSS_INMEM) + { state->maxSpace = Max(state->maxSpace, state->allowedMem - state->availMem); + state->inMem = true; + } + else + { + state->maxSpace = Max(state->maxSpace, + BufFileSize(state->myfile)); + state->inMem = false; + } } /* @@ -1509,7 +1520,7 @@ tuplestore_updatemax(Tuplestorestate *state) const char * tuplestore_storage_type_name(Tuplestorestate *state) { - if (state->status == TSS_INMEM) + if (state->inMem) return "Memory"; else return "Disk"; @@ -1517,8 +1528,7 @@ tuplestore_storage_type_name(Tuplestorestate *state) /* * tuplestore_space_used - * Return the maximum space used in memory unless the tuplestore has spilled - * to disk, in which case, return the disk space used. + * Return the maximum space used in memory or disk. */ int64 tuplestore_space_used(Tuplestorestate *state) @@ -1526,10 +1536,7 @@ tuplestore_space_used(Tuplestorestate *state) /* First, update the maxSpace field */ tuplestore_updatemax(state); - if (state->status == TSS_INMEM) - return state->maxSpace; - else - return BufFileSize(state->myfile); + return state->maxSpace; } /* @@ -1601,7 +1608,9 @@ writetup_heap(Tuplestorestate *state, void *tup) if (state->backward) /* need trailing length word? */ BufFileWrite(state->myfile, &tuplen, sizeof(tuplen)); - /* no need to call tuplestore_updatemax() when not in TSS_INMEM */ + /* update maxSpace */ + tuplestore_updatemax(state); + FREEMEM(state, GetMemoryChunkSpace(tuple)); heap_free_minimal_tuple(tuple); } -- 2.25.1
dbname=test rows=10000 secs=10 ntests=3 psql -c "drop table if exists part_test;" $dbname psql -c "create table part_test (a int not null);" $dbname psql -c "insert into part_test select g.s from generate_series(1,$rows) g(s);" $dbname psql -c "vacuum freeze analyze part_test;" $dbname psql -c "alter system set work_mem = '64kB';" $dbname psql -c "alter system set jit = 0;" $dbname psql -c "select pg_reload_conf();" $dbname for c in 1 10 100 1000 5000 10000 do echo "SELECT a,count(*) OVER (PARTITION BY a / $c) FROM part_test OFFSET $rows" > bench.sql echo "Testing with $(($rows / $c)) partitions" for i in $(seq 1 $ntests) do pgbench -n -f bench.sql -M prepared -T $secs $dbname | grep latency done done