Hi, Since doing some work for PG15 for speeding up sorts, I've been a little irritated by the fact that dumptuples() calls WRITETUP, (which is now always calling writetuple()) and calls pfree() on the tuple only for dumptuples() to do MemoryContextReset(state->base.tuplecontext) directly afterwards. These pfrees are just a waste of effort and we might as well leave it to the context reset to do the cleanup. (Probably especially so when using AllocSet for storing tuples)
There are only 2 calls to WRITETUP and the other one is always called when state->slabAllocatorUsed is true. writetuple() checks for that before freeing the tuple, which is a bit of a wasted branch since it'll always prove to be false for the use case in mergeonerun(). (It's possible the compiler might inline that now anyway since the WRITETUP macro always calls writetuple() directly now) I've attached 3 patches aimed to do a small amount of cleanup work in tuplesort.c 0001: Just fixes a broken looking comment in writetuple() 0002: Gets rid of the WRITETUP marco. That does not do anything useful since 097366c45 0003: Changes writetuple to tell it what it should do in regards to freeing and adjusting the memory accounting. Probably 0003 could be done differently. I'm certainly not set on the bool args. I understand that I'm never calling it with "freetup" == true. So other options include 1) rip out the pfree code and that parameter; or 2) just do the inlining manually at both call sites. I'll throw this in the September CF to see if anyone wants to look. There's probably lots more cleaning jobs that could be done in tuplesort.c. The performance improvement from 0003 is not that impressive, but it looks like it makes things very slightly faster, so probably worth it if the patch makes the code cleaner. See attached gif and script for the benchmark I ran to test it. I think the gains might go up slightly with [1] applied as that patch seems to do more to improve the speed of palloc() than it does to improve the speed of pfree(). David [1] https://commitfest.postgresql.org/39/3810/
From 3a0dc2893c757be7cbde5d1cd1f7801de073f943 Mon Sep 17 00:00:00 2001 From: David Rowley <dgrow...@gmail.com> Date: Fri, 26 Aug 2022 14:42:23 +1200 Subject: [PATCH v1 3/3] Be smarter about freeing tuples during tuplesorts During dumptuples() the call to writetuple() would pfree any non-null tuple. This was quite wasteful as this happens just before we perform a reset of the context which stores all of those tuples. Here we adjust the writetuple signature so we can tell it exactly what we need done in regards to freeing the tuple and/or adjusting the memory accounting. In dumptuples() we must still account for the memory that's been released. Also, since writetuple() is only called twice, let's make it static inline so that the compiler inlines the call and removes the branching to check the bool arguments. --- src/backend/utils/sort/tuplesort.c | 44 ++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index d1c2f5f58f..519c7424f7 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -452,8 +452,9 @@ struct Sharedsort static void tuplesort_begin_batch(Tuplesortstate *state); -static void writetuple(Tuplesortstate *state, LogicalTape *tape, - SortTuple *stup); +static inline void writetuple(Tuplesortstate *state, LogicalTape *tape, + SortTuple *stup, bool freetup, + bool adjust_accounting); static bool consider_abort_common(Tuplesortstate *state); static void inittapes(Tuplesortstate *state, bool mergeruns); static void inittapestate(Tuplesortstate *state, int maxTapes); @@ -1339,18 +1340,22 @@ tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, bool useAbbre } /* - * Write 'stup' out onto 'tape' and, unless using the slab allocator, pfree - * stup's tuple and adjust the memory accounting accordingly. + * Write 'stup' out onto 'tape' and optionally free stup->tuple and optionally + * adjust the memory accounting. */ -static void -writetuple(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) +static inline void +writetuple(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup, + bool freetup, bool adjust_accounting) { state->base.writetup(state, tape, stup); - if (!state->slabAllocatorUsed && stup->tuple) + if (stup->tuple != NULL) { - FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); - pfree(stup->tuple); + if (adjust_accounting) + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + + if (freetup) + pfree(stup->tuple); } } @@ -2251,6 +2256,8 @@ mergeonerun(Tuplesortstate *state) int srcTapeIndex; LogicalTape *srcTape; + Assert(state->slabAllocatorUsed); + /* * Start the merge by loading one tuple from each active source tape into * the heap. @@ -2269,7 +2276,7 @@ mergeonerun(Tuplesortstate *state) /* write the tuple to destTape */ srcTapeIndex = state->memtuples[0].srctape; srcTape = state->inputTapes[srcTapeIndex]; - writetuple(state, state->destTape, &state->memtuples[0]); + writetuple(state, state->destTape, &state->memtuples[0], false, false); /* recycle the slot of the tuple we just wrote out, for the next read */ if (state->memtuples[0].tuple) @@ -2415,16 +2422,23 @@ dumptuples(Tuplesortstate *state, bool alltuples) memtupwrite = state->memtupcount; for (i = 0; i < memtupwrite; i++) { - writetuple(state, state->destTape, &state->memtuples[i]); + /* + * No need to free the tuple as we're resetting the tuplecontext + * below. We do need to adjust the memory accounting for the tuple, + * however. + */ + writetuple(state, state->destTape, &state->memtuples[i], false, true); state->memtupcount--; } /* - * Reset tuple memory. We've freed all of the tuples that we previously - * allocated. It's important to avoid fragmentation when there is a stark - * change in the sizes of incoming tuples. Fragmentation due to + * Reset tuple memory. This saves us from having to pfree() each + * individual tuple and helps to avoid memory fragmentation when there is + * a stark change in the sizes of incoming tuples. Fragmentation due to * AllocSetFree's bucketing by size class might be particularly bad if - * this step wasn't taken. + * this step wasn't taken. We may be using the Generation allocator, + * which does not suffer from this issue, however it does not seem worth + * special handling for that. */ MemoryContextReset(state->base.tuplecontext); -- 2.34.1
From d4edd7a0011e8c9ded18bc85a326cb9540fe7d87 Mon Sep 17 00:00:00 2001 From: David Rowley <dgrow...@gmail.com> Date: Fri, 26 Aug 2022 14:06:15 +1200 Subject: [PATCH v1 2/3] Get rid of useless WRITETUP macro This always calls writetuple(), so there's no point in having it anymore. --- src/backend/utils/sort/tuplesort.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 4fe64f0793..d1c2f5f58f 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -395,7 +395,6 @@ struct Sharedsort #define REMOVEABBREV(state,stup,count) ((*(state)->base.removeabbrev) (state, stup, count)) #define COMPARETUP(state,a,b) ((*(state)->base.comparetup) (a, b, state)) -#define WRITETUP(state,tape,stup) (writetuple(state, tape, stup)) #define READTUP(state,stup,tape,len) ((*(state)->base.readtup) (state, stup, tape, len)) #define FREESTATE(state) ((state)->base.freestate ? (*(state)->base.freestate) (state) : (void) 0) #define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) @@ -2270,7 +2269,7 @@ mergeonerun(Tuplesortstate *state) /* write the tuple to destTape */ srcTapeIndex = state->memtuples[0].srctape; srcTape = state->inputTapes[srcTapeIndex]; - WRITETUP(state, state->destTape, &state->memtuples[0]); + writetuple(state, state->destTape, &state->memtuples[0]); /* recycle the slot of the tuple we just wrote out, for the next read */ if (state->memtuples[0].tuple) @@ -2416,7 +2415,7 @@ dumptuples(Tuplesortstate *state, bool alltuples) memtupwrite = state->memtupcount; for (i = 0; i < memtupwrite; i++) { - WRITETUP(state, state->destTape, &state->memtuples[i]); + writetuple(state, state->destTape, &state->memtuples[i]); state->memtupcount--; } -- 2.34.1
From 62fd0349f4550d3331843e8a7f7fe963fb90b8b6 Mon Sep 17 00:00:00 2001 From: David Rowley <dgrow...@gmail.com> Date: Fri, 26 Aug 2022 13:53:18 +1200 Subject: [PATCH v1 1/3] Improve wording in comment for writetuple function --- src/backend/utils/sort/tuplesort.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index d90a1aebdf..4fe64f0793 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -1340,10 +1340,8 @@ tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, bool useAbbre } /* - * Write a stored tuple onto tape.tuple. Unless the slab allocator is - * used, after writing the tuple, pfree() the out-of-line data (not the - * SortTuple struct!), and increase state->availMem by the amount of - * memory space thereby released. + * Write 'stup' out onto 'tape' and, unless using the slab allocator, pfree + * stup's tuple and adjust the memory accounting accordingly. */ static void writetuple(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) -- 2.34.1
#!/bin/bash dbname=postgres times=5 gigabytes=10 psql -c "alter system set max_parallel_workers_per_gather TO 0;" $dbname psql -c "alter system set jit to 0;" $dbname psql -c "select pg_reload_conf();" $dbname #for cols in 6 7 8 9 10 11 12 13 14 for cols in 6 do sql="(" psql -c "drop table if exists t" $dbname psql -c "drop table if exists tsize" $dbname i=1 while [ $i -lt $cols ] do sql+="c$i BIGINT NOT NULL DEFAULT 0," i=$[$i+1] done sql+="c$cols BIGINT NOT NULL DEFAULT 0);" echo "NOTICE: Creating $cols column $gigabytes gigabyte table" # use the tsize table just to figure out how many of these rows we can fit on a page psql -c "create table tsize$sql" $dbname # 300 rows should always fill at least 1 page psql -c "insert into tsize (c1) select x from generate_series(1,300) x" $dbname psql -c "create table t$sql" $dbname # insert enough rows for $gigabytes gigabytes data psql -c "insert into t (c1) select x from generate_Series(1, (select count(*) * ($gigabytes::bigint*1024*1024*1024/8192) from tsize where ctid < '(1,0)')) x;" $dbname psql -c "vacuum freeze t" $dbname for mem in "4MB" "8MB" "16MB" "32MB" "64MB" "128MB" "256MB" "512MB" "1024MB" "2048MB" "4096MB" "8192MB" "16384MB" #for mem in "64MB" "65MB" do psql -c "alter system set work_mem TO '$mem';" $dbname psql -c "select pg_reload_conf();" $dbname echo "NOTICE: $cols column $gigabytes gigabyte table with work_mem of $mem" for sql in "select * from t order by c1 offset 1000000000000" do echo "$sql" > bench.sql pgbench -n -f bench.sql -t $times -M prepared $dbname done done done