Hi,

Since doing some work for PG15 for speeding up sorts, I've been a
little irritated by the fact that dumptuples() calls WRITETUP, (which
is now always calling writetuple()) and calls pfree() on the tuple
only for dumptuples() to do
MemoryContextReset(state->base.tuplecontext) directly afterwards.
These pfrees are just a waste of effort and we might as well leave it
to the context reset to do the cleanup. (Probably especially so when
using AllocSet for storing tuples)

There are only 2 calls to WRITETUP and the other one is always called
when state->slabAllocatorUsed is true.  writetuple() checks for that
before freeing the tuple, which is a bit of a wasted branch since
it'll always prove to be false for the use case in mergeonerun().
(It's possible the compiler might inline that now anyway since the
WRITETUP macro always calls writetuple() directly now)

I've attached 3 patches aimed to do a small amount of cleanup work in
tuplesort.c

0001: Just fixes a broken looking comment in writetuple()
0002: Gets rid of the WRITETUP marco. That does not do anything useful
since 097366c45
0003: Changes writetuple to tell it what it should do in regards to
freeing and adjusting the memory accounting.

Probably 0003 could be done differently. I'm certainly not set on the
bool args. I understand that I'm never calling it with "freetup" ==
true. So other options include 1) rip out the pfree code and that
parameter; or 2) just do the inlining manually at both call sites.

I'll throw this in the September CF to see if anyone wants to look.
There's probably lots more cleaning jobs that could be done in
tuplesort.c.

The performance improvement from 0003 is not that impressive, but it
looks like it makes things very slightly faster, so probably worth it
if the patch makes the code cleaner. See attached gif and script for
the benchmark I ran to test it.  I think the gains might go up
slightly with [1] applied as that patch seems to do more to improve
the speed of palloc() than it does to improve the speed of pfree().

David

[1] https://commitfest.postgresql.org/39/3810/
From 3a0dc2893c757be7cbde5d1cd1f7801de073f943 Mon Sep 17 00:00:00 2001
From: David Rowley <dgrow...@gmail.com>
Date: Fri, 26 Aug 2022 14:42:23 +1200
Subject: [PATCH v1 3/3] Be smarter about freeing tuples during tuplesorts

During dumptuples() the call to writetuple() would pfree any non-null
tuple.  This was quite wasteful as this happens just before we perform a
reset of the context which stores all of those tuples.

Here we adjust the writetuple signature so we can tell it exactly what we
need done in regards to freeing the tuple and/or adjusting the memory
accounting.  In dumptuples() we must still account for the memory that's
been released.  Also, since writetuple() is only called twice, let's make
it static inline so that the compiler inlines the call and removes the
branching to check the bool arguments.
---
 src/backend/utils/sort/tuplesort.c | 44 ++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/src/backend/utils/sort/tuplesort.c 
b/src/backend/utils/sort/tuplesort.c
index d1c2f5f58f..519c7424f7 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -452,8 +452,9 @@ struct Sharedsort
 
 
 static void tuplesort_begin_batch(Tuplesortstate *state);
-static void writetuple(Tuplesortstate *state, LogicalTape *tape,
-                                          SortTuple *stup);
+static inline void writetuple(Tuplesortstate *state, LogicalTape *tape,
+                                                         SortTuple *stup, bool 
freetup,
+                                                         bool 
adjust_accounting);
 static bool consider_abort_common(Tuplesortstate *state);
 static void inittapes(Tuplesortstate *state, bool mergeruns);
 static void inittapestate(Tuplesortstate *state, int maxTapes);
@@ -1339,18 +1340,22 @@ tuplesort_puttuple_common(Tuplesortstate *state, 
SortTuple *tuple, bool useAbbre
 }
 
 /*
- * Write 'stup' out onto 'tape' and, unless using the slab allocator, pfree
- * stup's tuple and adjust the memory accounting accordingly.
+ * Write 'stup' out onto 'tape' and optionally free stup->tuple and optionally
+ * adjust the memory accounting.
  */
-static void
-writetuple(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
+static inline void
+writetuple(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup,
+                  bool freetup, bool adjust_accounting)
 {
        state->base.writetup(state, tape, stup);
 
-       if (!state->slabAllocatorUsed && stup->tuple)
+       if (stup->tuple != NULL)
        {
-               FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
-               pfree(stup->tuple);
+               if (adjust_accounting)
+                       FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
+
+               if (freetup)
+                       pfree(stup->tuple);
        }
 }
 
@@ -2251,6 +2256,8 @@ mergeonerun(Tuplesortstate *state)
        int                     srcTapeIndex;
        LogicalTape *srcTape;
 
+       Assert(state->slabAllocatorUsed);
+
        /*
         * Start the merge by loading one tuple from each active source tape 
into
         * the heap.
@@ -2269,7 +2276,7 @@ mergeonerun(Tuplesortstate *state)
                /* write the tuple to destTape */
                srcTapeIndex = state->memtuples[0].srctape;
                srcTape = state->inputTapes[srcTapeIndex];
-               writetuple(state, state->destTape, &state->memtuples[0]);
+               writetuple(state, state->destTape, &state->memtuples[0], false, 
false);
 
                /* recycle the slot of the tuple we just wrote out, for the 
next read */
                if (state->memtuples[0].tuple)
@@ -2415,16 +2422,23 @@ dumptuples(Tuplesortstate *state, bool alltuples)
        memtupwrite = state->memtupcount;
        for (i = 0; i < memtupwrite; i++)
        {
-               writetuple(state, state->destTape, &state->memtuples[i]);
+               /*
+                * No need to free the tuple as we're resetting the tuplecontext
+                * below.  We do need to adjust the memory accounting for the 
tuple,
+                * however.
+                */
+               writetuple(state, state->destTape, &state->memtuples[i], false, 
true);
                state->memtupcount--;
        }
 
        /*
-        * Reset tuple memory.  We've freed all of the tuples that we previously
-        * allocated.  It's important to avoid fragmentation when there is a 
stark
-        * change in the sizes of incoming tuples.  Fragmentation due to
+        * Reset tuple memory.  This saves us from having to pfree() each
+        * individual tuple and helps to avoid memory fragmentation when there 
is
+        * a stark change in the sizes of incoming tuples.  Fragmentation due to
         * AllocSetFree's bucketing by size class might be particularly bad if
-        * this step wasn't taken.
+        * this step wasn't taken.  We may be using the Generation allocator,
+        * which does not suffer from this issue, however it does not seem worth
+        * special handling for that.
         */
        MemoryContextReset(state->base.tuplecontext);
 
-- 
2.34.1

From d4edd7a0011e8c9ded18bc85a326cb9540fe7d87 Mon Sep 17 00:00:00 2001
From: David Rowley <dgrow...@gmail.com>
Date: Fri, 26 Aug 2022 14:06:15 +1200
Subject: [PATCH v1 2/3] Get rid of useless WRITETUP macro

This always calls writetuple(), so there's no point in having it anymore.
---
 src/backend/utils/sort/tuplesort.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/backend/utils/sort/tuplesort.c 
b/src/backend/utils/sort/tuplesort.c
index 4fe64f0793..d1c2f5f58f 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -395,7 +395,6 @@ struct Sharedsort
 
 #define REMOVEABBREV(state,stup,count) ((*(state)->base.removeabbrev) (state, 
stup, count))
 #define COMPARETUP(state,a,b)  ((*(state)->base.comparetup) (a, b, state))
-#define WRITETUP(state,tape,stup)      (writetuple(state, tape, stup))
 #define READTUP(state,stup,tape,len) ((*(state)->base.readtup) (state, stup, 
tape, len))
 #define FREESTATE(state)       ((state)->base.freestate ? 
(*(state)->base.freestate) (state) : (void) 0)
 #define LACKMEM(state)         ((state)->availMem < 0 && 
!(state)->slabAllocatorUsed)
@@ -2270,7 +2269,7 @@ mergeonerun(Tuplesortstate *state)
                /* write the tuple to destTape */
                srcTapeIndex = state->memtuples[0].srctape;
                srcTape = state->inputTapes[srcTapeIndex];
-               WRITETUP(state, state->destTape, &state->memtuples[0]);
+               writetuple(state, state->destTape, &state->memtuples[0]);
 
                /* recycle the slot of the tuple we just wrote out, for the 
next read */
                if (state->memtuples[0].tuple)
@@ -2416,7 +2415,7 @@ dumptuples(Tuplesortstate *state, bool alltuples)
        memtupwrite = state->memtupcount;
        for (i = 0; i < memtupwrite; i++)
        {
-               WRITETUP(state, state->destTape, &state->memtuples[i]);
+               writetuple(state, state->destTape, &state->memtuples[i]);
                state->memtupcount--;
        }
 
-- 
2.34.1

From 62fd0349f4550d3331843e8a7f7fe963fb90b8b6 Mon Sep 17 00:00:00 2001
From: David Rowley <dgrow...@gmail.com>
Date: Fri, 26 Aug 2022 13:53:18 +1200
Subject: [PATCH v1 1/3] Improve wording in comment for writetuple function

---
 src/backend/utils/sort/tuplesort.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/backend/utils/sort/tuplesort.c 
b/src/backend/utils/sort/tuplesort.c
index d90a1aebdf..4fe64f0793 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -1340,10 +1340,8 @@ tuplesort_puttuple_common(Tuplesortstate *state, 
SortTuple *tuple, bool useAbbre
 }
 
 /*
- * Write a stored tuple onto tape.tuple.  Unless the slab allocator is
- * used, after writing the tuple, pfree() the out-of-line data (not the
- * SortTuple struct!), and increase state->availMem by the amount of
- * memory space thereby released.
+ * Write 'stup' out onto 'tape' and, unless using the slab allocator, pfree
+ * stup's tuple and adjust the memory accounting accordingly.
  */
 static void
 writetuple(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
-- 
2.34.1

#!/bin/bash

dbname=postgres
times=5
gigabytes=10

psql -c "alter system set max_parallel_workers_per_gather TO 0;" $dbname
psql -c "alter system set jit to 0;" $dbname
psql -c "select pg_reload_conf();" $dbname

#for cols in 6 7 8 9 10 11 12 13 14
for cols in 6
do

        sql="("
        psql -c "drop table if exists t" $dbname
        psql -c "drop table if exists tsize" $dbname
        i=1
        while [ $i -lt $cols ]
        do
                sql+="c$i BIGINT NOT NULL DEFAULT 0,"
                i=$[$i+1]
        done
        sql+="c$cols BIGINT NOT NULL DEFAULT 0);"
        echo "NOTICE: Creating $cols column $gigabytes gigabyte table"

        # use the tsize table just to figure out how many of these rows we can 
fit on a page
        psql -c "create table tsize$sql" $dbname
        # 300 rows should always fill at least 1 page
        psql -c "insert into tsize (c1) select x from generate_series(1,300) x" 
$dbname
        psql -c "create table t$sql" $dbname
        # insert enough rows for $gigabytes gigabytes data
        psql -c "insert into t (c1) select x from generate_Series(1, (select 
count(*) * ($gigabytes::bigint*1024*1024*1024/8192) from tsize where ctid < 
'(1,0)')) x;" $dbname
        psql -c "vacuum freeze t" $dbname

        for mem in "4MB" "8MB" "16MB" "32MB" "64MB" "128MB" "256MB" "512MB" 
"1024MB" "2048MB" "4096MB" "8192MB" "16384MB"
        #for mem in "64MB" "65MB"
        do
                psql -c "alter system set work_mem TO '$mem';" $dbname
                psql -c "select pg_reload_conf();" $dbname

                echo "NOTICE: $cols column $gigabytes gigabyte table with 
work_mem of $mem"
                for sql in "select * from t order by c1 offset 1000000000000"
                do
                        echo "$sql" > bench.sql
                        pgbench -n -f bench.sql -t $times -M prepared $dbname
                done
        done    
done

Reply via email to