Hi Andres,
I took a stab at naive version of this but been stuck for sometime now.
I have added logic to sort on first column at first pass,
realloc all tuples and do full sort at second pass, but I am not seeing
any benefit (it is actually regressing) at all.
Tried doing above both at bulk and at chunks of data.
> You effectively end up with a bounded number of pre-sorted blocks, so
the most
> obvious thing to try is to build a heap of those blocks and
effectively do a
> heapsort between the presorted blocks.
I am not very clear about implementation for this. How can we do
heapsort between
the presorted blocks? Do we keep changing state->bound=i, i+n, i+2n
something like
this and keep calling make_bounded_heap/sort_bounded_heap?
> A related, but separate, improvement is to reduce / remove the memory
> allocation overhead.
This is still pending from my side.
I have attached some benchmarking results with script and POC
patches (which includes GUC switch to enable optimization for ease of
testing) for the same.
Tested on WORK_MEM=3 GB for 1 and 10 Million rows data.
Please let me know things which I can fix and re-attempt.
Thanks,
Ankit
#!/bin/bash
psql -c "create table if not exists abcd (a int, b int, c int, d int );"
postgres
for rows in 1000000 10000000
do
for selectrows in "select random()*x,random()*x,random()*x,random()*x
from generate_Series(1,$rows) x;" "select 1,random()*x,random()*x,random()*x
from generate_Series(1,$rows) x;" "select 1,1,random()*x,random()*x from
generate_Series(1,$rows) x;"
do
psql -c "truncate table abcd;" postgres > /dev/null
psql -c "insert into abcd $selectrows" postgres >
/dev/null
psql -c "vacuum freeze abcd;" postgres > /dev/null
psql -c "select pg_prewarm('abcd');" postgres >
/dev/null
echo "select a,b,c,d from abcd order by a,b,c,d" >
bench.sql
psql -c "alter system set enable_sort_optimization =
off;" postgres > /dev/null
psql -c "select pg_reload_conf();" postgres > /dev/null
echo -n "$rows Master: "
pgbench -n -f bench.sql -T 10 -M prepared postgres |
grep latency
psql -c "alter system set enable_sort_optimization =
on;" postgres > /dev/null
psql -c "select pg_reload_conf();" postgres > /dev/null
echo -n "$rows Patched: "
pgbench -n -f bench.sql -T 10 -M prepared postgres |
grep latency
done
done
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 1c0583fe26..6585164ca4 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -846,6 +846,16 @@ struct config_bool ConfigureNamesBool[] =
true,
NULL, NULL, NULL
},
+ {
+ {"enable_sort_optimization", PGC_USERSET, QUERY_TUNING_METHOD,
+ gettext_noop("Enables the sort optimizations."),
+ NULL,
+ GUC_EXPLAIN
+ },
+ &enable_sort_optimization,
+ true,
+ NULL, NULL, NULL
+ },
{
{"enable_incremental_sort", PGC_USERSET, QUERY_TUNING_METHOD,
gettext_noop("Enables the planner's use of incremental sort steps."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index d06074b86f..1b49f48457 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -389,6 +389,7 @@
#enable_presorted_aggregate = on
#enable_seqscan = on
#enable_sort = on
+#enable_sort_optimization = on
#enable_tidscan = on
# - Planner Cost Constants -
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 9ca9835aab..cbaf54437c 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -127,6 +127,8 @@
bool trace_sort = false;
#endif
+bool enable_sort_optimization = true;
+
#ifdef DEBUG_BOUNDED_SORT
bool optimize_bounded_sort = true;
#endif
@@ -404,6 +406,7 @@ struct Sharedsort
#define SERIAL(state) ((state)->shared == NULL)
#define WORKER(state) ((state)->shared && (state)->worker != -1)
#define LEADER(state) ((state)->shared && (state)->worker == -1)
+#define CACHESIZE 2*1024*1024L
/*
* NOTES about on-tape representation of tuples:
@@ -2725,6 +2728,8 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
{
if (state->base.sortKeys[0].comparator == ssup_datum_unsigned_cmp)
{
+ SortSupport oldonlyKey = state->base.onlyKey;
+ state->base.onlyKey = state->base.sortKeys;
qsort_tuple_unsigned(state->memtuples,
state->memtupcount,
state);
@@ -2740,11 +2745,63 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
}
#endif
else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp)
- {
- qsort_tuple_int32(state->memtuples,
+ {
+ if (enable_sort_optimization)
+ {
+ /* backup onlyKey and set it as NON NULL so that tie breaker won't be called
+ * aim is to sort on first col only now
+ */
+ SortSupport oldonlyKey = state->base.onlyKey;
+ state->base.onlyKey = state->base.sortKeys;
+ qsort_tuple_int32(state->memtuples,
+ state->memtupcount,
+ state);
+ int BATCHSIZE = ceil(CACHESIZE/sizeof(state->memtuples[0].tuple));
+ if (state->memtupcount<BATCHSIZE)
+ {
+ BATCHSIZE = state->memtupcount;
+ }
+ int i=0;
+ int j=0;
+ while(i<state->memtupcount)
+ {
+
+ if ( (i + BATCHSIZE) > state->memtupcount)
+ {
+ BATCHSIZE = state->memtupcount - i;
+ }
+ SortTuple *memtuples = &state->memtuples[i];
+ qsort_tuple_int32(memtuples,
+ BATCHSIZE,
+ state);
+ SortTuple *tempmemtuples = (SortTuple *) palloc(BATCHSIZE * sizeof(SortTuple));
+ j=i;
+ for (; j < i+BATCHSIZE; j++)
+ {
+ void* temp = state->memtuples[j].tuple;
+ state->memtuples[j].tuple = heap_copy_minimal_tuple((MinimalTuple) state->memtuples[j].tuple);
+ pfree(temp);
+ }
+
+
+ i += BATCHSIZE;
+ }
+ /* revert onlyKey for pass 2 */
+ state->base.onlyKey = oldonlyKey;
+
+ qsort_tuple_int32(state->memtuples,
+ state->memtupcount,
+ state);
+ return;
+ }
+ else
+ {
+ qsort_tuple_int32(state->memtuples,
state->memtupcount,
state);
return;
+
+ }
}
}
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index ba89d013e6..2ae1c90420 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -281,6 +281,7 @@ extern PGDLLIMPORT int tcp_keepalives_idle;
extern PGDLLIMPORT int tcp_keepalives_interval;
extern PGDLLIMPORT int tcp_keepalives_count;
extern PGDLLIMPORT int tcp_user_timeout;
+extern PGDLLIMPORT bool enable_sort_optimization;
#ifdef TRACE_SORT
extern PGDLLIMPORT bool trace_sort;
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 1c0583fe26..6585164ca4 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -846,6 +846,16 @@ struct config_bool ConfigureNamesBool[] =
true,
NULL, NULL, NULL
},
+ {
+ {"enable_sort_optimization", PGC_USERSET, QUERY_TUNING_METHOD,
+ gettext_noop("Enables the sort optimizations."),
+ NULL,
+ GUC_EXPLAIN
+ },
+ &enable_sort_optimization,
+ true,
+ NULL, NULL, NULL
+ },
{
{"enable_incremental_sort", PGC_USERSET, QUERY_TUNING_METHOD,
gettext_noop("Enables the planner's use of incremental sort steps."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index d06074b86f..1b49f48457 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -389,6 +389,7 @@
#enable_presorted_aggregate = on
#enable_seqscan = on
#enable_sort = on
+#enable_sort_optimization = on
#enable_tidscan = on
# - Planner Cost Constants -
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 9ca9835aab..33b8f804dd 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -127,6 +127,8 @@
bool trace_sort = false;
#endif
+bool enable_sort_optimization = true;
+
#ifdef DEBUG_BOUNDED_SORT
bool optimize_bounded_sort = true;
#endif
@@ -404,6 +406,7 @@ struct Sharedsort
#define SERIAL(state) ((state)->shared == NULL)
#define WORKER(state) ((state)->shared && (state)->worker != -1)
#define LEADER(state) ((state)->shared && (state)->worker == -1)
+#define CACHESIZE 2*1024*1024L
/*
* NOTES about on-tape representation of tuples:
@@ -2725,6 +2728,8 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
{
if (state->base.sortKeys[0].comparator == ssup_datum_unsigned_cmp)
{
+ SortSupport oldonlyKey = state->base.onlyKey;
+ state->base.onlyKey = state->base.sortKeys;
qsort_tuple_unsigned(state->memtuples,
state->memtupcount,
state);
@@ -2740,11 +2745,51 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
}
#endif
else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp)
- {
- qsort_tuple_int32(state->memtuples,
+ {
+ if (enable_sort_optimization)
+ {
+ /* backup onlyKey and set it as NON NULL so that tie breaker won't be called
+ * aim is to sort on first col only at first pass
+ */
+ SortSupport oldonlyKey = state->base.onlyKey;
+ state->base.onlyKey = state->base.sortKeys;
+ qsort_tuple_int32(state->memtuples,
+ state->memtupcount,
+ state);
+
+ /* revert onlyKey for pass 2 */
+ state->base.onlyKey = oldonlyKey;
+ /* realloc tuples */
+ SortTuple *tempmemtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
+ int i;
+ for (i = 0; i < state->memtupcount; i++)
+ {
+ tempmemtuples[i].datum1 = state->memtuples[i].datum1;
+ /*
+ * could have just copied realloc temp tuple to memtuples.tuple
+ * instead of allocating full memtuples but then have to call
+ * pfree for each tuple, in current approach whole block can be freed in one go
+ */
+ tempmemtuples[i].tuple = heap_copy_minimal_tuple((MinimalTuple) state->memtuples[i].tuple);
+ tempmemtuples[i].isnull1 = state->memtuples[i].isnull1;
+ tempmemtuples[i].srctape = state->memtuples[i].srctape;
+ }
+ pfree(state->memtuples);
+ state->memtuples = tempmemtuples;
+
+ qsort_tuple_int32(state->memtuples,
+ state->memtupcount,
+ state);
+ return;
+ }
+ else
+ {
+ qsort_tuple_int32(state->memtuples,
state->memtupcount,
state);
return;
+
+ }
}
}
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index ba89d013e6..2ae1c90420 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -281,6 +281,7 @@ extern PGDLLIMPORT int tcp_keepalives_idle;
extern PGDLLIMPORT int tcp_keepalives_interval;
extern PGDLLIMPORT int tcp_keepalives_count;
extern PGDLLIMPORT int tcp_user_timeout;
+extern PGDLLIMPORT bool enable_sort_optimization;
#ifdef TRACE_SORT
extern PGDLLIMPORT bool trace_sort;