Hi Andres,

I took a stab at naive version of this but been stuck for sometime now.

I have added logic to sort on first column at first pass,

realloc all tuples and do full sort at second pass, but I am not seeing

any benefit (it is actually regressing) at all.

Tried doing above both at bulk and at chunks of data.

> You effectively end up with a bounded number of pre-sorted blocks, so the most > obvious thing to try is to build a heap of those blocks and effectively do a
> heapsort between the presorted blocks.

I am not very clear about implementation for this. How can we do heapsort between

 the presorted blocks? Do we keep changing state->bound=i, i+n, i+2n something like

this and keep calling make_bounded_heap/sort_bounded_heap?

> A related, but separate, improvement is to reduce / remove the memory
> allocation overhead.

This is still pending from my side.


I have attached some benchmarking results with script and POC

patches (which includes GUC switch to enable optimization for ease of testing) for the same.

Tested on WORK_MEM=3 GB for 1 and 10 Million rows data.

Please let me know things which I can fix and re-attempt.


Thanks,

Ankit


#!/bin/bash

psql -c "create table if not exists abcd (a int, b int, c int, d int );" 
postgres

for rows in 1000000 10000000
    do
        for selectrows in "select random()*x,random()*x,random()*x,random()*x 
from generate_Series(1,$rows) x;" "select 1,random()*x,random()*x,random()*x 
from generate_Series(1,$rows) x;" "select 1,1,random()*x,random()*x from 
generate_Series(1,$rows) x;"
        do
                        psql -c "truncate table abcd;" postgres > /dev/null
                        psql -c "insert into abcd $selectrows" postgres > 
/dev/null
                        psql -c "vacuum freeze abcd;" postgres > /dev/null
                        psql -c "select pg_prewarm('abcd');" postgres > 
/dev/null
                        echo "select a,b,c,d from abcd order by a,b,c,d" > 
bench.sql
                        
                        psql -c "alter system set enable_sort_optimization = 
off;" postgres > /dev/null
                        psql -c "select pg_reload_conf();" postgres > /dev/null
                        echo -n "$rows Master: "
                        pgbench -n -f bench.sql -T 10 -M prepared postgres | 
grep latency

                        psql -c "alter system set enable_sort_optimization = 
on;" postgres > /dev/null
                        psql -c "select pg_reload_conf();" postgres > /dev/null
                        echo -n "$rows Patched: "
                        pgbench -n -f bench.sql -T 10 -M prepared postgres | 
grep latency
                done
    done
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 1c0583fe26..6585164ca4 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -846,6 +846,16 @@ struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_sort_optimization", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the sort optimizations."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_sort_optimization,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_incremental_sort", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of incremental sort steps."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index d06074b86f..1b49f48457 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -389,6 +389,7 @@
 #enable_presorted_aggregate = on
 #enable_seqscan = on
 #enable_sort = on
+#enable_sort_optimization = on
 #enable_tidscan = on
 
 # - Planner Cost Constants -
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 9ca9835aab..cbaf54437c 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -127,6 +127,8 @@
 bool		trace_sort = false;
 #endif
 
+bool		enable_sort_optimization = true;
+
 #ifdef DEBUG_BOUNDED_SORT
 bool		optimize_bounded_sort = true;
 #endif
@@ -404,6 +406,7 @@ struct Sharedsort
 #define SERIAL(state)		((state)->shared == NULL)
 #define WORKER(state)		((state)->shared && (state)->worker != -1)
 #define LEADER(state)		((state)->shared && (state)->worker == -1)
+#define CACHESIZE 2*1024*1024L
 
 /*
  * NOTES about on-tape representation of tuples:
@@ -2725,6 +2728,8 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
 		{
 			if (state->base.sortKeys[0].comparator == ssup_datum_unsigned_cmp)
 			{
+				SortSupport oldonlyKey = state->base.onlyKey;
+				state->base.onlyKey = state->base.sortKeys;
 				qsort_tuple_unsigned(state->memtuples,
 									 state->memtupcount,
 									 state);
@@ -2740,11 +2745,63 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
 			}
 #endif
 			else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp)
-			{
-				qsort_tuple_int32(state->memtuples,
+			{	
+				if (enable_sort_optimization)
+				{
+					/* backup onlyKey and set it as NON NULL so that tie breaker won't be called
+					 * aim is to sort on first col only now 
+					 */
+					SortSupport oldonlyKey = state->base.onlyKey;
+					state->base.onlyKey = state->base.sortKeys;
+					qsort_tuple_int32(state->memtuples,
+									state->memtupcount,
+									state);
+					int BATCHSIZE = ceil(CACHESIZE/sizeof(state->memtuples[0].tuple));
+					if (state->memtupcount<BATCHSIZE)
+					{
+						BATCHSIZE = state->memtupcount;
+					}
+					int i=0;
+					int j=0;
+					while(i<state->memtupcount)
+					{
+
+						if ( (i + BATCHSIZE) > state->memtupcount)
+						{
+							BATCHSIZE = state->memtupcount - i;
+						}
+						SortTuple  *memtuples = &state->memtuples[i];
+						qsort_tuple_int32(memtuples,
+									BATCHSIZE,
+									state);
+						SortTuple  *tempmemtuples = (SortTuple *) palloc(BATCHSIZE * sizeof(SortTuple));
+						j=i;
+						for (; j < i+BATCHSIZE; j++)
+						{
+							void* temp = state->memtuples[j].tuple;
+							state->memtuples[j].tuple = heap_copy_minimal_tuple((MinimalTuple) state->memtuples[j].tuple);
+							pfree(temp);
+						}
+
+
+							i += BATCHSIZE;
+					}
+					/* revert onlyKey for pass 2 */
+					state->base.onlyKey = oldonlyKey;
+
+					qsort_tuple_int32(state->memtuples,
+									state->memtupcount,
+									state);
+					return;
+				}
+				else
+				{
+					qsort_tuple_int32(state->memtuples,
 								  state->memtupcount,
 								  state);
 				return;
+
+				}
 			}
 		}
 
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index ba89d013e6..2ae1c90420 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -281,6 +281,7 @@ extern PGDLLIMPORT int tcp_keepalives_idle;
 extern PGDLLIMPORT int tcp_keepalives_interval;
 extern PGDLLIMPORT int tcp_keepalives_count;
 extern PGDLLIMPORT int tcp_user_timeout;
+extern PGDLLIMPORT bool enable_sort_optimization;
 
 #ifdef TRACE_SORT
 extern PGDLLIMPORT bool trace_sort;
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 1c0583fe26..6585164ca4 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -846,6 +846,16 @@ struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_sort_optimization", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the sort optimizations."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_sort_optimization,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_incremental_sort", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of incremental sort steps."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index d06074b86f..1b49f48457 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -389,6 +389,7 @@
 #enable_presorted_aggregate = on
 #enable_seqscan = on
 #enable_sort = on
+#enable_sort_optimization = on
 #enable_tidscan = on
 
 # - Planner Cost Constants -
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 9ca9835aab..33b8f804dd 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -127,6 +127,8 @@
 bool		trace_sort = false;
 #endif
 
+bool		enable_sort_optimization = true;
+
 #ifdef DEBUG_BOUNDED_SORT
 bool		optimize_bounded_sort = true;
 #endif
@@ -404,6 +406,7 @@ struct Sharedsort
 #define SERIAL(state)		((state)->shared == NULL)
 #define WORKER(state)		((state)->shared && (state)->worker != -1)
 #define LEADER(state)		((state)->shared && (state)->worker == -1)
+#define CACHESIZE 2*1024*1024L
 
 /*
  * NOTES about on-tape representation of tuples:
@@ -2725,6 +2728,8 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
 		{
 			if (state->base.sortKeys[0].comparator == ssup_datum_unsigned_cmp)
 			{
+				SortSupport oldonlyKey = state->base.onlyKey;
+				state->base.onlyKey = state->base.sortKeys;
 				qsort_tuple_unsigned(state->memtuples,
 									 state->memtupcount,
 									 state);
@@ -2740,11 +2745,51 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
 			}
 #endif
 			else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp)
-			{
-				qsort_tuple_int32(state->memtuples,
+			{	
+				if (enable_sort_optimization)
+				{
+					/* backup onlyKey and set it as NON NULL so that tie breaker won't be called
+					 * aim is to sort on first col only at first pass
+					 */
+					SortSupport oldonlyKey = state->base.onlyKey;
+					state->base.onlyKey = state->base.sortKeys;
+					qsort_tuple_int32(state->memtuples,
+									state->memtupcount,
+									state);
+
+					/* revert onlyKey for pass 2 */
+					state->base.onlyKey = oldonlyKey;
+					/* realloc tuples */
+					SortTuple  *tempmemtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
+					int i;
+					for (i = 0; i < state->memtupcount; i++)
+					{	
+						tempmemtuples[i].datum1 = state->memtuples[i].datum1;
+						/*
+						 * could have just copied realloc temp tuple to memtuples.tuple
+						 * instead of allocating full memtuples but then have to call
+						 * pfree for each tuple, in current approach whole block can be freed in one go
+						 */					
+						tempmemtuples[i].tuple = heap_copy_minimal_tuple((MinimalTuple) state->memtuples[i].tuple);
+						tempmemtuples[i].isnull1 = state->memtuples[i].isnull1;
+						tempmemtuples[i].srctape = state->memtuples[i].srctape;
+					}
+					pfree(state->memtuples);
+					state->memtuples = tempmemtuples;
+
+					qsort_tuple_int32(state->memtuples,
+									state->memtupcount,
+									state);
+					return;
+				}
+				else
+				{
+					qsort_tuple_int32(state->memtuples,
 								  state->memtupcount,
 								  state);
 				return;
+
+				}
 			}
 		}
 
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index ba89d013e6..2ae1c90420 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -281,6 +281,7 @@ extern PGDLLIMPORT int tcp_keepalives_idle;
 extern PGDLLIMPORT int tcp_keepalives_interval;
 extern PGDLLIMPORT int tcp_keepalives_count;
 extern PGDLLIMPORT int tcp_user_timeout;
+extern PGDLLIMPORT bool enable_sort_optimization;
 
 #ifdef TRACE_SORT
 extern PGDLLIMPORT bool trace_sort;

Reply via email to