> I've now pushed that bug fix so it's fine to remove the change to
> tuplesort.c now.

Thanks, I've rebased the patch, please find attached the v6.

> 
> I also did a round of benchmarking on this patch using the attached
> script. Anyone wanting to run it will need to run make installcheck
> first to create the required tables.

I've run your benchmark, keeping the best of three runs each time.
This is an intel laptop, so as many things are running on it there is a lot of 
noise... 

Both standard and patched run come from a compilation with gcc -O2. No changes 
have been done to the default settings.

Query # Master  Patched Variation
1       884     1627    184.05%
2       364     375     103.02%
3       568     783     137.85%
4       296     297     100.34%
5       421     484     114.96%
6       359     408     113.65%
7       237     251     105.91%
8       806     1271    157.69%

Since I didn't reproduce your slowdown at all on the first run, I tried to 
rerun the benchmark several times and for the "dubious cases" (2, 4 and 7), 
the results are too jittery to conclude one way or another in my case.  I 
don't have access to proper hardware, so not sure if that would be useful in 
any way to just run the bench for thousands of xacts instead. I would be 
surprised the check adds that much to the whole execution though.

I attach a graph similar to yours for reference.


-- 
Ronan Dunklau
commit 01885709d1718710db1665bf2b3f39d83c720147
Author: Ronan Dunklau <ronan.dunk...@aiven.io>
Date:   Tue Jul 6 16:34:31 2021 +0200

    Use optimized single-datum tuplesort in ExecSort
    
    Previously, with optimization was only done in ExecAgg.
    Doing this optimization on regular sort nodes provides for a nice
    performance improvement, when the input / output tuple has only one
    attribute.

diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index b99027e0d7..df4e79c6ba 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -29,6 +29,10 @@
  *		which saves the results in a temporary file or memory. After the
  *		initial call, returns a tuple from the file with each call.
  *
+ *		The tuplesort can either occur on the whole tuple (this is the nominal
+ *		case) or, when the input / output tuple consists of only one attribute,
+ *		we switch to the tuplesort_*_datum API, optimized for that specific case.
+ *
  *		Conditions:
  *		  -- none.
  *
@@ -86,31 +90,64 @@ ExecSort(PlanState *pstate)
 		outerNode = outerPlanState(node);
 		tupDesc = ExecGetResultType(outerNode);
 
-		tuplesortstate = tuplesort_begin_heap(tupDesc,
-											  plannode->numCols,
-											  plannode->sortColIdx,
-											  plannode->sortOperators,
-											  plannode->collations,
-											  plannode->nullsFirst,
-											  work_mem,
-											  NULL,
-											  node->randomAccess);
+		/*
+		 * Switch to the tuplesort_*_datum interface when we have only one
+		 * key, as it is much more efficient especially when the type is
+		 * pass-by-value.
+		 */
+		if (tupDesc->natts == 1)
+		{
+			node->datumSort = true;
+			tuplesortstate = tuplesort_begin_datum(TupleDescAttr(tupDesc, 0)->atttypid,
+												   plannode->sortOperators[0],
+												   plannode->collations[0],
+												   plannode->nullsFirst[0],
+												   work_mem,
+												   NULL,
+												   node->randomAccess);
+		}
+		else
+			tuplesortstate = tuplesort_begin_heap(tupDesc,
+												  plannode->numCols,
+												  plannode->sortColIdx,
+												  plannode->sortOperators,
+												  plannode->collations,
+												  plannode->nullsFirst,
+												  work_mem,
+												  NULL,
+												  node->randomAccess);
 		if (node->bounded)
 			tuplesort_set_bound(tuplesortstate, node->bound);
 		node->tuplesortstate = (void *) tuplesortstate;
 
 		/*
-		 * Scan the subplan and feed all the tuples to tuplesort.
+		 * Scan the subplan and feed all the tuples to tuplesort, using either
+		 * the _putdatum or _puttupleslot API as appropriate.
 		 */
-
-		for (;;)
+		if (node->datumSort)
 		{
-			slot = ExecProcNode(outerNode);
-
-			if (TupIsNull(slot))
-				break;
-
-			tuplesort_puttupleslot(tuplesortstate, slot);
+			for (;;)
+			{
+				slot = ExecProcNode(outerNode);
+
+				if (TupIsNull(slot))
+					break;
+				slot_getsomeattrs(slot, 1);
+				tuplesort_putdatum(tuplesortstate,
+								   slot->tts_values[0],
+								   slot->tts_isnull[0]);
+			}
+		}
+		else
+		{
+			for (;;)
+			{
+				slot = ExecProcNode(outerNode);
+
+				if (TupIsNull(slot))
+					break;
+				tuplesort_puttupleslot(tuplesortstate, slot);
+			}
 		}
 
 		/*
@@ -150,9 +187,18 @@ ExecSort(PlanState *pstate)
 	 * next fetch from the tuplesort.
 	 */
 	slot = node->ss.ps.ps_ResultTupleSlot;
-	(void) tuplesort_gettupleslot(tuplesortstate,
-								  ScanDirectionIsForward(dir),
-								  false, slot, NULL);
+	if (node->datumSort)
+	{
+		ExecClearTuple(slot);
+		if (tuplesort_getdatum(tuplesortstate, ScanDirectionIsForward(dir),
+							   &(slot->tts_values[0]), &(slot->tts_isnull[0]), NULL))
+			ExecStoreVirtualTuple(slot);
+	}
+	else
+		(void) tuplesort_gettupleslot(tuplesortstate,
+									  ScanDirectionIsForward(dir),
+									  false, slot, NULL);
+
 	return slot;
 }
 
@@ -191,6 +237,7 @@ ExecInitSort(Sort *node, EState *estate, int eflags)
 	sortstate->bounded = false;
 	sortstate->sort_Done = false;
 	sortstate->tuplesortstate = NULL;
+	sortstate->datumSort = false;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 0ec5509e7e..89f944a9c4 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2151,6 +2151,7 @@ typedef struct SortState
 	int64		bound_Done;		/* value of bound we did the sort with */
 	void	   *tuplesortstate; /* private state of tuplesort.c */
 	bool		am_worker;		/* are we a worker? */
+	bool		datumSort;  /* are we using the single value optimization ? */
 	SharedSortInfo *shared_info;	/* one entry per worker */
 } SortState;
 

Reply via email to