I wrote:

> I've set the threshold to 400 for now, but I'm not claiming that's the
> end story. In addition to the underestimation mentioned above,
> unrolling the counting step is a factor. Unrolling makes smaller
> inputs worse (which we can reach by recursing from larger inputs), but
> unrolling seems important for large inputs with low cardinality (a few
> percent, but I haven't shared numbers yet). We've found that a large
> input with only 4-5 distinct values just barely wins with radix sort.
> I'll be curious to see if unrolling is actually needed to prevent
> regressions there.

Looking more closely at my development history, it turns out I added
loop unrolling before adding common prefix detection. Most real-world
non-negative integers have the upper bytes the same, especially since
the datum is 8 bytes regardless of underlying type. For those bytes,
the radix sort finds only one unique byte and continues on to the next
byte. By detecting the common prefix as we condition the datums, it
matters less how fast we can count since we simply skip some useless
work. (This is not as relevant when we have an abbreviated datum)

Repeating part of the microbenchmark from last time with no unrolling,
a threshold of 200 works for all but the lowest cardinality inputs:

cardinality: 256
number of elements:  100   qsort: 34.8 radix: 38.3
number of elements:  200   qsort: 34.9 radix: 29.7
number of elements:  400   qsort: 40.8 radix: 29.2

cardinality: 16
number of elements:  100   qsort: 19.3 radix: 26.2
number of elements:  200   qsort: 18.9 radix: 18.2
number of elements:  400   qsort: 18.5 radix: 14.5

cardinality: 2
number of elements:  100   qsort: 09.3 radix: 21.6
number of elements:  200   qsort: 08.9 radix: 15.4
number of elements:  400   qsort: 10.3 radix: 14.0

To test further, I dug up a test from [1] that stresses low
cardinality on multiple sort keys (attached in a form to allow turing
radix sort on and off via a command line argument), and found no
regression with or without loop unrolling:

V2:
off:
4 ^ 8: latency average = 101.070 ms
5 ^ 8: latency average = 704.862 ms
6 ^ 8: latency average = 3651.015 ms
7 ^ 8: latency average = 15141.412 ms

on:
4 ^ 8: latency average = 99.939 ms
5 ^ 8: latency average = 683.018 ms
6 ^ 8: latency average = 3545.626 ms
7 ^ 8: latency average = 14095.677 ms

V3:
off:
4 ^ 8: latency average = 99.486 ms
5 ^ 8: latency average = 693.434 ms
6 ^ 8: latency average = 3607.940 ms
7 ^ 8: latency average = 14602.325 ms

on:
4 ^ 8: latency average = 97.664 ms
5 ^ 8: latency average = 678.752 ms
6 ^ 8: latency average = 3361.765 ms
7 ^ 8: latency average = 14121.190 ms

So v3 gets rid of loop unrolling, reduces the threshold to 200.

[1] 
https://www.postgresql.org/message-id/flat/CAApHDvqkHZsT2gaAWFM7D%3D7qyQ%3DeKXQvvn%2BBkwCn4Rvj1w4EKQ%40mail.gmail.com#1c67321e09be15e031d3995d45a1b8fd

TODOs:
- adding a "sorted pre-check" to keep up with our qsort for ascending inputs
- further performance validation
- possibly doing NULL partitioning differently
- possibly specializing qsort on the NULL partition
- code cleanup

--
John Naylor
Amazon Web Services

Attachment: bench_cartesiansort.sh
Description: Bourne shell script

From 8247793423da15e6c7558808897203f3234b2666 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Fri, 17 Oct 2025 09:57:43 +0700
Subject: [PATCH v3] Use radix sort when datum1 is an integer type

XXX regression tests don't pass for underspecified queries; this
is expected
---
 src/backend/utils/misc/guc_parameters.dat |   7 +
 src/backend/utils/sort/tuplesort.c        | 488 +++++++++++++++++++++-
 src/include/utils/guc.h                   |   1 +
 src/include/utils/tuplesort.h             |  12 +-
 4 files changed, 487 insertions(+), 21 deletions(-)

diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 25da769eb35..bbfc52adcaf 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -3469,6 +3469,13 @@
   max => 'INT_MAX',
 },
 
+{ name => 'wip_radix_sort', type => 'bool', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS',
+  short_desc => 'Test radix sort for debugging.',
+  flags => 'GUC_NOT_IN_SAMPLE',
+  variable => 'wip_radix_sort',
+  boot_val => 'true',
+},
+
 { name => 'work_mem', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_MEM',
   short_desc => 'Sets the maximum memory to be used for query workspaces.',
   long_desc => 'This much memory can be used by each internal sort operation and hash table before switching to temporary disk files.',
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 5d4411dc33f..84abd1995f7 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -104,6 +104,7 @@
 #include "commands/tablespace.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
+#include "port/pg_bitutils.h"
 #include "storage/shmem.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
@@ -122,6 +123,7 @@
 
 /* GUC variables */
 bool		trace_sort = false;
+bool		wip_radix_sort = true;
 
 #ifdef DEBUG_BOUNDED_SORT
 bool		optimize_bounded_sort = true;
@@ -490,6 +492,25 @@ static void tuplesort_updatemax(Tuplesortstate *state);
  * abbreviations of text or multi-key sorts.  There could be!  Is it worth it?
  */
 
+/* Used for conditioned datums, so we can ignore NULLs and sort direction. */
+static pg_attribute_always_inline int
+qsort_tuple_conditioned_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
+{
+	if (a->cond_datum1 < b->cond_datum1)
+		return -1;
+	if (a->cond_datum1 > b->cond_datum1)
+		return 1;
+
+	/*
+	 * No need to waste effort calling the tiebreak function when there are no
+	 * other keys to sort on.
+	 */
+	if (state->base.onlyKey != NULL)
+		return 0;
+
+	return state->base.comparetup_tiebreak(a, b, state);
+}
+
 /* Used if first key's comparator is ssup_datum_unsigned_cmp */
 static pg_attribute_always_inline int
 qsort_tuple_unsigned_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
@@ -567,6 +588,15 @@ qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
  * common comparison functions on pass-by-value leading datums.
  */
 
+#define ST_SORT qsort_tuple_conditioned
+#define ST_ELEMENT_TYPE SortTuple
+#define ST_COMPARE(a, b, state) qsort_tuple_conditioned_compare(a, b, state)
+#define ST_COMPARE_ARG_TYPE Tuplesortstate
+#define ST_CHECK_FOR_INTERRUPTS
+#define ST_SCOPE static
+#define ST_DEFINE
+#include "lib/sort_template.h"
+
 #define ST_SORT qsort_tuple_unsigned
 #define ST_ELEMENT_TYPE SortTuple
 #define ST_COMPARE(a, b, state) qsort_tuple_unsigned_compare(a, b, state)
@@ -615,6 +645,212 @@ qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
 #define ST_DEFINE
 #include "lib/sort_template.h"
 
+
+/*
+ * WIP: For now prefer test coverage of radix sort in Assert builds.
+ */
+#ifdef USE_ASSERT_CHECKING
+#define QSORT_THRESHOLD 0
+#else
+#define QSORT_THRESHOLD 200
+#endif
+
+typedef struct PartitionInfo
+{
+	union
+	{
+		size_t		count;
+		size_t		offset;
+	};
+	size_t		next_offset;
+}			PartitionInfo;
+
+static inline uint8_t
+extract_key(Datum key, int level)
+{
+	return (key >> (((SIZEOF_DATUM - 1) - level) * 8)) & 0xFF;
+}
+
+static inline void
+swap(SortTuple *a, SortTuple *b)
+{
+	SortTuple	tmp = *a;
+
+	*a = *b;
+	*b = tmp;
+}
+
+/*
+ * Condition datum to work with pure unsigned comparison,
+ * taking ASC/DESC into account as well.
+ */
+static inline Datum
+condition_datum(Datum orig, SortSupport ssup)
+{
+	Datum		cond_datum1;
+
+	if (ssup->comparator == ssup_datum_signed_cmp)
+	{
+		/* it was already cast to unsigned when stored */
+		cond_datum1 = orig ^ (UINT64CONST(1) << 63);
+	}
+	else if (ssup->comparator == ssup_datum_int32_cmp)
+	{
+		/*
+		 * First normalize to uint32. Technically, we don't need to do this,
+		 * but it forces the upper bytes to remain the same regardless of
+		 * sign.
+		 */
+		uint32		u32 = DatumGetUInt32(orig) ^ ((uint32) 1 << 31);
+
+		cond_datum1 = UInt32GetDatum(u32);
+	}
+	else
+	{
+		Assert(ssup->comparator == ssup_datum_unsigned_cmp);
+		cond_datum1 = orig;
+	}
+
+	if (ssup->ssup_reverse)
+		cond_datum1 = ~cond_datum1;
+
+	return cond_datum1;
+}
+
+/*
+ * Based on implementation in https://github.com/skarupke/ska_sort (Boost license),
+ * with the following changes:
+ *  - unroll loop in counting step
+ *  - count sorted partitions in every pass, rather than maintaining list of unsorted partitions
+ * TODO: match qsort API with number of elements rather than end pointer
+ */
+static void
+ska_byte_sort(SortTuple *begin,
+			  SortTuple *end, int level, Tuplesortstate *state)
+{
+	PartitionInfo partitions[256] = {0};
+	uint8_t		remaining_partitions[256] = {0};
+	size_t		total = 0;
+	int			num_partitions = 0;
+	int			num_remaining;
+
+	/* count key chunks */
+	for (SortTuple *tup = begin; tup < end; tup++)
+	{
+		uint8		key_chunk;
+
+		key_chunk = extract_key(tup->cond_datum1, level);
+		partitions[key_chunk].count++;
+	}
+
+	/* compute partition offsets */
+	for (int i = 0; i < 256; ++i)
+	{
+		size_t		count = partitions[i].count;
+
+		if (count)
+		{
+			partitions[i].offset = total;
+			total += count;
+			remaining_partitions[num_partitions] = i;
+			++num_partitions;
+		}
+		partitions[i].next_offset = total;
+	}
+
+	num_remaining = num_partitions;
+
+	/*
+	 * Permute tuples to correct partition. If we started with one partition,
+	 * there is nothing to do. If a permutation from a previous iteration
+	 * results in a single partition that hasn't been marked as sorted, we
+	 * know it's actually sorted.
+	 */
+	while (num_remaining > 1)
+	{
+		/*
+		 * We can only exit the loop when all partitions are sorted, so must
+		 * reset every iteration
+		 */
+		num_remaining = num_partitions;
+
+		for (int i = 0; i < num_partitions; i++)
+		{
+			uint8		idx = remaining_partitions[i];
+
+			PartitionInfo part = partitions[idx];
+
+			for (SortTuple *st = begin + part.offset;
+				 st < begin + part.next_offset;
+				 st++)
+			{
+				uint8		this_partition = extract_key(st->cond_datum1, level);
+				size_t		offset = partitions[this_partition].offset++;
+
+				Assert(begin + offset < end);
+				swap(st, begin + offset);
+			};
+
+			if (part.offset == part.next_offset)
+			{
+				/* partition is sorted */
+				num_remaining--;
+			}
+		}
+	}
+
+	{
+		size_t		start_offset = 0;
+		SortTuple  *partition_begin = begin;
+
+		for (uint8_t *it = remaining_partitions, *end = remaining_partitions + num_partitions;
+			 it != end;
+			 ++it)
+		{
+			size_t		end_offset = partitions[*it].next_offset;
+			SortTuple  *partition_end = begin + end_offset;
+			ptrdiff_t	num_elements = end_offset - start_offset;
+
+			if (num_elements > 1)
+			{
+				if (level < SIZEOF_DATUM - 1)
+				{
+					if (num_elements < QSORT_THRESHOLD)
+					{
+						qsort_tuple_conditioned(partition_begin,
+												num_elements,
+												state);
+					}
+					else
+					{
+						ska_byte_sort(partition_begin,
+									  partition_end,
+									  level + 1,
+									  state);
+					}
+				}
+				else if (state->base.onlyKey == NULL)
+				{
+					/*
+					 * Finished radix sort on all bytes of cond_datum1
+					 * (possibily abbreviated), now qsort with tiebreak
+					 * comparator. XXX comparetup_tiebreak cannot inspect
+					 * isnull1. In core, the only tiebreak that does so is
+					 * comparetup_datum_tiebreak, but we wouldn't have gotten
+					 * here if that's the case.
+					 */
+					qsort_tuple(partition_begin,
+								num_elements,
+								state->base.comparetup_tiebreak,
+								state);
+				}
+			}
+			start_offset = end_offset;
+			partition_begin = partition_end;
+		}
+	}
+}
+
 /*
  *		tuplesort_begin_xxx
  *
@@ -2663,10 +2899,208 @@ sort_bounded_heap(Tuplesortstate *state)
 	state->boundUsed = true;
 }
 
+/* WIP: allow turning common prefix skipping off for testing */
+#define COMMON_PREFIX
+
+/*
+ * Compute conditioned datums for SortTuples so that a single
+ * unsigned comparison resolves the sort order of datum1.
+ * Then dispatch to either radix sort or a specialized qsort.
+ */
+static void
+sort_tuple_conditioned(Tuplesortstate *state)
+{
+	SortSupportData ssup = state->base.sortKeys[0];
+
+	bool		nulls_first = ssup.ssup_nulls_first;
+	SortTuple  *first = state->memtuples;
+	SortTuple  *last = state->memtuples + state->memtupcount;
+	SortTuple  *not_null_start;
+	size_t		d1,
+				d2,
+				not_null_count;
+#ifdef COMMON_PREFIX
+	Datum		first_datum = 0;
+	Datum		common_upper_bits = 0;
+#endif
+	int			common_prefix;
+
+	/*
+	 * Partition by isnull1, since we can only radix sort on non-NULL
+	 * elements. This also allows the qsort fallback to ignore NULLs.
+	 */
+
+	/*
+	 * Find the leftmost NOT NULL tuple if NULLS FIRST, or leftmost NULL
+	 * element if NULLS LAST.
+	 */
+	while (first < last && first->isnull1 == nulls_first)
+		first++;
+
+	/*
+	 * XXX We must start "last" after the final tuple to maintain the
+	 * invariant that it ends up one after the first partition, and the first
+	 * partition may correspond to the entire array. If "first" isn't gotten
+	 * this far, we need to pre-decrement "last" before beginning its loop.
+	 */
+	if (first < last)
+		last--;
+
+	/*
+	 * Find the rightmost NULL tuple if NULLS FIRST, or rightmost NOT NULL
+	 * tuple if NULLS LAST.
+	 */
+	while (first < last && last->isnull1 != nulls_first)
+		last--;
+
+	/* swap pairs of tuples that are in the wrong order */
+	while (first < last)
+	{
+		swap(first, last);
+		while (first < last && first->isnull1 == nulls_first)
+			first++;
+		while (first < last && last->isnull1 != nulls_first)
+			last--;
+	}
+
+	d1 = last - state->memtuples;
+	d2 = state->memtupcount - d1;
+
+	Assert(last == first);
+	Assert(last + d2 == state->memtuples + state->memtupcount);
+	for (SortTuple *pm = state->memtuples;
+		 pm < state->memtuples + d1;
+		 pm++)
+		Assert(pm->isnull1 == nulls_first);
+	for (SortTuple *pm = last;
+		 pm < last + d2;
+		 pm++)
+		Assert(pm->isnull1 != nulls_first);
+
+	/*
+	 * Sort NULL partition using tiebreak comparator. XXX this will repeat the
+	 * NULL check for abbreviated keys.
+	 */
+	if (nulls_first)
+	{
+		qsort_tuple(state->memtuples,
+					d1,
+					state->base.comparetup_tiebreak,
+					state);
+		not_null_start = last;
+		not_null_count = d2;
+	}
+	else
+	{
+		qsort_tuple(last,
+					d2,
+					state->base.comparetup_tiebreak,
+					state);
+		not_null_start = state->memtuples;
+		not_null_count = d1;
+	}
+
+	/*
+	 * Condition datum so that unsigned comparision is order-preserving, and
+	 * compute the common prefix to skip unproductive recursion steps during
+	 * radix sort.
+	 */
+	for (SortTuple *tup = not_null_start;
+		 tup < not_null_start + not_null_count;
+		 tup++)
+	{
+		Datum		cond_datum1 = condition_datum(tup->datum1, &ssup);
+#ifdef COMMON_PREFIX
+		if (tup == not_null_start)
+		{
+			/* Need to start with some value, may as well be the first one. */
+			first_datum = cond_datum1;
+		}
+		else
+		{
+			Datum		this_common_bits;
+
+			/* The bits in common will be zero */
+			this_common_bits = first_datum ^ cond_datum1;
+
+			/*
+			 * We're really only interested in the case where the leftmost one
+			 * bit is further left, but this branch should be predictable
+			 * enough not to waste cycles trying harder.
+			 */
+			if (this_common_bits > common_upper_bits)
+				common_upper_bits = this_common_bits;
+		}
+#endif
+		tup->cond_datum1 = cond_datum1;
+	}
+
+	/*
+	 * Sort the not-NULL partition, using radix sort if large enough,
+	 * otherwise fall back to quicksort.
+	 */
+	if (not_null_count < QSORT_THRESHOLD)
+		qsort_tuple_conditioned(not_null_start,
+								not_null_count,
+								state);
+	else
+	{
+
+		/*
+		 * The upper bits of common_upper_bits are zero where all values have
+		 * the same bits. The byte position of the leftmost one bit is the
+		 * byte where radix sort should start bucketing. OR-ing in the lowest
+		 * bit guards against undefined behavior without changing the result.
+		 */
+#ifdef COMMON_PREFIX
+		common_prefix = sizeof(Datum) - 1 -
+			(pg_leftmost_one_pos64(common_upper_bits | 1) / BITS_PER_BYTE);
+#else
+		common_prefix = 0;
+#endif
+		ska_byte_sort(not_null_start,
+					  not_null_start + not_null_count,
+					  common_prefix,
+					  state);
+	}
+
+	/*
+	 * Restore fields that were overwritten with temporary conditioned datum1
+	 */
+	for (SortTuple *tup = not_null_start;
+		 tup < not_null_start + not_null_count;
+		 tup++)
+	{
+		/* need to restore NOT NULL */
+		tup->isnull1 = false;
+		/* be tidy */
+		tup->srctape = 0;
+	}
+}
+
+/* Verify sort using standard comparator. */
+static void
+check_sorted(Tuplesortstate *state)
+{
+#ifdef USE_ASSERT_CHECKING
+	for (SortTuple *pm = state->memtuples + 1;
+		 pm < state->memtuples + state->memtupcount;
+		 pm++)
+	{
+#if 0
+		Assert(COMPARETUP(state, pm - 1, pm) <= 0);
+#else
+		if (COMPARETUP(state, pm - 1, pm) > 0)
+			elog(ERROR, "SORT FAILED");
+#endif
+	}
+#endif
+}
+
 /*
- * Sort all memtuples using specialized qsort() routines.
+ * Sort all memtuples using specialized routines.
  *
- * Quicksort is used for small in-memory sorts, and external sort runs.
+ * Quicksort or radix sort is used for small in-memory sorts, and external sort runs.
  */
 static void
 tuplesort_sort_memtuples(Tuplesortstate *state)
@@ -2681,26 +3115,42 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
 		 */
 		if (state->base.haveDatum1 && state->base.sortKeys)
 		{
-			if (state->base.sortKeys[0].comparator == ssup_datum_unsigned_cmp)
-			{
-				qsort_tuple_unsigned(state->memtuples,
-									 state->memtupcount,
-									 state);
-				return;
-			}
-			else if (state->base.sortKeys[0].comparator == ssup_datum_signed_cmp)
+			SortSupportData ssup = state->base.sortKeys[0];
+
+			if (wip_radix_sort)
 			{
-				qsort_tuple_signed(state->memtuples,
-								   state->memtupcount,
-								   state);
-				return;
+				if ((ssup.comparator == ssup_datum_unsigned_cmp ||
+					 ssup.comparator == ssup_datum_signed_cmp ||
+					 ssup.comparator == ssup_datum_int32_cmp))
+				{
+					sort_tuple_conditioned(state);
+					check_sorted(state);
+					return;
+				}
 			}
-			else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp)
+			else
 			{
-				qsort_tuple_int32(state->memtuples,
-								  state->memtupcount,
-								  state);
-				return;
+				if (state->base.sortKeys[0].comparator == ssup_datum_unsigned_cmp)
+				{
+					qsort_tuple_unsigned(state->memtuples,
+										 state->memtupcount,
+										 state);
+					return;
+				}
+				else if (state->base.sortKeys[0].comparator == ssup_datum_signed_cmp)
+				{
+					qsort_tuple_signed(state->memtuples,
+									   state->memtupcount,
+									   state);
+					return;
+				}
+				else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp)
+				{
+					qsort_tuple_int32(state->memtuples,
+									  state->memtupcount,
+									  state);
+					return;
+				}
 			}
 		}
 
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index f21ec37da89..bc6f7fa60f3 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -324,6 +324,7 @@ extern PGDLLIMPORT int tcp_user_timeout;
 extern PGDLLIMPORT char *role_string;
 extern PGDLLIMPORT bool in_hot_standby_guc;
 extern PGDLLIMPORT bool trace_sort;
+extern PGDLLIMPORT bool wip_radix_sort;
 
 #ifdef DEBUG_BOUNDED_SORT
 extern PGDLLIMPORT bool optimize_bounded_sort;
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index 0bf55902aa1..27cd12985fa 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -149,8 +149,16 @@ typedef struct
 {
 	void	   *tuple;			/* the tuple itself */
 	Datum		datum1;			/* value of first key column */
-	bool		isnull1;		/* is first key column NULL? */
-	int			srctape;		/* source tape number */
+
+	union
+	{
+		struct
+		{
+			bool		isnull1;		/* is first key column NULL? */
+			int			srctape;		/* source tape number */
+		};
+		Datum		cond_datum1;		/* sort key for radix sort */
+	};
 } SortTuple;
 
 typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
-- 
2.51.1

Reply via email to