Re: [PATCH] ANALYZE: hash-accelerate MCV tracking for equality-only types

Ilia Evdokimov Sun, 03 May 2026 07:16:50 -0700

On 4/14/26 04:34, Chengpeng Yan wrote:

I split v5 accordingly. The first patch changes the singleton handling
from shifting to a cursor-based eviction scheme, and the second patch
adds the hash lookup.

I reviewed v5 of the patches. Instead of going through each issue one byone, I made a pass to clean up and clarify the code and summarize themain changes belows:

- Fixed a few typos in comments and added comments in places where thelogic was not immediately clear;

- Rewrote the bubble-up loop using `for` loop, which I find morereadable. Also removed some confusing uses of the `j` variable that masethe flow harder to follow;


- Simplified parts of the code to improve overall readability.

--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC,
https://tantorlabs.com/

From 6f081a9525aff0792fd42b2d629133ed5114a6de Mon Sep 17 00:00:00 2001
From: Evdokimov Ilia <[email protected]>
Date: Sun, 3 May 2026 17:04:34 +0300
Subject: [PATCH v6 2/2] ANALYZE: hash-accelerate MCV tracking for
 equality-only types

compute_distinct_stats() still performs a linear search of track[]
for each sampled value. At higher statistics targets, that lookup cost
can dominate ANALYZE time for equality-only datatypes.

When the type cache's default hash support matches the equality
operator, and the statistics target is at least
ANALYZE_HASH_THRESHOLD, maintain a simplehash table that maps tracked
values to their current track[] slots. That reduces match lookup from
linear to O(1) on average.

Entries that move or are replaced update the hash table so track[] and
the lookup structure stay in sync, while the existing linear path
remains available as a fallback.
---
 src/backend/commands/analyze.c | 171 ++++++++++++++++++++++++++++++---
 1 file changed, 160 insertions(+), 11 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 262adf41b24..2e6fbe5cc12 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -55,6 +55,7 @@
 #include "utils/sortsupport.h"
 #include "utils/syscache.h"
 #include "utils/timestamp.h"
+#include "utils/typcache.h"
 
 /* Per-index data for ANALYZE */
 typedef struct AnlIndexData
@@ -1900,6 +1901,12 @@ ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull)
  */
 #define WIDTH_THRESHOLD  1024
 
+/*
+ * Build a hash table for distinct/MCV tracking only when the statistics
+ * target is large enough to justify the overhead of maintaining it.
+ */
+#define ANALYZE_HASH_THRESHOLD  100
+
 #define swapInt(a,b)	do {int _tmp; _tmp=a; a=b; b=_tmp;} while(0)
 #define swapDatum(a,b)	do {Datum _tmp; _tmp=a; a=b; b=_tmp;} while(0)
 
@@ -1918,7 +1925,29 @@ typedef struct
 	int		   *tupnoLink;
 } CompareScalarsContext;
 
+/* Entries in the simplehash hash table used by compute_distinct_stats */
+typedef struct DistinctHashEntry
+{
+	Datum		value;			/* the value represented by this entry */
+	int			index;			/* its index in the relevant track */
+	uint32		hash;			/* hash code for the Datum */
+	char		status;			/* status code used by simplehash.h */
+}			DistinctHashEntry;
+
+/* private_data for the simplehash hash table */
+typedef struct DistinctHashContext
+{
+	FmgrInfo   *eqfunc;			/* the equality operator */
+	FmgrInfo   *hashfunc;		/* the hash function to use */
+	Oid			collation;		/* collation to use equality and hash calls */
+}			DistinctHashContext;
+
 
+/* forward reference */
+typedef struct DistinctHash_hash DistinctHash_hash;
+
+static uint32 distinct_hash_hash(DistinctHash_hash * tab, Datum key);
+static bool distinct_hash_equal(DistinctHash_hash * tab, Datum key0, Datum key1);
 static void compute_trivial_stats(VacAttrStatsP stats,
 								  AnalyzeAttrFetchFunc fetchfunc,
 								  int samplerows,
@@ -1940,6 +1969,53 @@ static int	analyze_mcv_list(int *mcv_counts,
 							 int samplerows,
 							 double totalrows);
 
+/* Define support routines for compute distinct values hash tables */
+#define SH_PREFIX DistinctHash
+#define SH_ELEMENT_TYPE DistinctHashEntry
+#define SH_KEY_TYPE Datum
+#define SH_KEY value
+#define SH_HASH_KEY(tab, key) distinct_hash_hash(tab, key)
+#define SH_EQUAL(tab, key0, key1) distinct_hash_equal(tab, key0, key1)
+#define SH_SCOPE static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(tab, ent) ((ent)->hash)
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+
+/*
+ * Support functions for the hash tables used by compute_distinct_stats
+ */
+static uint32
+distinct_hash_hash(DistinctHash_hash * tab, Datum key)
+{
+	DistinctHashContext *context = (DistinctHashContext *) tab->private_data;
+	Datum		result;
+
+	result = FunctionCall1Coll(context->hashfunc, context->collation, key);
+	return DatumGetUInt32(result);
+}
+
+static bool
+distinct_hash_equal(DistinctHash_hash * tab, Datum key0, Datum key1)
+{
+	DistinctHashContext *context = (DistinctHashContext *) tab->private_data;
+	Datum		result;
+
+	result = FunctionCall2Coll(context->eqfunc, context->collation, key0, key1);
+	return DatumGetBool(result);
+}
+
+static inline void
+distinct_hash_set_index(DistinctHash_hash * hash, Datum value,
+						uint32 value_hash, int index)
+{
+	DistinctHashEntry *entry = DistinctHash_lookup_hash(hash, value, value_hash);
+
+	Assert(entry != NULL);
+	entry->index = index;
+}
 
 /*
  * std_typanalyze -- the default type-specific typanalyze function
@@ -2129,15 +2205,20 @@ compute_distinct_stats(VacAttrStatsP stats,
 	bool		is_varwidth = (!stats->attrtype->typbyval &&
 							   stats->attrtype->typlen < 0);
 	FmgrInfo	f_cmpeq;
+	TypeCacheEntry *typentry;
 	typedef struct
 	{
 		Datum		value;
 		int			count;
+		uint32		hash;
 	} TrackItem;
 	TrackItem  *track;
 	int			track_cnt,
 				track_max;
 	int			num_mcv = stats->attstattarget;
+	bool		use_hash;
+	DistinctHashContext hash_context;
+	DistinctHash_hash *track_hash = NULL;
 	int			firstcount1 = 0,	/* index of first singleton in track[] */
 				c1_cursor = 0;	/* next singleton to evict (FIFO) */
 	StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;
@@ -2152,6 +2233,26 @@ compute_distinct_stats(VacAttrStatsP stats,
 	track_cnt = 0;
 
 	fmgr_info(mystats->eqfunc, &f_cmpeq);
+	typentry = lookup_type_cache(stats->attrtypid,
+								 TYPECACHE_HASH_PROC_FINFO | TYPECACHE_EQ_OPR);
+
+	/*
+	 * For sufficiently large statistics targets, use a hash table to avoid
+	 * repeated linear searches of the track[] array, but only when we can use
+	 * the type's default hash support that matches the equality operator.
+	 */
+	use_hash = (num_mcv >= ANALYZE_HASH_THRESHOLD &&
+				mystats->eqopr == typentry->eq_opr &&
+				OidIsValid(typentry->hash_proc));
+
+	if (use_hash)
+	{
+		hash_context.eqfunc = &f_cmpeq;
+		hash_context.hashfunc = &typentry->hash_proc_finfo;
+		hash_context.collation = stats->attrcollid;
+		track_hash = DistinctHash_create(CurrentMemoryContext, track_max,
+										 &hash_context);
+	}
 
 	for (i = 0; i < samplerows; i++)
 	{
@@ -2160,6 +2261,7 @@ compute_distinct_stats(VacAttrStatsP stats,
 		bool		match;
 		int			match_index,
 					j;
+		uint32		value_hash = 0;
 
 		vacuum_delay_point(true);
 
@@ -2206,20 +2308,33 @@ compute_distinct_stats(VacAttrStatsP stats,
 		/*
 		 * See if the value matches anything we're already tracking.
 		 */
-		match = false;
-		firstcount1 = track_cnt;
-		for (j = 0; j < track_cnt; j++)
+		if (use_hash)
 		{
-			if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
-											   stats->attrcollid,
-											   value, track[j].value)))
+			DistinctHashEntry *entry;
+
+			value_hash = distinct_hash_hash(track_hash, value);
+			entry = DistinctHash_lookup_hash(track_hash, value, value_hash);
+			match = (entry != NULL);
+			if (match)
+				match_index = entry->index;
+		}
+		else
+		{
+			match = false;
+			firstcount1 = track_cnt;
+			for (j = 0; j < track_cnt; j++)
 			{
-				match = true;
-				match_index = j;
-				break;
+				if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
+												   stats->attrcollid,
+												   value, track[j].value)))
+				{
+					match = true;
+					match_index = j;
+					break;
+				}
+				if (j < firstcount1 && track[j].count == 1)
+					firstcount1 = j;
 			}
-			if (j < firstcount1 && track[j].count == 1)
-				firstcount1 = j;
 		}
 
 		if (match)
@@ -2234,6 +2349,18 @@ compute_distinct_stats(VacAttrStatsP stats,
 			{
 				swapDatum(track[j].value, track[j - 1].value);
 				swapInt(track[j].count, track[j - 1].count);
+				if (use_hash)
+				{
+					uint32		tmp;
+
+					tmp = track[j].hash;
+					track[j].hash = track[j - 1].hash;
+					track[j - 1].hash = tmp;
+					distinct_hash_set_index(track_hash, track[j].value,
+											track[j].hash, j);
+					distinct_hash_set_index(track_hash, track[j - 1].value,
+											track[j - 1].hash, j - 1);
+				}
 			}
 
 			/*
@@ -2281,12 +2408,34 @@ compute_distinct_stats(VacAttrStatsP stats,
 				insert_index = c1_cursor++;
 				if (c1_cursor >= track_cnt)
 					c1_cursor = firstcount1;
+
+				if (use_hash)
+				{
+					DistinctHashEntry *delentry;
+
+					delentry = DistinctHash_lookup_hash(track_hash,
+														track[insert_index].value,
+														track[insert_index].hash);
+					Assert(delentry != NULL);
+					DistinctHash_delete_item(track_hash, delentry);
+				}
 			}
 			else
 				continue;
 
 			track[insert_index].value = value;
 			track[insert_index].count = 1;
+			if (use_hash)
+			{
+				bool		found_hash;
+				DistinctHashEntry *entry;
+
+				track[insert_index].hash = value_hash;
+				entry = DistinctHash_insert_hash(track_hash, value, value_hash,
+												 &found_hash);
+				Assert(!found_hash);
+				entry->index = insert_index;
+			}
 		}
 	}
 
-- 
2.34.1

From df78d0bad95eb1572826a61bde6f4d6a7e1b639d Mon Sep 17 00:00:00 2001
From: Evdokimov Ilia <[email protected]>
Date: Sun, 3 May 2026 16:49:11 +0300
Subject: [PATCH v6 1/2] ANALYZE: use cursor eviction for distinct-value
 tracking

compute_distinct_stats() currently maintains the count=1 region by
shifting entries in track[] whenever a new singleton is inserted.
That makes each replacement O(n) in the size of the singleton region.

Replace that with a round-robin cursor over the count=1 region. This
preserves FIFO eviction order for singleton candidates while making
singleton replacement O(1) and avoiding the repeated array shifts
needed by the old scheme.
---
 src/backend/commands/analyze.c | 79 ++++++++++++++++++++++++++--------
 1 file changed, 60 insertions(+), 19 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 020a5919b84..262adf41b24 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -56,7 +56,6 @@
 #include "utils/syscache.h"
 #include "utils/timestamp.h"
 
-
 /* Per-index data for ANALYZE */
 typedef struct AnlIndexData
 {
@@ -2108,10 +2107,11 @@ compute_trivial_stats(VacAttrStatsP stats,
  *
  *	The most common values are determined by brute force: we keep a list
  *	of previously seen values, ordered by number of times seen, as we scan
- *	the samples.  A newly seen value is inserted just after the last
- *	multiply-seen value, causing the bottommost (oldest) singly-seen value
- *	to drop off the list.  The accuracy of this method, and also its cost,
- *	depend mainly on the length of the list we are willing to keep.
+ *	the samples.  Newly seen values are appended to the list, and when it's
+ *	full we replace the oldest singly-seen value (FIFO) using a round-robin
+ *	cursor (clock hand) over the count=1 region.  This avoids repeatedly
+ *	shifting the count=1 region, and when hashing is enabled, avoids having
+ *	to update a large number of hash->index mappings.
  */
 static void
 compute_distinct_stats(VacAttrStatsP stats,
@@ -2138,6 +2138,8 @@ compute_distinct_stats(VacAttrStatsP stats,
 	int			track_cnt,
 				track_max;
 	int			num_mcv = stats->attstattarget;
+	int			firstcount1 = 0,	/* index of first singleton in track[] */
+				c1_cursor = 0;	/* next singleton to evict (FIFO) */
 	StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;
 
 	/*
@@ -2156,7 +2158,7 @@ compute_distinct_stats(VacAttrStatsP stats,
 		Datum		value;
 		bool		isnull;
 		bool		match;
-		int			firstcount1,
+		int			match_index,
 					j;
 
 		vacuum_delay_point(true);
@@ -2213,6 +2215,7 @@ compute_distinct_stats(VacAttrStatsP stats,
 											   value, track[j].value)))
 			{
 				match = true;
+				match_index = j;
 				break;
 			}
 			if (j < firstcount1 && track[j].count == 1)
@@ -2221,31 +2224,69 @@ compute_distinct_stats(VacAttrStatsP stats,
 
 		if (match)
 		{
+			bool		was_count1;
+
 			/* Found a match */
-			track[j].count++;
+			was_count1 = (track[match_index].count == 1);
+			track[match_index].count++;
 			/* This value may now need to "bubble up" in the track list */
-			while (j > 0 && track[j].count > track[j - 1].count)
+			for (j = match_index; j > 0 && track[j].count > track[j - 1].count; j--)
 			{
 				swapDatum(track[j].value, track[j - 1].value);
 				swapInt(track[j].count, track[j - 1].count);
-				j--;
 			}
+
+			/*
+			 * If a singleton was promoted, the singleton region shrinks by
+			 * one element, so advance its boundary.
+			 */
+			if (was_count1)
+			{
+				firstcount1++;
+
+				/*
+				 * If the cursor was pointing into the singleton region,
+				 * advance it so it still refers to a valid singleton.
+				 */
+				if (c1_cursor >= firstcount1 - 1)
+				{
+					c1_cursor++;
+					if (c1_cursor >= track_cnt)
+						c1_cursor = firstcount1;
+				}
+			}
+
+			/* Final safety: keep cursor inside singleton region */
+			if (c1_cursor < firstcount1 || c1_cursor >= track_cnt)
+				c1_cursor = firstcount1;
 		}
 		else
 		{
-			/* No match.  Insert at head of count-1 list */
+			int			insert_index;
+
+			/*
+			 * No match.  Track the value if we still have room; otherwise
+			 * evict the oldest singleton from the count=1 region.
+			 */
 			if (track_cnt < track_max)
-				track_cnt++;
-			for (j = track_cnt - 1; j > firstcount1; j--)
-			{
-				track[j].value = track[j - 1].value;
-				track[j].count = track[j - 1].count;
-			}
-			if (firstcount1 < track_cnt)
+				insert_index = track_cnt++;
+			else if (firstcount1 < track_cnt)
 			{
-				track[firstcount1].value = value;
-				track[firstcount1].count = 1;
+				/*
+				 * Use c1_cursor as a round-robin cursor over the count=1
+				 * region.  Keep it on a current singleton before evicting.
+				 */
+				if (c1_cursor < firstcount1 || c1_cursor >= track_cnt)
+					c1_cursor = firstcount1;
+				insert_index = c1_cursor++;
+				if (c1_cursor >= track_cnt)
+					c1_cursor = firstcount1;
 			}
+			else
+				continue;
+
+			track[insert_index].value = value;
+			track[insert_index].count = 1;
 		}
 	}
 
-- 
2.34.1

Re: [PATCH] ANALYZE: hash-accelerate MCV tracking for equality-only types

Reply via email to