Hi hackers,
The repeated use of the number 300 in the ANALYZE-related code creates
redundancy and relies on scattered, sometimes unclear, comments to
explain its purpose. This can make the code harder to understand,
especially for new contributors who might not immediately understand its
significance. To address this, I propose introducing a macro
STATS_MIN_ROWS to represent this value and consolidating its explanation
in a single place, making the code more consistent and readable.
--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC.
From d3cbea3875fdc9866c86c64c0d0e0d838887040c Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdoki...@tantorlabs.com>
Date: Mon, 9 Dec 2024 15:50:40 +0300
Subject: [PATCH v1] Define STATS_MIN_ROWS for minimum rows of stats in ANALYZE
This introduces a macro STATS_MIN_ROWS to represent
the default minimum number of rows (300) sampled in ANALYZE.
---
src/backend/commands/analyze.c | 26 ++-----------------
src/backend/statistics/extended_stats.c | 2 +-
src/backend/tsearch/ts_typanalyze.c | 4 +--
src/backend/utils/adt/rangetypes_typanalyze.c | 7 +++--
src/include/statistics/statistics.h | 23 ++++++++++++++++
5 files changed, 31 insertions(+), 31 deletions(-)
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 9a56de2282..ff7b6ffe8f 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -1897,42 +1897,20 @@ std_typanalyze(VacAttrStats *stats)
{
/* Seems to be a scalar datatype */
stats->compute_stats = compute_scalar_stats;
- /*--------------------
- * The following choice of minrows is based on the paper
- * "Random sampling for histogram construction: how much is enough?"
- * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
- * Proceedings of ACM SIGMOD International Conference on Management
- * of Data, 1998, Pages 436-447. Their Corollary 1 to Theorem 5
- * says that for table size n, histogram size k, maximum relative
- * error in bin size f, and error probability gamma, the minimum
- * random sample size is
- * r = 4 * k * ln(2*n/gamma) / f^2
- * Taking f = 0.5, gamma = 0.01, n = 10^6 rows, we obtain
- * r = 305.82 * k
- * Note that because of the log function, the dependence on n is
- * quite weak; even at n = 10^12, a 300*k sample gives <= 0.66
- * bin size error with probability 0.99. So there's no real need to
- * scale for n, which is a good thing because we don't necessarily
- * know it at this point.
- *--------------------
- */
- stats->minrows = 300 * stats->attstattarget;
}
else if (OidIsValid(eqopr))
{
/* We can still recognize distinct values */
stats->compute_stats = compute_distinct_stats;
- /* Might as well use the same minrows as above */
- stats->minrows = 300 * stats->attstattarget;
}
else
{
/* Can't do much but the trivial stuff */
stats->compute_stats = compute_trivial_stats;
- /* Might as well use the same minrows as above */
- stats->minrows = 300 * stats->attstattarget;
}
+ stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
+
return true;
}
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 99fdf208db..451e2d1e9c 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -320,7 +320,7 @@ ComputeExtStatisticsRows(Relation onerel,
MemoryContextDelete(cxt);
/* compute sample size based on the statistics target */
- return (300 * result);
+ return (STATS_MIN_ROWS * result);
}
/*
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index ccafe42729..befd90c9d5 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -17,6 +17,7 @@
#include "catalog/pg_operator.h"
#include "commands/vacuum.h"
#include "common/hashfn.h"
+#include "statistics/statistics.h"
#include "tsearch/ts_type.h"
#include "utils/builtins.h"
#include "varatt.h"
@@ -64,8 +65,7 @@ ts_typanalyze(PG_FUNCTION_ARGS)
stats->attstattarget = default_statistics_target;
stats->compute_stats = compute_tsvector_stats;
- /* see comment about the choice of minrows in commands/analyze.c */
- stats->minrows = 300 * stats->attstattarget;
+ stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
PG_RETURN_BOOL(true);
}
diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c
index 3773f98115..7fcac5621c 100644
--- a/src/backend/utils/adt/rangetypes_typanalyze.c
+++ b/src/backend/utils/adt/rangetypes_typanalyze.c
@@ -26,6 +26,7 @@
#include "catalog/pg_operator.h"
#include "commands/vacuum.h"
+#include "statistics/statistics.h"
#include "utils/float.h"
#include "utils/fmgrprotos.h"
#include "utils/lsyscache.h"
@@ -56,8 +57,7 @@ range_typanalyze(PG_FUNCTION_ARGS)
stats->compute_stats = compute_range_stats;
stats->extra_data = typcache;
- /* same as in std_typanalyze */
- stats->minrows = 300 * stats->attstattarget;
+ stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
PG_RETURN_BOOL(true);
}
@@ -82,8 +82,7 @@ multirange_typanalyze(PG_FUNCTION_ARGS)
stats->compute_stats = compute_range_stats;
stats->extra_data = typcache;
- /* same as in std_typanalyze */
- stats->minrows = 300 * stats->attstattarget;
+ stats->minrows = (STATS_MIN_ROWS * stats->attstattarget);
PG_RETURN_BOOL(true);
}
diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h
index 7f2bf18716..f578f7fd1d 100644
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -22,6 +22,29 @@
#define STATS_NDISTINCT_MAGIC 0xA352BFA4 /* struct identifier */
#define STATS_NDISTINCT_TYPE_BASIC 1 /* struct version */
+/*--------------------
+ * Minimum of rows wanted for stats
+ *
+ * The following choice of minrows is based on the paper
+ * "Random sampling for histogram construction: how much is enough?"
+ * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
+ * Proceedings of ACM SIGMOD International Conference on Management
+ * of Data, 1998, Pages 436-447. Their Corollary 1 to Theorem 5
+ * says that for table size n, histogram size k, maximum relative
+ * error in bin size f, and error probability gamma, the minimum
+ * random sample size is
+ * r = 4 * k * ln(2*n/gamma) / f^2
+ * Taking f = 0.5, gamma = 0.01, n = 10^6 rows, we obtain
+ * r = 305.82 * k
+ * Note that because of the log function, the dependence on n is
+ * quite weak; even at n = 10^12, a 300*k sample gives <= 0.66
+ * bin size error with probability 0.99. So there's no real need to
+ * scale for n, which is a good thing because we don't necessarily
+ * know it at this point.
+ *--------------------
+ */
+#define STATS_MIN_ROWS 300
+
/* MVNDistinctItem represents a single combination of columns */
typedef struct MVNDistinctItem
{
--
2.34.1