Add the GUC parameter.
On 2024/1/30 21:25, Quan Zongliang wrote:
On 2023/11/3 15:27, Quan Zongliang wrote:
Hi
We have one such problem. A table field has skewed data. Statistics:
n_distinct | -0.4481973
most_common_vals | {5f006ca25b52ed78e457b150ee95a30c}
most_common_freqs | {0.5518474}
Data generation:
CREATE TABLE s_user (
user_id varchar(32) NOT NULL,
corp_id varchar(32),
status int NOT NULL
);
insert into s_user
select md5('user_id ' || a), md5('corp_id ' || a),
case random()<0.877675 when true then 1 else -1 end
FROM generate_series(1,10031) a;
insert into s_user
select md5('user_id ' || a), md5('corp_id 10032'),
case random()<0.877675 when true then 1 else -1 end
FROM generate_series(10031,22383) a;
CREATE INDEX s_user_corp_id_idx ON s_user USING btree (corp_id);
analyze s_user;
1. First, define a PREPARE statement
prepare stmt as select count(*) from s_user where status=1 and corp_id
= $1;
2. Run it five times. Choose the custom plan.
explain (analyze,buffers) execute
stmt('5f006ca25b52ed78e457b150ee95a30c');
Here's the plan:
Aggregate (cost=639.84..639.85 rows=1 width=8) (actual
time=4.653..4.654 rows=1 loops=1)
Buffers: shared hit=277
-> Seq Scan on s_user (cost=0.00..612.76 rows=10830 width=0)
(actual time=1.402..3.747 rows=10836 loops=1)
Filter: ((status = 1) AND ((corp_id)::text =
'5f006ca25b52ed78e457b150ee95a30c'::text))
Rows Removed by Filter: 11548
Buffers: shared hit=277
Planning Time: 0.100 ms
Execution Time: 4.674 ms
(8 rows)
3.From the sixth time. Choose generic plan.
We can see that there is a huge deviation between the estimate and the
actual value:
Aggregate (cost=11.83..11.84 rows=1 width=8) (actual
time=4.424..4.425 rows=1 loops=1)
Buffers: shared hit=154 read=13
-> Bitmap Heap Scan on s_user (cost=4.30..11.82 rows=2 width=0)
(actual time=0.664..3.371 rows=10836 loops=1)
Recheck Cond: ((corp_id)::text = $1)
Filter: (status = 1)
Rows Removed by Filter: 1517
Heap Blocks: exact=154
Buffers: shared hit=154 read=13
-> Bitmap Index Scan on s_user_corp_id_idx
(cost=0.00..4.30 rows=2 width=0) (actual time=0.635..0.635 rows=12353
loops=1)
Index Cond: ((corp_id)::text = $1)
Buffers: shared read=13
Planning Time: 0.246 ms
Execution Time: 4.490 ms
(13 rows)
This is because in the choose_custom_plan function, the generic plan
is attempted after executing the custom plan five times.
if (plansource->num_custom_plans < 5)
return true;
The generic plan uses var_eq_non_const to estimate the average
selectivity.
These are facts that many people already know. So a brief introduction.
Our users actually use such parameter conditions in very complex
PREPARE statements. Once they use the generic plan for the sixth time.
The execution time will change from 5 milliseconds to 5 minutes.
To improve this problem. The following approaches can be considered:
1. Determine whether data skew exists in the PREPARE statement
parameter conditions based on the statistics.
However, there is no way to know if the user will use the skewed
parameter.
2.When comparing the cost of the generic plan with the average cost of
the custom plan(function choose_custom_plan). Consider whether the
maximum cost of a custom plan executed is an order of magnitude
different from the cost of a generic plan.
If the first five use a small selectivity condition. And after the
sixth use a high selectivity condition. Problems will still arise.
3.Trace the execution time of the PREPARE statement. When an execution
time is found to be much longer than the average execution time, the
custom plan is forced to run.
Is there any better idea?
I tried to do a demo. Add a member paramid to Const. When Const is
generated by Param, the Const is identified as coming from Param. Then
check in var_eq_const to see if the field in the condition using this
parameter is skewed. If so, choose_custom_plan returns true every time,
forcing custom_plan to be used.
Only conditional expressions such as var eq param or param eq var can be
supported.
If it makes sense. Continue to improve this patch.
--
Quan Zongliang
diff --git a/src/backend/optimizer/path/costsize.c
b/src/backend/optimizer/path/costsize.c
index 8b76e98529..14b8bec6ff 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -154,6 +154,8 @@ bool enable_partition_pruning = true;
bool enable_presorted_aggregate = true;
bool enable_async_append = true;
+double skewed_param_factor = DEFAULT_SKEWED_PARAM_FACTOR;
+
typedef struct
{
PlannerInfo *root;
diff --git a/src/backend/optimizer/util/clauses.c
b/src/backend/optimizer/util/clauses.c
index edc25d712e..8b922c0c95 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -2493,6 +2493,8 @@ eval_const_expressions_mutator(Node *node,
pval,
prm->isnull,
typByVal);
+ if (paramLI->paramFetch
== NULL)
+ con->paramid =
param->paramid;
con->location =
param->location;
return (Node *) con;
}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index cea777e9d4..9fa6548d23 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -273,10 +273,32 @@ eqsel_internal(PG_FUNCTION_ARGS, bool negate)
* in the query.)
*/
if (IsA(other, Const))
- selec = var_eq_const(&vardata, operator, collation,
- ((Const *)
other)->constvalue,
- ((Const *)
other)->constisnull,
- varonleft, negate);
+ {
+ Const * con = (Const *) other;
+
+ if (con->paramid > 0)
+ {
+ bool skewed_stat = false;
+
+ selec = var_eq_const_ext(&vardata, operator, collation,
+ ((Const *)
other)->constvalue,
+ ((Const *)
other)->constisnull,
+ varonleft,
negate, &skewed_stat);
+
+ if (skewed_stat)
+ {
+ ParamExternData *prm;
+
+ prm =
&root->glob->boundParams->params[con->paramid - 1];
+ prm->pflags |= PARAM_FLAG_SKEWEDSTAT;
+ }
+ }
+ else
+ selec = var_eq_const(&vardata, operator, collation,
+ ((Const *)
other)->constvalue,
+ ((Const *)
other)->constisnull,
+ varonleft,
negate);
+ }
else
selec = var_eq_non_const(&vardata, operator, collation, other,
varonleft,
negate);
@@ -295,6 +317,16 @@ double
var_eq_const(VariableStatData *vardata, Oid oproid, Oid collation,
Datum constval, bool constisnull,
bool varonleft, bool negate)
+{
+ return var_eq_const_ext(vardata, oproid, collation,
+ constval, constisnull,
+ varonleft, negate, NULL);
+}
+
+double
+var_eq_const_ext(VariableStatData *vardata, Oid oproid, Oid collation,
+ Datum constval, bool constisnull,
+ bool varonleft, bool negate, bool *skewed_stat)
{
double selec;
double nullfrac = 0.0;
@@ -387,6 +419,39 @@ var_eq_const(VariableStatData *vardata, Oid oproid, Oid
collation,
break;
}
}
+
+ Assert(sslot.nvalues > 0);
+ if (skewed_stat)
+ {
+ /* Check whether the values of the field is
skewed according to the MCVs. */
+ if (sslot.nvalues > 1 &&
+ sslot.numbers[0] >
sslot.numbers[sslot.nvalues - 1] * skewed_param_factor)
+ /* Compare the maximum and minimum
quantities in the MCVs. */
+ *skewed_stat = true;
+ else
+ {
+ /*
+ * Compare the maximum number in the
MCVs with the average number of
+ * other values(not in the MCVs).
+ */
+ int j;
+ double sumcommon = 0.0;
+ double otherdistinct, avgselec;
+
+ for (j = 0; j < sslot.nnumbers; j++)
+ sumcommon += sslot.numbers[j];
+
+ otherdistinct =
get_variable_numdistinct(vardata, &isdefault) -
+ sslot.nnumbers;
+
+ if (otherdistinct > 0)
+ {
+ avgselec = (1.0 - sumcommon -
nullfrac) / otherdistinct;
+ if (sslot.numbers[0] >
(avgselec * skewed_param_factor))
+ *skewed_stat = true;
+ }
+ }
+ }
}
else
{
diff --git a/src/backend/utils/cache/plancache.c
b/src/backend/utils/cache/plancache.c
index 5194cbf2cc..d1ee00139e 100644
--- a/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@@ -991,6 +991,20 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist,
else
plan_context = CurrentMemoryContext;
+ if (boundParams)
+ {
+ int i;
+
+ for (i =0; i < boundParams->numParams; i++)
+ {
+ if (boundParams->params[i].pflags &
PARAM_FLAG_SKEWEDSTAT)
+ {
+ plansource->hasSkewedparam = true;
+ break;
+ }
+ }
+ }
+
/*
* Create and fill the CachedPlan struct within the new context.
*/
@@ -1066,6 +1080,12 @@ choose_custom_plan(CachedPlanSource *plansource,
ParamListInfo boundParams)
if (plan_cache_mode == PLAN_CACHE_MODE_FORCE_CUSTOM_PLAN)
return true;
+ if (plan_cache_mode == PLAN_CACHE_MODE_STAT_ADAPTIVE_PLAN)
+ {
+ if (plansource->hasSkewedparam)
+ return true;
+ }
+
/* See if caller wants to force the decision */
if (plansource->cursor_options & CURSOR_OPT_GENERIC_PLAN)
return false;
diff --git a/src/backend/utils/misc/guc_tables.c
b/src/backend/utils/misc/guc_tables.c
index 70652f0a3f..93f85baedb 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -403,6 +403,7 @@ static const struct config_enum_entry
debug_parallel_query_options[] = {
static const struct config_enum_entry plan_cache_mode_options[] = {
{"auto", PLAN_CACHE_MODE_AUTO, false},
+ {"stat_adaptive_plan", PLAN_CACHE_MODE_STAT_ADAPTIVE_PLAN, false},
{"force_generic_plan", PLAN_CACHE_MODE_FORCE_GENERIC_PLAN, false},
{"force_custom_plan", PLAN_CACHE_MODE_FORCE_CUSTOM_PLAN, false},
{NULL, 0, false}
@@ -3866,6 +3867,17 @@ struct config_real ConfigureNamesReal[] =
NULL, NULL, NULL
},
+ {
+ {"skewed_param_factor", PGC_SUSET, LOGGING_WHEN,
+ gettext_noop("Sets the skewed factor for parameters of
the prepare statement."),
+ gettext_noop("Use a thredhold to determine whether a
skewed parameter exists."
+ "If true, use the custom
plan.")
+ },
+ &skewed_param_factor,
+ DEFAULT_SKEWED_PARAM_FACTOR, 2.0, 10000.0,
+ NULL, NULL, NULL
+ },
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
diff --git a/src/include/nodes/params.h b/src/include/nodes/params.h
index 10c9fc5413..051a12bb1d 100644
--- a/src/include/nodes/params.h
+++ b/src/include/nodes/params.h
@@ -85,7 +85,13 @@ struct ParseState;
* and paramCompileArg is rather arbitrary.
*/
-#define PARAM_FLAG_CONST 0x0001 /* parameter is constant */
+/* parameter is constant */
+#define PARAM_FLAG_CONST 0x0001
+/*
+ * The var side of 'var eq param' or 'param eq var' expression
+ * corresponding to the parameter contains skewed data.
+ */
+#define PARAM_FLAG_SKEWEDSTAT 0x0002
typedef struct ParamExternData
{
diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h
index 4a154606d2..7e6762de47 100644
--- a/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@ -314,6 +314,12 @@ typedef struct Const
*/
bool constbyval pg_node_attr(query_jumble_ignore);
+ /*
+ * >0 indicates that the constant is extracted from Param.
+ * Its value is paramid of the Param struct.
+ */
+ int paramid pg_node_attr(query_jumble_ignore);
+
/*
* token location, or -1 if unknown. All constants are tracked as
* locations in query jumbling, to be marked as parameters.
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index b1c51a4e70..3606e47681 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -28,6 +28,7 @@
#define DEFAULT_CPU_OPERATOR_COST 0.0025
#define DEFAULT_PARALLEL_TUPLE_COST 0.1
#define DEFAULT_PARALLEL_SETUP_COST 1000.0
+#define DEFAULT_SKEWED_PARAM_FACTOR 10.0
/* defaults for non-Cost parameters */
#define DEFAULT_RECURSIVE_WORKTABLE_FACTOR 10.0
@@ -71,6 +72,7 @@ extern PGDLLIMPORT bool enable_partition_pruning;
extern PGDLLIMPORT bool enable_presorted_aggregate;
extern PGDLLIMPORT bool enable_async_append;
extern PGDLLIMPORT int constraint_exclusion;
+extern PGDLLIMPORT double skewed_param_factor;
extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
double
index_pages, PlannerInfo *root);
diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h
index a90dfdf906..c742c6449c 100644
--- a/src/include/utils/plancache.h
+++ b/src/include/utils/plancache.h
@@ -30,6 +30,7 @@ struct RawStmt;
typedef enum
{
PLAN_CACHE_MODE_AUTO,
+ PLAN_CACHE_MODE_STAT_ADAPTIVE_PLAN,
PLAN_CACHE_MODE_FORCE_GENERIC_PLAN,
PLAN_CACHE_MODE_FORCE_CUSTOM_PLAN,
} PlanCacheMode;
@@ -124,6 +125,7 @@ typedef struct CachedPlanSource
bool is_complete; /* has CompleteCachedPlan been done? */
bool is_saved; /* has CachedPlanSource been
"saved"? */
bool is_valid; /* is the query_list currently
valid? */
+ bool hasSkewedparam; /* see the definition of PARAM FLAG
SKEWEDSTAT */
int generation; /* increments each time
we create a plan */
/* If CachedPlanSource has been saved, it is a member of a global list
*/
dlist_node node; /* list link, if is_saved */
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 2fa4c4fc1b..a7532b8d7b 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -184,6 +184,10 @@ extern double var_eq_const(VariableStatData *vardata,
Oid oproid, Oid collation,
Datum constval, bool
constisnull,
bool varonleft, bool negate);
+extern double var_eq_const_ext(VariableStatData *vardata,
+ Oid oproid, Oid collation,
+ Datum constval, bool
constisnull,
+ bool varonleft, bool negate,
bool *skewed_stat);
extern double var_eq_non_const(VariableStatData *vardata,
Oid oproid, Oid
collation,
Node *other,