On 6/24/23 13:23, Tomas Vondra wrote:
I really hope what I just wrote makes at least a little bit of sense.
Here is a continuation of your work: 1. non-matched estimation sophisticated to provide meaningful numbers.2. unmatched_frac is stored in sjinfo that let us to summarise these generated nulls along the tree of outer joins. 3. unmatched_frac is used not only for nulltest estimation but also for correction of stanullfrac and stadistinct values (improves GROUP-BY). 4. EXPLAIN prints number of estimated and actually generated NULLs in outer-join nodes.
This patch no less ugly than your, may be even more. Tambien, not sure it became more meaningful. But it helps being applied to some reported cases I have ;).
-- regards, Andrei Lepikhov
From 938c311d76404a22d96f3fcff406fe6c7e2f59f7 Mon Sep 17 00:00:00 2001 From: "Andrei V. Lepikhov" <lepi...@gmail.com> Date: Thu, 17 Apr 2025 14:33:02 +0200 Subject: [PATCH] Consider number of nulls generated by inner side of an outer-join. Nullable side of an outer join may provide significant fraction of null values replacing real numbers. Having no information about that in statistics, planner may increase estimation error, because base parameters like stanullfrac and stadistinct are out of real values. In this patch we employ varnullingrels field in attempt to remember generated nulls during query planning. --- contrib/intarray/_int_selfuncs.c | 5 +- src/backend/commands/explain.c | 24 ++ src/backend/executor/nodeHashjoin.c | 3 + src/backend/executor/nodeMergejoin.c | 2 + src/backend/executor/nodeNestloop.c | 1 + src/backend/optimizer/path/costsize.c | 1 + src/backend/optimizer/plan/createplan.c | 2 + src/backend/optimizer/plan/initsplan.c | 1 + src/backend/optimizer/util/pathnode.c | 9 + src/backend/utils/adt/like_support.c | 5 +- .../utils/adt/multirangetypes_selfuncs.c | 4 +- src/backend/utils/adt/network_selfuncs.c | 18 +- src/backend/utils/adt/rangetypes_selfuncs.c | 4 +- src/backend/utils/adt/selfuncs.c | 325 +++++++++++++++--- src/backend/utils/misc/guc_tables.c | 10 + src/include/nodes/execnodes.h | 1 + src/include/nodes/pathnodes.h | 9 + src/include/nodes/plannodes.h | 2 + src/include/optimizer/cost.h | 1 + src/include/utils/selfuncs.h | 2 + src/test/regress/expected/stats.out | 24 ++ src/test/regress/sql/stats.sql | 18 + 22 files changed, 402 insertions(+), 69 deletions(-) diff --git a/contrib/intarray/_int_selfuncs.c b/contrib/intarray/_int_selfuncs.c index 6c3b7ace146..c8d8f4e42f4 100644 --- a/contrib/intarray/_int_selfuncs.c +++ b/contrib/intarray/_int_selfuncs.c @@ -188,10 +188,7 @@ _int_matchsel(PG_FUNCTION_ARGS) */ if (HeapTupleIsValid(vardata.statsTuple)) { - Form_pg_statistic stats; - - stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); - nullfrac = stats->stanullfrac; + nullfrac = compute_stanullfrac(&vardata, NULL); /* * For an int4 array, the default array type analyze function will diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 786ee865f14..d4ddaa2db32 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2273,6 +2273,30 @@ ExplainNode(PlanState *planstate, List *ancestors, break; } + if (nodeTag(plan) == T_NestLoop || nodeTag(plan) == T_MergeJoin || + nodeTag(plan) == T_HashJoin) + { + JoinState *js = (JoinState *) planstate; + Join *join = (Join *) plan; + + if (join->unmatched_frac >= 0.0) + { + /* + * There are a probability of null tuples existed. Show how + * precisely the planner have done its job. + * XXX: The type of output value is debatable. Would it + * be better to show relative values? For now, current format just + * simpler. + */ + ExplainPropertyFloat("Planned unmatched rows", NULL, + join->unmatched_frac * join->plan.plan_rows, 0, es); + + if (es->analyze) + ExplainPropertyFloat("Actual unmatched rows", NULL, + js->unmatched_tuples, 0, es); + } + } + /* * Prepare per-worker JIT instrumentation. As with the overall JIT * summary, this is printed only if printing costs is enabled. diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 5661ad76830..a7be7a9f2b8 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -615,7 +615,10 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot; if (otherqual == NULL || ExecQual(otherqual, econtext)) + { + node->js.unmatched_tuples++; return ExecProject(node->js.ps.ps_ProjInfo); + } else InstrCountFiltered2(node, 1); } diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c index a233313128a..69c0a4b2396 100644 --- a/src/backend/executor/nodeMergejoin.c +++ b/src/backend/executor/nodeMergejoin.c @@ -668,6 +668,7 @@ ExecMergeJoin(PlanState *pstate) */ TupleTableSlot *result; + node->js.unmatched_tuples++; result = MJFillOuter(node); if (result) return result; @@ -723,6 +724,7 @@ ExecMergeJoin(PlanState *pstate) */ TupleTableSlot *result; + node->js.unmatched_tuples++; result = MJFillInner(node); if (result) return result; diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c index 5cd1a251625..26e9537d187 100644 --- a/src/backend/executor/nodeNestloop.c +++ b/src/backend/executor/nodeNestloop.c @@ -188,6 +188,7 @@ ExecNestLoop(PlanState *pstate) */ ENL1_printf("qualification succeeded, projecting tuple"); + node->js.unmatched_tuples++; return ExecProject(node->js.ps.ps_ProjInfo); } else diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 60b0fcfb6be..b877de37134 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -163,6 +163,7 @@ bool enable_parallel_hash = true; bool enable_partition_pruning = true; bool enable_presorted_aggregate = true; bool enable_async_append = true; +bool compute_innerside_nulls = true; typedef struct { diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index a8f22a8c154..a6ca458e22b 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -1104,6 +1104,8 @@ create_join_plan(PlannerInfo *root, JoinPath *best_path) break; } + ((Join *) plan)->unmatched_frac = best_path->unmatch_frac; + /* * If there are any pseudoconstant clauses attached to this node, insert a * gating Result node that evaluates the pseudoconstants as one-time diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 01804b085b3..25e30786605 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -1765,6 +1765,7 @@ make_outerjoininfo(PlannerInfo *root, sjinfo->commute_above_r = NULL; sjinfo->commute_below_l = NULL; sjinfo->commute_below_r = NULL; + sjinfo->unmatched_frac = -1; /* Undefined */ compute_semijoin_info(root, sjinfo, clause); diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 93e73cb44db..0d6ec5586be 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -2605,6 +2605,9 @@ create_nestloop_path(PlannerInfo *root, final_cost_nestloop(root, pathnode, workspace, extra); + pathnode->jpath.unmatch_frac = extra->sjinfo->ojrelid > 0 ? + extra->sjinfo->unmatched_frac : -1.0; + return pathnode; } @@ -2674,6 +2677,9 @@ create_mergejoin_path(PlannerInfo *root, final_cost_mergejoin(root, pathnode, workspace, extra); + pathnode->jpath.unmatch_frac = extra->sjinfo->ojrelid > 0 ? + extra->sjinfo->unmatched_frac : -1.0; + return pathnode; } @@ -2748,6 +2754,9 @@ create_hashjoin_path(PlannerInfo *root, final_cost_hashjoin(root, pathnode, workspace, extra); + pathnode->jpath.unmatch_frac = extra->sjinfo->ojrelid > 0 ? + extra->sjinfo->unmatched_frac : -1.0; + return pathnode; } diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index 8fdc677371f..e91874eff31 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -603,10 +603,7 @@ patternsel_common(PlannerInfo *root, */ if (HeapTupleIsValid(vardata.statsTuple)) { - Form_pg_statistic stats; - - stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); - nullfrac = stats->stanullfrac; + nullfrac = compute_stanullfrac(&vardata, NULL); } /* diff --git a/src/backend/utils/adt/multirangetypes_selfuncs.c b/src/backend/utils/adt/multirangetypes_selfuncs.c index b87bcf3ea30..24a1fa3800a 100644 --- a/src/backend/utils/adt/multirangetypes_selfuncs.c +++ b/src/backend/utils/adt/multirangetypes_selfuncs.c @@ -302,11 +302,9 @@ calc_multirangesel(TypeCacheEntry *typcache, VariableStatData *vardata, */ if (HeapTupleIsValid(vardata->statsTuple)) { - Form_pg_statistic stats; AttStatsSlot sslot; - stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); - null_frac = stats->stanullfrac; + null_frac = compute_stanullfrac(vardata, NULL); /* Try to get fraction of empty multiranges */ if (get_attstatsslot(&sslot, vardata->statsTuple, diff --git a/src/backend/utils/adt/network_selfuncs.c b/src/backend/utils/adt/network_selfuncs.c index 940cdafa546..05332d378aa 100644 --- a/src/backend/utils/adt/network_selfuncs.c +++ b/src/backend/utils/adt/network_selfuncs.c @@ -89,7 +89,6 @@ networksel(PG_FUNCTION_ARGS) mcv_selec, non_mcv_selec; Datum constvalue; - Form_pg_statistic stats; AttStatsSlot hslot; double sumcommon, nullfrac; @@ -127,8 +126,7 @@ networksel(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(DEFAULT_SEL(operator)); } - stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); - nullfrac = stats->stanullfrac; + nullfrac = compute_stanullfrac(&vardata, NULL); /* * If we have most-common-values info, add up the fractions of the MCV @@ -263,7 +261,6 @@ static Selectivity networkjoinsel_inner(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2) { - Form_pg_statistic stats; double nullfrac1 = 0.0, nullfrac2 = 0.0; Selectivity selec = 0.0, @@ -283,8 +280,7 @@ networkjoinsel_inner(Oid operator, if (HeapTupleIsValid(vardata1->statsTuple)) { - stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple); - nullfrac1 = stats->stanullfrac; + nullfrac1 = compute_stanullfrac(vardata1, NULL); mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple, STATISTIC_KIND_MCV, InvalidOid, @@ -305,8 +301,7 @@ networkjoinsel_inner(Oid operator, if (HeapTupleIsValid(vardata2->statsTuple)) { - stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple); - nullfrac2 = stats->stanullfrac; + nullfrac2 = compute_stanullfrac(vardata2, NULL); mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple, STATISTIC_KIND_MCV, InvalidOid, @@ -390,7 +385,6 @@ static Selectivity networkjoinsel_semi(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2) { - Form_pg_statistic stats; Selectivity selec = 0.0, sumcommon1 = 0.0, sumcommon2 = 0.0; @@ -413,8 +407,7 @@ networkjoinsel_semi(Oid operator, if (HeapTupleIsValid(vardata1->statsTuple)) { - stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple); - nullfrac1 = stats->stanullfrac; + nullfrac1 = compute_stanullfrac(vardata1, NULL); mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple, STATISTIC_KIND_MCV, InvalidOid, @@ -435,8 +428,7 @@ networkjoinsel_semi(Oid operator, if (HeapTupleIsValid(vardata2->statsTuple)) { - stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple); - nullfrac2 = stats->stanullfrac; + nullfrac2 = compute_stanullfrac(vardata2, NULL); mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple, STATISTIC_KIND_MCV, InvalidOid, diff --git a/src/backend/utils/adt/rangetypes_selfuncs.c b/src/backend/utils/adt/rangetypes_selfuncs.c index d126abc5a82..0bfc68baac6 100644 --- a/src/backend/utils/adt/rangetypes_selfuncs.c +++ b/src/backend/utils/adt/rangetypes_selfuncs.c @@ -241,11 +241,9 @@ calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata, */ if (HeapTupleIsValid(vardata->statsTuple)) { - Form_pg_statistic stats; AttStatsSlot sslot; - stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); - null_frac = stats->stanullfrac; + null_frac = compute_stanullfrac(vardata, NULL); /* Try to get fraction of empty ranges */ if (get_attstatsslot(&sslot, vardata->statsTuple, diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 987f2154459..5f3e26e5235 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -156,6 +156,13 @@ static double eqjoinsel_inner(Oid opfuncoid, Oid collation, AttStatsSlot *sslot1, AttStatsSlot *sslot2, Form_pg_statistic stats1, Form_pg_statistic stats2, bool have_mcvs1, bool have_mcvs2); +static double eqjoinsel_unmatch_left(Oid opfuncoid, Oid collation, + VariableStatData *vardata1, VariableStatData *vardata2, + double nd1, double nd2, + bool isdefault1, bool isdefault2, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + Form_pg_statistic stats1, Form_pg_statistic stats2, + bool have_mcvs1, bool have_mcvs2); static double eqjoinsel_semi(Oid opfuncoid, Oid collation, VariableStatData *vardata1, VariableStatData *vardata2, double nd1, double nd2, @@ -319,10 +326,7 @@ var_eq_const(VariableStatData *vardata, Oid oproid, Oid collation, */ if (HeapTupleIsValid(vardata->statsTuple)) { - Form_pg_statistic stats; - - stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); - nullfrac = stats->stanullfrac; + nullfrac = compute_stanullfrac(vardata, NULL); } /* @@ -481,10 +485,7 @@ var_eq_non_const(VariableStatData *vardata, Oid oproid, Oid collation, */ if (HeapTupleIsValid(vardata->statsTuple)) { - Form_pg_statistic stats; - - stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); - nullfrac = stats->stanullfrac; + nullfrac = compute_stanullfrac(vardata, NULL); } /* @@ -997,10 +998,7 @@ generic_restriction_selectivity(PlannerInfo *root, Oid oproid, Oid collation, selec = 0.9999; /* Don't forget to account for nulls. */ - if (HeapTupleIsValid(vardata.statsTuple)) - nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac; - else - nullfrac = 0.0; + nullfrac = compute_stanullfrac(&vardata, NULL); /* * Now merge the results from the MCV and histogram calculations, @@ -1552,12 +1550,10 @@ booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg, if (HeapTupleIsValid(vardata.statsTuple)) { - Form_pg_statistic stats; double freq_null; AttStatsSlot sslot; - stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); - freq_null = stats->stanullfrac; + freq_null = compute_stanullfrac(&vardata, NULL); if (get_attstatsslot(&sslot, vardata.statsTuple, STATISTIC_KIND_MCV, InvalidOid, @@ -1710,11 +1706,12 @@ nulltestsel(PlannerInfo *root, NullTestType nulltesttype, Node *arg, if (HeapTupleIsValid(vardata.statsTuple)) { - Form_pg_statistic stats; double freq_null; - stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); - freq_null = stats->stanullfrac; + freq_null = compute_stanullfrac(&vardata, NULL); + + if (sjinfo) + freq_null = freq_null + sjinfo->unmatched_frac - freq_null * sjinfo->unmatched_frac; switch (nulltesttype) { @@ -2287,6 +2284,7 @@ eqjoinsel(PG_FUNCTION_ARGS) Oid collation = PG_GET_COLLATION(); double selec; double selec_inner; + double unmatched_frac; VariableStatData vardata1; VariableStatData vardata2; double nd1; @@ -2359,6 +2357,26 @@ eqjoinsel(PG_FUNCTION_ARGS) stats1, stats2, have_mcvs1, have_mcvs2); + /* + * calculate fraction of right without of matching row on left + * + * FIXME Should be restricted to JOIN_LEFT, we should have similar logic + * for JOIN_FULL. + * + * XXX Probably should calculate unmatched as fraction of the join result, + * not of the relation on the right (because the matched part can have more + * matches per row and thus grow). Not sure. Depends on how it's used later. + */ + unmatched_frac = eqjoinsel_unmatch_left(opfuncoid, collation, + &vardata1, &vardata2, + nd1, nd2, + isdefault1, isdefault2, + &sslot1, &sslot2, + stats1, stats2, + have_mcvs1, have_mcvs2); + + sjinfo->unmatched_frac = unmatched_frac; + switch (sjinfo->jointype) { case JOIN_INNER: @@ -2467,8 +2485,8 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation, FmgrInfo eqproc; bool *hasmatch1; bool *hasmatch2; - double nullfrac1 = stats1->stanullfrac; - double nullfrac2 = stats2->stanullfrac; + double nullfrac1 = compute_stanullfrac(vardata1, NULL); + double nullfrac2 = compute_stanullfrac(vardata2, NULL); double matchprodfreq, matchfreq1, matchfreq2, @@ -2628,6 +2646,135 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation, return selec; } +/* + * eqjoinsel_unmatch_left + * + * XXX Mostly copy paste of eqjoinsel_inner, but calculates unmatched fraction + * of the relation on the left. See eqjoinsel_inner for more comments. + * + * XXX Maybe we could/should integrate this into eqjoinsel_inner, so that we + * don't have to walk the MCV lists twice? + */ +static double +eqjoinsel_unmatch_left(Oid opfuncoid, Oid collation, + VariableStatData *vardata1, VariableStatData *vardata2, + double nd1, double nd2, + bool isdefault1, bool isdefault2, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + Form_pg_statistic stats1, Form_pg_statistic stats2, + bool have_mcvs1, bool have_mcvs2) +{ + double unmatchfreq; + double nullfrac1 = compute_stanullfrac(vardata1, NULL); + + if (!compute_innerside_nulls) + return 0.0; + + if (have_mcvs1 && have_mcvs2) + { + LOCAL_FCINFO(fcinfo, 2); + FmgrInfo eqproc; + bool *hasmatch1; + bool *hasmatch2; + double matchfreq1, + unmatchfreq1; + int i, + nmatches; + + fmgr_info(opfuncoid, &eqproc); + + /* + * Save a few cycles by setting up the fcinfo struct just once. Using + * FunctionCallInvoke directly also avoids failure if the eqproc + * returns NULL, though really equality functions should never do + * that. + */ + InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation, + NULL, NULL); + fcinfo->args[0].isnull = false; + fcinfo->args[1].isnull = false; + + hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool)); + hasmatch2 = (bool *) palloc0(sslot2->nvalues * sizeof(bool)); + + /* + * Note we assume that each MCV will match at most one member of the + * other MCV list. If the operator isn't really equality, there could + * be multiple matches --- but we don't look for them, both for speed + * and because the math wouldn't add up... + */ + nmatches = 0; + for (i = 0; i < sslot1->nvalues; i++) + { + int j; + + fcinfo->args[0].value = sslot1->values[i]; + + for (j = 0; j < sslot2->nvalues; j++) + { + Datum fresult; + + if (hasmatch2[j]) + /* Already matched. Impossible to find more candidates */ + continue; + + fcinfo->args[1].value = sslot2->values[j]; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + if (!fcinfo->isnull && DatumGetBool(fresult)) + { + hasmatch1[i] = hasmatch2[j] = true; + nmatches++; + break; + } + } + } + + /* Sum up frequencies of matched and unmatched MCVs */ + matchfreq1 = unmatchfreq1 = 0.0; + for (i = 0; i < sslot1->nvalues; i++) + { + if (hasmatch1[i]) + matchfreq1 += sslot1->numbers[i]; + else + unmatchfreq1 += sslot1->numbers[i]; + } + CLAMP_PROBABILITY(matchfreq1); + CLAMP_PROBABILITY(unmatchfreq1); + pfree(hasmatch1); + pfree(hasmatch2); + + unmatchfreq = 1.0 - nullfrac1 - matchfreq1; + CLAMP_PROBABILITY(unmatchfreq); + + nd1 -= nmatches; + nd2 -= nmatches; + + /* XXX Keep an eye on the cases where ndistinct less than zero */ + nd1 = clamp_row_est(nd1); + nd2 = clamp_row_est(nd2); + + if (nd1 > nd2) + unmatchfreq *= (nd1 - nd2) / nd1; + else + unmatchfreq *= (nd2 - nd1) / nd2; + } + else + { + /* + * XXX Should this look at nullfrac on either side? Probably depends on + * if we're calculating fraction of NULLs or fraction of unmatched rows. + */ + unmatchfreq = (1.0 - nullfrac1); + if (nd1 > nd2) + unmatchfreq *= (nd1 - nd2) / nd1; + else + unmatchfreq *= (nd2 - nd1) / nd2; + } + + return unmatchfreq; +} + /* * eqjoinsel_semi --- eqjoinsel for semi join * @@ -2694,7 +2841,7 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation, FmgrInfo eqproc; bool *hasmatch1; bool *hasmatch2; - double nullfrac1 = stats1->stanullfrac; + double nullfrac1 = compute_stanullfrac(vardata1, NULL); double matchfreq1, uncertainfrac, uncertain; @@ -2854,15 +3001,11 @@ neqjoinsel(PG_FUNCTION_ARGS) VariableStatData leftvar; VariableStatData rightvar; bool reversed; - HeapTuple statsTuple; double nullfrac; get_join_variables(root, args, sjinfo, &leftvar, &rightvar, &reversed); - statsTuple = reversed ? rightvar.statsTuple : leftvar.statsTuple; - if (HeapTupleIsValid(statsTuple)) - nullfrac = ((Form_pg_statistic) GETSTRUCT(statsTuple))->stanullfrac; - else - nullfrac = 0.0; + nullfrac = compute_stanullfrac(reversed ? &rightvar : &leftvar, NULL); + ReleaseVariableStats(leftvar); ReleaseVariableStats(rightvar); @@ -3226,22 +3369,18 @@ mergejoinscansel(PlannerInfo *root, Node *clause, */ if (nulls_first) { - Form_pg_statistic stats; - if (HeapTupleIsValid(leftvar.statsTuple)) { - stats = (Form_pg_statistic) GETSTRUCT(leftvar.statsTuple); - *leftstart += stats->stanullfrac; + *leftstart += compute_stanullfrac(&leftvar, NULL); CLAMP_PROBABILITY(*leftstart); - *leftend += stats->stanullfrac; + *leftend += compute_stanullfrac(&leftvar, NULL); CLAMP_PROBABILITY(*leftend); } if (HeapTupleIsValid(rightvar.statsTuple)) { - stats = (Form_pg_statistic) GETSTRUCT(rightvar.statsTuple); - *rightstart += stats->stanullfrac; + *rightstart += compute_stanullfrac(&rightvar, NULL); CLAMP_PROBABILITY(*rightstart); - *rightend += stats->stanullfrac; + *rightend += compute_stanullfrac(&rightvar, NULL); CLAMP_PROBABILITY(*rightend); } } @@ -4049,10 +4188,7 @@ estimate_hash_bucket_stats(PlannerInfo *root, Node *hashkey, double nbuckets, /* Get fraction that are null */ if (HeapTupleIsValid(vardata.statsTuple)) { - Form_pg_statistic stats; - - stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); - stanullfrac = stats->stanullfrac; + stanullfrac = compute_stanullfrac(&vardata, NULL); } else stanullfrac = 0.0; @@ -5196,6 +5332,109 @@ ReleaseDummy(HeapTuple tuple) pfree(tuple); } +static int +sjinfo_cmp(const ListCell *lc1, const ListCell *lc2) +{ + SpecialJoinInfo *sjinfo1 = lfirst_node(SpecialJoinInfo, lc1); + SpecialJoinInfo *sjinfo2 = lfirst_node(SpecialJoinInfo, lc2); + int num1 = bms_num_members( + bms_union(sjinfo1->min_lefthand, sjinfo1->min_righthand)); + int num2 = bms_num_members( + bms_union(sjinfo2->min_lefthand, sjinfo2->min_righthand)); + + if (num1 > num2) + return 1; + else if (num1 == num2) + return 0; + else + return -1; +} + +double +compute_stanullfrac(VariableStatData *vardata, double *gen_frac) +{ + Form_pg_statistic stats; + Var *var; + PlannerInfo *root = vardata->root; + List *sjinfos = NIL; + double nullfrac; + + if (gen_frac != NULL) + *gen_frac = 0.0; + + if (!HeapTupleIsValid(vardata->statsTuple) || vardata->var == NULL) + return 0.0; + + if (IsA(vardata->var, RelabelType)) + var = (Var *) (((RelabelType *) vardata->var)->arg); + else + var = (Var *) vardata->var; + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + nullfrac = stats->stanullfrac; + + if (!compute_innerside_nulls) + return nullfrac; + + if (root == NULL || !IsA(var, Var) || + bms_is_empty(var->varnullingrels)) + return nullfrac; + + /* Find all outer joins that can null this column */ + foreach_node(SpecialJoinInfo, sjinfo, vardata->root->join_info_list) + { + RangeTblEntry *rte; + + if (!OidIsValid(sjinfo->ojrelid) || + !bms_is_member(sjinfo->ojrelid, var->varnullingrels)) + continue; + + Assert(OidIsValid(sjinfo->ojrelid)); + + /* + * varnullingrels may refer not only outer joins - we need to filter + * other objects in advance. + */ + + rte = root->simple_rte_array[sjinfo->ojrelid]; + + if (rte->rtekind != RTE_JOIN) + continue; + + Assert(sjinfo->ojrelid < root->simple_rel_array_size && + root->simple_rel_array[sjinfo->ojrelid] == NULL); + Assert(bms_is_member(sjinfo->ojrelid, root->outer_join_rels)); + + sjinfos = lappend(sjinfos, sjinfo); + break; + } + list_sort(sjinfos, sjinfo_cmp); + + foreach_node(SpecialJoinInfo, sjinfo, sjinfos) + { + double unmatched_frac; + + if (sjinfo->unmatched_frac <= 0.0) + continue; + + unmatched_frac = sjinfo->unmatched_frac - (sjinfo->unmatched_frac * nullfrac); + nullfrac = sjinfo->unmatched_frac + nullfrac - + (sjinfo->unmatched_frac * nullfrac); + Assert(nullfrac >= 0. && nullfrac <= 1.); + + + if (gen_frac != NULL) + { + *gen_frac = *gen_frac + unmatched_frac - + (unmatched_frac * *gen_frac); + + Assert(*gen_frac >= 0.0 && *gen_frac <= 1.0); + } + } + + return nullfrac; +} + /* * examine_variable * Try to look up statistical data about an expression. @@ -5261,6 +5500,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, Var *var = (Var *) basenode; /* Set up result fields other than the stats tuple */ + vardata->root = root; vardata->var = basenode; /* return Var without relabeling */ vardata->rel = find_base_rel(root, var->varno); vardata->atttype = var->vartype; @@ -6096,6 +6336,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault) { double stadistinct; double stanullfrac = 0.0; + double generatednullsfrac = 0.0; double ntuples; *isdefault = false; @@ -6113,7 +6354,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault) stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); stadistinct = stats->stadistinct; - stanullfrac = stats->stanullfrac; + stanullfrac = compute_stanullfrac(vardata, &generatednullsfrac); } else if (vardata->vartype == BOOLOID) { @@ -6200,7 +6441,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault) * If we had a relative estimate, use that. */ if (stadistinct < 0.0) - return clamp_row_est(-stadistinct * ntuples); + return clamp_row_est(-stadistinct * ntuples * (1.0 - generatednullsfrac)); /* * With no data, estimate ndistinct = ntuples if the table is small, else @@ -6328,7 +6569,7 @@ get_variable_range(PlannerInfo *root, VariableStatData *vardata, for (i = 0; i < sslot.nnumbers; i++) sumcommon += sslot.numbers[i]; - nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata->statsTuple))->stanullfrac; + nullfrac = compute_stanullfrac(vardata, NULL); if (sumcommon + nullfrac > 0.99999) use_mcvs = true; } diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 60b12446a1c..45e82894b39 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -1016,6 +1016,16 @@ struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"compute_innerside_nulls", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables considering NULL values have appeared because of unmatched side of OJ"), + NULL, + GUC_EXPLAIN + }, + &compute_innerside_nulls, + true, + NULL, NULL, NULL + }, { {"enable_group_by_reordering", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables reordering of GROUP BY keys."), diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 5b6cadb5a6c..aad097dd57b 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -2155,6 +2155,7 @@ typedef struct JoinState bool single_match; /* True if we should skip to next outer tuple * after finding one inner match */ ExprState *joinqual; /* JOIN quals (in addition to ps.qual) */ + double unmatched_tuples; } JoinState; /* ---------------- diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index bb678bdcdcd..a49661af91f 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -2218,6 +2218,12 @@ typedef struct JoinPath * joinrestrictinfo is needed in JoinPath, and can't be merged into the * parent RelOptInfo. */ + + /* + * Must be -1.0 'not initialized' for non-outer joins. In case of outer join + * it contains fraction of not matched tuples. + */ + Selectivity unmatch_frac; } JoinPath; /* @@ -3043,6 +3049,9 @@ struct SpecialJoinInfo bool semi_can_hash; /* true if semi_operators are all hash */ List *semi_operators; /* OIDs of equality join operators */ List *semi_rhs_exprs; /* righthand-side expressions of these ops */ + + /* For outer join, fraction of rows without a match. */ + Selectivity unmatched_frac; }; /* diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 658d76225e4..884e5bf8b27 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -929,6 +929,8 @@ typedef struct Join bool inner_unique; /* JOIN quals (in addition to plan.qual) */ List *joinqual; + + Selectivity unmatched_frac; } Join; /* ---------------- diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index c5987440817..6b4c3c1eb6a 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -71,6 +71,7 @@ extern PGDLLIMPORT bool enable_partition_pruning; extern PGDLLIMPORT bool enable_presorted_aggregate; extern PGDLLIMPORT bool enable_async_append; extern PGDLLIMPORT int constraint_exclusion; +extern PGDLLIMPORT bool compute_innerside_nulls; extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, double index_pages, PlannerInfo *root); diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index 013049b3098..0e865773c5e 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -97,6 +97,7 @@ typedef struct VariableStatData bool isunique; /* matches unique index, DISTINCT or GROUP-BY * clause */ bool acl_ok; /* result of ACL check on table or column */ + PlannerInfo *root; } VariableStatData; #define ReleaseVariableStats(vardata) \ @@ -151,6 +152,7 @@ extern PGDLLIMPORT get_index_stats_hook_type get_index_stats_hook; /* Functions in selfuncs.c */ +extern double compute_stanullfrac(VariableStatData *vardata, double *gen_frac); extern void examine_variable(PlannerInfo *root, Node *node, int varRelid, VariableStatData *vardata); extern bool statistic_proc_security_check(VariableStatData *vardata, Oid func_oid); diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index 776f1ad0e53..14083893190 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -1868,4 +1868,28 @@ SELECT * FROM check_estimated_rows('SELECT * FROM table_fillfactor'); (1 row) DROP TABLE table_fillfactor; +CREATE TABLE test (x integer); +INSERT INTO test (SELECT gs FROM generate_series(1,10000) AS gs); +CREATE INDEX ON test (x); +VACUUM ANALYZE test; +-- SET enable_hashagg = off; -- avoid mem size unstable report +SET compute_innerside_nulls = off; +SELECT * FROM check_estimated_rows(' + SELECT t2.x FROM test t1 LEFT JOIN test t2 ON (t1.x=-t2.x) GROUP BY t2.x; +'); + estimated | actual +-----------+-------- + 10000 | 1 +(1 row) + +SET compute_innerside_nulls = on; +SELECT * FROM check_estimated_rows(' + SELECT t2.x FROM test t1 LEFT JOIN test t2 ON (t1.x=-t2.x) GROUP BY t2.x; +'); + estimated | actual +-----------+-------- + 200 | 1 +(1 row) + +DROP TABLE IF EXISTS test; -- End of Stats Test diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql index 232ab8db8fa..408b65c20a8 100644 --- a/src/test/regress/sql/stats.sql +++ b/src/test/regress/sql/stats.sql @@ -925,4 +925,22 @@ SELECT * FROM check_estimated_rows('SELECT * FROM table_fillfactor'); DROP TABLE table_fillfactor; +CREATE TABLE test (x integer); +INSERT INTO test (SELECT gs FROM generate_series(1,10000) AS gs); +CREATE INDEX ON test (x); +VACUUM ANALYZE test; + +-- SET enable_hashagg = off; -- avoid mem size unstable report + +SET compute_innerside_nulls = off; +SELECT * FROM check_estimated_rows(' + SELECT t2.x FROM test t1 LEFT JOIN test t2 ON (t1.x=-t2.x) GROUP BY t2.x; +'); + +SET compute_innerside_nulls = on; +SELECT * FROM check_estimated_rows(' + SELECT t2.x FROM test t1 LEFT JOIN test t2 ON (t1.x=-t2.x) GROUP BY t2.x; +'); +DROP TABLE IF EXISTS test; + -- End of Stats Test -- 2.39.5