On Wed, Mar 02, 2022 at 11:38:21AM -0600, Justin Pryzby wrote: > Rebased over 269b532ae and muted compiler warnings.
And attached.
>From 587a5e9fe87c26cdcd9602fc349f092da95cc580 Mon Sep 17 00:00:00 2001 From: Tomas Vondra <tomas.von...@postgresql.org> Date: Mon, 13 Dec 2021 14:05:17 +0100 Subject: [PATCH] Estimate joins using extended statistics Use extended statistics (MCV) to improve join estimates. In general this is similar to how we use regular statistics - we search for extended statistics (with MCV) covering all join clauses, and if we find such MCV on both sides of the join, we combine those two MCVs. Extended statistics allow a couple additional improvements - e.g. if there are baserel conditions, we can use them to restrict the part of the MCVs combined. This means we're building conditional probability distribution and calculating conditional probability P(join clauses | baserel conditions) instead of just P(join clauses). The patch also allows combining regular and extended MCV - we don't need extended MCVs on both sides. This helps when one of the tables does not have extended statistics (e.g. because there are no correlations). --- src/backend/optimizer/path/clausesel.c | 63 +- src/backend/statistics/extended_stats.c | 805 ++++++++++++++++++ src/backend/statistics/mcv.c | 757 ++++++++++++++++ .../statistics/extended_stats_internal.h | 20 + src/include/statistics/statistics.h | 12 + src/test/regress/expected/stats_ext.out | 167 ++++ src/test/regress/sql/stats_ext.sql | 66 ++ 7 files changed, 1889 insertions(+), 1 deletion(-) diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index 06f836308d0..1b2227321a2 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -50,6 +50,9 @@ static Selectivity clauselist_selectivity_or(PlannerInfo *root, JoinType jointype, SpecialJoinInfo *sjinfo, bool use_extended_stats); +static inline bool treat_as_join_clause(PlannerInfo *root, + Node *clause, RestrictInfo *rinfo, + int varRelid, SpecialJoinInfo *sjinfo); /**************************************************************************** * ROUTINES TO COMPUTE SELECTIVITIES @@ -129,12 +132,53 @@ clauselist_selectivity_ext(PlannerInfo *root, RangeQueryClause *rqlist = NULL; ListCell *l; int listidx; + bool single_clause_optimization = true; + + /* + * The optimization of skipping to clause_selectivity_ext for single + * clauses means we can't improve join estimates with a single join + * clause but additional baserel restrictions. So we disable it when + * estimating joins. + * + * XXX Not sure if this is the right way to do it, but more elaborate + * checks would mostly negate the whole point of the optimization. + * The (Var op Var) patch has the same issue. + * + * XXX An alternative might be making clause_selectivity_ext smarter + * and make it use the join extended stats there. But that seems kinda + * against the whole point of the optimization (skipping expensive + * stuff) and it's making other parts more complex. + * + * XXX Maybe this should check if there are at least some restrictions + * on some base relations, which seems important. But then again, that + * seems to go against the idea of this check to be cheap. Moreover, it + * won't work for OR clauses, which may have multiple parts but we still + * see them as a single BoolExpr clause (it doesn't work later, though). + */ + if (list_length(clauses) == 1) + { + Node *clause = linitial(clauses); + RestrictInfo *rinfo = NULL; + + if (IsA(clause, RestrictInfo)) + { + rinfo = (RestrictInfo *) clause; + clause = (Node *) rinfo->clause; + } + + single_clause_optimization + = !treat_as_join_clause(root, clause, rinfo, varRelid, sjinfo); + } /* * If there's exactly one clause, just go directly to * clause_selectivity_ext(). None of what we might do below is relevant. + * + * XXX This means we won't try using extended stats on OR-clauses (which + * are a single BoolExpr clause at this point), although we'll do that + * later (once we look at the arguments). */ - if (list_length(clauses) == 1) + if ((list_length(clauses) == 1) && single_clause_optimization) return clause_selectivity_ext(root, (Node *) linitial(clauses), varRelid, jointype, sjinfo, use_extended_stats); @@ -157,6 +201,23 @@ clauselist_selectivity_ext(PlannerInfo *root, &estimatedclauses, false); } + /* + * Try applying extended statistics to joins. There's not much we can + * do to detect when this makes sense, but we can check that there are + * join clauses, and that at least some of the rels have stats. + * + * XXX Isn't this mutually exclusive with the preceding block which + * calculates estimates for a single relation? + */ + if (use_extended_stats && + statext_try_join_estimates(root, clauses, varRelid, jointype, sjinfo, + estimatedclauses)) + { + s1 *= statext_clauselist_join_selectivity(root, clauses, varRelid, + jointype, sjinfo, + &estimatedclauses); + } + /* * Apply normal selectivity estimates for remaining clauses. We'll be * careful to skip any clauses which were already estimated above. diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index ca48395d5c5..427ba015b73 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -31,6 +31,7 @@ #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" #include "optimizer/optimizer.h" +#include "optimizer/pathnode.h" #include "parser/parsetree.h" #include "pgstat.h" #include "postmaster/autovacuum.h" @@ -103,6 +104,8 @@ static StatsBuildData *make_build_data(Relation onerel, StatExtEntry *stat, int numrows, HeapTuple *rows, VacAttrStats **stats, int stattarget); +static bool stat_covers_expressions(StatisticExtInfo *stat, List *exprs, + Bitmapset **expr_idxs); /* * Compute requested extended stats, using the rows sampled for the plain @@ -2611,3 +2614,805 @@ make_build_data(Relation rel, StatExtEntry *stat, int numrows, HeapTuple *rows, return result; } + +/* + * statext_find_matching_mcv + * Search for a MCV covering all the attributes and expressions. + * + * We pick the statistics to use for join estimation. The statistics object has + * to have MCV, and we require it to match all the join conditions, because it + * makes the estimation simpler. + * + * If there are multiple candidate statistics objects (matching all join clauses), + * we pick the smallest one, and we also consider additional conditions on + * the base relations to restrict the MCV items used for estimation (using + * conditional probability). + * + * XXX The requirement that all the attributes need to be covered might be + * too strong. We could relax this and and require fewer matches (at least two, + * if counting the additional conditions), and we might even apply multiple + * statistics etc. But that would require matching statistics on both sides of + * the join, while now we simply know the statistics match. We don't really + * expect many candidate MCVs, so this simple approach seems sufficient. And + * the joins usually use only one or two columns, so there's not much room + * for applying multiple statistics anyway. + */ +StatisticExtInfo * +statext_find_matching_mcv(PlannerInfo *root, RelOptInfo *rel, + Bitmapset *attnums, List *exprs) +{ + ListCell *l; + StatisticExtInfo *mcv = NULL; + List *stats = rel->statlist; + + foreach(l, stats) + { + StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l); + List *conditions1 = NIL, + *conditions2 = NIL; + + /* We only care about MCV statistics here. */ + if (stat->kind != STATS_EXT_MCV) + continue; + + /* + * Ignore MCVs not covering all the attributes/expressions. + * + * XXX Maybe we shouldn't be so strict and consider only partial + * matches for join clauses too? + */ + if (!bms_is_subset(attnums, stat->keys) || + !stat_covers_expressions(stat, exprs, NULL)) + continue; + + /* If there's no matching MCV yet, keep this one. */ + if (!mcv) + { + mcv = stat; + continue; + } + + /* + * OK, we have two candidate statistics objects and we need to decide + * which one to keep. We'll use two simple heuristics: + * + * (a) We prefer smaller statistics (fewer columns), on the assumption + * that it represents a larger fraction of the data (due to having fewer + * combinations with higher counts). + * + * (b) If the statistics object covers some additional conditions for the rels, + * that may help with considering additional dependencies between the + * tables. + * + * Of course, those two heuristict are somewhat contradictory - smaller + * stats are less likely to cover as many conditions as a larger one. We + * consider the additional conditions first - if someone created such + * statistics, there probably is a dependency worth considering. + * + * When inspecting the restrictions, we need to be careful - we don't + * know which of them are compatible with extended stats, so we have to + * run them through statext_is_compatible_clause first and then match + * them to the statistics. + * + * XXX Maybe we shouldn't pick statistics that covers just a single join + * clause, without any additional conditions. In such case we could just + * as well pick regular statistics for the column/expression, but it's + * not clear if that actually exists (so we might reject the stats here + * and then fail to find something simpler/better). + */ + conditions1 = statext_determine_join_restrictions(root, rel, stat); + conditions2 = statext_determine_join_restrictions(root, rel, mcv); + + /* if the new statistics object covers more conditions, use it */ + if (list_length(conditions1) > list_length(conditions2)) + { + mcv = stat; + continue; + } + + /* The statistics seem about equal, so just use the smaller one. */ + if (bms_num_members(mcv->keys) + list_length(mcv->exprs) > + bms_num_members(stat->keys) + list_length(stat->exprs)) + { + mcv = stat; + } + } + + return mcv; +} + +/* + * statext_determine_join_restrictions + * Get restrictions on base relation, covered by the statistics object. + * + * Returns a list of baserel restrictinfos, compatible with extended statistics + * and covered by the extended statistics object. + * + * When using extended statistics to estimate joins, we can use conditions + * from base relations to calculate conditional probability + * + * P(join clauses | baserel restrictions) + * + * which should be a better estimate than just P(join clauses). We want to pick + * the statistics object covering the most such conditions. + */ +List * +statext_determine_join_restrictions(PlannerInfo *root, RelOptInfo *rel, + StatisticExtInfo *info) +{ + ListCell *lc; + List *conditions = NIL; + + /* extract conditions that may be applied to the MCV list */ + foreach (lc, rel->baserestrictinfo) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc); + Bitmapset *indexes = NULL; + Bitmapset *attnums = NULL; + List *exprs = NIL; + + /* clause has to be supported by MCV in general */ + if (!statext_is_compatible_clause(root, (Node *) rinfo, rel->relid, + &attnums, &exprs)) + continue; + + /* + * clause is compatible in general, but is it actually covered + * by this particular statistics object? + */ + if (!bms_is_subset(attnums, info->keys) || + !stat_covers_expressions(info, exprs, &indexes)) + continue; + + conditions = lappend(conditions, rinfo->clause); + } + + return conditions; +} + +/* + * statext_is_supported_join_clause + * Check if a join clause may be estimated using extended stats. + * + * Determines if this is a join clause of the form (Expr op Expr) which may be + * estimated using extended statistics. Each side must reference just a single + * relation for now. + * + * Similar to treat_as_join_clause, but we place additional restrictions + * on the conditions. + */ +static bool +statext_is_supported_join_clause(PlannerInfo *root, Node *clause, + int varRelid, SpecialJoinInfo *sjinfo) +{ + Oid oprsel; + RestrictInfo *rinfo; + OpExpr *opclause; + ListCell *lc; + + /* + * evaluation as a restriction clause, either at scan node or forced + * + * XXX See treat_as_join_clause. + */ + if ((varRelid != 0) || (sjinfo == NULL)) + return false; + + /* XXX Can we rely on always getting RestrictInfo here? */ + if (!IsA(clause, RestrictInfo)) + return false; + + /* strip the RestrictInfo */ + rinfo = (RestrictInfo *) clause; + clause = (Node *) rinfo->clause; + + /* is it referencing multiple relations? */ + if (bms_membership(rinfo->clause_relids) != BMS_MULTIPLE) + return false; + + /* we only support simple operator clauses for now */ + if (!is_opclause(clause)) + return false; + + opclause = (OpExpr *) clause; + + /* for now we only support estimating equijoins */ + oprsel = get_oprjoin(opclause->opno); + + /* has to be an equality condition */ + if (oprsel != F_EQJOINSEL) + return false; + + /* + * Make sure we're not mixing vars from multiple relations on the same + * side, like + * + * (t1.a + t2.a) = (t1.b + t2.b) + * + * which is still technically an opclause, but we can't match it to + * extended statistics in a simple way. + * + * XXX This also means we require rinfo->clause_relids to have 2 rels. + * + * XXX Also check it's not expression on system attributes, which we + * don't allow in extended statistics. + * + * XXX Although maybe we could allow cases that combine expressions + * from both relations on either side? Like (t1.a + t2.b = t1.c - t2.d) + * or something like that. We could do "cartesian product" of the MCV + * stats and restrict it using this condition. + */ + foreach (lc, opclause->args) + { + Bitmapset *varnos = NULL; + Node *expr = (Node *) lfirst(lc); + + varnos = pull_varnos(root, expr); + + /* + * No argument should reference more than just one relation. + * + * This effectively means each side references just two relations. + * If there's no relation on one side, it's a Const, and the other + * side has to be either Const or Expr with a single rel. In which + * case it can't be a join clause. + */ + if (bms_num_members(varnos) > 1) + return false; + + /* + * XXX Maybe check that both relations have extended statistics + * (no point in considering the clause as useful without it). But + * we'll do that check later anyway, so keep this cheap. + */ + } + + return true; +} + +/* + * statext_try_join_estimates + * Checks if it's worth considering extended stats on join estimates. + * + * This is supposed to be a quick/cheap check to decide whether to expend + * more effort on applying extended statistics to join clauses. + */ +bool +statext_try_join_estimates(PlannerInfo *root, List *clauses, int varRelid, + JoinType jointype, SpecialJoinInfo *sjinfo, + Bitmapset *estimatedclauses) +{ + int listidx; + int k; + ListCell *lc; + Bitmapset *relids = NULL; + + /* + * XXX Not having these values means treat_as_join_clause returns false, + * so we're not supposed to handle join clauses here. So just bail out. + */ + if ((varRelid != 0) || (sjinfo == NULL)) + return false; + + /* + * Check if there are any unestimated join clauses, collect relids. + * + * XXX Currently this only allows simple OpExpr equality clauses with each + * argument referring to single relation, AND-ed together. Maybe we could + * relax this in the future, e.g. to allow more complex (deeper) expressions + * and to allow OR-ed join clauses too. And maybe supporting inequalities. + * + * Handling more complex expressions seems simple - we already do that for + * baserel estimates by building the match bitmap recursively, and we could + * do something similar for combinations of MCV items (a bit like building + * a single bit in the match bitmap). The challenge is what to do about the + * part not represented by MCV, which is now based on ndistinct estimates. + */ + listidx = -1; + foreach (lc, clauses) + { + Node *clause = (Node *) lfirst(lc); + RestrictInfo *rinfo; + + /* needs to happen before skipping any clauses */ + listidx++; + + /* Skip clauses that were already estimated. */ + if (bms_is_member(listidx, estimatedclauses)) + continue; + + /* + * Skip clauses that are not join clauses or that we don't know + * how to handle estimate using extended statistics. + */ + if (!statext_is_supported_join_clause(root, clause, varRelid, sjinfo)) + continue; + + /* + * XXX We're guaranteed to have RestrictInfo thanks to the checks + * in statext_is_supported_join_clause. + */ + rinfo = (RestrictInfo *) clause; + + /* Collect relids from all usable clauses. */ + relids = bms_union(relids, rinfo->clause_relids); + } + + /* no join clauses found, don't try applying extended stats */ + if (bms_num_members(relids) == 0) + return false; + + /* + * We expect either 0 or >= 2 relids, a case with 1 relid in join clauses + * should be impossible. And we just ruled out 0, so there are at least 2. + */ + Assert(bms_num_members(relids) >= 2); + + /* + * Check that at least some of the rels referenced by the clauses have + * extended stats. + * + * XXX Maybe we should check how many rels have stats, and cross-check how + * compatible they are (e.g. that both have MCVs, etc.). We might also + * cross-check the exact joined pairs of rels, but it's supposed to be a + * cheap check, so maybe better leave that for later. + * + * XXX We could also check the number of parameters in each rel to consider + * extended stats. If there's just a single attribute, it's pointless to use + * extended statistics. OTOH we can also consider restriction clauses from + * baserestrictinfo and use them to calculate conditional probabilities. + */ + k = -1; + while ((k = bms_next_member(relids, k)) >= 0) + { + RelOptInfo *rel = find_base_rel(root, k); + if (rel->statlist) + return true; + } + + return false; +} + +/* + * Information about two joined relations, along with the join clauses between. + */ +typedef struct JoinPairInfo +{ + Bitmapset *rels; + List *clauses; +} JoinPairInfo; + +/* + * statext_build_join_pairs + * Extract pairs of joined rels with join clauses for each pair. + * + * Walks the remaining (not yet estimated) clauses, and splits them into + * lists for each pair of joined relations. Returns NULL if there are no + * suitable join pairs that might be estimated using extended stats. + * + * XXX It's possible there are join clauses, but the clauses are not + * supported by the extended stats machinery (we only support opclauses + * with F_EQJOINSEL selectivity function at the moment). + */ +static JoinPairInfo * +statext_build_join_pairs(PlannerInfo *root, List *clauses, int varRelid, + JoinType jointype, SpecialJoinInfo *sjinfo, + Bitmapset *estimatedclauses, int *npairs) +{ + int cnt; + int listidx; + JoinPairInfo *info; + ListCell *lc; + + /* + * Assume each clause is for a different pair of relations (some of them + * might be already estimated, but meh - there shouldn't be too many of + * them and it's cheaper than repalloc). + */ + info = (JoinPairInfo *) palloc0(sizeof(JoinPairInfo) * list_length(clauses)); + cnt = 0; + + listidx = -1; + foreach(lc, clauses) + { + int i; + bool found; + Node *clause = (Node *) lfirst(lc); + RestrictInfo *rinfo; + + listidx++; + + /* skip already estimated clauses */ + if (bms_is_member(listidx, estimatedclauses)) + continue; + + /* + * Make sure the clause is a join clause of a supported shape (at + * the moment we support just (Expr op Expr) clauses with each + * side referencing just a single relation). + */ + if (!statext_is_supported_join_clause(root, clause, varRelid, sjinfo)) + continue; + + /* statext_is_supported_join_clause guarantees RestrictInfo */ + rinfo = (RestrictInfo *) clause; + clause = (Node *) rinfo->clause; + + /* search for a matching join pair */ + found = false; + for (i = 0; i < cnt; i++) + { + if (bms_is_subset(rinfo->clause_relids, info[i].rels)) + { + info[i].clauses = lappend(info[i].clauses, clause); + found = true; + break; + } + } + + if (!found) + { + info[cnt].rels = rinfo->clause_relids; + info[cnt].clauses = lappend(info[cnt].clauses, clause); + cnt++; + } + } + + if (cnt == 0) + return NULL; + + *npairs = cnt; + return info; +} + +/* + * extract_relation_info + * Extract information about a relation in a join pair. + * + * The relation is identified by index (generally 0 or 1), and picks extended + * statistics object covering the join clauses and baserel restrictions. + * + * XXX Can we have cases with indexes above 1? Probably for clauses mixing + * vars from 3 relations, but statext_is_supported_join_clause rejects those. + */ +static RelOptInfo * +extract_relation_info(PlannerInfo *root, JoinPairInfo *info, int index, + StatisticExtInfo **stat) +{ + int k; + int relid; + RelOptInfo *rel; + ListCell *lc; + List *exprs = NIL; + + Bitmapset *attnums = NULL; + + Assert((index >= 0) && (index <= 1)); + + k = -1; + while (index >= 0) + { + k = bms_next_member(info->rels, k); + if (k < 0) + elog(ERROR, "failed to extract relid"); + + relid = k; + index--; + } + + rel = find_base_rel(root, relid); + + /* + * Walk the clauses for this join pair, and extract expressions about + * the relation identified by index / relid. For simple Vars we extract + * the attnum. Otherwise we keep the whole expression. + */ + foreach (lc, info->clauses) + { + ListCell *lc2; + Node *clause = (Node *) lfirst(lc); + OpExpr *opclause = (OpExpr *) clause; + + /* only opclauses supported for now */ + Assert(is_opclause(clause)); + + foreach (lc2, opclause->args) + { + Node *arg = (Node *) lfirst(lc2); + Bitmapset *varnos = NULL; + + /* plain Var references (boolean Vars or recursive checks) */ + if (IsA(arg, Var)) + { + Var *var = (Var *) arg; + + /* Ignore vars from other relations. */ + if (var->varno != relid) + continue; + + /* we also better ensure the Var is from the current level */ + if (var->varlevelsup > 0) + continue; + + /* Also skip system attributes (we don't allow stats on those). */ + if (!AttrNumberIsForUserDefinedAttr(var->varattno)) + elog(ERROR, "unexpected system attribute"); + + attnums = bms_add_member(attnums, var->varattno); + + /* Done, process the next argument. */ + continue; + } + + /* + * OK, it's a more complex expression, so check if it matches + * the relid and maybe keep it as a whole. It should be + * compatible because we already checked it when building the + * join pairs. + */ + varnos = pull_varnos(root, arg); + + if (relid == bms_singleton_member(varnos)) + exprs = lappend(exprs, arg); + } + } + + *stat = statext_find_matching_mcv(root, rel, attnums, exprs); + + return rel; +} + +/* + * get_expression_for_rel + * Extract expression for a given relation from the join clause. + * + * Given a join clause supported by the extended statistics object (currently + * that means just OpExpr clauses with each argument referencing single rel), + * return either the left or right argument expression for the rel. + * + * XXX This should probably return a flag identifying whether it's the + * left or right argument. + */ +static Node * +get_expression_for_rel(PlannerInfo *root, RelOptInfo *rel, Node *clause) +{ + OpExpr *opexpr; + Node *expr; + + /* + * Strip the RestrictInfo node, get the actual clause. + * + * XXX Not sure if we need to care about removing other node types + * too (e.g. RelabelType etc.). statext_is_supported_join_clause + * matches this, but maybe we need to relax it? + */ + if (IsA(clause, RestrictInfo)) + clause = (Node *) ((RestrictInfo *) clause)->clause; + + opexpr = (OpExpr *) clause; + + /* Make sure we have the expected node type. */ + Assert(is_opclause(clause)); + Assert(list_length(opexpr->args) == 2); + + /* FIXME strip relabel etc. the way examine_opclause_args does */ + expr = linitial(opexpr->args); + if (bms_singleton_member(pull_varnos(root, expr)) == rel->relid) + return expr; + + expr = lsecond(opexpr->args); + if (bms_singleton_member(pull_varnos(root, expr)) == rel->relid) + return expr; + + return NULL; +} + +/* + * statext_clauselist_join_selectivity + * Use extended stats to estimate join clauses. + * + * XXX In principle, we should not restrict this to cases with multiple + * join clauses - we should consider dependencies with conditions at the + * base relations, i.e. calculate P(join clause | base restrictions). + * But currently that does not happen, because clauselist_selectivity_ext + * treats a single clause as a special case (and we don't apply extended + * statistics in that case yet). + */ +Selectivity +statext_clauselist_join_selectivity(PlannerInfo *root, List *clauses, int varRelid, + JoinType jointype, SpecialJoinInfo *sjinfo, + Bitmapset **estimatedclauses) +{ + int i; + int listidx; + Selectivity s = 1.0; + + JoinPairInfo *info; + int ninfo; + + if (!clauses) + return 1.0; + + /* extract pairs of joined relations from the list of clauses */ + info = statext_build_join_pairs(root, clauses, varRelid, jointype, sjinfo, + *estimatedclauses, &ninfo); + + /* no useful join pairs */ + if (!info) + return 1.0; + + /* + * Process the join pairs, try to find a matching MCV on each side. + * + * XXX The basic principle is quite similar to eqjoinsel_inner, i.e. + * we try to find a MCV on both sides of the join, and use it to get + * a better join estimate. It's a bit more complicated, because there + * might be multiple MCV lists, we also need ndistinct estimate, and + * there may be interesting baserestrictions too. + * + * XXX At the moment we only handle the case with matching MCVs on + * both sides, but it'd be good to also handle case with just ndistinct + * statistics improving ndistinct estimates. + * + * XXX We might also handle cases with a regular MCV on one side and + * an extended MCV on the other side. + * + * XXX Perhaps it'd be good to also handle case with one side only + * having "regular" statistics (e.g. MCV), especially in cases with + * no conditions on that side of the join (where we can't use the + * extended MCV to calculate conditional probability). + */ + for (i = 0; i < ninfo; i++) + { + ListCell *lc; + + RelOptInfo *rel1; + RelOptInfo *rel2; + + StatisticExtInfo *stat1; + StatisticExtInfo *stat2; + + /* extract info about the first relation */ + rel1 = extract_relation_info(root, &info[i], 0, &stat1); + + /* extract info about the second relation */ + rel2 = extract_relation_info(root, &info[i], 1, &stat2); + + /* + * We can handle three basic cases: + * + * a) Extended stats (with MCV) on both sides is an ideal case, and we + * can simply combine the two MCVs, possibly with additional conditions + * from the relations. + * + * b) Extended stats on one side, regular MCV on the other side (this + * means there's just one join clause / expression). It also means the + * extended stats likely covers at least one extra condition, otherwise + * we could just use regular statistics. We can combine the stats just + * similarly to (a). + * + * c) No extended stats with MCV. If there are multiple join clauses, + * we can try using ndistinct coefficients and do what eqjoinsel does. + * + * If none of these applies, we fallback to the regular selectivity + * estimation in eqjoinsel. + */ + if (stat1 && stat2) + { + s *= mcv_combine_extended(root, rel1, rel2, stat1, stat2, info[i].clauses); + } + else if (stat1 && (list_length(info[i].clauses) == 1)) + { + /* try finding MCV on the other relation */ + VariableStatData vardata; + AttStatsSlot sslot; + Form_pg_statistic stats = NULL; + bool have_mcvs = false; + Node *clause = linitial(info[i].clauses); + Node *expr = get_expression_for_rel(root, rel2, clause); + double nd; + bool isdefault; + + examine_variable(root, expr, 0, &vardata); + + nd = get_variable_numdistinct(&vardata, &isdefault); + + memset(&sslot, 0, sizeof(sslot)); + + if (HeapTupleIsValid(vardata.statsTuple)) + { + /* note we allow use of nullfrac regardless of security check */ + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + /* FIXME should this call statistic_proc_security_check like eqjoinsel? */ + have_mcvs = get_attstatsslot(&sslot, vardata.statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); + } + + if (have_mcvs) + s *= mcv_combine_simple(root, rel1, stat1, &sslot, + stats->stanullfrac, nd, isdefault, clause); + + free_attstatsslot(&sslot); + + ReleaseVariableStats(vardata); + + /* no stats, don't mark the clauses as estimated */ + if (!have_mcvs) + continue; + } + else if (stat2 && (list_length(info[i].clauses) == 1)) + { + /* try finding MCV on the other relation */ + VariableStatData vardata; + AttStatsSlot sslot; + Form_pg_statistic stats = NULL; + bool have_mcvs = false; + Node *clause = (Node *) linitial(info[i].clauses); + Node *expr = get_expression_for_rel(root, rel1, clause); + double nd; + bool isdefault; + + examine_variable(root, expr, 0, &vardata); + + nd = get_variable_numdistinct(&vardata, &isdefault); + + memset(&sslot, 0, sizeof(sslot)); + + if (HeapTupleIsValid(vardata.statsTuple)) + { + /* note we allow use of nullfrac regardless of security check */ + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + /* FIXME should this call statistic_proc_security_check like eqjoinsel? */ + have_mcvs = get_attstatsslot(&sslot, vardata.statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); + } + + if (have_mcvs) + s *= mcv_combine_simple(root, rel2, stat2, &sslot, + stats->stanullfrac, nd, isdefault, clause); + + free_attstatsslot(&sslot); + + ReleaseVariableStats(vardata); + + /* no stats, don't mark the clauses as estimated */ + if (!have_mcvs) + continue; + } + else + continue; + + /* + * Now mark all the clauses for this join pair as estimated. + * + * XXX Maybe track the indexes in JoinPairInfo, so that we can + * simply union the two bitmaps, without the extra matching. + */ + foreach (lc, info->clauses) + { + Node *clause = (Node *) lfirst(lc); + ListCell *lc2; + + listidx = -1; + foreach (lc2, clauses) + { + Node *clause2 = (Node *) lfirst(lc2); + listidx++; + + Assert(IsA(clause2, RestrictInfo)); + + clause2 = (Node *) ((RestrictInfo *) clause2)->clause; + + if (equal(clause, clause2)) + { + *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + break; + } + } + } + } + + return s; +} diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c index 1ef30344285..fbfebf88ac0 100644 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -24,6 +24,7 @@ #include "funcapi.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" +#include "optimizer/optimizer.h" #include "statistics/extended_stats_internal.h" #include "statistics/statistics.h" #include "utils/array.h" @@ -2158,3 +2159,759 @@ mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat, return s; } + +/* + * statext_compare_mcvs + * Calculate join selectivity using extended statistics, similar to + * eqjoinsel_inner. + * + * Considers restrictions on base relations too, essentially computing a + * conditional probability + * + * P(join clauses | baserestrictinfos on either side) + * + * Compared to eqjoinsel_inner there's a couple problems. With per-column MCV + * lists it's obvious that the number of distinct values not covered by the MCV + * is (ndistinct - size(MCV)). With multi-column MCVs it's not that simple, + * particularly when the conditions are on a subset of the MCV attributes and/or + * NULLs are involved. E.g. with MCV (a,b,c) and conditions on (a,b), it's not + * clear if the number of (a,b) combinations not covered by the MCV is + * + * (ndistinct(a,b) - ndistinct_mcv(a,b)) + * + * where ndistinct_mcv(a,b) is the number of distinct (a,b) combinations + * included in the MCV list. These combinations may be present in the rest + * of the data (outside MCV), just with some extra values in "c". So in + * principle there may be between + * + * (ndistinct(a,b) - ndistinct_mcv(a,b)) and ndistinct(a,b) + * + * distinct values in the part of the data not covered by the MCV. So we need + * to pick something in between, there's no way to calculate this accurately. + */ +Selectivity +mcv_combine_extended(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, + StatisticExtInfo *stat1, StatisticExtInfo *stat2, + List *clauses) +{ + ListCell *lc; + + MCVList *mcv1, + *mcv2; + int idx, + i, + j; + Selectivity s = 0; + + /* match bitmaps and selectivity for baserel conditions (if any) */ + List *exprs1 = NIL, + *exprs2 = NIL; + List *conditions1 = NIL, + *conditions2 = NIL; + bool *cmatches1 = NULL, + *cmatches2 = NULL; + + double csel1 = 1.0, + csel2 = 1.0; + + bool *matches1 = NULL, + *matches2 = NULL; + + /* estimates for the two relations */ + double matchfreq1, + unmatchfreq1, + otherfreq1, + mcvfreq1, + nd1, + totalsel1; + + double matchfreq2, + unmatchfreq2, + otherfreq2, + mcvfreq2, + nd2, + totalsel2; + + /* info about clauses and how they match to MCV stats */ + FmgrInfo *opprocs; + int *indexes1, + *indexes2; + bool *reverse; + RangeTblEntry *rte1 = root->simple_rte_array[rel1->relid]; + RangeTblEntry *rte2 = root->simple_rte_array[rel2->relid]; + + /* we picked the stats so that they have MCV enabled */ + Assert((stat1->kind = STATS_EXT_MCV) && (stat2->kind = STATS_EXT_MCV)); + + mcv1 = statext_mcv_load(stat1->statOid, rte1->inh); + mcv2 = statext_mcv_load(stat2->statOid, rte2->inh); + + /* should only get here with MCV on both sides */ + Assert(mcv1 && mcv2); + + /* Determine which baserel clauses to use for conditional probability. */ + conditions1 = statext_determine_join_restrictions(root, rel1, stat1); + conditions2 = statext_determine_join_restrictions(root, rel2, stat2); + + /* + * Calculate match bitmaps for restrictions on either side of the join + * (there may be none, in which case this will be NULL). + */ + if (conditions1) + { + cmatches1 = mcv_get_match_bitmap(root, conditions1, + stat1->keys, stat1->exprs, + mcv1, false); + csel1 = clauselist_selectivity(root, conditions1, rel1->relid, 0, NULL); + } + + if (conditions2) + { + cmatches2 = mcv_get_match_bitmap(root, conditions2, + stat2->keys, stat2->exprs, + mcv2, false); + csel2 = clauselist_selectivity(root, conditions2, rel2->relid, 0, NULL); + } + + /* + * Match bitmaps for matches between MCV elements. By default there + * are no matches, so we set all items to 0. + */ + matches1 = (bool *) palloc0(sizeof(bool) * mcv1->nitems); + matches2 = (bool *) palloc0(sizeof(bool) * mcv2->nitems); + + /* + * Initialize information about clauses and how they match to the MCV + * stats we picked. We do this only once before processing the lists, + * so that we don't have to do that for each MCV item or so. + */ + opprocs = (FmgrInfo *) palloc(sizeof(FmgrInfo) * list_length(clauses)); + indexes1 = (int *) palloc(sizeof(int) * list_length(clauses)); + indexes2 = (int *) palloc(sizeof(int) * list_length(clauses)); + reverse = (bool *) palloc(sizeof(bool) * list_length(clauses)); + + idx = 0; + foreach (lc, clauses) + { + Node *clause = (Node *) lfirst(lc); + OpExpr *opexpr; + Node *expr1, + *expr2; + Bitmapset *relids1, + *relids2; + + /* + * Strip the RestrictInfo node, get the actual clause. + * + * XXX Not sure if we need to care about removing other node types + * too (e.g. RelabelType etc.). statext_is_supported_join_clause + * matches this, but maybe we need to relax it? + */ + if (IsA(clause, RestrictInfo)) + clause = (Node *) ((RestrictInfo *) clause)->clause; + + opexpr = (OpExpr *) clause; + + /* Make sure we have the expected node type. */ + Assert(is_opclause(clause)); + Assert(list_length(opexpr->args) == 2); + + fmgr_info(get_opcode(opexpr->opno), &opprocs[idx]); + + /* FIXME strip relabel etc. the way examine_opclause_args does */ + expr1 = linitial(opexpr->args); + expr2 = lsecond(opexpr->args); + + /* determine order of clauses (rel1 op rel2) or (rel2 op rel1) */ + relids1 = pull_varnos(root, expr1); + relids2 = pull_varnos(root, expr2); + + if ((bms_singleton_member(relids1) == rel1->relid) && + (bms_singleton_member(relids2) == rel2->relid)) + { + Oid collid; + + indexes1[idx] = mcv_match_expression(expr1, + stat1->keys, stat1->exprs, + &collid); + indexes2[idx] = mcv_match_expression(expr2, + stat2->keys, stat2->exprs, + &collid); + reverse[idx] = false; + + exprs1 = lappend(exprs1, expr1); + exprs2 = lappend(exprs2, expr2); + } + else if ((bms_singleton_member(relids2) == rel1->relid) && + (bms_singleton_member(relids1) == rel2->relid)) + { + Oid collid; + + indexes1[idx] = mcv_match_expression(expr2, + stat2->keys, stat2->exprs, + &collid); + indexes2[idx] = mcv_match_expression(expr1, + stat1->keys, stat1->exprs, + &collid); + reverse[idx] = true; + + exprs1 = lappend(exprs1, expr2); + exprs2 = lappend(exprs2, expr1); + } + else + /* should never happen */ + Assert(false); + + Assert((indexes1[idx] >= 0) && + (indexes1[idx] < bms_num_members(stat1->keys) + list_length(stat1->exprs))); + + Assert((indexes2[idx] >= 0) && + (indexes2[idx] < bms_num_members(stat2->keys) + list_length(stat2->exprs))); + + idx++; + } + + /* + * Match items between the two MCV lists. + * + * We don't know if the join conditions match all attributes in the MCV, the + * overlap may be just on a subset of attributes, e.g. (a,b,c) vs. (b,c,d). + * So there may be multiple matches on either side. So we can't optimize by + * aborting the inner loop after the first match, etc. + * + * XXX We can skip the items eliminated by the base restrictions, of course. + * + * XXX We might optimize this in two ways. We might sort the MCV items on + * both sides using the "join" attributes, and then perform something like + * merge join. Or we might calculate a hash from the join columns, and then + * compare this (to eliminate the most expensive equality functions). + */ + for (i = 0; i < mcv1->nitems; i++) + { + bool has_nulls; + + /* skip items eliminated by restrictions on rel1 */ + if (cmatches1 && !cmatches1[i]) + continue; + + /* + * Check if any value in the first MCV item is NULL, because it'll be + * mismatch anyway. + * + * XXX This might not work for some join clauses, e.g. IS NOT DISTINCT + * FROM, but those are currently not considered compatible (we only + * allow OpExpr at the moment). + */ + has_nulls = false; + for (j = 0; j < list_length(clauses); j++) + has_nulls |= mcv1->items[i].isnull[indexes1[j]]; + + if (has_nulls) + continue; + + /* find matches in the second MCV list */ + for (j = 0; j < mcv2->nitems; j++) + { + int idx; + bool items_match = true; + + /* skip items eliminated by restrictions on rel2 */ + if (cmatches2 && !cmatches2[j]) + continue; + + /* + * XXX We can't skip based on existing matches2 value, because there + * may be duplicates in the first MCV. + */ + + /* + * Evaluate if all the join clauses match between the two MCV items. + * + * XXX We might optimize the order of evaluation, using the costs of + * operator functions for individual columns. It does depend on the + * number of distinct values, etc. + */ + idx = 0; + foreach (lc, clauses) + { + bool match; + int index1 = indexes1[idx], + index2 = indexes2[idx]; + Datum value1, + value2; + bool reverse_args = reverse[idx]; + + /* If either value is null, it's a mismatch */ + if (mcv2->items[j].isnull[index2]) + match = false; + else + { + value1 = mcv1->items[i].values[index1]; + value2 = mcv2->items[j].values[index2]; + + /* + * Careful about order of parameters. For same-type equality + * that should not matter, but easy enough. + * + * FIXME Use appropriate collation. + */ + if (reverse_args) + match = DatumGetBool(FunctionCall2Coll(&opprocs[idx], + InvalidOid, + value2, value1)); + else + match = DatumGetBool(FunctionCall2Coll(&opprocs[idx], + InvalidOid, + value1, value2)); + } + + items_match &= match; + + if (!items_match) + break; + + idx++; + } + + if (items_match) + { + /* XXX Do we need to do something about base frequency? */ + matches1[i] = matches2[j] = true; + s += mcv1->items[i].frequency * mcv2->items[j].frequency; + } + } + } + + matchfreq1 = unmatchfreq1 = mcvfreq1 = 0.0; + for (i = 0; i < mcv1->nitems; i++) + { + mcvfreq1 += mcv1->items[i].frequency; + + /* ignore MCV items eliminated by baserel conditions */ + if (cmatches1 && !cmatches1[i]) + continue; + + if (matches1[i]) + matchfreq1 += mcv1->items[i].frequency; + else + unmatchfreq1 += mcv1->items[i].frequency; + } + + /* not represented by the MCV */ + otherfreq1 = 1.0 - mcvfreq1; + + matchfreq2 = unmatchfreq2 = mcvfreq2 = 0.0; + for (i = 0; i < mcv2->nitems; i++) + { + mcvfreq2 += mcv2->items[i].frequency; + + /* ignore MCV items eliminated by baserel conditions */ + if (cmatches2 && !cmatches2[i]) + continue; + + if (matches2[i]) + matchfreq2 += mcv2->items[i].frequency; + else + unmatchfreq2 += mcv2->items[i].frequency; + } + + /* not represented by the MCV */ + otherfreq2 = 1.0 - mcvfreq2; + + /* + * Correction for MCV parts eliminated by the conditions. + * + * We need to be careful about cases where conditions eliminated all + * the MCV items. We must not divide by 0.0, because that would either + * produce bogus value or trigger division by zero. Instead we simply + * set the selectivity to 0.0, because there can't be any matches. + */ + if ((matchfreq1 + unmatchfreq1) > 0) + s = s * mcvfreq1 / (matchfreq1 + unmatchfreq1); + else + s = 0.0; + + if ((matchfreq2 + unmatchfreq2) > 0) + s = s * mcvfreq2 / (matchfreq2 + unmatchfreq2); + else + s = 0.0; + + /* calculate ndistinct for the expression in join clauses for each rel */ + nd1 = estimate_num_groups(root, exprs1, rel1->rows, NULL, NULL); + nd2 = estimate_num_groups(root, exprs2, rel2->rows, NULL, NULL); + + /* + * Consider the part of the data not represented by the MCV lists. + * + * XXX this is a bit bogus, because we don't know what fraction of + * distinct combinations is covered by the MCV list (we're only + * dealing with some of the columns), so we can't use the same + * formular as eqjoinsel_inner exactly. We just use the estimates + * for the whole table - this is likely an overestimate, because + * (a) items may repeat in the MCV list, if it has more columns, + * and (b) some of the combinations may be present in non-MCV data. + * + * Moreover, we need to look at the conditions. For now we simply + * assume the conditions affect the distinct groups, and use that. + * + * XXX We might calculate the number of distinct groups in the MCV, + * and then use something between (nd1 - distinct(MCV)) and (nd1), + * which are the possible extreme values, assuming the estimates + * are accurate. Maybe mean or geometric mean would work? + * + * XXX Not sure multiplying ndistinct with probabilities is good. + * Maybe we should do something more like estimate_num_groups? + */ + nd1 *= csel1; + nd2 *= csel2; + + totalsel1 = s; + totalsel1 += unmatchfreq1 * otherfreq2 / nd2; + totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) / nd2; + +// if (nd2 > mcvb->nitems) +// totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - mcvb->nitems); +// if (nd2 > nmatches) +// totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) / +// (nd2 - nmatches); + + totalsel2 = s; + totalsel2 += unmatchfreq2 * otherfreq1 / nd1; + totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) / nd1; + +// if (nd1 > mcva->nitems) +// totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - mcva->nitems); +// if (nd1 > nmatches) +// totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) / +// (nd1 - nmatches); + + s = Min(totalsel1, totalsel2); + + return s; +} + + +/* + * statext_compare_simple + * Calculate join selectivity using a combination of extended + * statistics MCV on one side, and simple per-column MCV on the other. + * + * Most of the mcv_combine_extended comment applies here too, but we can make + * some simplifications because we know the second (per-column) MCV is simpler, + * contains no NULL or duplicate values, etc. + */ +Selectivity +mcv_combine_simple(PlannerInfo *root, RelOptInfo *rel, StatisticExtInfo *stat, + AttStatsSlot *sslot, double stanullfrac, double nd, + bool isdefault, Node *clause) +{ + MCVList *mcv; + int i, + j; + Selectivity s = 0; + + /* match bitmaps and selectivity for baserel conditions (if any) */ + List *conditions = NIL; + bool *cmatches = NULL; + + double csel = 1.0; + + bool *matches1 = NULL, + *matches2 = NULL; + + /* estimates for the two sides */ + double matchfreq1, + unmatchfreq1, + otherfreq1, + mcvfreq1, + nd1, + totalsel1; + + double matchfreq2, + unmatchfreq2, + otherfreq2, + mcvfreq2, + nd2, + totalsel2; + + List *exprs1 = NIL, + *exprs2 = NIL; + + /* info about clauses and how they match to MCV stats */ + FmgrInfo opproc; + int index = 0; + bool reverse = false; + RangeTblEntry *rte = root->simple_rte_array[rel->relid]; + + /* we picked the stats so that they have MCV enabled */ + Assert(stat->kind = STATS_EXT_MCV); + + mcv = statext_mcv_load(stat->statOid, rte->inh); + + /* should only get here with MCV on both sides */ + Assert(mcv); + + /* Determine which baserel clauses to use for conditional probability. */ + conditions = statext_determine_join_restrictions(root, rel, stat); + + /* + * Calculate match bitmaps for restrictions on either side of the join + * (there may be none, in which case this will be NULL). + */ + if (conditions) + { + cmatches = mcv_get_match_bitmap(root, conditions, + stat->keys, stat->exprs, + mcv, false); + csel = clauselist_selectivity(root, conditions, rel->relid, 0, NULL); + } + + /* + * Match bitmaps for matches between MCV elements. By default there + * are no matches, so we set all items to 0. + */ + matches1 = (bool *) palloc0(sizeof(bool) * mcv->nitems); + + /* Matches for the side with just regular single-column MCV. */ + matches2 = (bool *) palloc0(sizeof(bool) * sslot->nvalues); + + /* + * Initialize information about the clause and how it matches to the + * extended stats we picked. We do this only once before processing + * the lists, so that we don't have to do that for each item or so. + */ + { + OpExpr *opexpr; + Node *expr1, + *expr2; + Bitmapset *relids1, + *relids2; + + /* + * Strip the RestrictInfo node, get the actual clause. + * + * XXX Not sure if we need to care about removing other node types + * too (e.g. RelabelType etc.). statext_is_supported_join_clause + * matches this, but maybe we need to relax it? + */ + if (IsA(clause, RestrictInfo)) + clause = (Node *) ((RestrictInfo *) clause)->clause; + + opexpr = (OpExpr *) clause; + + /* Make sure we have the expected node type. */ + Assert(is_opclause(clause)); + Assert(list_length(opexpr->args) == 2); + + fmgr_info(get_opcode(opexpr->opno), &opproc); + + /* FIXME strip relabel etc. the way examine_opclause_args does */ + expr1 = linitial(opexpr->args); + expr2 = lsecond(opexpr->args); + + /* determine order of clauses (rel1 op rel2) or (rel2 op rel1) */ + relids1 = pull_varnos(root, expr1); + relids2 = pull_varnos(root, expr2); + + if (bms_singleton_member(relids1) == rel->relid) + { + Oid collid; + + index = mcv_match_expression(expr1, stat->keys, stat->exprs, + &collid); + reverse = false; + + exprs1 = lappend(exprs1, expr1); + exprs2 = lappend(exprs2, expr2); + } + else if (bms_singleton_member(relids2) == rel->relid) + { + Oid collid; + + index = mcv_match_expression(expr2, stat->keys, stat->exprs, + &collid); + reverse = true; + + exprs1 = lappend(exprs1, expr2); + exprs2 = lappend(exprs2, expr1); + } + else + /* should never happen */ + Assert(false); + + Assert((index >= 0) && + (index < bms_num_members(stat->keys) + list_length(stat->exprs))); + } + + /* + * Match items between the two MCV lists. + * + * We don't know if the join conditions match all attributes in the MCV, the + * overlap may be just on a subset of attributes, e.g. (a,b,c) vs. (b,c,d). + * So there may be multiple matches on either side. So we can't optimize by + * aborting the inner loop after the first match, etc. + * + * XXX We can skip the items eliminated by the base restrictions, of course. + * + * XXX We might optimize this in two ways. We might sort the MCV items on + * both sides using the "join" attributes, and then perform something like + * merge join. Or we might calculate a hash from the join columns, and then + * compare this (to eliminate the most expensive equality functions). + */ + for (i = 0; i < mcv->nitems; i++) + { + /* skip items eliminated by restrictions on rel1 */ + if (cmatches && !cmatches[i]) + continue; + + /* + * We can check mcv1->items[i].isnull[index1] here, because it'll be a + * mismatch anyway (the simple MCV does not contain NULLs). + */ + if (mcv->items[i].isnull[index]) + continue; + + /* find matches in the second MCV list */ + for (j = 0; j < sslot->nvalues; j++) + { + bool match; + Datum value1 = mcv->items[i].values[index]; + Datum value2 = sslot->values[j]; + + /* + * Evaluate the join clause between the two MCV lists. We don't + * need to deal with NULL values here - we've already checked for + * NULL in the extended statistics earlier, and the simple MCV + * does not contain NULL values. + * + * Careful about order of parameters. For same-type equality + * that should not matter, but easy enough. + * + * FIXME Use appropriate collation. + */ + if (reverse) + match = DatumGetBool(FunctionCall2Coll(&opproc, + InvalidOid, + value2, value1)); + else + match = DatumGetBool(FunctionCall2Coll(&opproc, + InvalidOid, + value1, value2)); + + if (match) + { + /* XXX Do we need to do something about base frequency? */ + matches1[i] = matches2[j] = true; + s += mcv->items[i].frequency * sslot->numbers[j]; + + /* + * We know there can be just a single match in the regular + * MCV list, so we can abort the inner loop. + */ + break; + } + } + } + + matchfreq1 = unmatchfreq1 = mcvfreq1 = 0.0; + for (i = 0; i < mcv->nitems; i++) + { + mcvfreq1 += mcv->items[i].frequency; + + /* ignore MCV items eliminated by baserel conditions */ + if (cmatches && !cmatches[i]) + continue; + + if (matches1[i]) + matchfreq1 += mcv->items[i].frequency; + else + unmatchfreq1 += mcv->items[i].frequency; + } + + /* not represented by the MCV */ + otherfreq1 = 1.0 - mcvfreq1; + + matchfreq2 = unmatchfreq2 = mcvfreq2 = 0.0; + for (i = 0; i < sslot->nvalues; i++) + { + mcvfreq2 += sslot->numbers[i]; + + if (matches2[i]) + matchfreq2 += sslot->numbers[i]; + else + unmatchfreq2 += sslot->numbers[i]; + } + + /* not represented by the MCV */ + otherfreq2 = 1.0 - mcvfreq2; + + /* + * Correction for MCV parts eliminated by the conditions. + * + * We need to be careful about cases where conditions eliminated all + * the MCV items. We must not divide by 0.0, because that would either + * produce bogus value or trigger division by zero. Instead we simply + * set the selectivity to 0.0, because there can't be any matches. + */ + if ((matchfreq1 + unmatchfreq1) > 0) + s = s * mcvfreq1 / (matchfreq1 + unmatchfreq1); + else + s = 0.0; + + if ((matchfreq2 + unmatchfreq2) > 0) + s = s * mcvfreq2 / (matchfreq2 + unmatchfreq2); + else + s = 0.0; + + /* calculate ndistinct for the expression in join clauses for each rel */ + nd1 = estimate_num_groups(root, exprs1, rel->rows, NULL, NULL); + nd2 = nd; + + /* + * Consider the part of the data not represented by the MCV lists. + * + * XXX this is a bit bogus, because we don't know what fraction of + * distinct combinations is covered by the MCV list (we're only + * dealing with some of the columns), so we can't use the same + * formular as eqjoinsel_inner exactly. We just use the estimates + * for the whole table - this is likely an overestimate, because + * (a) items may repeat in the MCV list, if it has more columns, + * and (b) some of the combinations may be present in non-MCV data. + * + * Moreover, we need to look at the conditions. For now we simply + * assume the conditions affect the distinct groups, and use that. + * + * XXX We might calculate the number of distinct groups in the MCV, + * and then use something between (nd1 - distinct(MCV)) and (nd1), + * which are the possible extreme values, assuming the estimates + * are accurate. Maybe mean or geometric mean would work? + * + * XXX Not sure multiplying ndistinct with probabilities is good. + * Maybe we should do something more like estimate_num_groups? + */ + nd1 *= csel; + + totalsel1 = s; + totalsel1 += unmatchfreq1 * otherfreq2 / nd2; + totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) / nd2; + +// if (nd2 > mcvb->nitems) +// totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - mcvb->nitems); +// if (nd2 > nmatches) +// totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) / +// (nd2 - nmatches); + + totalsel2 = s; + totalsel2 += unmatchfreq2 * otherfreq1 / nd1; + totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) / nd1; + +// if (nd1 > mcva->nitems) +// totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - mcva->nitems); +// if (nd1 > nmatches) +// totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) / +// (nd1 - nmatches); + + s = Min(totalsel1, totalsel2); + + return s; +} diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h index 71f852c157b..d115c6c791c 100644 --- a/src/include/statistics/extended_stats_internal.h +++ b/src/include/statistics/extended_stats_internal.h @@ -15,6 +15,7 @@ #define EXTENDED_STATS_INTERNAL_H #include "statistics/statistics.h" +#include "utils/lsyscache.h" #include "utils/sortsupport.h" typedef struct @@ -127,4 +128,23 @@ extern Selectivity mcv_clause_selectivity_or(PlannerInfo *root, Selectivity *overlap_basesel, Selectivity *totalsel); +extern Selectivity mcv_combine_simple(PlannerInfo *root, + RelOptInfo *rel, + StatisticExtInfo *stat, + AttStatsSlot *sslot, + double stanullfrac, + double nd, bool isdefault, + Node *clause); + +extern Selectivity mcv_combine_extended(PlannerInfo *root, + RelOptInfo *rel1, + RelOptInfo *rel2, + StatisticExtInfo *stat1, + StatisticExtInfo *stat2, + List *clauses); + +extern List *statext_determine_join_restrictions(PlannerInfo *root, + RelOptInfo *rel, + StatisticExtInfo *info); + #endif /* EXTENDED_STATS_INTERNAL_H */ diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h index bb7ef1240e0..c69cadff3a8 100644 --- a/src/include/statistics/statistics.h +++ b/src/include/statistics/statistics.h @@ -127,4 +127,16 @@ extern StatisticExtInfo *choose_best_statistics(List *stats, char requiredkind, int nclauses); extern HeapTuple statext_expressions_load(Oid stxoid, bool inh, int idx); +extern StatisticExtInfo *statext_find_matching_mcv(PlannerInfo *root, RelOptInfo *rel, + Bitmapset *attnums, List *exprs); + +extern bool statext_try_join_estimates(PlannerInfo *root, List *clauses, int varRelid, + JoinType jointype, SpecialJoinInfo *sjinfo, + Bitmapset *estimatedclauses); + +extern Selectivity statext_clauselist_join_selectivity(PlannerInfo *root, List *clauses, + int varRelid, + JoinType jointype, SpecialJoinInfo *sjinfo, + Bitmapset **estimatedclauses); + #endif /* STATISTICS_H */ diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 042316aeed8..e5577e680f4 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -3045,6 +3045,173 @@ SELECT c0 FROM ONLY expr_stats_incompatible_test WHERE (0 rows) DROP TABLE expr_stats_incompatible_test; +-- Test join estimates. +CREATE TABLE join_test_1 (a int, b int, c int); +CREATE TABLE join_test_2 (a int, b int, c int); +INSERT INTO join_test_1 SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i); +INSERT INTO join_test_2 SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i); +ANALYZE join_test_1; +ANALYZE join_test_2; +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b))'); + estimated | actual +-----------+-------- + 500 | 100000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 5'); + estimated | actual +-----------+-------- + 250 | 50000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 0'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 5 and j2.c < 3'); + estimated | actual +-----------+-------- + 75 | 30000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 5 and j2.c > 5'); + estimated | actual +-----------+-------- + 100 | 0 +(1 row) + +-- can't be improved due to the optimization in clauselist_selectivity_ext, +-- which skips cases with a single (join) clause +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1)) where j1.c < 5'); + estimated | actual +-----------+-------- + 2500 | 50000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1)) where j1.c < 5 and j2.c < 5'); + estimated | actual +-----------+-------- + 1250 | 50000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1)) where j1.c < 5 and j2.c > 5'); + estimated | actual +-----------+-------- + 1000 | 0 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a) where j1.c < 5'); + estimated | actual +-----------+-------- + 2500 | 50000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a + 1) where j1.c < 5'); + estimated | actual +-----------+-------- + 2500 | 50000 +(1 row) + +-- create extended statistics on the join/where columns +CREATE STATISTICS join_stats_1 ON a, b, c, (a+1), (b+1) FROM join_test_1; +CREATE STATISTICS join_stats_2 ON a, b, c, (a+1), (b+1) FROM join_test_2; +ANALYZE join_test_1; +ANALYZE join_test_2; +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b))'); + estimated | actual +-----------+-------- + 100000 | 100000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 5'); + estimated | actual +-----------+-------- + 50000 | 50000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 0'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 5 and j2.c < 3'); + estimated | actual +-----------+-------- + 30000 | 30000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 5 and j2.c > 5'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + +-- can't be improved due to the optimization in clauselist_selectivity_ext, +-- which skips cases with a single (join) clause +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1)) where j1.c < 5'); + estimated | actual +-----------+-------- + 50000 | 50000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1)) where j1.c < 5 and j2.c < 5'); + estimated | actual +-----------+-------- + 50000 | 50000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1)) where j1.c < 5 and j2.c > 5'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a) where j1.c < 5'); + estimated | actual +-----------+-------- + 50000 | 50000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a + 1) where j1.c < 5'); + estimated | actual +-----------+-------- + 50000 | 50000 +(1 row) + +-- try combining with single-column (and single-expression) statistics +DROP STATISTICS join_stats_2; +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a) where j1.c < 5'); + estimated | actual +-----------+-------- + 50000 | 50000 +(1 row) + +-- no MCV on join_test_2 (on the (a+1) expression) +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a + 1) where j1.c < 5'); + estimated | actual +-----------+-------- + 2500 | 50000 +(1 row) + +CREATE STATISTICS join_stats_2 ON (a+1) FROM join_test_2; +ANALYZE join_test_2; +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a) where j1.c < 5'); + estimated | actual +-----------+-------- + 50000 | 50000 +(1 row) + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a + 1) where j1.c < 5'); + estimated | actual +-----------+-------- + 50000 | 50000 +(1 row) + +DROP TABLE join_test_1; +DROP TABLE join_test_2; -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in -- the underlying table. diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 6b954c9e500..4fb2c518d2c 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -1533,6 +1533,72 @@ SELECT c0 FROM ONLY expr_stats_incompatible_test WHERE DROP TABLE expr_stats_incompatible_test; + +-- Test join estimates. +CREATE TABLE join_test_1 (a int, b int, c int); +CREATE TABLE join_test_2 (a int, b int, c int); + +INSERT INTO join_test_1 SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i); +INSERT INTO join_test_2 SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i); + +ANALYZE join_test_1; +ANALYZE join_test_2; + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b))'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 5'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 0'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 5 and j2.c < 3'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 5 and j2.c > 5'); + +-- can't be improved due to the optimization in clauselist_selectivity_ext, +-- which skips cases with a single (join) clause +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1)) where j1.c < 5'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1)) where j1.c < 5 and j2.c < 5'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1)) where j1.c < 5 and j2.c > 5'); + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a) where j1.c < 5'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a + 1) where j1.c < 5'); + +-- create extended statistics on the join/where columns +CREATE STATISTICS join_stats_1 ON a, b, c, (a+1), (b+1) FROM join_test_1; +CREATE STATISTICS join_stats_2 ON a, b, c, (a+1), (b+1) FROM join_test_2; + +ANALYZE join_test_1; +ANALYZE join_test_2; + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b))'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 5'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 0'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 5 and j2.c < 3'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1) and (j1.b = j2.b)) where j1.c < 5 and j2.c > 5'); + +-- can't be improved due to the optimization in clauselist_selectivity_ext, +-- which skips cases with a single (join) clause +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1)) where j1.c < 5'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1)) where j1.c < 5 and j2.c < 5'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on ((j1.a + 1 = j2.a + 1)) where j1.c < 5 and j2.c > 5'); + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a) where j1.c < 5'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a + 1) where j1.c < 5'); + +-- try combining with single-column (and single-expression) statistics +DROP STATISTICS join_stats_2; + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a) where j1.c < 5'); + +-- no MCV on join_test_2 (on the (a+1) expression) +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a + 1) where j1.c < 5'); + +CREATE STATISTICS join_stats_2 ON (a+1) FROM join_test_2; +ANALYZE join_test_2; + +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a) where j1.c < 5'); +SELECT * FROM check_estimated_rows('select * from join_test_1 j1 join join_test_2 j2 on (j1.a + 1 = j2.a + 1) where j1.c < 5'); + + +DROP TABLE join_test_1; +DROP TABLE join_test_2; + -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in -- the underlying table. -- 2.17.1