On Wed, 18 Mar 2020 at 00:29, Tomas Vondra <tomas.von...@2ndquadrant.com> wrote: > > OK, I took a look. I think from the correctness POV the patch is OK, but > I think the dependencies_clauselist_selectivity() function now got a bit > too complex. I've been able to parse it now, but I'm sure I'll have > trouble in the future :-( > > Can we refactor / split it somehow and move bits of the logic to smaller > functions, or something like that? >
Yeah, it has gotten a bit long. It's somewhat tricky splitting it up, because of the number of shared variables used throughout the function, but here's an updated patch splitting it into what seemed like the 2 most logical pieces. The first piece (still in dependencies_clauselist_selectivity()) works out what dependencies can/should be applied, and the second piece in a new function does the actual work of applying the list of functional dependencies to the clause list. I think that has made it easier to follow, and it has also reduced the complexity of the final "no applicable stats" branch. > Another thing I'd like to suggest is keeping the "old" formula, and > instead of just replacing it with > > P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b) > > but explaining how the old formula may produce nonsensical selectivity, > and how the new formula addresses that issue. > I think this is purely a comment issue? I've added some more extensive comments attempting to justify the formulae. Regards, Dean
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c new file mode 100644 index 5f9b43b..18cce60 --- a/src/backend/statistics/dependencies.c +++ b/src/backend/statistics/dependencies.c @@ -30,6 +30,7 @@ #include "utils/fmgroids.h" #include "utils/fmgrprotos.h" #include "utils/lsyscache.h" +#include "utils/selfuncs.h" #include "utils/syscache.h" #include "utils/typcache.h" @@ -73,13 +74,18 @@ static double dependency_degree(int numr AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs); static bool dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums); -static bool dependency_implies_attribute(MVDependency *dependency, - AttrNumber attnum); static bool dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum); static MVDependency *find_strongest_dependency(MVDependencies **dependencies, int ndependencies, Bitmapset *attnums); +static Selectivity deps_clauselist_selectivity(PlannerInfo *root, List *clauses, + int varRelid, JoinType jointype, + SpecialJoinInfo *sjinfo, + MVDependency **dependencies, + int ndependencies, + AttrNumber *list_attnums, + Bitmapset **estimatedclauses); static void generate_dependencies_recurse(DependencyGenerator state, int index, @@ -614,19 +620,6 @@ dependency_is_fully_matched(MVDependency } /* - * dependency_implies_attribute - * check that the attnum matches is implied by the functional dependency - */ -static bool -dependency_implies_attribute(MVDependency *dependency, AttrNumber attnum) -{ - if (attnum == dependency->attributes[dependency->nattributes - 1]) - return true; - - return false; -} - -/* * statext_dependencies_load * Load the functional dependencies for the indicated pg_statistic_ext tuple */ @@ -986,6 +979,182 @@ find_strongest_dependency(MVDependencies } /* + * deps_clauselist_selectivity + * Return the estimated selectivity of (a subset of) the given clauses + * using the given functional dependencies. This is the guts of + * dependencies_clauselist_selectivity(). + * + * This will estimate all not-already-estimated clauses that are compatible + * with functional dependencies, and which have an attribute mentioned by any + * of the given dependencies (either as an implying or implied attribute). + * + * Given (lists of) clauses on attributes (a,b) and a functional dependency + * (a=>b), the per-column selectivities P(a) and P(b) are notionally combined + * using the formula + * + * P(a,b) = f * P(a) + (1-f) * P(a) * P(b) + * + * where 'f' is the degree of dependency. This reflects the fact that we + * expect a fraction f of all rows to be consistent with the dependency + * (a=>b), and so have a selectivity of P(a), while the remaining rows are + * treated as independent. + * + * In practice, we use a slightly modified version of this formula, which uses + * a selectivity of Min(P(a), P(b)) for the dependent rows, since the result + * should obviously not exceed either column's individual selectivity. I.e., + * we actually combine selectivities using the formula + * + * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b) + * + * This can make quite a difference if the specific values matching the + * clauses are not consistent with the functional dependency. + */ +static Selectivity +deps_clauselist_selectivity(PlannerInfo *root, List *clauses, + int varRelid, JoinType jointype, + SpecialJoinInfo *sjinfo, + MVDependency **dependencies, int ndependencies, + AttrNumber *list_attnums, + Bitmapset **estimatedclauses) +{ + Bitmapset *attnums; + int i; + int j; + int nattrs; + Selectivity *attr_sel; + int attidx; + int listidx; + ListCell *l; + Selectivity s1; + + /* + * Extract the attnums of all implying and implied attributes from all the + * given dependencies. Each of these attributes is expected to have at + * least 1 not-already-estimated compatible clause that we will estimate + * here. + */ + attnums = NULL; + for (i = 0; i < ndependencies; i++) + { + for (j = 0; j < dependencies[i]->nattributes; j++) + { + AttrNumber attnum = dependencies[i]->attributes[j]; + attnums = bms_add_member(attnums, attnum); + } + } + + /* + * Compute per-column selectivity estimates for each of these attributes, + * and mark all the corresponding clauses as estimated. + */ + nattrs = bms_num_members(attnums); + attr_sel = (Selectivity *) palloc(sizeof(Selectivity) * nattrs); + + attidx = 0; + i = -1; + while ((i = bms_next_member(attnums, i)) >= 0) + { + List *attr_clauses = NIL; + Selectivity simple_sel; + + listidx = -1; + foreach(l, clauses) + { + Node *clause = (Node *) lfirst(l); + + listidx++; + if (list_attnums[listidx] == i) + { + attr_clauses = lappend(attr_clauses, clause); + *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + } + } + + simple_sel = clauselist_selectivity_simple(root, attr_clauses, varRelid, + jointype, sjinfo, NULL); + attr_sel[attidx++] = simple_sel; + } + + /* + * Now combine these selectivities using the dependency information. For + * chains of dependencies such as a -> b -> c, the b -> c dependency will + * come before the a -> b dependency in the array, so we traverse the + * array backwards to ensure such chains are computed in the right order. + * + * As explained above, pairs of selectivities are combined using the + * formula + * + * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b) + * + * to ensure that the combined selectivity is never greater than either + * individual selectivity. + * + * Where multiple dependencies apply (e.g., a -> b -> c), we use + * conditional probabilities to compute the overall result as follows: + * + * P(a,b,c) = P(c|a,b) * P(a,b) = P(c|a,b) * P(b|a) * P(a) + * + * so we replace the selectivities of all implied attributes with + * conditional probabilities, that are conditional on all their implying + * attributes. The selectivities of all other non-implied attributes are + * left as they are. + */ + for (i = ndependencies - 1; i >= 0; i--) + { + MVDependency *dependency = dependencies[i]; + AttrNumber attnum; + Selectivity s2; + double f; + + /* Selectivity of all the implying attributes */ + s1 = 1.0; + for (j = 0; j < dependency->nattributes - 1; j++) + { + attnum = dependency->attributes[j]; + attidx = bms_member_index(attnums, attnum); + s1 *= attr_sel[attidx]; + } + + /* Original selectivity of the implied attribute */ + attnum = dependency->attributes[j]; + attidx = bms_member_index(attnums, attnum); + s2 = attr_sel[attidx]; + + /* + * Replace s2 with the conditional probability s2 given s1, computed + * using the formula P(b|a) = P(a,b) / P(a), which simplifies to + * + * P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b) + * + * where P(a) = s1, the selectivity of the implying attributes, and + * P(b) = s2, the selectivity of the implied attribute. + */ + f = dependency->degree; + + if (s1 <= s2) + attr_sel[attidx] = f + (1 - f) * s2; + else + attr_sel[attidx] = f * s2 / s1 + (1 -f) * s2; + } + + /* + * The overall selectivity of all the clauses on all these attributes is + * then the product of all the original (non-implied) probabilities and + * the new conditional (implied) probabilities. + */ + s1 = 1.0; + for (i = 0; i < nattrs; i++) + s1 *= attr_sel[i]; + + CLAMP_PROBABILITY(s1); + + pfree(attr_sel); + bms_free(attnums); + + return s1; +} + +/* * dependencies_clauselist_selectivity * Return the estimated selectivity of (a subset of) the given clauses * using functional dependency statistics, or 1.0 if no useful functional @@ -999,15 +1168,16 @@ find_strongest_dependency(MVDependencies * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected * dependency, we then combine the per-clause selectivities using the formula * - * P(a,b) = P(a) * [f + (1-f)*P(b)] + * P(a,b) = f * P(a) + (1-f) * P(a) * P(b) * - * where 'f' is the degree of the dependency. + * where 'f' is the degree of the dependency. (Actually we use a slightly + * modified version of this formula -- see deps_clauselist_selectivity()). * * With clauses on more than two attributes, the dependencies are applied * recursively, starting with the widest/strongest dependencies. For example * P(a,b,c) is first split like this: * - * P(a,b,c) = P(a,b) * [f + (1-f)*P(c)] + * P(a,b,c) = f * P(a,b) + (1-f) * P(a,b) * P(c) * * assuming (a,b=>c) is the strongest dependency. */ @@ -1023,17 +1193,20 @@ dependencies_clauselist_selectivity(Plan Selectivity s1 = 1.0; ListCell *l; Bitmapset *clauses_attnums = NULL; - Bitmapset **list_attnums; + AttrNumber *list_attnums; int listidx; - MVDependencies **dependencies = NULL; - int ndependencies = 0; + MVDependencies **func_dependencies; + int nfunc_dependencies; + int total_ndeps; + MVDependency **dependencies; + int ndependencies; int i; /* check if there's any stats that might be useful for us. */ if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES)) return 1.0; - list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) * + list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * list_length(clauses)); /* @@ -1056,11 +1229,11 @@ dependencies_clauselist_selectivity(Plan if (!bms_is_member(listidx, *estimatedclauses) && dependency_is_compatible_clause(clause, rel->relid, &attnum)) { - list_attnums[listidx] = bms_make_singleton(attnum); + list_attnums[listidx] = attnum; clauses_attnums = bms_add_member(clauses_attnums, attnum); } else - list_attnums[listidx] = NULL; + list_attnums[listidx] = InvalidAttrNumber; listidx++; } @@ -1072,6 +1245,7 @@ dependencies_clauselist_selectivity(Plan */ if (bms_num_members(clauses_attnums) < 2) { + bms_free(clauses_attnums); pfree(list_attnums); return 1.0; } @@ -1087,9 +1261,10 @@ dependencies_clauselist_selectivity(Plan * to make it just the right size, but it's likely wasteful anyway thanks * to moving the freed chunks to freelists etc. */ - ndependencies = 0; - dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) * - list_length(rel->statlist)); + func_dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) * + list_length(rel->statlist)); + nfunc_dependencies = 0; + total_ndeps = 0; foreach(l,rel->statlist) { @@ -1109,104 +1284,65 @@ dependencies_clauselist_selectivity(Plan if (num_matched < 2) continue; - dependencies[ndependencies++] + func_dependencies[nfunc_dependencies] = statext_dependencies_load(stat->statOid); + + total_ndeps += func_dependencies[nfunc_dependencies]->ndeps; + nfunc_dependencies++; } /* if no matching stats could be found then we've nothing to do */ - if (!ndependencies) + if (nfunc_dependencies == 0) { + pfree(func_dependencies); + bms_free(clauses_attnums); pfree(list_attnums); return 1.0; } /* - * Apply the dependencies recursively, starting with the widest/strongest - * ones, and proceeding to the smaller/weaker ones. At the end of each - * round we factor in the selectivity of clauses on the implied attribute, - * and remove the clauses from the list. + * Work out which dependencies we can apply, starting with the + * widest/stongest ones, and proceeding to smaller/weaker ones. */ + dependencies = (MVDependency **) palloc(sizeof(MVDependency *) * + total_ndeps); + ndependencies = 0; + while (true) { - Selectivity s2 = 1.0; MVDependency *dependency; + AttrNumber attnum; /* the widest/strongest dependency, fully matched by clauses */ - dependency = find_strongest_dependency(dependencies, ndependencies, + dependency = find_strongest_dependency(func_dependencies, + nfunc_dependencies, clauses_attnums); - - /* if no suitable dependency was found, we're done */ if (!dependency) break; - /* - * We found an applicable dependency, so find all the clauses on the - * implied attribute - with dependency (a,b => c) we look for clauses - * on 'c'. - */ - listidx = -1; - foreach(l, clauses) - { - Node *clause; - AttrNumber attnum; - - listidx++; - - /* - * Skip incompatible clauses, and ones we've already estimated on. - */ - if (!list_attnums[listidx]) - continue; - - /* - * We expect the bitmaps ton contain a single attribute number. - */ - attnum = bms_singleton_member(list_attnums[listidx]); - - /* - * Technically we could find more than one clause for a given - * attnum. Since these clauses must be equality clauses, we choose - * to only take the selectivity estimate from the final clause in - * the list for this attnum. If the attnum happens to be compared - * to a different Const in another clause then no rows will match - * anyway. If it happens to be compared to the same Const, then - * ignoring the additional clause is just the thing to do. - */ - if (dependency_implies_attribute(dependency, attnum)) - { - clause = (Node *) lfirst(l); - - s2 = clause_selectivity(root, clause, varRelid, jointype, - sjinfo); - - /* mark this one as done, so we don't touch it again. */ - *estimatedclauses = bms_add_member(*estimatedclauses, listidx); - - /* - * Mark that we've got and used the dependency on this clause. - * We'll want to ignore this when looking for the next - * strongest dependency above. - */ - clauses_attnums = bms_del_member(clauses_attnums, attnum); - } - } + dependencies[ndependencies++] = dependency; - /* - * Now factor in the selectivity for all the "implied" clauses into - * the final one, using this formula: - * - * P(a,b) = P(a) * (f + (1-f) * P(b)) - * - * where 'f' is the degree of validity of the dependency. - */ - s1 *= (dependency->degree + (1 - dependency->degree) * s2); + /* Ignore dependencies using this implied attribute in later loops */ + attnum = dependency->attributes[dependency->nattributes - 1]; + clauses_attnums = bms_del_member(clauses_attnums, attnum); } + /* + * If we found applicable dependencies, use them to estimate all + * compatible clauses on attributes that they refer to. + */ + if (ndependencies != 0) + s1 = deps_clauselist_selectivity(root, clauses, varRelid, jointype, + sjinfo, dependencies, ndependencies, + list_attnums, estimatedclauses); + /* free deserialized functional dependencies (and then the array) */ - for (i = 0; i < ndependencies; i++) - pfree(dependencies[i]); + for (i = 0; i < nfunc_dependencies; i++) + pfree(func_dependencies[i]); pfree(dependencies); + pfree(func_dependencies); + bms_free(clauses_attnums); pfree(list_attnums); return s1; diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out new file mode 100644 index bd6d8be..f2f4008 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -440,6 +440,12 @@ SELECT * FROM check_estimated_rows('SELE 8 | 200 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1'''); + estimated | actual +-----------+-------- + 4 | 100 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1'); estimated | actual -----------+-------- @@ -600,6 +606,12 @@ SELECT * FROM check_estimated_rows('SELE 200 | 200 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1'''); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1'); estimated | actual -----------+-------- @@ -719,12 +731,14 @@ SELECT * FROM check_estimated_rows('SELE 1 | 0 (1 row) --- check change of column type doesn't break it +-- changing the type of column c causes its single-column stats to be dropped, +-- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple +-- clauses estimated with functional dependencies does not exceed this ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric; SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1'); estimated | actual -----------+-------- - 50 | 50 + 25 | 50 (1 row) ANALYZE functional_dependencies; diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql new file mode 100644 index 897ed30..f3b652d --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -280,6 +280,8 @@ SELECT * FROM check_estimated_rows('SELE SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')'); +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1'''); + SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1'); SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)'); @@ -342,6 +344,8 @@ SELECT * FROM check_estimated_rows('SELE SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')'); +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1'''); + SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1'); SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)'); @@ -385,7 +389,9 @@ SELECT * FROM check_estimated_rows('SELE SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])'); --- check change of column type doesn't break it +-- changing the type of column c causes its single-column stats to be dropped, +-- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple +-- clauses estimated with functional dependencies does not exceed this ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric; SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');