> On Thu, Feb 06, 2020 at 09:22:20PM +0900, Kyotaro Horiguchi wrote: > At Thu, 6 Feb 2020 11:57:07 +0100, Dmitry Dolgov <9erthali...@gmail.com> > wrote in > > > On Thu, Feb 06, 2020 at 10:24:50AM +0900, Kyotaro Horiguchi wrote: > > > At Wed, 5 Feb 2020 17:37:30 +0100, Dmitry Dolgov <9erthali...@gmail.com> > > > wrote in > > > We could add an additional parameter "in_cursor" to > > > ExecSupportBackwardScan and let skip scan return false if in_cursor is > > > true, but I'm not sure it's acceptable. > > > > I also was thinking about whether it's possible to use > > ExecSupportBackwardScan here, but skip scan is just a mode of an > > index/indexonly scan. Which means that ExecSupportBackwardScan also need > > to know somehow if this mode is being used, and then, since this > > function is called after it's already decided to use skip scan in the > > resulting plan, somehow correct the plan (exclude skipping and try to > > find next best path?) - do I understand your suggestion correct? > > I didn't thought so hardly, but a bit of confirmation told me that > IndexSupportsBackwardScan returns fixed flag for AM. It seems that > things are not that simple.
Yes, I've mentioned that already in one of the previous emails :) The simplest way I see to achieve what we want is to do something like in attached modified version with a new hasDeclaredCursor field. It's not a final version though, but posted just for discussion, so feel free to suggest any improvements or alternatives.
>From 22e6b4ccd5f79ca069bd5cd90ba3696dd97f76ea Mon Sep 17 00:00:00 2001 From: jesperpedersen <jesper.peder...@redhat.com> Date: Tue, 9 Jul 2019 06:44:57 -0400 Subject: [PATCH v33 1/2] Unique key Design by David Rowley. Author: Jesper Pedersen --- src/backend/nodes/outfuncs.c | 14 +++ src/backend/nodes/print.c | 39 +++++++ src/backend/optimizer/path/Makefile | 3 +- src/backend/optimizer/path/allpaths.c | 8 ++ src/backend/optimizer/path/indxpath.c | 41 +++++++ src/backend/optimizer/path/pathkeys.c | 71 ++++++++++-- src/backend/optimizer/path/uniquekey.c | 147 +++++++++++++++++++++++++ src/backend/optimizer/plan/planagg.c | 1 + src/backend/optimizer/plan/planmain.c | 1 + src/backend/optimizer/plan/planner.c | 17 ++- src/backend/optimizer/util/pathnode.c | 12 ++ src/include/nodes/nodes.h | 1 + src/include/nodes/pathnodes.h | 18 +++ src/include/nodes/print.h | 1 + src/include/optimizer/pathnode.h | 1 + src/include/optimizer/paths.h | 11 ++ 16 files changed, 373 insertions(+), 13 deletions(-) create mode 100644 src/backend/optimizer/path/uniquekey.c diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index d76fae44b8..16083e7a7e 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -1723,6 +1723,7 @@ _outPathInfo(StringInfo str, const Path *node) WRITE_FLOAT_FIELD(startup_cost, "%.2f"); WRITE_FLOAT_FIELD(total_cost, "%.2f"); WRITE_NODE_FIELD(pathkeys); + WRITE_NODE_FIELD(uniquekeys); } /* @@ -2205,6 +2206,7 @@ _outPlannerInfo(StringInfo str, const PlannerInfo *node) WRITE_NODE_FIELD(eq_classes); WRITE_BOOL_FIELD(ec_merging_done); WRITE_NODE_FIELD(canon_pathkeys); + WRITE_NODE_FIELD(canon_uniquekeys); WRITE_NODE_FIELD(left_join_clauses); WRITE_NODE_FIELD(right_join_clauses); WRITE_NODE_FIELD(full_join_clauses); @@ -2214,6 +2216,7 @@ _outPlannerInfo(StringInfo str, const PlannerInfo *node) WRITE_NODE_FIELD(placeholder_list); WRITE_NODE_FIELD(fkey_list); WRITE_NODE_FIELD(query_pathkeys); + WRITE_NODE_FIELD(query_uniquekeys); WRITE_NODE_FIELD(group_pathkeys); WRITE_NODE_FIELD(window_pathkeys); WRITE_NODE_FIELD(distinct_pathkeys); @@ -2401,6 +2404,14 @@ _outPathKey(StringInfo str, const PathKey *node) WRITE_BOOL_FIELD(pk_nulls_first); } +static void +_outUniqueKey(StringInfo str, const UniqueKey *node) +{ + WRITE_NODE_TYPE("UNIQUEKEY"); + + WRITE_NODE_FIELD(eq_clause); +} + static void _outPathTarget(StringInfo str, const PathTarget *node) { @@ -4092,6 +4103,9 @@ outNode(StringInfo str, const void *obj) case T_PathKey: _outPathKey(str, obj); break; + case T_UniqueKey: + _outUniqueKey(str, obj); + break; case T_PathTarget: _outPathTarget(str, obj); break; diff --git a/src/backend/nodes/print.c b/src/backend/nodes/print.c index 42476724d8..d286b34544 100644 --- a/src/backend/nodes/print.c +++ b/src/backend/nodes/print.c @@ -459,6 +459,45 @@ print_pathkeys(const List *pathkeys, const List *rtable) printf(")\n"); } +/* + * print_uniquekeys - + * uniquekeys list of UniqueKeys + */ +void +print_uniquekeys(const List *uniquekeys, const List *rtable) +{ + ListCell *l; + + printf("("); + foreach(l, uniquekeys) + { + UniqueKey *unique_key = (UniqueKey *) lfirst(l); + EquivalenceClass *eclass = (EquivalenceClass *) unique_key->eq_clause; + ListCell *k; + bool first = true; + + /* chase up */ + while (eclass->ec_merged) + eclass = eclass->ec_merged; + + printf("("); + foreach(k, eclass->ec_members) + { + EquivalenceMember *mem = (EquivalenceMember *) lfirst(k); + + if (first) + first = false; + else + printf(", "); + print_expr((Node *) mem->em_expr, rtable); + } + printf(")"); + if (lnext(uniquekeys, l)) + printf(", "); + } + printf(")\n"); +} + /* * print_tl * print targetlist in a more legible way. diff --git a/src/backend/optimizer/path/Makefile b/src/backend/optimizer/path/Makefile index 1e199ff66f..63cc1505d9 100644 --- a/src/backend/optimizer/path/Makefile +++ b/src/backend/optimizer/path/Makefile @@ -21,6 +21,7 @@ OBJS = \ joinpath.o \ joinrels.o \ pathkeys.o \ - tidpath.o + tidpath.o \ + uniquekey.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 8286d9cf34..bbc13e6141 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -3954,6 +3954,14 @@ print_path(PlannerInfo *root, Path *path, int indent) print_pathkeys(path->pathkeys, root->parse->rtable); } + if (path->uniquekeys) + { + for (i = 0; i < indent; i++) + printf("\t"); + printf(" uniquekeys: "); + print_uniquekeys(path->uniquekeys, root->parse->rtable); + } + if (join) { JoinPath *jp = (JoinPath *) path; diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 2a50272da6..bd1ea53e5c 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -189,6 +189,7 @@ static Expr *match_clause_to_ordering_op(IndexOptInfo *index, static bool ec_member_matches_indexcol(PlannerInfo *root, RelOptInfo *rel, EquivalenceClass *ec, EquivalenceMember *em, void *arg); +static List *get_uniquekeys_for_index(PlannerInfo *root, List *pathkeys); /* @@ -874,6 +875,7 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, List *orderbyclausecols; List *index_pathkeys; List *useful_pathkeys; + List *useful_uniquekeys = NIL; bool found_lower_saop_clause; bool pathkeys_possibly_useful; bool index_is_ordered; @@ -1036,11 +1038,15 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, if (index_clauses != NIL || useful_pathkeys != NIL || useful_predicate || index_only_scan) { + if (has_useful_uniquekeys(root)) + useful_uniquekeys = get_uniquekeys_for_index(root, useful_pathkeys); + ipath = create_index_path(root, index, index_clauses, orderbyclauses, orderbyclausecols, useful_pathkeys, + useful_uniquekeys, index_is_ordered ? ForwardScanDirection : NoMovementScanDirection, @@ -1063,6 +1069,7 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, orderbyclauses, orderbyclausecols, useful_pathkeys, + useful_uniquekeys, index_is_ordered ? ForwardScanDirection : NoMovementScanDirection, @@ -1093,11 +1100,15 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, index_pathkeys); if (useful_pathkeys != NIL) { + if (has_useful_uniquekeys(root)) + useful_uniquekeys = get_uniquekeys_for_index(root, useful_pathkeys); + ipath = create_index_path(root, index, index_clauses, NIL, NIL, useful_pathkeys, + useful_uniquekeys, BackwardScanDirection, index_only_scan, outer_relids, @@ -1115,6 +1126,7 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, NIL, NIL, useful_pathkeys, + useful_uniquekeys, BackwardScanDirection, index_only_scan, outer_relids, @@ -3365,6 +3377,35 @@ match_clause_to_ordering_op(IndexOptInfo *index, return clause; } +/* + * get_uniquekeys_for_index + */ +static List * +get_uniquekeys_for_index(PlannerInfo *root, List *pathkeys) +{ + ListCell *lc; + + if (pathkeys) + { + List *uniquekeys = NIL; + foreach(lc, pathkeys) + { + UniqueKey *unique_key; + PathKey *pk = (PathKey *) lfirst(lc); + EquivalenceClass *ec = (EquivalenceClass *) pk->pk_eclass; + + unique_key = makeNode(UniqueKey); + unique_key->eq_clause = ec; + + lappend(uniquekeys, unique_key); + } + + if (uniquekeys_contained_in(root->canon_uniquekeys, uniquekeys)) + return uniquekeys; + } + + return NIL; +} /**************************************************************************** * ---- ROUTINES TO DO PARTIAL INDEX PREDICATE TESTS ---- diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index 71b9d42c99..054df9a617 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -29,6 +29,7 @@ #include "utils/lsyscache.h" +static bool pathkey_is_unique(PathKey *new_pathkey, List *pathkeys); static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys); static bool matches_boolean_partition_clause(RestrictInfo *rinfo, RelOptInfo *partrel, @@ -96,6 +97,29 @@ make_canonical_pathkey(PlannerInfo *root, return pk; } +/* + * pathkey_is_unique + * Checks if the new pathkey's equivalence class is the same as that of + * any existing member of the pathkey list. + */ +static bool +pathkey_is_unique(PathKey *new_pathkey, List *pathkeys) +{ + EquivalenceClass *new_ec = new_pathkey->pk_eclass; + ListCell *lc; + + /* If same EC already is already in the list, then not unique */ + foreach(lc, pathkeys) + { + PathKey *old_pathkey = (PathKey *) lfirst(lc); + + if (new_ec == old_pathkey->pk_eclass) + return false; + } + + return true; +} + /* * pathkey_is_redundant * Is a pathkey redundant with one already in the given list? @@ -135,22 +159,12 @@ static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys) { EquivalenceClass *new_ec = new_pathkey->pk_eclass; - ListCell *lc; /* Check for EC containing a constant --- unconditionally redundant */ if (EC_MUST_BE_REDUNDANT(new_ec)) return true; - /* If same EC already used in list, then redundant */ - foreach(lc, pathkeys) - { - PathKey *old_pathkey = (PathKey *) lfirst(lc); - - if (new_ec == old_pathkey->pk_eclass) - return true; - } - - return false; + return !pathkey_is_unique(new_pathkey, pathkeys); } /* @@ -1098,6 +1112,41 @@ make_pathkeys_for_sortclauses(PlannerInfo *root, return pathkeys; } +/* + * make_pathkeys_for_uniquekeyclauses + * Generate a pathkeys list to be used for uniquekey clauses + */ +List * +make_pathkeys_for_uniquekeys(PlannerInfo *root, + List *sortclauses, + List *tlist) +{ + List *pathkeys = NIL; + ListCell *l; + + foreach(l, sortclauses) + { + SortGroupClause *sortcl = (SortGroupClause *) lfirst(l); + Expr *sortkey; + PathKey *pathkey; + + sortkey = (Expr *) get_sortgroupclause_expr(sortcl, tlist); + Assert(OidIsValid(sortcl->sortop)); + pathkey = make_pathkey_from_sortop(root, + sortkey, + root->nullable_baserels, + sortcl->sortop, + sortcl->nulls_first, + sortcl->tleSortGroupRef, + true); + + if (pathkey_is_unique(pathkey, pathkeys)) + pathkeys = lappend(pathkeys, pathkey); + } + + return pathkeys; +} + /**************************************************************************** * PATHKEYS AND MERGECLAUSES ****************************************************************************/ diff --git a/src/backend/optimizer/path/uniquekey.c b/src/backend/optimizer/path/uniquekey.c new file mode 100644 index 0000000000..13d4ebb98c --- /dev/null +++ b/src/backend/optimizer/path/uniquekey.c @@ -0,0 +1,147 @@ +/*------------------------------------------------------------------------- + * + * uniquekey.c + * Utilities for matching and building unique keys + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/optimizer/path/uniquekey.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "optimizer/pathnode.h" +#include "optimizer/paths.h" +#include "nodes/pg_list.h" + +static UniqueKey *make_canonical_uniquekey(PlannerInfo *root, EquivalenceClass *eclass); + +/* + * Build a list of unique keys + */ +List* +build_uniquekeys(PlannerInfo *root, List *sortclauses) +{ + List *result = NIL; + List *sortkeys; + ListCell *l; + + sortkeys = make_pathkeys_for_uniquekeys(root, + sortclauses, + root->processed_tlist); + + /* Create a uniquekey and add it to the list */ + foreach(l, sortkeys) + { + PathKey *pathkey = (PathKey *) lfirst(l); + EquivalenceClass *ec = pathkey->pk_eclass; + UniqueKey *unique_key = make_canonical_uniquekey(root, ec); + + result = lappend(result, unique_key); + } + + return result; +} + +/* + * uniquekeys_contained_in + * Are the keys2 included in the keys1 superset + */ +bool +uniquekeys_contained_in(List *keys1, List *keys2) +{ + ListCell *key1, + *key2; + + /* + * Fall out quickly if we are passed two identical lists. This mostly + * catches the case where both are NIL, but that's common enough to + * warrant the test. + */ + if (keys1 == keys2) + return true; + + foreach(key2, keys2) + { + bool found = false; + UniqueKey *uniquekey2 = (UniqueKey *) lfirst(key2); + + foreach(key1, keys1) + { + UniqueKey *uniquekey1 = (UniqueKey *) lfirst(key1); + + if (uniquekey1->eq_clause == uniquekey2->eq_clause) + { + found = true; + break; + } + } + + if (!found) + return false; + } + + return true; +} + +/* + * has_useful_uniquekeys + * Detect whether the planner could have any uniquekeys that are + * useful. + */ +bool +has_useful_uniquekeys(PlannerInfo *root) +{ + if (root->query_uniquekeys != NIL) + return true; /* there are some */ + return false; /* definitely useless */ +} + +/* + * make_canonical_uniquekey + * Given the parameters for a UniqueKey, find any pre-existing matching + * uniquekey in the query's list of "canonical" uniquekeys. Make a new + * entry if there's not one already. + * + * Note that this function must not be used until after we have completed + * merging EquivalenceClasses. (We don't try to enforce that here; instead, + * equivclass.c will complain if a merge occurs after root->canon_uniquekeys + * has become nonempty.) + */ +static UniqueKey * +make_canonical_uniquekey(PlannerInfo *root, + EquivalenceClass *eclass) +{ + UniqueKey *uk; + ListCell *lc; + MemoryContext oldcontext; + + /* The passed eclass might be non-canonical, so chase up to the top */ + while (eclass->ec_merged) + eclass = eclass->ec_merged; + + foreach(lc, root->canon_uniquekeys) + { + uk = (UniqueKey *) lfirst(lc); + if (eclass == uk->eq_clause) + return uk; + } + + /* + * Be sure canonical uniquekeys are allocated in the main planning context. + * Not an issue in normal planning, but it is for GEQO. + */ + oldcontext = MemoryContextSwitchTo(root->planner_cxt); + + uk = makeNode(UniqueKey); + uk->eq_clause = eclass; + + root->canon_uniquekeys = lappend(root->canon_uniquekeys, uk); + + MemoryContextSwitchTo(oldcontext); + + return uk; +} diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c index 8634940efc..dd64775d8f 100644 --- a/src/backend/optimizer/plan/planagg.c +++ b/src/backend/optimizer/plan/planagg.c @@ -511,6 +511,7 @@ minmax_qp_callback(PlannerInfo *root, void *extra) root->parse->targetList); root->query_pathkeys = root->sort_pathkeys; + root->query_uniquekeys = NIL; } /* diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c index 62dfc6d44a..3a372af91b 100644 --- a/src/backend/optimizer/plan/planmain.c +++ b/src/backend/optimizer/plan/planmain.c @@ -70,6 +70,7 @@ query_planner(PlannerInfo *root, root->join_rel_level = NULL; root->join_cur_level = 0; root->canon_pathkeys = NIL; + root->canon_uniquekeys = NIL; root->left_join_clauses = NIL; root->right_join_clauses = NIL; root->full_join_clauses = NIL; diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index d6f2153593..984fca0696 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -3657,15 +3657,30 @@ standard_qp_callback(PlannerInfo *root, void *extra) * much easier, since we know that the parser ensured that one is a * superset of the other. */ + root->query_uniquekeys = NIL; + if (root->group_pathkeys) + { root->query_pathkeys = root->group_pathkeys; + + if (!root->parse->hasAggs) + root->query_uniquekeys = build_uniquekeys(root, qp_extra->groupClause); + } else if (root->window_pathkeys) root->query_pathkeys = root->window_pathkeys; else if (list_length(root->distinct_pathkeys) > list_length(root->sort_pathkeys)) + { root->query_pathkeys = root->distinct_pathkeys; + root->query_uniquekeys = build_uniquekeys(root, parse->distinctClause); + } else if (root->sort_pathkeys) + { root->query_pathkeys = root->sort_pathkeys; + + if (root->distinct_pathkeys) + root->query_uniquekeys = build_uniquekeys(root, parse->distinctClause); + } else root->query_pathkeys = NIL; } @@ -6222,7 +6237,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid) /* Estimate the cost of index scan */ indexScanPath = create_index_path(root, indexInfo, - NIL, NIL, NIL, NIL, + NIL, NIL, NIL, NIL, NIL, ForwardScanDirection, false, NULL, 1.0, false); diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index e6d08aede5..a006dbbe9c 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -940,6 +940,7 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->parallel_safe = rel->consider_parallel; pathnode->parallel_workers = parallel_workers; pathnode->pathkeys = NIL; /* seqscan has unordered result */ + pathnode->uniquekeys = NIL; cost_seqscan(pathnode, root, rel, pathnode->param_info); @@ -964,6 +965,7 @@ create_samplescan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer pathnode->parallel_safe = rel->consider_parallel; pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* samplescan has unordered result */ + pathnode->uniquekeys = NIL; cost_samplescan(pathnode, root, rel, pathnode->param_info); @@ -1000,6 +1002,7 @@ create_index_path(PlannerInfo *root, List *indexorderbys, List *indexorderbycols, List *pathkeys, + List *uniquekeys, ScanDirection indexscandir, bool indexonly, Relids required_outer, @@ -1018,6 +1021,7 @@ create_index_path(PlannerInfo *root, pathnode->path.parallel_safe = rel->consider_parallel; pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = pathkeys; + pathnode->path.uniquekeys = uniquekeys; pathnode->indexinfo = index; pathnode->indexclauses = indexclauses; @@ -1061,6 +1065,7 @@ create_bitmap_heap_path(PlannerInfo *root, pathnode->path.parallel_safe = rel->consider_parallel; pathnode->path.parallel_workers = parallel_degree; pathnode->path.pathkeys = NIL; /* always unordered */ + pathnode->path.uniquekeys = NIL; pathnode->bitmapqual = bitmapqual; @@ -1922,6 +1927,7 @@ create_functionscan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->parallel_safe = rel->consider_parallel; pathnode->parallel_workers = 0; pathnode->pathkeys = pathkeys; + pathnode->uniquekeys = NIL; cost_functionscan(pathnode, root, rel, pathnode->param_info); @@ -1948,6 +1954,7 @@ create_tablefuncscan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->parallel_safe = rel->consider_parallel; pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* result is always unordered */ + pathnode->uniquekeys = NIL; cost_tablefuncscan(pathnode, root, rel, pathnode->param_info); @@ -1974,6 +1981,7 @@ create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->parallel_safe = rel->consider_parallel; pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* result is always unordered */ + pathnode->uniquekeys = NIL; cost_valuesscan(pathnode, root, rel, pathnode->param_info); @@ -1999,6 +2007,7 @@ create_ctescan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer) pathnode->parallel_safe = rel->consider_parallel; pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* XXX for now, result is always unordered */ + pathnode->uniquekeys = NIL; cost_ctescan(pathnode, root, rel, pathnode->param_info); @@ -2025,6 +2034,7 @@ create_namedtuplestorescan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->parallel_safe = rel->consider_parallel; pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* result is always unordered */ + pathnode->uniquekeys = NIL; cost_namedtuplestorescan(pathnode, root, rel, pathnode->param_info); @@ -2051,6 +2061,7 @@ create_resultscan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->parallel_safe = rel->consider_parallel; pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* result is always unordered */ + pathnode->uniquekeys = NIL; cost_resultscan(pathnode, root, rel, pathnode->param_info); @@ -2077,6 +2088,7 @@ create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->parallel_safe = rel->consider_parallel; pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* result is always unordered */ + pathnode->uniquekeys = NIL; /* Cost is the same as for a regular CTE scan */ cost_ctescan(pathnode, root, rel, pathnode->param_info); diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index baced7eec0..a1511b46ea 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -261,6 +261,7 @@ typedef enum NodeTag T_EquivalenceMember, T_PathKey, T_PathTarget, + T_UniqueKey, T_RestrictInfo, T_IndexClause, T_PlaceHolderVar, diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 3d3be197e0..4e329f0fb5 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -269,6 +269,8 @@ struct PlannerInfo List *canon_pathkeys; /* list of "canonical" PathKeys */ + List *canon_uniquekeys; /* list of "canonical" UniqueKeys */ + List *left_join_clauses; /* list of RestrictInfos for mergejoinable * outer join clauses w/nonnullable var on * left */ @@ -297,6 +299,8 @@ struct PlannerInfo List *query_pathkeys; /* desired pathkeys for query_planner() */ + List *query_uniquekeys; /* unique keys used for the query */ + List *group_pathkeys; /* groupClause pathkeys, if any */ List *window_pathkeys; /* pathkeys of bottom window, if any */ List *distinct_pathkeys; /* distinctClause pathkeys, if any */ @@ -1077,6 +1081,15 @@ typedef struct ParamPathInfo List *ppi_clauses; /* join clauses available from outer rels */ } ParamPathInfo; +/* + * UniqueKey + */ +typedef struct UniqueKey +{ + NodeTag type; + + EquivalenceClass *eq_clause; /* equivalence class */ +} UniqueKey; /* * Type "Path" is used as-is for sequential-scan paths, as well as some other @@ -1106,6 +1119,9 @@ typedef struct ParamPathInfo * * "pathkeys" is a List of PathKey nodes (see above), describing the sort * ordering of the path's output rows. + * + * "uniquekeys", if not NIL, is a list of UniqueKey nodes (see above), + * describing the XXX. */ typedef struct Path { @@ -1129,6 +1145,8 @@ typedef struct Path List *pathkeys; /* sort ordering of path's output */ /* pathkeys is a List of PathKey nodes; see above */ + + List *uniquekeys; /* the unique keys, or NIL if none */ } Path; /* Macro for extracting a path's parameterization relids; beware double eval */ diff --git a/src/include/nodes/print.h b/src/include/nodes/print.h index 6126b491bf..006248bfb5 100644 --- a/src/include/nodes/print.h +++ b/src/include/nodes/print.h @@ -28,6 +28,7 @@ extern char *pretty_format_node_dump(const char *dump); extern void print_rt(const List *rtable); extern void print_expr(const Node *expr, const List *rtable); extern void print_pathkeys(const List *pathkeys, const List *rtable); +extern void print_uniquekeys(const List *uniquekeys, const List *rtable); extern void print_tl(const List *tlist, const List *rtable); extern void print_slot(TupleTableSlot *slot); diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index e450fe112a..f75ff6f323 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -44,6 +44,7 @@ extern IndexPath *create_index_path(PlannerInfo *root, List *indexorderbys, List *indexorderbycols, List *pathkeys, + List *uniquekeys, ScanDirection indexscandir, bool indexonly, Relids required_outer, diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 9ab73bd20c..5b6be383b3 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -214,6 +214,9 @@ extern List *build_join_pathkeys(PlannerInfo *root, extern List *make_pathkeys_for_sortclauses(PlannerInfo *root, List *sortclauses, List *tlist); +extern List *make_pathkeys_for_uniquekeys(PlannerInfo *root, + List *sortclauses, + List *tlist); extern void initialize_mergeclause_eclasses(PlannerInfo *root, RestrictInfo *restrictinfo); extern void update_mergeclause_eclasses(PlannerInfo *root, @@ -240,4 +243,12 @@ extern PathKey *make_canonical_pathkey(PlannerInfo *root, extern void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, List *live_childrels); +/* + * uniquekey.c + * Utilities for matching and building unique keys + */ +extern List *build_uniquekeys(PlannerInfo *root, List *sortclauses); +extern bool uniquekeys_contained_in(List *keys1, List *keys2); +extern bool has_useful_uniquekeys(PlannerInfo *root); + #endif /* PATHS_H */ -- 2.21.0
>From c5b589f33e5ab2f8801bbf3e2cb9e7a25777bc82 Mon Sep 17 00:00:00 2001 From: jesperpedersen <jesper.peder...@redhat.com> Date: Fri, 15 Nov 2019 09:46:53 -0500 Subject: [PATCH v33 2/2] Index skip scan Implementation of Index Skip Scan (see Loose Index Scan in the wiki [1]) on top of IndexOnlyScan and IndexScan. To make it suitable for both situations when there are small number of distinct values and significant amount of distinct values the following approach is taken - instead of searching from the root for every value we're searching for then first on the current page, and then if not found continue searching from the root. Original patch and design were proposed by Thomas Munro [2], revived and improved by Dmitry Dolgov and Jesper Pedersen. [1] https://wiki.postgresql.org/wiki/Loose_indexscan [2] https://www.postgresql.org/message-id/flat/CADLWmXXbTSBxP-MzJuPAYSsL_2f0iPm5VWPbCvDbVvfX93FKkw%40mail.gmail.com Author: Jesper Pedersen, Dmitry Dolgov Reviewed-by: Thomas Munro, David Rowley, Floris Van Nee, Kyotaro Horiguchi, Tomas Vondra, Peter Geoghegan --- contrib/bloom/blutils.c | 1 + doc/src/sgml/config.sgml | 15 + doc/src/sgml/indexam.sgml | 63 ++ doc/src/sgml/indices.sgml | 23 + src/backend/access/brin/brin.c | 1 + src/backend/access/gin/ginutil.c | 1 + src/backend/access/gist/gist.c | 1 + src/backend/access/hash/hash.c | 1 + src/backend/access/index/indexam.c | 18 + src/backend/access/nbtree/nbtree.c | 13 + src/backend/access/nbtree/nbtsearch.c | 458 +++++++++++++ src/backend/access/spgist/spgutils.c | 1 + src/backend/commands/explain.c | 25 + src/backend/commands/portalcmds.c | 2 + src/backend/executor/nodeIndexonlyscan.c | 51 +- src/backend/executor/nodeIndexscan.c | 56 +- src/backend/nodes/copyfuncs.c | 2 + src/backend/nodes/outfuncs.c | 2 + src/backend/nodes/readfuncs.c | 2 + src/backend/optimizer/path/costsize.c | 1 + src/backend/optimizer/plan/createplan.c | 20 +- src/backend/optimizer/plan/planner.c | 77 +++ src/backend/optimizer/util/pathnode.c | 40 ++ src/backend/optimizer/util/plancat.c | 1 + src/backend/utils/misc/guc.c | 9 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/access/amapi.h | 8 + src/include/access/genam.h | 2 + src/include/access/nbtree.h | 7 + src/include/access/sdir.h | 7 + src/include/nodes/execnodes.h | 6 + src/include/nodes/parsenodes.h | 1 + src/include/nodes/pathnodes.h | 5 + src/include/nodes/plannodes.h | 4 + src/include/optimizer/cost.h | 1 + src/include/optimizer/pathnode.h | 5 + src/test/regress/expected/select_distinct.out | 601 ++++++++++++++++++ src/test/regress/expected/sysviews.out | 3 +- src/test/regress/sql/select_distinct.sql | 248 ++++++++ 39 files changed, 1772 insertions(+), 11 deletions(-) diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index 0104d02f67..a018b7f3d0 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -133,6 +133,7 @@ blhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = blbulkdelete; amroutine->amvacuumcleanup = blvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; amroutine->amcostestimate = blcostestimate; amroutine->amoptions = bloptions; amroutine->amproperty = NULL; diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index e07dc01e80..36ba75b077 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -4517,6 +4517,21 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class=" </listitem> </varlistentry> + <varlistentry id="guc-enable-indexskipscan" xreflabel="enable_indexskipscan"> + <term><varname>enable_indexskipscan</varname> (<type>boolean</type>) + <indexterm> + <primary><varname>enable_indexskipscan</varname> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + Enables or disables the query planner's use of index-skip-scan plan + types (see <xref linkend="indexes-index-skip-scans"/>). The default is + <literal>on</literal>. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-enable-material" xreflabel="enable_material"> <term><varname>enable_material</varname> (<type>boolean</type>) <indexterm> diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 37f8d8760a..a726d80878 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -148,6 +148,7 @@ typedef struct IndexAmRoutine amendscan_function amendscan; ammarkpos_function ammarkpos; /* can be NULL */ amrestrpos_function amrestrpos; /* can be NULL */ + amskip_function amskip; /* can be NULL */ /* interface functions to support parallel index scans */ amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ @@ -691,6 +692,68 @@ amrestrpos (IndexScanDesc scan); <para> <programlisting> +bool +amskip (IndexScanDesc scan, + ScanDirection direction, + ScanDirection indexdir, + bool scanstart, + int prefix); +</programlisting> + Skip past all tuples where the first 'prefix' columns have the same value as + the last tuple returned in the current scan. The arguments are: + + <variablelist> + <varlistentry> + <term><parameter>scan</parameter></term> + <listitem> + <para> + Index scan information + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><parameter>direction</parameter></term> + <listitem> + <para> + The direction in which data is advancing. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><parameter>indexdir</parameter></term> + <listitem> + <para> + The index direction, in which data must be read. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><parameter>scanstart</parameter></term> + <listitem> + <para> + Whether or not it is a start of the scan. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><parameter>prefix</parameter></term> + <listitem> + <para> + Distinct prefix size. + </para> + </listitem> + </varlistentry> + + </variablelist> + + </para> + + <para> +<programlisting> Size amestimateparallelscan (void); </programlisting> diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml index c54bf0dbbd..c429d98fc7 100644 --- a/doc/src/sgml/indices.sgml +++ b/doc/src/sgml/indices.sgml @@ -1254,6 +1254,29 @@ SELECT target FROM tests WHERE subject = 'some-subject' AND success; and later will recognize such cases and allow index-only scans to be generated, but older versions will not. </para> + + <sect2 id="indexes-index-skip-scans"> + <title>Index Skip Scans</title> + + <indexterm zone="indexes-index-skip-scans"> + <primary>index</primary> + <secondary>index-skip scans</secondary> + </indexterm> + <indexterm zone="indexes-index-skip-scans"> + <primary>index-skip scan</primary> + </indexterm> + + <para> + When the rows retrieved from an index scan are then deduplicated by + eliminating rows matching on a prefix of index keys (e.g. when using + <literal>SELECT DISTINCT</literal>), the planner will consider + skipping groups of rows with a matching key prefix. When a row with + a particular prefix is found, remaining rows with the same key prefix + are skipped. The larger the number of rows with the same key prefix + rows (i.e. the lower the number of distinct key prefixes in the index), + the more efficient this is. + </para> + </sect2> </sect1> diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 2e8f67ef10..4db31bb211 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -113,6 +113,7 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = brinbulkdelete; amroutine->amvacuumcleanup = brinvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; amroutine->amcostestimate = brincostestimate; amroutine->amoptions = brinoptions; amroutine->amproperty = NULL; diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index a7e55caf28..8dd1d30d2a 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -65,6 +65,7 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = ginbulkdelete; amroutine->amvacuumcleanup = ginvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; amroutine->amcostestimate = gincostestimate; amroutine->amoptions = ginoptions; amroutine->amproperty = NULL; diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index aefc302ed2..8c692f7fb4 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -86,6 +86,7 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = gistbulkdelete; amroutine->amvacuumcleanup = gistvacuumcleanup; amroutine->amcanreturn = gistcanreturn; + amroutine->amskip = NULL; amroutine->amcostestimate = gistcostestimate; amroutine->amoptions = gistoptions; amroutine->amproperty = gistproperty; diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 4871b7ff4d..e5fa4c7864 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -83,6 +83,7 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = hashbulkdelete; amroutine->amvacuumcleanup = hashvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; amroutine->amcostestimate = hashcostestimate; amroutine->amoptions = hashoptions; amroutine->amproperty = NULL; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 01539b6bd6..1047a35ade 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -33,6 +33,7 @@ * index_can_return - does index support index-only scans? * index_getprocid - get a support procedure OID * index_getprocinfo - get a support procedure's lookup info + * index_skip - advance past duplicate key values in a scan * * NOTES * This file contains the index_ routines which used @@ -730,6 +731,23 @@ index_can_return(Relation indexRelation, int attno) return indexRelation->rd_indam->amcanreturn(indexRelation, attno); } +/* ---------------- + * index_skip + * + * Skip past all tuples where the first 'prefix' columns have the + * same value as the last tuple returned in the current scan. + * ---------------- + */ +bool +index_skip(IndexScanDesc scan, ScanDirection direction, + ScanDirection indexdir, bool scanstart, int prefix) +{ + SCAN_CHECKS; + + return scan->indexRelation->rd_indam->amskip(scan, direction, + indexdir, scanstart, prefix); +} + /* ---------------- * index_getprocid * diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 5254bc7ef5..8fde56fe60 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -132,6 +132,7 @@ bthandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = btbulkdelete; amroutine->amvacuumcleanup = btvacuumcleanup; amroutine->amcanreturn = btcanreturn; + amroutine->amskip = btskip; amroutine->amcostestimate = btcostestimate; amroutine->amoptions = btoptions; amroutine->amproperty = btproperty; @@ -381,6 +382,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys) */ so->currTuples = so->markTuples = NULL; + so->skipScanKey = NULL; + scan->xs_itupdesc = RelationGetDescr(rel); scan->opaque = so; @@ -448,6 +451,16 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, _bt_preprocess_array_keys(scan); } +/* + * btskip() -- skip to the beginning of the next key prefix + */ +bool +btskip(IndexScanDesc scan, ScanDirection direction, + ScanDirection indexdir, bool start, int prefix) +{ + return _bt_skip(scan, direction, indexdir, start, prefix); +} + /* * btendscan() -- close down a scan */ diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index c573814f01..8b406416fd 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -37,6 +37,10 @@ static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); +static inline void _bt_update_skip_scankeys(IndexScanDesc scan, + Relation indexRel); +static inline bool _bt_scankey_within_page(IndexScanDesc scan, BTScanInsert key, + Buffer buf, ScanDirection dir); /* @@ -1375,6 +1379,409 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) return true; } +/* + * _bt_skip() -- Skip items that have the same prefix as the most recently + * fetched index tuple. + * + * The current position is set so that a subsequent call to _bt_next will + * fetch the first tuple that differs in the leading 'prefix' keys. + * + * There are four different kinds of skipping (depending on dir and + * indexdir, that are important to distinguish, especially in the presense + * of an index condition: + * + * * Advancing forward and reading forward + * simple scan + * + * * Advancing forward and reading backward + * scan inside a cursor fetching backward, when skipping is necessary + * right from the start + * + * * Advancing backward and reading forward + * scan with order by desc inside a cursor fetching forward, when + * skipping is necessary right from the start + * + * * Advancing backward and reading backward + * simple scan with order by desc + * + * The current page is searched for the next unique value. If none is found + * we will do a scan from the root in order to find the next page with + * a unique value. + */ +bool +_bt_skip(IndexScanDesc scan, ScanDirection dir, + ScanDirection indexdir, bool scanstart, int prefix) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTStack stack; + Buffer buf; + OffsetNumber offnum; + BTScanPosItem *currItem; + Relation indexRel = scan->indexRelation; + + /* We want to return tuples, and we need a starting point */ + Assert(scan->xs_want_itup); + Assert(scan->xs_itup); + + if (so->numKilled > 0) + _bt_killitems(scan); + + /* If skipScanKey is NULL then we initialize it with _bt_mkscankey */ + if (so->skipScanKey == NULL) + { + so->skipScanKey = _bt_mkscankey(indexRel, scan->xs_itup); + so->skipScanKey->keysz = prefix; + so->skipScanKey->scantid = NULL; + } + so->skipScanKey->nextkey = ScanDirectionIsForward(dir); + _bt_update_skip_scankeys(scan, indexRel); + + /* Check if the next unique key can be found within the current page. + * Since we do not lock the current page between jumps, it's possible + * that it was splitted since the last time we saw it. This is fine in + * case of scanning forward, since page split to the right and we are + * still on the left most page. In case of scanning backwards it's + * possible to loose some pages and we need to remember the previous + * page, and then follow the right link from the current page until we + * find the original one. + * + * Since the whole idea of checking the current page is to protect + * ourselves and make more performant statistic mismatch case when + * there are too many distinct values for jumping, it's not clear if + * the complexity of this solution in case of backward scan is + * justified, so for now just avoid it. + */ + if (BufferIsValid(so->currPos.buf) && ScanDirectionIsForward(dir)) + { + LockBuffer(so->currPos.buf, BT_READ); + + if (_bt_scankey_within_page(scan, so->skipScanKey, so->currPos.buf, dir)) + { + bool keyFound = false; + + offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, so->currPos.buf); + + /* Lock the page for SERIALIZABLE transactions */ + PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(so->currPos.buf), + scan->xs_snapshot); + + /* We know in which direction to look */ + _bt_initialize_more_data(so, dir); + + /* Now read the data */ + keyFound = _bt_readpage(scan, dir, offnum); + + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(so->currPos.buf); + so->currPos.buf = InvalidBuffer; + + if (keyFound) + { + /* set IndexTuple */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + return true; + } + } + else + { + if (so->numKilled > 0) + _bt_killitems(scan); + + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + } + } + + if (BufferIsValid(so->currPos.buf)) + { + ReleaseBuffer(so->currPos.buf); + so->currPos.buf = InvalidBuffer; + } + + /* + * We haven't found scan key within the current page, so let's scan from + * the root. Use _bt_search and _bt_binsrch to get the buffer and offset + * number + */ + so->skipScanKey->nextkey = ScanDirectionIsForward(dir); + stack = _bt_search(scan->indexRelation, so->skipScanKey, + &buf, BT_READ, scan->xs_snapshot); + _bt_freestack(stack); + so->currPos.buf = buf; + offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf); + + /* Lock the page for SERIALIZABLE transactions */ + PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf), + scan->xs_snapshot); + + /* We know in which direction to look */ + _bt_initialize_more_data(so, dir); + + /* + * Simplest case is when both directions are forward, when we are already + * at the next distinct key at the beginning of the series (so everything + * else would be done in _bt_readpage) + * + * The case when both directions are backwards is also simple, but we need + * to go one step back, since we need a last element from the previous + * series. + */ + if (ScanDirectionIsBackward(dir) && ScanDirectionIsBackward(indexdir)) + offnum = OffsetNumberPrev(offnum); + + /* + * Andvance backward but read forward. At this moment we are at the next + * distinct key at the beginning of the series. In case if scan just + * started, we can read forward without doing anything else. Otherwise + * find previous distinct key and the beginning of it's series and read + * forward from there. To do so, go back one step, perform binary search + * to find the first item in the series and let _bt_readpage do everything + * else. + */ + else if (ScanDirectionIsBackward(dir) && ScanDirectionIsForward(indexdir)) + { + if (!scanstart) + { + offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf); + + /* One step back to find a previous value */ + _bt_readpage(scan, dir, offnum); + + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + if (_bt_next(scan, dir)) + { + LockBuffer(so->currPos.buf, BT_READ); + _bt_update_skip_scankeys(scan, indexRel); + + /* + * And now find the last item from the sequence for the + * current, value with the intention do OffsetNumberNext. As a + * result we end up on a first element from the sequence. + */ + if (_bt_scankey_within_page(scan, so->skipScanKey, so->currPos.buf, dir)) + offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf); + else + { + if (BufferIsValid(so->currPos.buf)) + { + if (so->numKilled > 0) + _bt_killitems(scan); + + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(so->currPos.buf); + so->currPos.buf = InvalidBuffer; + } + + stack = _bt_search(scan->indexRelation, so->skipScanKey, + &buf, BT_READ, scan->xs_snapshot); + _bt_freestack(stack); + so->currPos.buf = buf; + offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf); + } + } + else + { + pfree(so->skipScanKey); + so->skipScanKey = NULL; + return false; + } + } + } + + /* + * Advance forward but read backward. At this moment we are at the next + * distinct key at the beginning of the series. In case if scan just + * started, we can go one step back and read forward without doing + * anything else. Otherwise find the next distinct key and the beginning + * of it's series, go one step back and read backward from there. + * + * An interesting situation can happen if one of distinct keys do not pass + * a corresponding index condition at all. In this case reading backward + * can lead to a previous distinct key being found, creating a loop. To + * avoid that check the value to be returned, and jump one more time if + * it's the same as at the beginning. + */ + else if (ScanDirectionIsForward(dir) && ScanDirectionIsBackward(indexdir)) + { + if (scanstart) + offnum = OffsetNumberPrev(offnum); + else + { + OffsetNumber nextOffset, + startOffset, + jumpOffset; + + IndexTuple startItup = CopyIndexTuple(scan->xs_itup); + Page page = BufferGetPage(so->currPos.buf); + + /* We are at the end and need to return */ + if ((offnum > PageGetMaxOffsetNumber(page)) & + (so->currPos.nextPage == P_NONE)) + { + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + + BTScanPosUnpinIfPinned(so->currPos); + BTScanPosInvalidate(so->currPos) + + pfree(so->skipScanKey); + so->skipScanKey = NULL; + return false; + } + + nextOffset = startOffset = ItemPointerGetOffsetNumber(&scan->xs_itup->t_tid); + + while (nextOffset == startOffset) + { + IndexTuple itup; + CHECK_FOR_INTERRUPTS(); + + /* + * Find a next index tuple to update scan key. It could be at + * the end, so check for max offset + */ + if (!_bt_readpage(scan, ForwardScanDirection, offnum)) + { + /* + * There's no actually-matching data on this page. Try to + * advance to the next page. Return false if there's no + * matching data at all. + */ + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + if (!_bt_steppage(scan, dir)) + { + pfree(so->skipScanKey); + so->skipScanKey = NULL; + return false; + } + LockBuffer(so->currPos.buf, BT_READ); + } + + currItem = &so->currPos.items[so->currPos.firstItem]; + itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + scan->xs_itup = itup; + so->skipScanKey->nextkey = ScanDirectionIsForward(dir); + + _bt_update_skip_scankeys(scan, indexRel); + if (BufferIsValid(so->currPos.buf)) + { + if (so->numKilled > 0) + _bt_killitems(scan); + + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(so->currPos.buf); + so->currPos.buf = InvalidBuffer; + } + + stack = _bt_search(scan->indexRelation, so->skipScanKey, + &buf, BT_READ, scan->xs_snapshot); + _bt_freestack(stack); + so->currPos.buf = buf; + jumpOffset = offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf); + offnum = OffsetNumberPrev(offnum); + + so->currPos.moreLeft = true; + if (!_bt_readpage(scan, indexdir, offnum)) + { + /* + * There's no actually-matching data on this page. Try to + * advance to the next page. Return false if there's no + * matching data at all. + */ + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + if (!_bt_steppage(scan, indexdir)) + { + pfree(so->skipScanKey); + so->skipScanKey = NULL; + return false; + } + LockBuffer(so->currPos.buf, BT_READ); + } + + currItem = &so->currPos.items[so->currPos.lastItem]; + itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + nextOffset = ItemPointerGetOffsetNumber(&itup->t_tid); + + /* + * To check if we returned the same tuple, try to find a + * startItup on the current page. For that we need to update + * scankey to match the whole tuple and set nextkey to return + * an exact tuple, not the next one. If the nextOffset is the + * same as before, it means we are in the loop, return offnum + * to the original position and jump further + */ + scan->xs_itup = startItup; + _bt_update_skip_scankeys(scan, indexRel); + + so->skipScanKey->keysz = IndexRelationGetNumberOfKeyAttributes(indexRel); + so->skipScanKey->nextkey = false; + + if (_bt_scankey_within_page(scan, so->skipScanKey, + so->currPos.buf, dir)) + { + OffsetNumber maxoff; + startOffset = _bt_binsrch(scan->indexRelation, + so->skipScanKey, + so->currPos.buf); + + page = BufferGetPage(so->currPos.buf); + maxoff = PageGetMaxOffsetNumber(page); + + if (nextOffset <= startOffset) + { + offnum = jumpOffset; + nextOffset = startOffset; + } + + if ((offnum > maxoff) & (so->currPos.nextPage == P_NONE)) + { + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + + BTScanPosUnpinIfPinned(so->currPos); + BTScanPosInvalidate(so->currPos) + + pfree(so->skipScanKey); + so->skipScanKey = NULL; + return false; + } + } + + /* Return original scankey options */ + so->skipScanKey->keysz = prefix; + so->skipScanKey->nextkey = ScanDirectionIsForward(dir); + } + } + } + + /* Now read the data */ + if (!_bt_readpage(scan, indexdir, offnum)) + { + /* + * There's no actually-matching data on this page. Try to advance to + * the next page. Return false if there's no matching data at all. + */ + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + if (!_bt_steppage(scan, dir)) + { + pfree(so->skipScanKey); + so->skipScanKey = NULL; + return false; + } + } + else + { + /* Drop the lock, and maybe the pin, on the current page */ + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + } + + /* And set IndexTuple */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + return true; +} + /* * _bt_readpage() -- Load data from current index page into so->currPos * @@ -2246,3 +2653,54 @@ _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir) so->numKilled = 0; /* just paranoia */ so->markItemIndex = -1; /* ditto */ } + +/* + * _bt_update_skip_scankeys() -- set up a new values for the existing scankeys + * based on the current index tuple + */ +static inline void +_bt_update_skip_scankeys(IndexScanDesc scan, Relation indexRel) +{ + TupleDesc itupdesc; + int indnkeyatts, + i; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ScanKey scankeys = so->skipScanKey->scankeys; + + itupdesc = RelationGetDescr(indexRel); + indnkeyatts = IndexRelationGetNumberOfKeyAttributes(indexRel); + for (i = 0; i < indnkeyatts; i++) + { + Datum datum; + bool null; + int flags; + + datum = index_getattr(scan->xs_itup, i + 1, itupdesc, &null); + flags = (null ? SK_ISNULL : 0) | + (indexRel->rd_indoption[i] << SK_BT_INDOPTION_SHIFT); + scankeys[i].sk_flags = flags; + scankeys[i].sk_argument = datum; + } +} + +/* + * _bt_scankey_within_page() -- check if the provided scankey could be found + * within a page, specified by the buffer. + */ +static inline bool +_bt_scankey_within_page(IndexScanDesc scan, BTScanInsert key, + Buffer buf, ScanDirection dir) +{ + OffsetNumber low, high; + Page page = BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + low = P_FIRSTDATAKEY(opaque); + high = PageGetMaxOffsetNumber(page); + + if (unlikely(high < low)) + return false; + + return (_bt_compare(scan->indexRelation, key, page, low) > 0 && + _bt_compare(scan->indexRelation, key, page, high) < 1); +} diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 4924ae1c59..fa09a4685e 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -68,6 +68,7 @@ spghandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = spgbulkdelete; amroutine->amvacuumcleanup = spgvacuumcleanup; amroutine->amcanreturn = spgcanreturn; + amroutine->amskip = NULL; amroutine->amcostestimate = spgcostestimate; amroutine->amoptions = spgoptions; amroutine->amproperty = spgproperty; diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index c367c750b1..a7dd874531 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -141,6 +141,7 @@ static void ExplainXMLTag(const char *tagname, int flags, ExplainState *es); static void ExplainIndentText(ExplainState *es); static void ExplainJSONLineEnding(ExplainState *es); static void ExplainYAMLLineStarting(ExplainState *es); +static void ExplainIndexSkipScanKeys(int skipPrefixSize, ExplainState *es); static void escape_yaml(StringInfo buf, const char *str); @@ -1052,6 +1053,22 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) return planstate_tree_walker(planstate, ExplainPreScanNode, rels_used); } +/* + * ExplainIndexSkipScanKeys - + * Append information about index skip scan to es->str. + * + * Can be used to print the skip prefix size. + */ +static void +ExplainIndexSkipScanKeys(int skipPrefixSize, ExplainState *es) +{ + if (skipPrefixSize > 0) + { + if (es->format != EXPLAIN_FORMAT_TEXT) + ExplainPropertyInteger("Distinct Prefix", NULL, skipPrefixSize, es); + } +} + /* * ExplainNode - * Appends a description of a plan tree to es->str @@ -1386,6 +1403,8 @@ ExplainNode(PlanState *planstate, List *ancestors, { IndexScan *indexscan = (IndexScan *) plan; + ExplainIndexSkipScanKeys(indexscan->indexskipprefixsize, es); + ExplainIndexScanDetails(indexscan->indexid, indexscan->indexorderdir, es); @@ -1396,6 +1415,8 @@ ExplainNode(PlanState *planstate, List *ancestors, { IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan; + ExplainIndexSkipScanKeys(indexonlyscan->indexskipprefixsize, es); + ExplainIndexScanDetails(indexonlyscan->indexid, indexonlyscan->indexorderdir, es); @@ -1655,6 +1676,8 @@ ExplainNode(PlanState *planstate, List *ancestors, switch (nodeTag(plan)) { case T_IndexScan: + if (((IndexScan *) plan)->indexskipprefixsize > 0) + ExplainPropertyBool("Skip scan", true, es); show_scan_qual(((IndexScan *) plan)->indexqualorig, "Index Cond", planstate, ancestors, es); if (((IndexScan *) plan)->indexqualorig) @@ -1668,6 +1691,8 @@ ExplainNode(PlanState *planstate, List *ancestors, planstate, es); break; case T_IndexOnlyScan: + if (((IndexOnlyScan *) plan)->indexskipprefixsize > 0) + ExplainPropertyBool("Skip scan", true, es); show_scan_qual(((IndexOnlyScan *) plan)->indexqual, "Index Cond", planstate, ancestors, es); if (((IndexOnlyScan *) plan)->indexqual) diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c index 7e5c805a1e..b9999ca3a9 100644 --- a/src/backend/commands/portalcmds.c +++ b/src/backend/commands/portalcmds.c @@ -89,6 +89,8 @@ PerformCursorOpen(ParseState *pstate, DeclareCursorStmt *cstmt, ParamListInfo pa if (query->commandType != CMD_SELECT) elog(ERROR, "non-SELECT statement in DECLARE CURSOR"); + query->hasDeclareCursor = true; + /* Plan the query, applying the specified options */ plan = pg_plan_query(query, cstmt->options, params); diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 5617ac29e7..76330f7906 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -65,6 +65,13 @@ IndexOnlyNext(IndexOnlyScanState *node) IndexScanDesc scandesc; TupleTableSlot *slot; ItemPointer tid; + IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) node->ss.ps.plan; + + /* + * tells if the current position was reached via skipping. In this case + * there is no nead for the index_getnext_tid + */ + bool skipped = false; /* * extract necessary information from index scan node @@ -72,7 +79,7 @@ IndexOnlyNext(IndexOnlyScanState *node) estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ - if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir)) + if (ScanDirectionIsBackward(indexonlyscan->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; @@ -115,14 +122,50 @@ IndexOnlyNext(IndexOnlyScanState *node) node->ioss_NumOrderByKeys); } + /* + * Check if we need to skip to the next key prefix, because we've been + * asked to implement DISTINCT. + * + * When fetching a cursor in the direction opposite to a general scan + * direction, the result must be what normal fetching should have + * returned, but in reversed order. In other words, return the last or + * first scanned tuple in a DISTINCT set, depending on a cursor direction. + * Due to that we skip also when the first tuple wasn't emitted yet, but + * the directions are opposite. + */ + if (node->ioss_SkipPrefixSize > 0 && + (node->ioss_FirstTupleEmitted || + ScanDirectionsAreOpposite(direction, indexonlyscan->indexorderdir))) + { + if (!index_skip(scandesc, direction, indexonlyscan->indexorderdir, + !node->ioss_FirstTupleEmitted, node->ioss_SkipPrefixSize)) + { + /* + * Reached end of index. At this point currPos is invalidated, and + * we need to reset ioss_FirstTupleEmitted, since otherwise after + * going backwards, reaching the end of index, and going forward + * again we apply skip again. It would be incorrect and lead to an + * extra skipped item. + */ + node->ioss_FirstTupleEmitted = false; + return ExecClearTuple(slot); + } + else + { + skipped = true; + tid = &scandesc->xs_heaptid; + } + } + /* * OK, now that we have what we need, fetch the next tuple. */ - while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + while (skipped || (tid = index_getnext_tid(scandesc, direction)) != NULL) { bool tuple_from_heap = false; CHECK_FOR_INTERRUPTS(); + skipped = false; /* * We can skip the heap fetch if the TID references a heap page on @@ -250,6 +293,8 @@ IndexOnlyNext(IndexOnlyScanState *node) ItemPointerGetBlockNumber(tid), estate->es_snapshot); + node->ioss_FirstTupleEmitted = true; + return slot; } @@ -504,6 +549,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan; + indexstate->ioss_SkipPrefixSize = node->indexskipprefixsize; + indexstate->ioss_FirstTupleEmitted = false; /* * Miscellaneous initialization diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index d0a96a38e0..449aaec3ac 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -85,6 +85,13 @@ IndexNext(IndexScanState *node) ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; + IndexScan *indexscan = (IndexScan *) node->ss.ps.plan; + + /* + * tells if the current position was reached via skipping. In this case + * there is no nead for the index_getnext_tid + */ + bool skipped = false; /* * extract necessary information from index scan node @@ -92,7 +99,7 @@ IndexNext(IndexScanState *node) estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ - if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir)) + if (ScanDirectionIsBackward(indexscan->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; @@ -117,6 +124,12 @@ IndexNext(IndexScanState *node) node->iss_ScanDesc = scandesc; + /* Index skip scan assumes xs_want_itup, so set it to true */ + if (indexscan->indexskipprefixsize > 0) + node->iss_ScanDesc->xs_want_itup = true; + else + node->iss_ScanDesc->xs_want_itup = false; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -127,12 +140,48 @@ IndexNext(IndexScanState *node) node->iss_OrderByKeys, node->iss_NumOrderByKeys); } + /* + * Check if we need to skip to the next key prefix, because we've been + * asked to implement DISTINCT. + * + * When fetching a cursor in the direction opposite to a general scan + * direction, the result must be what normal fetching should have + * returned, but in reversed order. In other words, return the last or + * first scanned tuple in a DISTINCT set, depending on a cursor direction. + * Due to that we skip also when the first tuple wasn't emitted yet, but + * the directions are opposite. + */ + if (node->iss_SkipPrefixSize > 0 && + (node->iss_FirstTupleEmitted || + ScanDirectionsAreOpposite(direction, indexscan->indexorderdir))) + { + if (!index_skip(scandesc, direction, indexscan->indexorderdir, + !node->iss_FirstTupleEmitted, node->iss_SkipPrefixSize)) + { + /* + * Reached end of index. At this point currPos is invalidated, and + * we need to reset iss_FirstTupleEmitted, since otherwise after + * going backwards, reaching the end of index, and going forward + * again we apply skip again. It would be incorrect and lead to an + * extra skipped item. + */ + node->iss_FirstTupleEmitted = false; + return ExecClearTuple(slot); + } + else + { + skipped = true; + index_fetch_heap(scandesc, slot); + } + } + /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while (skipped || index_getnext_slot(scandesc, direction, slot)) { CHECK_FOR_INTERRUPTS(); + skipped = false; /* * If the index was lossy, we have to recheck the index quals using @@ -149,6 +198,7 @@ IndexNext(IndexScanState *node) } } + node->iss_FirstTupleEmitted = true; return slot; } @@ -910,6 +960,8 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; indexstate->ss.ps.ExecProcNode = ExecIndexScan; + indexstate->iss_SkipPrefixSize = node->indexskipprefixsize; + indexstate->iss_FirstTupleEmitted = false; /* * Miscellaneous initialization diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 54ad62bb7f..e0cfd710c4 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -493,6 +493,7 @@ _copyIndexScan(const IndexScan *from) COPY_NODE_FIELD(indexorderbyorig); COPY_NODE_FIELD(indexorderbyops); COPY_SCALAR_FIELD(indexorderdir); + COPY_SCALAR_FIELD(indexskipprefixsize); return newnode; } @@ -518,6 +519,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from) COPY_NODE_FIELD(indexorderby); COPY_NODE_FIELD(indextlist); COPY_SCALAR_FIELD(indexorderdir); + COPY_SCALAR_FIELD(indexskipprefixsize); return newnode; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 16083e7a7e..5f723cda4b 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -562,6 +562,7 @@ _outIndexScan(StringInfo str, const IndexScan *node) WRITE_NODE_FIELD(indexorderbyorig); WRITE_NODE_FIELD(indexorderbyops); WRITE_ENUM_FIELD(indexorderdir, ScanDirection); + WRITE_INT_FIELD(indexskipprefixsize); } static void @@ -576,6 +577,7 @@ _outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node) WRITE_NODE_FIELD(indexorderby); WRITE_NODE_FIELD(indextlist); WRITE_ENUM_FIELD(indexorderdir, ScanDirection); + WRITE_INT_FIELD(indexskipprefixsize); } static void diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 551ce6c41c..028d03a56d 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -1820,6 +1820,7 @@ _readIndexScan(void) READ_NODE_FIELD(indexorderbyorig); READ_NODE_FIELD(indexorderbyops); READ_ENUM_FIELD(indexorderdir, ScanDirection); + READ_INT_FIELD(indexskipprefixsize); READ_DONE(); } @@ -1839,6 +1840,7 @@ _readIndexOnlyScan(void) READ_NODE_FIELD(indexorderby); READ_NODE_FIELD(indextlist); READ_ENUM_FIELD(indexorderdir, ScanDirection); + READ_INT_FIELD(indexskipprefixsize); READ_DONE(); } diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index b5a0033721..710edf160a 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -124,6 +124,7 @@ int max_parallel_workers_per_gather = 2; bool enable_seqscan = true; bool enable_indexscan = true; bool enable_indexonlyscan = true; +bool enable_indexskipscan = true; bool enable_bitmapscan = true; bool enable_tidscan = true; bool enable_sort = true; diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index dff826a828..7b32f2cc7e 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -175,12 +175,14 @@ static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig, List *indexorderby, List *indexorderbyorig, List *indexorderbyops, - ScanDirection indexscandir); + ScanDirection indexscandir, + int skipprefix); static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexorderby, List *indextlist, - ScanDirection indexscandir); + ScanDirection indexscandir, + int skipprefix); static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig); @@ -2910,7 +2912,8 @@ create_indexscan_plan(PlannerInfo *root, fixed_indexquals, fixed_indexorderbys, best_path->indexinfo->indextlist, - best_path->indexscandir); + best_path->indexscandir, + best_path->indexskipprefix); else scan_plan = (Scan *) make_indexscan(tlist, qpqual, @@ -2921,7 +2924,8 @@ create_indexscan_plan(PlannerInfo *root, fixed_indexorderbys, indexorderbys, indexorderbyops, - best_path->indexscandir); + best_path->indexscandir, + best_path->indexskipprefix); copy_generic_path_info(&scan_plan->plan, &best_path->path); @@ -5184,7 +5188,8 @@ make_indexscan(List *qptlist, List *indexorderby, List *indexorderbyorig, List *indexorderbyops, - ScanDirection indexscandir) + ScanDirection indexscandir, + int skipPrefixSize) { IndexScan *node = makeNode(IndexScan); Plan *plan = &node->scan.plan; @@ -5201,6 +5206,7 @@ make_indexscan(List *qptlist, node->indexorderbyorig = indexorderbyorig; node->indexorderbyops = indexorderbyops; node->indexorderdir = indexscandir; + node->indexskipprefixsize = skipPrefixSize; return node; } @@ -5213,7 +5219,8 @@ make_indexonlyscan(List *qptlist, List *indexqual, List *indexorderby, List *indextlist, - ScanDirection indexscandir) + ScanDirection indexscandir, + int skipPrefixSize) { IndexOnlyScan *node = makeNode(IndexOnlyScan); Plan *plan = &node->scan.plan; @@ -5228,6 +5235,7 @@ make_indexonlyscan(List *qptlist, node->indexorderby = indexorderby; node->indextlist = indextlist; node->indexorderdir = indexscandir; + node->indexskipprefixsize = skipPrefixSize; return node; } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 984fca0696..777ae3ee0f 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -4834,6 +4834,83 @@ create_distinct_paths(PlannerInfo *root, path, list_length(root->distinct_pathkeys), numDistinctRows)); + + /* Consider index skip scan as well */ + if (enable_indexskipscan && + IsA(path, IndexPath) && + root->parse->hasDeclareCursor == false && + ((IndexPath *) path)->indexinfo->amcanskip && + root->distinct_pathkeys != NIL) + { + ListCell *lc; + IndexOptInfo *index = NULL; + bool different_columns_order = false, + not_empty_qual = false; + int i = 0; + int distinctPrefixKeys; + + Assert(path->pathtype == T_IndexOnlyScan || + path->pathtype == T_IndexScan); + + index = ((IndexPath *) path)->indexinfo; + distinctPrefixKeys = list_length(root->query_uniquekeys); + + /* + * Normally we can think about distinctPrefixKeys as just + * a number of distinct keys. But if lets say we have a + * distinct key a, and the index contains b, a in exactly + * this order. In such situation we need to use position + * of a in the index as distinctPrefixKeys, otherwise skip + * will happen only by the first column. + */ + foreach(lc, root->query_uniquekeys) + { + UniqueKey *uniquekey = (UniqueKey *) lfirst(lc); + EquivalenceMember *em = + lfirst_node(EquivalenceMember, + list_head(uniquekey->eq_clause->ec_members)); + Var *var = (Var *) em->em_expr; + + Assert(i < index->ncolumns); + + for (i = 0; i < index->ncolumns; i++) + { + if (index->indexkeys[i] == var->varattno) + { + distinctPrefixKeys = Max(i + 1, distinctPrefixKeys); + break; + } + } + } + + /* + * XXX: In case of index scan quals evaluation happens + * after ExecScanFetch, which means skip results could be + * fitered out. Consider the following query: + * + * select distinct (a, b) a, b, c from t where c < 100; + * + * Skip scan returns one tuple for one distinct set of (a, + * b) with arbitrary one of c, so if the choosed c does + * not match the qual and there is any c that matches the + * qual, we miss that tuple. + */ + if (path->pathtype == T_IndexScan && + parse->jointree != NULL && + parse->jointree->quals != NULL && + list_length((List *) parse->jointree->quals) != 0) + not_empty_qual = true; + + if (!different_columns_order && !not_empty_qual) + { + add_path(distinct_rel, (Path *) + create_skipscan_unique_path(root, + distinct_rel, + path, + distinctPrefixKeys, + numDistinctRows)); + } + } } } diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index a006dbbe9c..2fb18fb372 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -2915,6 +2915,46 @@ create_upper_unique_path(PlannerInfo *root, return pathnode; } +/* + * create_skipscan_unique_path + * Creates a pathnode the same as an existing IndexPath except based on + * skipping duplicate values. This may or may not be cheaper than using + * create_upper_unique_path. + * + * The input path must be an IndexPath for an index that supports amskip. + */ +IndexPath * +create_skipscan_unique_path(PlannerInfo *root, + RelOptInfo *rel, + Path *basepath, + int distinctPrefixKeys, + double numGroups) +{ + IndexPath *pathnode = makeNode(IndexPath); + + Assert(IsA(basepath, IndexPath)); + + /* We don't want to modify basepath, so make a copy. */ + memcpy(pathnode, basepath, sizeof(IndexPath)); + + /* The size of the prefix we'll use for skipping. */ + Assert(pathnode->indexinfo->amcanskip); + Assert(distinctPrefixKeys > 0); + /*Assert(distinctPrefixKeys <= list_length(pathnode->path.pathkeys));*/ + pathnode->indexskipprefix = distinctPrefixKeys; + + /* + * The cost to skip to each distinct value should be roughly the same as + * the cost of finding the first key times the number of distinct values + * we expect to find. + */ + pathnode->path.startup_cost = basepath->startup_cost; + pathnode->path.total_cost = basepath->startup_cost * numGroups; + pathnode->path.rows = numGroups; + + return pathnode; +} + /* * create_agg_path * Creates a pathnode that represents performing aggregation/grouping diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index d82fc5ab8b..f65b299f37 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -271,6 +271,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->amoptionalkey = amroutine->amoptionalkey; info->amsearcharray = amroutine->amsearcharray; info->amsearchnulls = amroutine->amsearchnulls; + info->amcanskip = (amroutine->amskip != NULL); info->amcanparallel = amroutine->amcanparallel; info->amhasgettuple = (amroutine->amgettuple != NULL); info->amhasgetbitmap = amroutine->amgetbitmap != NULL && diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index cacbe904db..7c71ee4499 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -923,6 +923,15 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_indexskipscan", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of index-skip-scan plans."), + NULL + }, + &enable_indexskipscan, + true, + NULL, NULL, NULL + }, { {"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of bitmap-scan plans."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e1048c0047..a002ee2143 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -353,6 +353,7 @@ #enable_hashjoin = on #enable_indexscan = on #enable_indexonlyscan = on +#enable_indexskipscan = on #enable_material = on #enable_mergejoin = on #enable_nestloop = on diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 3b3e22f73d..3d39cd9d07 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -130,6 +130,13 @@ typedef void (*amrescan_function) (IndexScanDesc scan, typedef bool (*amgettuple_function) (IndexScanDesc scan, ScanDirection direction); +/* skip past duplicates in a given prefix */ +typedef bool (*amskip_function) (IndexScanDesc scan, + ScanDirection dir, + ScanDirection indexdir, + bool start, + int prefix); + /* fetch all valid tuples */ typedef int64 (*amgetbitmap_function) (IndexScanDesc scan, TIDBitmap *tbm); @@ -229,6 +236,7 @@ typedef struct IndexAmRoutine amendscan_function amendscan; ammarkpos_function ammarkpos; /* can be NULL */ amrestrpos_function amrestrpos; /* can be NULL */ + amskip_function amskip; /* can be NULL */ /* interface functions to support parallel index scans */ amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 7e9364a50c..815de4e4dd 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -180,6 +180,8 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info, extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats); extern bool index_can_return(Relation indexRelation, int attno); +extern bool index_skip(IndexScanDesc scan, ScanDirection direction, + ScanDirection indexdir, bool start, int prefix); extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum); extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum, diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 20ace69dab..e098c6a1ab 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -662,6 +662,9 @@ typedef struct BTScanOpaqueData */ int markItemIndex; /* itemIndex, or -1 if not valid */ + /* Work space for _bt_skip */ + BTScanInsert skipScanKey; /* used to control skipping */ + /* keep these last in struct for efficiency */ BTScanPosData currPos; /* current position data */ BTScanPosData markPos; /* marked position, if any */ @@ -793,6 +796,8 @@ extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate); extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); +extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, + ScanDirection indexdir, bool start, int prefix); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot); @@ -817,6 +822,8 @@ extern void _bt_end_vacuum_callback(int code, Datum arg); extern Size BTreeShmemSize(void); extern void BTreeShmemInit(void); extern bytea *btoptions(Datum reloptions, bool validate); +extern bool btskip(IndexScanDesc scan, ScanDirection dir, + ScanDirection indexdir, bool start, int prefix); extern bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); diff --git a/src/include/access/sdir.h b/src/include/access/sdir.h index 23feb90986..094a127464 100644 --- a/src/include/access/sdir.h +++ b/src/include/access/sdir.h @@ -55,4 +55,11 @@ typedef enum ScanDirection #define ScanDirectionIsForward(direction) \ ((bool) ((direction) == ForwardScanDirection)) +/* + * ScanDirectionsAreOpposite + * True iff scan directions are backward/forward or forward/backward. + */ +#define ScanDirectionsAreOpposite(dirA, dirB) \ + ((bool) (dirA != NoMovementScanDirection && dirA == -dirB)) + #endif /* SDIR_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 1f6f5bbc20..2c6acc160a 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1423,6 +1423,8 @@ typedef struct IndexScanState ExprContext *iss_RuntimeContext; Relation iss_RelationDesc; struct IndexScanDescData *iss_ScanDesc; + int iss_SkipPrefixSize; + bool iss_FirstTupleEmitted; /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; @@ -1452,6 +1454,8 @@ typedef struct IndexScanState * TableSlot slot for holding tuples fetched from the table * VMBuffer buffer in use for visibility map testing, if any * PscanLen size of parallel index-only scan descriptor + * SkipPrefixSize number of keys for skip-based DISTINCT + * FirstTupleEmitted has the first tuple been emitted * ---------------- */ typedef struct IndexOnlyScanState @@ -1470,6 +1474,8 @@ typedef struct IndexOnlyScanState struct IndexScanDescData *ioss_ScanDesc; TupleTableSlot *ioss_TableSlot; Buffer ioss_VMBuffer; + int ioss_SkipPrefixSize; + bool ioss_FirstTupleEmitted; Size ioss_PscanLen; } IndexOnlyScanState; diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index da0706add5..f69468d56a 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -131,6 +131,7 @@ typedef struct Query bool hasModifyingCTE; /* has INSERT/UPDATE/DELETE in WITH */ bool hasForUpdate; /* FOR [KEY] UPDATE/SHARE was specified */ bool hasRowSecurity; /* rewriter has applied some RLS policy */ + bool hasDeclareCursor; /* has declaring cursor */ List *cteList; /* WITH list (of CommonTableExpr's) */ diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 4e329f0fb5..b0ff9ca3a8 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -839,6 +839,7 @@ struct IndexOptInfo bool amsearchnulls; /* can AM search for NULL/NOT NULL entries? */ bool amhasgettuple; /* does AM have amgettuple interface? */ bool amhasgetbitmap; /* does AM have amgetbitmap interface? */ + bool amcanskip; /* can AM skip duplicate values? */ bool amcanparallel; /* does AM support parallel scan? */ /* Rather than include amapi.h here, we declare amcostestimate like this */ void (*amcostestimate) (); /* AM's cost estimator */ @@ -1189,6 +1190,9 @@ typedef struct Path * we need not recompute them when considering using the same index in a * bitmap index/heap scan (see BitmapHeapPath). The costs of the IndexPath * itself represent the costs of an IndexScan or IndexOnlyScan plan type. + * + * 'indexskipprefix' represents the number of columns to consider for skip + * scans. *---------- */ typedef struct IndexPath @@ -1201,6 +1205,7 @@ typedef struct IndexPath ScanDirection indexscandir; Cost indextotalcost; Selectivity indexselectivity; + int indexskipprefix; } IndexPath; /* diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 32c0d87f80..03a00e8e1d 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -409,6 +409,8 @@ typedef struct IndexScan List *indexorderbyorig; /* the same in original form */ List *indexorderbyops; /* OIDs of sort ops for ORDER BY exprs */ ScanDirection indexorderdir; /* forward or backward or don't care */ + int indexskipprefixsize; /* the size of the prefix for distinct + * scans */ } IndexScan; /* ---------------- @@ -436,6 +438,8 @@ typedef struct IndexOnlyScan List *indexorderby; /* list of index ORDER BY exprs */ List *indextlist; /* TargetEntry list describing index's cols */ ScanDirection indexorderdir; /* forward or backward or don't care */ + int indexskipprefixsize; /* the size of the prefix for distinct + * scans */ } IndexOnlyScan; /* ---------------- diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index cb012ba198..847f34f02b 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -50,6 +50,7 @@ extern PGDLLIMPORT int max_parallel_workers_per_gather; extern PGDLLIMPORT bool enable_seqscan; extern PGDLLIMPORT bool enable_indexscan; extern PGDLLIMPORT bool enable_indexonlyscan; +extern PGDLLIMPORT bool enable_indexskipscan; extern PGDLLIMPORT bool enable_bitmapscan; extern PGDLLIMPORT bool enable_tidscan; extern PGDLLIMPORT bool enable_sort; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index f75ff6f323..6c8c9dadbb 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -201,6 +201,11 @@ extern UpperUniquePath *create_upper_unique_path(PlannerInfo *root, Path *subpath, int numCols, double numGroups); +extern IndexPath *create_skipscan_unique_path(PlannerInfo *root, + RelOptInfo *rel, + Path *subpath, + int numCols, + double numGroups); extern AggPath *create_agg_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out index f3696c6d1d..259db10c81 100644 --- a/src/test/regress/expected/select_distinct.out +++ b/src/test/regress/expected/select_distinct.out @@ -244,3 +244,604 @@ SELECT null IS NOT DISTINCT FROM null as "yes"; t (1 row) +-- index only skip scan +CREATE TABLE distinct_a (a int, b int, c int); +INSERT INTO distinct_a ( + SELECT five, tenthous, 10 FROM + generate_series(1, 5) five, + generate_series(1, 10000) tenthous +); +CREATE INDEX ON distinct_a (a, b); +ANALYZE distinct_a; +SELECT DISTINCT a FROM distinct_a; + a +--- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +SELECT DISTINCT a FROM distinct_a WHERE a = 1; + a +--- + 1 +(1 row) + +SELECT DISTINCT a FROM distinct_a ORDER BY a DESC; + a +--- + 5 + 4 + 3 + 2 + 1 +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT DISTINCT a FROM distinct_a; + QUERY PLAN +-------------------------------------------------------- + Index Only Scan using distinct_a_a_b_idx on distinct_a + Skip scan: true +(2 rows) + +-- test index skip scan with a condition on a non unique field +SELECT DISTINCT ON (a) a, b FROM distinct_a WHERE b = 2; + a | b +---+--- + 1 | 2 + 2 | 2 + 3 | 2 + 4 | 2 + 5 | 2 +(5 rows) + +-- test index skip scan backwards +SELECT DISTINCT ON (a) a, b FROM distinct_a ORDER BY a DESC, b DESC; + a | b +---+------- + 5 | 10000 + 4 | 10000 + 3 | 10000 + 2 | 10000 + 1 | 10000 +(5 rows) + +-- check colums order +CREATE INDEX distinct_a_b_a on distinct_a (b, a); +SELECT DISTINCT a FROM distinct_a WHERE b = 2; + a +--- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +SELECT DISTINCT on (a, b) a, b FROM distinct_a WHERE b = 2; + a | b +---+--- + 1 | 2 + 2 | 2 + 3 | 2 + 4 | 2 + 5 | 2 +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT DISTINCT a FROM distinct_a WHERE b = 2; + QUERY PLAN +---------------------------------------------------- + Index Only Scan using distinct_a_b_a on distinct_a + Skip scan: true + Index Cond: (b = 2) +(3 rows) + +EXPLAIN (COSTS OFF) +SELECT DISTINCT on (a, b) a, b FROM distinct_a WHERE b = 2; + QUERY PLAN +---------------------------------------------------- + Index Only Scan using distinct_a_b_a on distinct_a + Skip scan: true + Index Cond: (b = 2) +(3 rows) + +DROP INDEX distinct_a_b_a; +-- test opposite scan/index directions inside a cursor +-- forward/backward +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b FROM distinct_a ORDER BY a, b; +FETCH FROM c; + a | b +---+--- + 1 | 1 +(1 row) + +FETCH BACKWARD FROM c; + a | b +---+--- +(0 rows) + +FETCH 6 FROM c; + a | b +---+--- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +FETCH BACKWARD 6 FROM c; + a | b +---+--- + 5 | 1 + 4 | 1 + 3 | 1 + 2 | 1 + 1 | 1 +(5 rows) + +FETCH 6 FROM c; + a | b +---+--- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +FETCH BACKWARD 6 FROM c; + a | b +---+--- + 5 | 1 + 4 | 1 + 3 | 1 + 2 | 1 + 1 | 1 +(5 rows) + +END; +-- backward/forward +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b FROM distinct_a ORDER BY a DESC, b DESC; +FETCH FROM c; + a | b +---+------- + 5 | 10000 +(1 row) + +FETCH BACKWARD FROM c; + a | b +---+--- +(0 rows) + +FETCH 6 FROM c; + a | b +---+------- + 5 | 10000 + 4 | 10000 + 3 | 10000 + 2 | 10000 + 1 | 10000 +(5 rows) + +FETCH BACKWARD 6 FROM c; + a | b +---+------- + 1 | 10000 + 2 | 10000 + 3 | 10000 + 4 | 10000 + 5 | 10000 +(5 rows) + +FETCH 6 FROM c; + a | b +---+------- + 5 | 10000 + 4 | 10000 + 3 | 10000 + 2 | 10000 + 1 | 10000 +(5 rows) + +FETCH BACKWARD 6 FROM c; + a | b +---+------- + 1 | 10000 + 2 | 10000 + 3 | 10000 + 4 | 10000 + 5 | 10000 +(5 rows) + +END; +-- test missing values and skipping from the end +CREATE TABLE distinct_abc(a int, b int, c int); +CREATE INDEX ON distinct_abc(a, b, c); +INSERT INTO distinct_abc + VALUES (1, 1, 1), + (1, 1, 2), + (1, 2, 2), + (1, 2, 3), + (2, 2, 1), + (2, 2, 3), + (3, 1, 1), + (3, 1, 2), + (3, 2, 2), + (3, 2, 3); +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2; + QUERY PLAN +-------------------------------------------------------------- + Index Only Scan using distinct_abc_a_b_c_idx on distinct_abc + Skip scan: true + Index Cond: (c = 2) +(3 rows) + +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2; +FETCH ALL FROM c; + a | b | c +---+---+--- + 1 | 1 | 2 + 3 | 1 | 2 +(2 rows) + +FETCH BACKWARD ALL FROM c; + a | b | c +---+---+--- + 3 | 1 | 2 + 1 | 1 | 2 +(2 rows) + +END; +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2 +ORDER BY a DESC, b DESC; + QUERY PLAN +----------------------------------------------------------------------- + Index Only Scan Backward using distinct_abc_a_b_c_idx on distinct_abc + Skip scan: true + Index Cond: (c = 2) +(3 rows) + +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2 +ORDER BY a DESC, b DESC; +FETCH ALL FROM c; + a | b | c +---+---+--- + 3 | 2 | 2 + 1 | 2 | 2 +(2 rows) + +FETCH BACKWARD ALL FROM c; + a | b | c +---+---+--- + 1 | 2 | 2 + 3 | 2 | 2 +(2 rows) + +END; +DROP TABLE distinct_abc; +-- index skip scan +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a ORDER BY a; + a | b | c +---+---+---- + 1 | 1 | 10 + 2 | 1 | 10 + 3 | 1 | 10 + 4 | 1 | 10 + 5 | 1 | 10 +(5 rows) + +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a WHERE a = 1 ORDER BY a; + a | b | c +---+---+---- + 1 | 1 | 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a ORDER BY a; + QUERY PLAN +--------------------------------------------------- + Index Scan using distinct_a_a_b_idx on distinct_a + Skip scan: true +(2 rows) + +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a WHERE a = 1 ORDER BY a; + QUERY PLAN +----------------------------------------------------- + Unique + -> Bitmap Heap Scan on distinct_a + Recheck Cond: (a = 1) + -> Bitmap Index Scan on distinct_a_a_b_idx + Index Cond: (a = 1) +(5 rows) + +-- check colums order +SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10; + a +--- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10; + QUERY PLAN +--------------------------------------------------------- + Unique + -> Index Scan using distinct_a_a_b_idx on distinct_a + Index Cond: (b = 2) + Filter: (c = 10) +(4 rows) + +-- check projection case +SELECT DISTINCT a, a FROM distinct_a WHERE b = 2; + a | a +---+--- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 +(5 rows) + +SELECT DISTINCT a, 1 FROM distinct_a WHERE b = 2; + a | ?column? +---+---------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- test cursor forward/backward movements +BEGIN; +DECLARE c SCROLL CURSOR FOR SELECT DISTINCT a FROM distinct_a; +FETCH FROM c; + a +--- + 1 +(1 row) + +FETCH BACKWARD FROM c; + a +--- +(0 rows) + +FETCH 6 FROM c; + a +--- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +FETCH BACKWARD 6 FROM c; + a +--- + 5 + 4 + 3 + 2 + 1 +(5 rows) + +FETCH 6 FROM c; + a +--- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +FETCH BACKWARD 6 FROM c; + a +--- + 5 + 4 + 3 + 2 + 1 +(5 rows) + +END; +DROP TABLE distinct_a; +-- test tuples visibility +CREATE TABLE distinct_visibility (a int, b int); +INSERT INTO distinct_visibility (select a, b from generate_series(1,5) a, generate_series(1, 10000) b); +CREATE INDEX ON distinct_visibility (a, b); +ANALYZE distinct_visibility; +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a, b; + a | b +---+--- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +DELETE FROM distinct_visibility WHERE a = 2 and b = 1; +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a, b; + a | b +---+--- + 1 | 1 + 2 | 2 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a DESC, b DESC; + a | b +---+------- + 5 | 10000 + 4 | 10000 + 3 | 10000 + 2 | 10000 + 1 | 10000 +(5 rows) + +DELETE FROM distinct_visibility WHERE a = 2 and b = 10000; +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a DESC, b DESC; + a | b +---+------- + 5 | 10000 + 4 | 10000 + 3 | 10000 + 2 | 9999 + 1 | 10000 +(5 rows) + +DROP TABLE distinct_visibility; +-- test page boundaries +CREATE TABLE distinct_boundaries AS + SELECT a, b::int2 b, (b % 2)::int2 c FROM + generate_series(1, 5) a, + generate_series(1,366) b; +CREATE INDEX ON distinct_boundaries (a, b, c); +ANALYZE distinct_boundaries; +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a, b, c from distinct_boundaries +WHERE b >= 1 and c = 0 ORDER BY a, b; + QUERY PLAN +---------------------------------------------------------------------------- + Index Only Scan using distinct_boundaries_a_b_c_idx on distinct_boundaries + Skip scan: true + Index Cond: ((b >= 1) AND (c = 0)) +(3 rows) + +SELECT DISTINCT ON (a) a, b, c from distinct_boundaries +WHERE b >= 1 and c = 0 ORDER BY a, b; + a | b | c +---+---+--- + 1 | 2 | 0 + 2 | 2 | 0 + 3 | 2 | 0 + 4 | 2 | 0 + 5 | 2 | 0 +(5 rows) + +DROP TABLE distinct_boundaries; +-- test tuple killing +-- DESC ordering +CREATE TABLE distinct_killed AS + SELECT a, b, b % 2 AS c, 10 AS d + FROM generate_series(1, 5) a, + generate_series(1,1000) b; +CREATE INDEX ON distinct_killed (a, b, c, d); +DELETE FROM distinct_killed where a = 3; +BEGIN; + DECLARE c SCROLL CURSOR FOR + SELECT DISTINCT ON (a) a,b,c,d + FROM distinct_killed ORDER BY a DESC, b DESC; + FETCH FORWARD ALL FROM c; + a | b | c | d +---+------+---+---- + 5 | 1000 | 0 | 10 + 4 | 1000 | 0 | 10 + 2 | 1000 | 0 | 10 + 1 | 1000 | 0 | 10 +(4 rows) + + FETCH BACKWARD ALL FROM c; + a | b | c | d +---+------+---+---- + 1 | 1000 | 0 | 10 + 2 | 1000 | 0 | 10 + 4 | 1000 | 0 | 10 + 5 | 1000 | 0 | 10 +(4 rows) + +COMMIT; +DROP TABLE distinct_killed; +-- regular ordering +CREATE TABLE distinct_killed AS + SELECT a, b, b % 2 AS c, 10 AS d + FROM generate_series(1, 5) a, + generate_series(1,1000) b; +CREATE INDEX ON distinct_killed (a, b, c, d); +DELETE FROM distinct_killed where a = 3; +BEGIN; + DECLARE c SCROLL CURSOR FOR + SELECT DISTINCT ON (a) a,b,c,d + FROM distinct_killed ORDER BY a, b; + FETCH FORWARD ALL FROM c; + a | b | c | d +---+---+---+---- + 1 | 1 | 1 | 10 + 2 | 1 | 1 | 10 + 4 | 1 | 1 | 10 + 5 | 1 | 1 | 10 +(4 rows) + + FETCH BACKWARD ALL FROM c; + a | b | c | d +---+---+---+---- + 5 | 1 | 1 | 10 + 4 | 1 | 1 | 10 + 2 | 1 | 1 | 10 + 1 | 1 | 1 | 10 +(4 rows) + +COMMIT; +DROP TABLE distinct_killed; +-- partial delete +CREATE TABLE distinct_killed AS + SELECT a, b, b % 2 AS c, 10 AS d + FROM generate_series(1, 5) a, + generate_series(1,1000) b; +CREATE INDEX ON distinct_killed (a, b, c, d); +DELETE FROM distinct_killed WHERE a = 3 AND b <= 999; +BEGIN; + DECLARE c SCROLL CURSOR FOR + SELECT DISTINCT ON (a) a,b,c,d + FROM distinct_killed ORDER BY a DESC, b DESC; + FETCH FORWARD ALL FROM c; + a | b | c | d +---+------+---+---- + 5 | 1000 | 0 | 10 + 4 | 1000 | 0 | 10 + 3 | 1000 | 0 | 10 + 2 | 1000 | 0 | 10 + 1 | 1000 | 0 | 10 +(5 rows) + + FETCH BACKWARD ALL FROM c; + a | b | c | d +---+------+---+---- + 1 | 1000 | 0 | 10 + 2 | 1000 | 0 | 10 + 3 | 1000 | 0 | 10 + 4 | 1000 | 0 | 10 + 5 | 1000 | 0 | 10 +(5 rows) + +COMMIT; +DROP TABLE distinct_killed; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index a1c90eb905..bd3b373515 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -78,6 +78,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_hashjoin | on enable_indexonlyscan | on enable_indexscan | on + enable_indexskipscan | on enable_material | on enable_mergejoin | on enable_nestloop | on @@ -89,7 +90,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(17 rows) +(18 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql index a605e86449..843efeb28f 100644 --- a/src/test/regress/sql/select_distinct.sql +++ b/src/test/regress/sql/select_distinct.sql @@ -73,3 +73,251 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no"; SELECT 2 IS NOT DISTINCT FROM 2 as "yes"; SELECT 2 IS NOT DISTINCT FROM null as "no"; SELECT null IS NOT DISTINCT FROM null as "yes"; + +-- index only skip scan +CREATE TABLE distinct_a (a int, b int, c int); +INSERT INTO distinct_a ( + SELECT five, tenthous, 10 FROM + generate_series(1, 5) five, + generate_series(1, 10000) tenthous +); +CREATE INDEX ON distinct_a (a, b); +ANALYZE distinct_a; + +SELECT DISTINCT a FROM distinct_a; +SELECT DISTINCT a FROM distinct_a WHERE a = 1; +SELECT DISTINCT a FROM distinct_a ORDER BY a DESC; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT a FROM distinct_a; + +-- test index skip scan with a condition on a non unique field +SELECT DISTINCT ON (a) a, b FROM distinct_a WHERE b = 2; + +-- test index skip scan backwards +SELECT DISTINCT ON (a) a, b FROM distinct_a ORDER BY a DESC, b DESC; + +-- check colums order +CREATE INDEX distinct_a_b_a on distinct_a (b, a); + +SELECT DISTINCT a FROM distinct_a WHERE b = 2; +SELECT DISTINCT on (a, b) a, b FROM distinct_a WHERE b = 2; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT a FROM distinct_a WHERE b = 2; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT on (a, b) a, b FROM distinct_a WHERE b = 2; + +DROP INDEX distinct_a_b_a; + +-- test opposite scan/index directions inside a cursor +-- forward/backward +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b FROM distinct_a ORDER BY a, b; + +FETCH FROM c; +FETCH BACKWARD FROM c; + +FETCH 6 FROM c; +FETCH BACKWARD 6 FROM c; + +FETCH 6 FROM c; +FETCH BACKWARD 6 FROM c; + +END; + +-- backward/forward +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b FROM distinct_a ORDER BY a DESC, b DESC; + +FETCH FROM c; +FETCH BACKWARD FROM c; + +FETCH 6 FROM c; +FETCH BACKWARD 6 FROM c; + +FETCH 6 FROM c; +FETCH BACKWARD 6 FROM c; + +END; + +-- test missing values and skipping from the end +CREATE TABLE distinct_abc(a int, b int, c int); +CREATE INDEX ON distinct_abc(a, b, c); +INSERT INTO distinct_abc + VALUES (1, 1, 1), + (1, 1, 2), + (1, 2, 2), + (1, 2, 3), + (2, 2, 1), + (2, 2, 3), + (3, 1, 1), + (3, 1, 2), + (3, 2, 2), + (3, 2, 3); + +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2; + +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2; + +FETCH ALL FROM c; +FETCH BACKWARD ALL FROM c; + +END; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2 +ORDER BY a DESC, b DESC; + +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2 +ORDER BY a DESC, b DESC; + +FETCH ALL FROM c; +FETCH BACKWARD ALL FROM c; + +END; + +DROP TABLE distinct_abc; + +-- index skip scan +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a ORDER BY a; +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a WHERE a = 1 ORDER BY a; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a ORDER BY a; +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a WHERE a = 1 ORDER BY a; + +-- check colums order +SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10; + +-- check projection case +SELECT DISTINCT a, a FROM distinct_a WHERE b = 2; +SELECT DISTINCT a, 1 FROM distinct_a WHERE b = 2; + +-- test cursor forward/backward movements +BEGIN; +DECLARE c SCROLL CURSOR FOR SELECT DISTINCT a FROM distinct_a; + +FETCH FROM c; +FETCH BACKWARD FROM c; + +FETCH 6 FROM c; +FETCH BACKWARD 6 FROM c; + +FETCH 6 FROM c; +FETCH BACKWARD 6 FROM c; + +END; + +DROP TABLE distinct_a; + +-- test tuples visibility +CREATE TABLE distinct_visibility (a int, b int); +INSERT INTO distinct_visibility (select a, b from generate_series(1,5) a, generate_series(1, 10000) b); +CREATE INDEX ON distinct_visibility (a, b); +ANALYZE distinct_visibility; + +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a, b; +DELETE FROM distinct_visibility WHERE a = 2 and b = 1; +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a, b; + +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a DESC, b DESC; +DELETE FROM distinct_visibility WHERE a = 2 and b = 10000; +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a DESC, b DESC; +DROP TABLE distinct_visibility; + +-- test page boundaries +CREATE TABLE distinct_boundaries AS + SELECT a, b::int2 b, (b % 2)::int2 c FROM + generate_series(1, 5) a, + generate_series(1,366) b; + +CREATE INDEX ON distinct_boundaries (a, b, c); +ANALYZE distinct_boundaries; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a, b, c from distinct_boundaries +WHERE b >= 1 and c = 0 ORDER BY a, b; + +SELECT DISTINCT ON (a) a, b, c from distinct_boundaries +WHERE b >= 1 and c = 0 ORDER BY a, b; + +DROP TABLE distinct_boundaries; + +-- test tuple killing + +-- DESC ordering +CREATE TABLE distinct_killed AS + SELECT a, b, b % 2 AS c, 10 AS d + FROM generate_series(1, 5) a, + generate_series(1,1000) b; + +CREATE INDEX ON distinct_killed (a, b, c, d); + +DELETE FROM distinct_killed where a = 3; + +BEGIN; + DECLARE c SCROLL CURSOR FOR + SELECT DISTINCT ON (a) a,b,c,d + FROM distinct_killed ORDER BY a DESC, b DESC; + FETCH FORWARD ALL FROM c; + FETCH BACKWARD ALL FROM c; +COMMIT; + +DROP TABLE distinct_killed; + +-- regular ordering +CREATE TABLE distinct_killed AS + SELECT a, b, b % 2 AS c, 10 AS d + FROM generate_series(1, 5) a, + generate_series(1,1000) b; + +CREATE INDEX ON distinct_killed (a, b, c, d); + +DELETE FROM distinct_killed where a = 3; + +BEGIN; + DECLARE c SCROLL CURSOR FOR + SELECT DISTINCT ON (a) a,b,c,d + FROM distinct_killed ORDER BY a, b; + FETCH FORWARD ALL FROM c; + FETCH BACKWARD ALL FROM c; +COMMIT; + +DROP TABLE distinct_killed; + +-- partial delete +CREATE TABLE distinct_killed AS + SELECT a, b, b % 2 AS c, 10 AS d + FROM generate_series(1, 5) a, + generate_series(1,1000) b; + +CREATE INDEX ON distinct_killed (a, b, c, d); + +DELETE FROM distinct_killed WHERE a = 3 AND b <= 999; + +BEGIN; + DECLARE c SCROLL CURSOR FOR + SELECT DISTINCT ON (a) a,b,c,d + FROM distinct_killed ORDER BY a DESC, b DESC; + FETCH FORWARD ALL FROM c; + FETCH BACKWARD ALL FROM c; +COMMIT; + +DROP TABLE distinct_killed; -- 2.21.0