Hi Dmitry,
On 9/10/18 5:47 PM, Dmitry Dolgov wrote:
On Mon, 18 Jun 2018 at 17:26, Jesper Pedersen <jesper.peder...@redhat.com>
wrote:
I've looked through the patch more closely, and have a few questions:
Thanks for your review !
* Is there any reason why only copy function for the IndexOnlyScan node
includes an implementation for distinctPrefix?
Oversight -- thanks for catching that.
* What is the purpose of HeapFetches? I don't see any usage of this variable
except assigning 0 to it once.
That can be removed.
New version WIP v2 attached.
Best regards,
Jesper
diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index 6b2b9e3742..74ed15bfeb 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -129,6 +129,7 @@ blhandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = blbulkdelete;
amroutine->amvacuumcleanup = blvacuumcleanup;
amroutine->amcanreturn = NULL;
+ amroutine->amskip = NULL;
amroutine->amcostestimate = blcostestimate;
amroutine->amoptions = bloptions;
amroutine->amproperty = NULL;
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index bee4afbe4e..fd06549491 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -3725,6 +3725,22 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
</listitem>
</varlistentry>
+ <varlistentry id="guc-enable-indexskipscan" xreflabel="enable_indexskipscan">
+ <term><varname>enable_indexskipscan</varname> (<type>boolean</type>)
+ <indexterm>
+ <primary><varname>enable_indexskipscan</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Enables or disables the query planner's use of index-skip-scan plan
+ types (see <xref linkend="indexes-index-skip-scans"/>). This parameter requires
+ that <varname>enable_indexonlyscan</varname> is <literal>on</literal>.
+ The default is <literal>on</literal>.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry id="guc-enable-material" xreflabel="enable_material">
<term><varname>enable_material</varname> (<type>boolean</type>)
<indexterm>
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index beb99d1831..ccbb44288d 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -135,6 +135,7 @@ typedef struct IndexAmRoutine
amendscan_function amendscan;
ammarkpos_function ammarkpos; /* can be NULL */
amrestrpos_function amrestrpos; /* can be NULL */
+ amskip_function amskip; /* can be NULL */
/* interface functions to support parallel index scans */
amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
@@ -665,6 +666,14 @@ amrestrpos (IndexScanDesc scan);
<para>
<programlisting>
+bool
+amskip (IndexScanDesc scan, ScanDirection direction, int prefix);
+</programlisting>
+ TODO
+ </para>
+
+ <para>
+<programlisting>
Size
amestimateparallelscan (void);
</programlisting>
diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml
index a57c5e2e1f..842db029fa 100644
--- a/doc/src/sgml/indices.sgml
+++ b/doc/src/sgml/indices.sgml
@@ -1312,6 +1312,22 @@ SELECT target FROM tests WHERE subject = 'some-subject' AND success;
such cases and allow index-only scans to be generated, but older versions
will not.
</para>
+
+ <sect2 id="indexes-index-skip-scans">
+ <title>Index Skip Scans</title>
+
+ <indexterm zone="indexes-index-skip-scans">
+ <primary>index</primary>
+ <secondary>index-skip scans</secondary>
+ </indexterm>
+ <indexterm zone="indexes-index-skip-scans">
+ <primary>index-skip scan</primary>
+ </indexterm>
+
+ <para>
+ TODO
+ </para>
+ </sect2>
</sect1>
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index e95fbbcea7..85d6571c6d 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -106,6 +106,7 @@ brinhandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = brinbulkdelete;
amroutine->amvacuumcleanup = brinvacuumcleanup;
amroutine->amcanreturn = NULL;
+ amroutine->amskip = NULL;
amroutine->amcostestimate = brincostestimate;
amroutine->amoptions = brinoptions;
amroutine->amproperty = NULL;
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 0a32182dd7..162639090d 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -61,6 +61,7 @@ ginhandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = ginbulkdelete;
amroutine->amvacuumcleanup = ginvacuumcleanup;
amroutine->amcanreturn = NULL;
+ amroutine->amskip = NULL;
amroutine->amcostestimate = gincostestimate;
amroutine->amoptions = ginoptions;
amroutine->amproperty = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 8a42effdf7..ecd4af49d8 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -83,6 +83,7 @@ gisthandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = gistbulkdelete;
amroutine->amvacuumcleanup = gistvacuumcleanup;
amroutine->amcanreturn = gistcanreturn;
+ amroutine->amskip = NULL;
amroutine->amcostestimate = gistcostestimate;
amroutine->amoptions = gistoptions;
amroutine->amproperty = gistproperty;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 0002df30c0..7120950868 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -79,6 +79,7 @@ hashhandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = hashbulkdelete;
amroutine->amvacuumcleanup = hashvacuumcleanup;
amroutine->amcanreturn = NULL;
+ amroutine->amskip = NULL;
amroutine->amcostestimate = hashcostestimate;
amroutine->amoptions = hashoptions;
amroutine->amproperty = NULL;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 22b5cc921f..f9451768c8 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -33,6 +33,7 @@
* index_can_return - does index support index-only scans?
* index_getprocid - get a support procedure OID
* index_getprocinfo - get a support procedure's lookup info
+ * index_skip - advance past duplicate key values in a scan
*
* NOTES
* This file contains the index_ routines which used
@@ -791,6 +792,21 @@ index_can_return(Relation indexRelation, int attno)
return indexRelation->rd_amroutine->amcanreturn(indexRelation, attno);
}
+/* ----------------
+ * index_skip
+ *
+ * Skip past all tuples where the first 'prefix' columns have the
+ * same value as the last tuple returned in the current scan.
+ * ----------------
+ */
+bool
+index_skip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+ SCAN_CHECKS;
+
+ return scan->indexRelation->rd_amroutine->amskip(scan, direction, prefix);
+}
+
/* ----------------
* index_getprocid
*
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index e8725fbbe1..3d02a96dad 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -130,6 +130,7 @@ bthandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = btbulkdelete;
amroutine->amvacuumcleanup = btvacuumcleanup;
amroutine->amcanreturn = btcanreturn;
+ amroutine->amskip = btskip;
amroutine->amcostestimate = btcostestimate;
amroutine->amoptions = btoptions;
amroutine->amproperty = btproperty;
@@ -378,6 +379,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
*/
so->currTuples = so->markTuples = NULL;
+ so->skipScanKey = NULL;
+
scan->xs_itupdesc = RelationGetDescr(rel);
scan->opaque = so;
@@ -445,6 +448,15 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
_bt_preprocess_array_keys(scan);
}
+/*
+ * btskip() -- skip to the beginning of the next key prefix
+ */
+bool
+btskip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+ return _bt_skip(scan, direction, prefix);
+}
+
/*
* btendscan() -- close down a scan
*/
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index d3700bd082..52c24c7541 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1193,6 +1193,121 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
return true;
}
+/*
+ * _bt_skip() -- Skip items that have the same prefix as the most recently
+ * fetched index tuple. The current position is set so that a subsequent call
+ * to _bt_next will fetch the first tuple that differs in the leading 'prefix'
+ * keys.
+ */
+bool
+_bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ BTStack stack;
+ Buffer buf;
+ OffsetNumber offnum;
+ BTScanPosItem *currItem;
+
+ /* We want to return tuples, and we need a starting point */
+ Assert(scan->xs_want_itup);
+ Assert(scan->xs_itup);
+
+ /* Release the current associated buffer */
+ if (BTScanPosIsValid(so->currPos))
+ {
+ ReleaseBuffer(so->currPos.buf);
+ so->currPos.buf = InvalidBuffer;
+ }
+
+ /*
+ * If skipScanKey is NULL then we initialize it with _bt_mkscankey,
+ * otherwise we will just update the sk_flags / sk_argument elements
+ * in order to eliminate repeated free/realloc.
+ */
+ if (so->skipScanKey == NULL)
+ {
+ so->skipScanKey = _bt_mkscankey(scan->indexRelation, scan->xs_itup);
+ }
+ else
+ {
+ Relation rel;
+ TupleDesc itupdesc;
+ int indnkeyatts;
+ int i;
+
+ rel = scan->indexRelation;
+ itupdesc = RelationGetDescr(rel);
+ indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+ for (i = 0; i < indnkeyatts; i++)
+ {
+ Datum datum;
+ bool null;
+ int flags;
+
+ datum = index_getattr(scan->xs_itup, i + 1, itupdesc, &null);
+ flags = (null ? SK_ISNULL : 0) | (rel->rd_indoption[i] << SK_BT_INDOPTION_SHIFT);
+ so->skipScanKey[i].sk_flags = flags;
+ so->skipScanKey[i].sk_argument = datum;
+ }
+ }
+
+ /* Use _bt_search and _bt_binsrch to get the buffer and offset number */
+ stack =_bt_search(scan->indexRelation, prefix, so->skipScanKey,
+ ScanDirectionIsForward(dir), &buf, BT_READ,
+ scan->xs_snapshot);
+ _bt_freestack(stack);
+ so->currPos.buf = buf;
+ offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey,
+ ScanDirectionIsForward(dir));
+
+ /* Lock the page for SERIALIZABLE transactions */
+ PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf),
+ scan->xs_snapshot);
+
+ /* We know in which direction to look */
+ if (ScanDirectionIsForward(dir))
+ {
+ so->currPos.moreLeft = false;
+ so->currPos.moreRight = true;
+
+ /* Move back for _bt_next */
+ offnum = OffsetNumberPrev(offnum);
+ }
+ else
+ {
+ so->currPos.moreLeft = true;
+ so->currPos.moreRight = false;
+ }
+
+ /* Now read the data */
+ if (!_bt_readpage(scan, dir, offnum))
+ {
+ /*
+ * There's no actually-matching data on this page. Try to advance to
+ * the next page. Return false if there's no matching data at all.
+ */
+ LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+ if (!_bt_steppage(scan, dir))
+ {
+ _bt_freeskey(so->skipScanKey);
+ so->skipScanKey = NULL;
+ return false;
+ }
+ }
+ else
+ {
+ /* Drop the lock, and maybe the pin, on the current page */
+ _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+ }
+
+ /* And set IndexTuple */
+ currItem = &so->currPos.items[so->currPos.itemIndex];
+ scan->xs_ctup.t_self = currItem->heapTid;
+ scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+ return true;
+}
+
/*
* _bt_readpage() -- Load data from current index page into so->currPos
*
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 6d59b316ae..157b008284 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -59,6 +59,7 @@ spghandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = spgbulkdelete;
amroutine->amvacuumcleanup = spgvacuumcleanup;
amroutine->amcanreturn = spgcanreturn;
+ amroutine->amskip = NULL;
amroutine->amcostestimate = spgcostestimate;
amroutine->amoptions = spgoptions;
amroutine->amproperty = NULL;
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 16a80a0ea1..7e038cd9a3 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -993,7 +993,13 @@ ExplainNode(PlanState *planstate, List *ancestors,
pname = sname = "Index Scan";
break;
case T_IndexOnlyScan:
- pname = sname = "Index Only Scan";
+ {
+ IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
+ if (indexonlyscan->distinctPrefix > 0)
+ pname = sname = "Index Skip Scan";
+ else
+ pname = sname = "Index Only Scan";
+ }
break;
case T_BitmapIndexScan:
pname = sname = "Bitmap Index Scan";
@@ -1222,6 +1228,14 @@ ExplainNode(PlanState *planstate, List *ancestors,
{
IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
+ if (indexonlyscan->distinctPrefix > 0)
+ {
+ if (es->format != EXPLAIN_FORMAT_TEXT)
+ ExplainPropertyInteger("Distinct Prefix", NULL,
+ indexonlyscan->distinctPrefix,
+ es);
+ }
+
ExplainIndexScanDetails(indexonlyscan->indexid,
indexonlyscan->indexorderdir,
es);
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 8c32a74d39..e69157741f 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -112,6 +112,19 @@ IndexOnlyNext(IndexOnlyScanState *node)
node->ioss_NumOrderByKeys);
}
+ /*
+ * Check if we need to skip to the next key prefix, because we've been
+ * asked to implement DISTINCT.
+ */
+ if (node->ioss_NumDistinctKeys > 0 && node->ioss_FirstTupleEmitted)
+ {
+ if (!index_skip(scandesc, direction, node->ioss_NumDistinctKeys))
+ {
+ /* Reached end of index. */
+ return ExecClearTuple(slot);
+ }
+ }
+
/*
* OK, now that we have what we need, fetch the next tuple.
*/
@@ -247,6 +260,8 @@ IndexOnlyNext(IndexOnlyScanState *node)
ItemPointerGetBlockNumber(tid),
estate->es_snapshot);
+ node->ioss_FirstTupleEmitted = true;
+
return slot;
}
@@ -509,6 +524,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
indexstate->ss.ps.plan = (Plan *) node;
indexstate->ss.ps.state = estate;
indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan;
+ indexstate->ioss_NumDistinctKeys = node->distinctPrefix;
+ indexstate->ioss_FirstTupleEmitted = false;
/*
* Miscellaneous initialization
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 7c8220cf65..012f61d1ad 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -517,6 +517,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from)
COPY_NODE_FIELD(indexorderby);
COPY_NODE_FIELD(indextlist);
COPY_SCALAR_FIELD(indexorderdir);
+ COPY_SCALAR_FIELD(distinctPrefix);
return newnode;
}
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index b5af904c18..6892d94dcf 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -582,6 +582,7 @@ _outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node)
WRITE_NODE_FIELD(indexorderby);
WRITE_NODE_FIELD(indextlist);
WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
+ WRITE_INT_FIELD(distinctPrefix);
}
static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 3254524223..ab118c4b57 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1780,6 +1780,7 @@ _readIndexOnlyScan(void)
READ_NODE_FIELD(indexorderby);
READ_NODE_FIELD(indextlist);
READ_ENUM_FIELD(indexorderdir, ScanDirection);
+ READ_INT_FIELD(distinctPrefix);
READ_DONE();
}
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 7bf67a0529..b4c4edd276 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -122,6 +122,7 @@ int max_parallel_workers_per_gather = 2;
bool enable_seqscan = true;
bool enable_indexscan = true;
bool enable_indexonlyscan = true;
+bool enable_indexskipscan = true;
bool enable_bitmapscan = true;
bool enable_tidscan = true;
bool enable_sort = true;
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index ae41c9efa0..9569f45745 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -175,7 +175,8 @@ static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual,
Index scanrelid, Oid indexid,
List *indexqual, List *indexorderby,
List *indextlist,
- ScanDirection indexscandir);
+ ScanDirection indexscandir,
+ int skipprefix);
static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid,
List *indexqual,
List *indexqualorig);
@@ -2706,7 +2707,8 @@ create_indexscan_plan(PlannerInfo *root,
fixed_indexquals,
fixed_indexorderbys,
best_path->indexinfo->indextlist,
- best_path->indexscandir);
+ best_path->indexscandir,
+ best_path->indexskipprefix);
else
scan_plan = (Scan *) make_indexscan(tlist,
qpqual,
@@ -5115,7 +5117,8 @@ make_indexonlyscan(List *qptlist,
List *indexqual,
List *indexorderby,
List *indextlist,
- ScanDirection indexscandir)
+ ScanDirection indexscandir,
+ int skipprefix)
{
IndexOnlyScan *node = makeNode(IndexOnlyScan);
Plan *plan = &node->scan.plan;
@@ -5130,6 +5133,7 @@ make_indexonlyscan(List *qptlist,
node->indexorderby = indexorderby;
node->indextlist = indextlist;
node->indexorderdir = indexscandir;
+ node->distinctPrefix = skipprefix;
return node;
}
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 96bf0601a8..0c3ace4902 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4720,6 +4720,17 @@ create_distinct_paths(PlannerInfo *root,
path,
list_length(root->distinct_pathkeys),
numDistinctRows));
+
+ /* Also consider a skip scan, if possible. */
+ if (IsA(path, IndexPath) &&
+ path->pathtype == T_IndexOnlyScan &&
+ enable_indexskipscan &&
+ ((IndexPath *) path)->indexinfo->amcanskip)
+ add_path(distinct_rel, (Path *)
+ create_skipscan_unique_path(root, distinct_rel,
+ path,
+ list_length(root->distinct_pathkeys),
+ numDistinctRows));
}
}
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index c5aaaf5c22..b9e7baa5d4 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2768,6 +2768,45 @@ create_upper_unique_path(PlannerInfo *root,
return pathnode;
}
+/*
+ * create_skipscan_unique_path
+ * Creates a pathnode the same as an existing IndexPath except based on
+ * skipping duplicate values. This may or may not be cheaper than using
+ * create_upper_unique_path.
+ *
+ * The input path must be an IndexPath for an index that supports amskip.
+ */
+IndexPath *
+create_skipscan_unique_path(PlannerInfo *root,
+ RelOptInfo *rel,
+ Path *subpath,
+ int numCols,
+ double numGroups)
+{
+ IndexPath *pathnode = makeNode(IndexPath);
+
+ Assert(IsA(subpath, IndexPath));
+
+ /* We don't want to modify subpath, so make a copy. */
+ memcpy(pathnode, subpath, sizeof(IndexPath));
+
+ /* The size of the prefix we'll use for skipping. */
+ Assert(pathnode->indexinfo->amcanskip);
+ Assert(numCols > 0);
+ pathnode->indexskipprefix = numCols;
+
+ /*
+ * The cost to skip to each distinct value should be roughly the same as
+ * the cost of finding the first key times the number of distinct values
+ * we expect to find.
+ */
+ pathnode->path.startup_cost = subpath->startup_cost;
+ pathnode->path.total_cost = subpath->startup_cost * numGroups;
+ pathnode->path.rows = numGroups;
+
+ return pathnode;
+}
+
/*
* create_agg_path
* Creates a pathnode that represents performing aggregation/grouping
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 8369e3ad62..f07aba1c9c 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -270,6 +270,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
info->amoptionalkey = amroutine->amoptionalkey;
info->amsearcharray = amroutine->amsearcharray;
info->amsearchnulls = amroutine->amsearchnulls;
+ info->amcanskip = (amroutine->amskip != NULL);
info->amcanparallel = amroutine->amcanparallel;
info->amhasgettuple = (amroutine->amgettuple != NULL);
info->amhasgetbitmap = (amroutine->amgetbitmap != NULL);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 0625eff219..04eeb984cd 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -852,6 +852,15 @@ static struct config_bool ConfigureNamesBool[] =
true,
NULL, NULL, NULL
},
+ {
+ {"enable_indexskipscan", PGC_USERSET, QUERY_TUNING_METHOD,
+ gettext_noop("Enables the planner's use of index-skip-scan plans."),
+ NULL
+ },
+ &enable_indexskipscan,
+ true,
+ NULL, NULL, NULL
+ },
{
{"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD,
gettext_noop("Enables the planner's use of bitmap-scan plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 7486d20a34..3637133d18 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -297,6 +297,7 @@
#enable_hashjoin = on
#enable_indexscan = on
#enable_indexonlyscan = on
+#enable_indexskipscan = on
#enable_material = on
#enable_mergejoin = on
#enable_nestloop = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 14526a6bb2..81e1ea5d5f 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -127,6 +127,10 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
typedef bool (*amgettuple_function) (IndexScanDesc scan,
ScanDirection direction);
+/* skip past duplicates in a given prefix */
+typedef bool (*amskip_function) (IndexScanDesc scan,
+ ScanDirection dir, int prefix);
+
/* fetch all valid tuples */
typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
TIDBitmap *tbm);
@@ -221,6 +225,7 @@ typedef struct IndexAmRoutine
amendscan_function amendscan;
ammarkpos_function ammarkpos; /* can be NULL */
amrestrpos_function amrestrpos; /* can be NULL */
+ amskip_function amskip; /* can be NULL */
/* interface functions to support parallel index scans */
amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 24c720bf42..4c260115b4 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -170,6 +170,7 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
IndexBulkDeleteResult *stats);
extern bool index_can_return(Relation indexRelation, int attno);
+extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix);
extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
uint16 procnum);
extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 04ecb4cbc0..6009edb22d 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -471,6 +471,9 @@ typedef struct BTScanOpaqueData
*/
int markItemIndex; /* itemIndex, or -1 if not valid */
+ /* Work space for _bt_skip */
+ ScanKey skipScanKey; /* used to control skipping */
+
/* keep these last in struct for efficiency */
BTScanPosData currPos; /* current position data */
BTScanPosData markPos; /* marked position, if any */
@@ -571,6 +574,7 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
Page page, OffsetNumber offnum);
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix);
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
Snapshot snapshot);
@@ -598,6 +602,7 @@ extern void _bt_end_vacuum_callback(int code, Datum arg);
extern Size BTreeShmemSize(void);
extern void BTreeShmemInit(void);
extern bytea *btoptions(Datum reloptions, bool validate);
+extern bool btskip(IndexScanDesc scan, ScanDirection dir, int prefix);
extern bool btproperty(Oid index_oid, int attno,
IndexAMProperty prop, const char *propname,
bool *res, bool *isnull);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index c830f141b1..9e9bee0beb 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1323,6 +1323,8 @@ typedef struct IndexScanState
* RelationDesc index relation descriptor
* ScanDesc index scan descriptor
* VMBuffer buffer in use for visibility map testing, if any
+ * NumDistinctKeys number of keys for skip-based DISTINCT
+ * FirstTupleEmitted has the first tuple been emitted
* ioss_PscanLen Size of parallel index-only scan descriptor
* ----------------
*/
@@ -1341,6 +1343,8 @@ typedef struct IndexOnlyScanState
Relation ioss_RelationDesc;
IndexScanDesc ioss_ScanDesc;
Buffer ioss_VMBuffer;
+ int ioss_NumDistinctKeys;
+ bool ioss_FirstTupleEmitted;
Size ioss_PscanLen;
} IndexOnlyScanState;
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 7c2abbd03a..1e572853d8 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -438,6 +438,7 @@ typedef struct IndexOnlyScan
List *indexorderby; /* list of index ORDER BY exprs */
List *indextlist; /* TargetEntry list describing index's cols */
ScanDirection indexorderdir; /* forward or backward or don't care */
+ int distinctPrefix; /* the size of the prefix for distinct scans */
} IndexOnlyScan;
/* ----------------
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index adb4265047..27adafd6a6 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -811,6 +811,7 @@ typedef struct IndexOptInfo
bool amsearchnulls; /* can AM search for NULL/NOT NULL entries? */
bool amhasgettuple; /* does AM have amgettuple interface? */
bool amhasgetbitmap; /* does AM have amgetbitmap interface? */
+ bool amcanskip; /* can AM skip duplicate values? */
bool amcanparallel; /* does AM support parallel scan? */
/* Rather than include amapi.h here, we declare amcostestimate like this */
void (*amcostestimate) (); /* AM's cost estimator */
@@ -1161,6 +1162,9 @@ typedef struct Path
* we need not recompute them when considering using the same index in a
* bitmap index/heap scan (see BitmapHeapPath). The costs of the IndexPath
* itself represent the costs of an IndexScan or IndexOnlyScan plan type.
+ *
+ * 'indexskipprefix' represents the number of columns to consider for skip
+ * scans.
*----------
*/
typedef struct IndexPath
@@ -1175,6 +1179,7 @@ typedef struct IndexPath
ScanDirection indexscandir;
Cost indextotalcost;
Selectivity indexselectivity;
+ int indexskipprefix;
} IndexPath;
/*
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 77ca7ff837..1fb3de6fa6 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -58,6 +58,7 @@ extern PGDLLIMPORT int max_parallel_workers_per_gather;
extern PGDLLIMPORT bool enable_seqscan;
extern PGDLLIMPORT bool enable_indexscan;
extern PGDLLIMPORT bool enable_indexonlyscan;
+extern PGDLLIMPORT bool enable_indexskipscan;
extern PGDLLIMPORT bool enable_bitmapscan;
extern PGDLLIMPORT bool enable_tidscan;
extern PGDLLIMPORT bool enable_sort;
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 7c5ff22650..349b062ee2 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -186,6 +186,11 @@ extern UpperUniquePath *create_upper_unique_path(PlannerInfo *root,
Path *subpath,
int numCols,
double numGroups);
+extern IndexPath *create_skipscan_unique_path(PlannerInfo *root,
+ RelOptInfo *rel,
+ Path *subpath,
+ int numCols,
+ double numGroups);
extern AggPath *create_agg_path(PlannerInfo *root,
RelOptInfo *rel,
Path *subpath,
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index be25101db2..01bae00fdb 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -19,6 +19,7 @@ CREATE INDEX tenk1_unique1 ON tenk1 USING btree(unique1 int4_ops);
CREATE INDEX tenk1_unique2 ON tenk1 USING btree(unique2 int4_ops);
CREATE INDEX tenk1_hundred ON tenk1 USING btree(hundred int4_ops);
CREATE INDEX tenk1_thous_tenthous ON tenk1 (thousand, tenthous);
+CREATE INDEX tenk1_four ON tenk1 (four);
CREATE INDEX tenk2_unique1 ON tenk2 USING btree(unique1 int4_ops);
CREATE INDEX tenk2_unique2 ON tenk2 USING btree(unique2 int4_ops);
CREATE INDEX tenk2_hundred ON tenk2 USING btree(hundred int4_ops);
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index f3696c6d1d..f443ef4623 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -244,3 +244,21 @@ SELECT null IS NOT DISTINCT FROM null as "yes";
t
(1 row)
+-- index skip scan
+SELECT DISTINCT four FROM tenk1;
+ four
+------
+ 0
+ 1
+ 2
+ 3
+(4 rows)
+
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT DISTINCT four FROM tenk1;
+ QUERY PLAN
+--------------------------------------------------
+ Index Skip Scan using tenk1_four on public.tenk1
+ Output: four
+(2 rows)
+
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index a1c90eb905..bd3b373515 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -78,6 +78,7 @@ select name, setting from pg_settings where name like 'enable%';
enable_hashjoin | on
enable_indexonlyscan | on
enable_indexscan | on
+ enable_indexskipscan | on
enable_material | on
enable_mergejoin | on
enable_nestloop | on
@@ -89,7 +90,7 @@ select name, setting from pg_settings where name like 'enable%';
enable_seqscan | on
enable_sort | on
enable_tidscan | on
-(17 rows)
+(18 rows)
-- Test that the pg_timezone_names and pg_timezone_abbrevs views are
-- more-or-less working. We can't test their contents in any great detail
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index f9e7118f0d..3a7d59243c 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -26,6 +26,8 @@ CREATE INDEX tenk1_hundred ON tenk1 USING btree(hundred int4_ops);
CREATE INDEX tenk1_thous_tenthous ON tenk1 (thousand, tenthous);
+CREATE INDEX tenk1_four ON tenk1 (four);
+
CREATE INDEX tenk2_unique1 ON tenk2 USING btree(unique1 int4_ops);
CREATE INDEX tenk2_unique2 ON tenk2 USING btree(unique2 int4_ops);
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index a605e86449..db222fa510 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -73,3 +73,9 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no";
SELECT 2 IS NOT DISTINCT FROM 2 as "yes";
SELECT 2 IS NOT DISTINCT FROM null as "no";
SELECT null IS NOT DISTINCT FROM null as "yes";
+
+-- index skip scan
+SELECT DISTINCT four FROM tenk1;
+
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT DISTINCT four FROM tenk1;
--
2.17.1