On Sat, Jun 12, 2021 at 2:35 PM Thomas Munro <thomas.mu...@gmail.com> wrote: > ... and here is the corresponding code change, with a test to > demonstrate the change. > > I'm working on a proof-of-concept to skip the btree page lock > sometimes too, which seems promising, but it requires some planner > work which has temporarily pretzeled my brain.
Here's a highly experimental patch I came up with that seems to produce the right results in simple cases, without (yet) involving the planner. The regression tests show single table queries, but it works also for nest loop joins, which is where this optimisation should be most interesting, I think. There are a few weird things about this patch though, and there could well be much better ways to do it, as noted in the commit message and comments. It's a start on the problem...
From 2325ae403196f73865f7c6b3224db925ca406969 Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Thu, 10 Jun 2021 23:35:16 +1200 Subject: [PATCH v2 1/2] Use tuple-level SIREAD locks for index-only scans. When index-only scans manage to avoid fetching heap tuples, previously we'd predicate-lock the whole heap page (since commit cdf91edb). Commits c01262a8 and 6f38d4da made it possible to lock the tuple instead with only a TID, to match the behavior of regular index scans and avoid some false conflicts. Discussion: https://postgr.es/m/CAEepm%3D2GK3FVdnt5V3d%2Bh9njWipCv_fNL%3DwjxyUhzsF%3D0PcbNg%40mail.gmail.com --- src/backend/executor/nodeIndexonlyscan.c | 12 ++++-- .../isolation/expected/index-only-scan-2.out | 15 +++++++ .../isolation/expected/index-only-scan-3.out | 16 ++++++++ src/test/isolation/isolation_schedule | 2 + .../isolation/specs/index-only-scan-2.spec | 39 +++++++++++++++++++ .../isolation/specs/index-only-scan-3.spec | 33 ++++++++++++++++ 6 files changed, 113 insertions(+), 4 deletions(-) create mode 100644 src/test/isolation/expected/index-only-scan-2.out create mode 100644 src/test/isolation/expected/index-only-scan-3.out create mode 100644 src/test/isolation/specs/index-only-scan-2.spec create mode 100644 src/test/isolation/specs/index-only-scan-3.spec diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 0754e28a9a..f7185b4519 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -243,12 +243,16 @@ IndexOnlyNext(IndexOnlyScanState *node) /* * If we didn't access the heap, then we'll need to take a predicate - * lock explicitly, as if we had. For now we do that at page level. + * lock explicitly, as if we had. We don't have the inserter's xid, + * but that's only used by PredicateLockTID to check if it matches the + * caller's top level xid, and it can't possibly have been inserted by + * us or the page wouldn't be all visible. */ if (!tuple_from_heap) - PredicateLockPage(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - estate->es_snapshot); + PredicateLockTID(scandesc->heapRelation, + tid, + estate->es_snapshot, + InvalidTransactionId); return slot; } diff --git a/src/test/isolation/expected/index-only-scan-2.out b/src/test/isolation/expected/index-only-scan-2.out new file mode 100644 index 0000000000..fef5b8d398 --- /dev/null +++ b/src/test/isolation/expected/index-only-scan-2.out @@ -0,0 +1,15 @@ +Parsed test spec with 2 sessions + +starting permutation: r1 r2 w1 w2 c1 c2 +step r1: SELECT id1 FROM account WHERE id1 = 1; +id1 + +1 +step r2: SELECT id2 FROM account WHERE id2 = 2; +id2 + +2 +step w1: UPDATE account SET balance = 200 WHERE id1 = 1; +step w2: UPDATE account SET balance = 200 WHERE id2 = 2; +step c1: COMMIT; +step c2: COMMIT; diff --git a/src/test/isolation/expected/index-only-scan-3.out b/src/test/isolation/expected/index-only-scan-3.out new file mode 100644 index 0000000000..efef162779 --- /dev/null +++ b/src/test/isolation/expected/index-only-scan-3.out @@ -0,0 +1,16 @@ +Parsed test spec with 2 sessions + +starting permutation: r1 r2 w1 w2 c1 c2 +step r1: SELECT id1 FROM account WHERE id1 = 2; +id1 + +2 +step r2: SELECT id2 FROM account WHERE id2 = 1; +id2 + +1 +step w1: UPDATE account SET balance = 200 WHERE id1 = 1; +step w2: UPDATE account SET balance = 200 WHERE id2 = 2; +step c1: COMMIT; +step c2: COMMIT; +ERROR: could not serialize access due to read/write dependencies among transactions diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index f4c01006fc..570aedb900 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -17,6 +17,8 @@ test: partial-index test: two-ids test: multiple-row-versions test: index-only-scan +test: index-only-scan-2 +test: index-only-scan-3 test: predicate-lock-hot-tuple test: update-conflict-out test: deadlock-simple diff --git a/src/test/isolation/specs/index-only-scan-2.spec b/src/test/isolation/specs/index-only-scan-2.spec new file mode 100644 index 0000000000..cd3c599af8 --- /dev/null +++ b/src/test/isolation/specs/index-only-scan-2.spec @@ -0,0 +1,39 @@ +# Test the granularity of predicate locks on heap tuples, for index-only scans. + +# Access the accounts through two different indexes, so that index predicate +# locks don't confuse matters; here we only want to test heap locking. Since +# s1 and s2 access different heap tuples there is no cycle, so both +# transactions should be able to commit. Before PostgreSQL 15, a serialization +# failure was reported because of a page-level SIREAD lock on the heap that +# produced a false conflict. + +setup +{ + CREATE TABLE account (id1 int, id2 int, balance int); + CREATE UNIQUE INDEX ON account(id1); + CREATE UNIQUE INDEX ON account(id2); + INSERT INTO account VALUES (1, 1, 100), (2, 2, 100); +} +setup +{ + VACUUM account; +} + +teardown +{ + DROP TABLE account; +} + +session "s1" +setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan = off; } +step "r1" { SELECT id1 FROM account WHERE id1 = 1; } +step "w1" { UPDATE account SET balance = 200 WHERE id1 = 1; } +step "c1" { COMMIT; } + +session "s2" +setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan = off; } +step "r2" { SELECT id2 FROM account WHERE id2 = 2; } +step "w2" { UPDATE account SET balance = 200 WHERE id2 = 2; } +step "c2" { COMMIT; } + +permutation "r1" "r2" "w1" "w2" "c1" "c2" diff --git a/src/test/isolation/specs/index-only-scan-3.spec b/src/test/isolation/specs/index-only-scan-3.spec new file mode 100644 index 0000000000..5f04923002 --- /dev/null +++ b/src/test/isolation/specs/index-only-scan-3.spec @@ -0,0 +1,33 @@ +# A variant of index-only-scan-2.spec that is a true conflict, detected via +# heap tuple locks only. + +setup +{ + CREATE TABLE account (id1 int, id2 int, balance int); + CREATE UNIQUE INDEX ON account(id1); + CREATE UNIQUE INDEX ON account(id2); + INSERT INTO account VALUES (1, 1, 100), (2, 2, 100); +} +setup +{ + VACUUM account; +} + +teardown +{ + DROP TABLE account; +} + +session "s1" +setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan = off; } +step "r1" { SELECT id1 FROM account WHERE id1 = 2; } +step "w1" { UPDATE account SET balance = 200 WHERE id1 = 1; } +step "c1" { COMMIT; } + +session "s2" +setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan = off; } +step "r2" { SELECT id2 FROM account WHERE id2 = 1; } +step "w2" { UPDATE account SET balance = 200 WHERE id2 = 2; } +step "c2" { COMMIT; } + +permutation "r1" "r2" "w1" "w2" "c1" "c2" -- 2.30.2
From 10fc6dd34c806a42f9d842e190bf8636d223a29a Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Fri, 11 Jun 2021 15:33:59 +1200 Subject: [PATCH v2 2/2] WIP: Skip SIREAD locks on btree pages when possible. Previously, we'd always predicate-lock the index pages we searched, and any heap tuples we found. As described by Kevin and Dan in README-SSI, it's not strictly necessary to lock both: if we found a tuple and can prove that there can't be another visible match, it's enough to lock just the tuple. XXX The work of detecting suitable quals is done at execution time by ExecIndexBuildScanKeys(), but it could probably be done by the planner. XXX The logic for "scheduling" a predicate lock and then clearing or flushing is a bit weird; it seems like it's necessary though, because you don't yet have enough information to judge the visibility while you're examining the index page. XXX The mechanism for detecting page splits is a bit weird. Discussion: https://postgr.es/m/CAEepm%3D2GK3FVdnt5V3d%2Bh9njWipCv_fNL%3DwjxyUhzsF%3D0PcbNg%40mail.gmail.com --- src/backend/access/index/genam.c | 2 + src/backend/access/nbtree/nbtsearch.c | 24 ++- src/backend/executor/nodeBitmapIndexscan.c | 3 +- src/backend/executor/nodeIndexonlyscan.c | 25 ++- src/backend/executor/nodeIndexscan.c | 94 +++++++++- src/backend/storage/lmgr/README-SSI | 13 -- src/backend/storage/lmgr/predicate.c | 84 +++++++++ src/include/access/relscan.h | 10 ++ src/include/executor/nodeIndexscan.h | 3 +- src/include/nodes/execnodes.h | 2 + src/include/storage/predicate.h | 23 +++ src/include/storage/predicate_internals.h | 1 + src/test/regress/expected/btree_index.out | 193 +++++++++++++++++++++ src/test/regress/sql/btree_index.sql | 79 +++++++++ 14 files changed, 532 insertions(+), 24 deletions(-) diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index b93288a6fe..40ae726081 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -126,6 +126,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->xs_defer_predlocks = false; + return scan; } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index d1177d8772..914aff9fc4 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -824,6 +824,21 @@ _bt_compare(Relation rel, return 0; } +/* + * _bt_predlock_page() -- Predicate-lock a page, immediate or deferred. + */ +static inline void +_bt_predlock_page(IndexScanDesc scan, Relation rel, BlockNumber blocknum) +{ + if (scan->xs_defer_predlocks) + SchedulePredicateLockPage(&scan->xs_deferred_predlocks, + rel, + blocknum, + scan->xs_snapshot); + else + PredicateLockPage(rel, blocknum, scan->xs_snapshot); +} + /* * _bt_first() -- Find the first item in a scan. * @@ -1374,8 +1389,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) return false; } else - PredicateLockPage(rel, BufferGetBlockNumber(buf), - scan->xs_snapshot); + _bt_predlock_page(scan, rel, BufferGetBlockNumber(buf)); _bt_initialize_more_data(so, dir); @@ -1999,7 +2013,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) /* check for deleted page */ if (!P_IGNORE(opaque)) { - PredicateLockPage(rel, blkno, scan->xs_snapshot); + _bt_predlock_page(scan, rel, blkno); /* see if there are any matches on this page */ /* note that this will clear moreRight if we can stop */ if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque))) @@ -2101,7 +2115,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { - PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot); + _bt_predlock_page(scan, rel, BufferGetBlockNumber(so->currPos.buf)); /* see if there are any matches on this page */ /* note that this will clear moreLeft if we can stop */ if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) @@ -2404,7 +2418,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) return false; } - PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); + _bt_predlock_page(scan, rel, BufferGetBlockNumber(buf)); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c index 48c2036297..ea9298fed4 100644 --- a/src/backend/executor/nodeBitmapIndexscan.c +++ b/src/backend/executor/nodeBitmapIndexscan.c @@ -283,7 +283,8 @@ ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags) &indexstate->biss_RuntimeKeys, &indexstate->biss_NumRuntimeKeys, &indexstate->biss_ArrayKeys, - &indexstate->biss_NumArrayKeys); + &indexstate->biss_NumArrayKeys, + NULL); /* * If we have runtime keys or array keys, we need an ExprContext to diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index f7185b4519..4583375b17 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -103,6 +103,13 @@ IndexOnlyNext(IndexOnlyScanState *node) node->ioss_ScanDesc->xs_want_itup = true; node->ioss_VMBuffer = InvalidBuffer; + /* Set up the single row SSI optimization, if possible. */ + if (node->ioss_SingleRow) + { + InitPredicateLockBuffer(&scandesc->xs_deferred_predlocks); + scandesc->xs_defer_predlocks = true; + } + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -254,9 +261,23 @@ IndexOnlyNext(IndexOnlyScanState *node) estate->es_snapshot, InvalidTransactionId); + /* + * Single row SSI optimization: found a match, so we can throw away + * predicate locks on on the index. + */ + if (node->ioss_SingleRow) + ClearPredicateLockBuffer(&scandesc->xs_deferred_predlocks); + return slot; } + /* + * Single row SSI optimization: no match found, so we need the index locks + * to "lock the gap". + */ + if (node->ioss_SingleRow) + FlushPredicateLockBuffer(&scandesc->xs_deferred_predlocks); + /* * if we get here it means the index scan failed so we are at the end of * the scan.. @@ -593,7 +614,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) &indexstate->ioss_RuntimeKeys, &indexstate->ioss_NumRuntimeKeys, NULL, /* no ArrayKeys */ - NULL); + NULL, + &indexstate->ioss_SingleRow); /* * any ORDER BY exprs have to be turned into scankeys in the same way @@ -607,6 +629,7 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) &indexstate->ioss_RuntimeKeys, &indexstate->ioss_NumRuntimeKeys, NULL, /* no ArrayKeys */ + NULL, NULL); /* diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 2fffb1b437..959e1ef3c3 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -36,8 +36,10 @@ #include "executor/execdebug.h" #include "executor/nodeIndexscan.h" #include "lib/pairingheap.h" +#include "lib/qunique.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" +#include "storage/predicate.h" #include "utils/array.h" #include "utils/datum.h" #include "utils/lsyscache.h" @@ -117,6 +119,18 @@ IndexNext(IndexScanState *node) node->iss_ScanDesc = scandesc; + /* + * Single row SSI optimization: if this scan produce only one row, tell + * the index to defer acquisition of predicate locks on the index + * itself (but not the heap); we'll decide whether to acquire or forget + * them later. + */ + if (node->iss_SingleRow) + { + InitPredicateLockBuffer(&scandesc->xs_deferred_predlocks); + scandesc->xs_defer_predlocks = true; + } + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -149,9 +163,24 @@ IndexNext(IndexScanState *node) } } + /* + * Single row SSI optimization: If we found a visible match, then there + * is no need for the index to acquire predicate locks. The heap tuple + * lock is enough, because there can be only one match. + */ + if (node->iss_SingleRow) + ClearPredicateLockBuffer(&scandesc->xs_deferred_predlocks); + return slot; } + /* + * Single row SSI optimization: If we didn't find a row, then the index + * needs to lock the gap. + */ + if (node->iss_SingleRow) + FlushPredicateLockBuffer(&scandesc->xs_deferred_predlocks); + /* * if we get here it means the index scan failed so we are at the end of * the scan.. @@ -987,7 +1016,8 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) &indexstate->iss_RuntimeKeys, &indexstate->iss_NumRuntimeKeys, NULL, /* no ArrayKeys */ - NULL); + NULL, + &indexstate->iss_SingleRow); /* * any ORDER BY exprs have to be turned into scankeys in the same way @@ -1001,6 +1031,7 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) &indexstate->iss_RuntimeKeys, &indexstate->iss_NumRuntimeKeys, NULL, /* no ArrayKeys */ + NULL, NULL); /* Initialize sort support, if we need to re-check ORDER BY exprs */ @@ -1086,6 +1117,16 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) } +static int +compare_int16(const void *a, const void *b) +{ + int av = *(const int16 *) a; + int bv = *(const int16 *) b; + + return (av - bv); +} + + /* * ExecIndexBuildScanKeys * Build the index scan keys from the index qualification expressions @@ -1141,16 +1182,18 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) * *numRuntimeKeys: receives number of runtime keys * *arrayKeys: receives ptr to array of IndexArrayKeyInfos, or NULL if none * *numArrayKeys: receives number of array keys + * *singleRow: receives flag meaning at most one visible row can be produced * * Caller may pass NULL for arrayKeys and numArrayKeys to indicate that - * IndexArrayKeyInfos are not supported. + * IndexArrayKeyInfos are not supported. Likewise for singleRow. */ void ExecIndexBuildScanKeys(PlanState *planstate, Relation index, List *quals, bool isorderby, ScanKey *scanKeys, int *numScanKeys, IndexRuntimeKeyInfo **runtimeKeys, int *numRuntimeKeys, - IndexArrayKeyInfo **arrayKeys, int *numArrayKeys) + IndexArrayKeyInfo **arrayKeys, int *numArrayKeys, + bool *singleRow) { ListCell *qual_cell; ScanKey scan_keys; @@ -1161,6 +1204,7 @@ ExecIndexBuildScanKeys(PlanState *planstate, Relation index, int max_runtime_keys; int n_array_keys; int j; + bool single_row; /* Allocate array for ScanKey structs: one per qual */ n_scan_keys = list_length(quals); @@ -1623,6 +1667,36 @@ ExecIndexBuildScanKeys(PlanState *planstate, Relation index, array_keys = NULL; } + /* + * Can at most one visible row can be produced by the scan? + * + * XXX btree only for now due to need to recognise equality + * XXX this is probably not the right place for this work! + * XXX do it in the planner? 🤯 + */ + single_row = false; + if (singleRow && + index->rd_rel->relam == BTREE_AM_OID && + index->rd_index->indisunique) + { + AttrNumber attnos[INDEX_MAX_KEYS]; + int nattnos = 0; + + /* Find all equality quals. */ + for (int i = 0; i < n_scan_keys; ++i) + { + if (scan_keys[i].sk_strategy == BTEqualStrategyNumber) + attnos[nattnos++] = scan_keys[i].sk_attno; + } + + /* Are all attributes covered? */ + /* XXX is this check enough or do we need to work harder? */ + qsort(attnos, nattnos, sizeof(AttrNumber), compare_int16); + nattnos = qunique(attnos, nattnos, sizeof(AttrNumber), compare_int16); + if (nattnos == index->rd_index->indnkeyatts) + single_row = true; + } + /* * Return info to our caller. */ @@ -1637,6 +1711,8 @@ ExecIndexBuildScanKeys(PlanState *planstate, Relation index, } else if (n_array_keys != 0) elog(ERROR, "ScalarArrayOpExpr index qual found where not allowed"); + if (singleRow) + *singleRow = single_row; } /* ---------------------------------------------------------------- @@ -1689,6 +1765,12 @@ ExecIndexScanInitializeDSM(IndexScanState *node, node->iss_NumOrderByKeys, piscan); + if (node->iss_SingleRow) + { + InitPredicateLockBuffer(&node->iss_ScanDesc->xs_deferred_predlocks); + node->iss_ScanDesc->xs_defer_predlocks = true; + } + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. @@ -1732,6 +1814,12 @@ ExecIndexScanInitializeWorker(IndexScanState *node, node->iss_NumOrderByKeys, piscan); + if (node->iss_SingleRow) + { + InitPredicateLockBuffer(&node->iss_ScanDesc->xs_deferred_predlocks); + node->iss_ScanDesc->xs_defer_predlocks = true; + } + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. diff --git a/src/backend/storage/lmgr/README-SSI b/src/backend/storage/lmgr/README-SSI index 50d2ecca9d..afbd797754 100644 --- a/src/backend/storage/lmgr/README-SSI +++ b/src/backend/storage/lmgr/README-SSI @@ -349,12 +349,6 @@ Several optimizations are possible, though not all are implemented yet: * An index scan which is just finding the right position for an index insertion or deletion need not acquire a predicate lock. - * An index scan which is comparing for equality on the entire key -for a unique index need not acquire a predicate lock as long as a key -is found corresponding to a visible tuple which has not been modified -by another transaction -- there are no "between or around" gaps to -cover. - * As long as built-in foreign key enforcement continues to use its current "special tricks" to deal with MVCC issues, predicate locks should not be needed for scans done by enforcement code. @@ -610,13 +604,6 @@ address it? replication solutions, like Postgres-R, Slony, pgpool, HS/SR, etc. This is related to the "WAL file replay" issue. - * UNIQUE btree search for equality on all columns. Since a search -of a UNIQUE index using equality tests on all columns will lock the -heap tuple if an entry is found, it appears that there is no need to -get a predicate lock on the index in that case. A predicate lock is -still needed for such a search if a matching index entry which points -to a visible tuple is not found. - * Minimize touching of shared memory. Should lists in shared memory push entries which have just been returned to the front of the available list, so they will be popped back off soon and some memory diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index d493aeef0f..db997d2bc8 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -1277,6 +1277,8 @@ InitPredicateLocks(void) PredXact->OldCommittedSxact->xmin = InvalidTransactionId; PredXact->OldCommittedSxact->flags = SXACT_FLAG_COMMITTED; PredXact->OldCommittedSxact->pid = 0; + for (int i = 0; i < lengthof(PredXact->split_counters); ++i) + pg_atomic_init_u64(&PredXact->split_counters[i], 0); } /* This never changes, so let's keep a local copy. */ OldCommittedSxact = PredXact->OldCommittedSxact; @@ -3170,6 +3172,11 @@ PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, PREDICATELOCKTARGETTAG oldtargettag; PREDICATELOCKTARGETTAG newtargettag; bool success; + uint32 split_counter; + + /* Make sure that deferred predicate locks are promoted to relation. */ + split_counter = oldblkno % lengthof(PredXact->split_counters); + pg_atomic_fetch_add_u64(&PredXact->split_counters[split_counter], 1); /* * Bail out quickly if there are no serializable transactions running. @@ -5201,3 +5208,80 @@ AttachSerializableXact(SerializableXactHandle handle) if (MySerializableXact != InvalidSerializableXact) CreateLocalPredicateLockHash(); } + +/* + * Initialize a buffer of predicate lock requests. + */ +void +InitPredicateLockBuffer(PredicateLockBuffer *buffer) +{ + buffer->size = 0; +} + +/* + * Acquire all buffered predicate locks. + */ +void +FlushPredicateLockBuffer(PredicateLockBuffer *buffer) +{ + for (int i = 0; i < buffer->size; ++i) + { + PredicateLockRequest *request = &buffer->requests[i]; + uint32 split_counter; + + PredicateLockPage(request->relation, + request->blocknum, + request->snapshot); + + /* + * Check if there's been a split. If so, we don't have enough + * information to follow the split, so we'll promote the SIREAD lock to + * whole-relation level. This should be unlikely. + */ + split_counter = request->blocknum % lengthof(PredXact->split_counters); + if (request->page_split_count != + pg_atomic_read_u64(&PredXact->split_counters[split_counter])) + PredicateLockRelation(request->relation, + request->snapshot); + } + buffer->size = 0; +} + +/* + * Forget buffered predicate locks, we don't need them after all. + */ +void +ClearPredicateLockBuffer(PredicateLockBuffer *buffer) +{ + buffer->size = 0; +} + +/* + * Remember to acquire a page-level predicate lock. + */ +void +SchedulePredicateLockPage(PredicateLockBuffer *buffer, + Relation relation, + BlockNumber blkno, + Snapshot snapshot) +{ + PredicateLockRequest *request; + uint32 split_counter; + + if (buffer->size == lengthof(buffer->requests)) + FlushPredicateLockBuffer(buffer); + + request = &buffer->requests[buffer->size++]; + request->relation = relation; + request->blocknum = blkno; + request->snapshot = snapshot; + + /* + * Record the page split counter while the caller holds the page content + * lock. This way we can detect page splits that occur between now and the + * time that we acquire the SIREAD lock, if we eventually flush. + */ + split_counter = blkno % lengthof(PredXact->split_counters); + request->page_split_count = + pg_atomic_read_u64(&PredXact->split_counters[split_counter]); +} diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 74a07ef152..b0747c5e1e 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -18,6 +18,7 @@ #include "access/itup.h" #include "port/atomics.h" #include "storage/buf.h" +#include "storage/predicate.h" #include "storage/spin.h" #include "utils/relcache.h" @@ -162,6 +163,15 @@ typedef struct IndexScanDescData bool *xs_orderbynulls; bool xs_recheckorderby; + /* + * In certain conditions, we can avoid acquiring predicate locks. We don't + * know if that's possible until later, so we provide a way for index AMs + * to defer predicate lock acquisition so that the caller can decide + * whether they're needed. + */ + bool xs_defer_predlocks; + PredicateLockBuffer xs_deferred_predlocks; + /* parallel index scan information, in shared memory */ struct ParallelIndexScanDescData *parallel_scan; } IndexScanDescData; diff --git a/src/include/executor/nodeIndexscan.h b/src/include/executor/nodeIndexscan.h index 70a2cff199..c0ccc5bd53 100644 --- a/src/include/executor/nodeIndexscan.h +++ b/src/include/executor/nodeIndexscan.h @@ -37,7 +37,8 @@ extern void ExecIndexBuildScanKeys(PlanState *planstate, Relation index, List *quals, bool isorderby, ScanKey *scanKeys, int *numScanKeys, IndexRuntimeKeyInfo **runtimeKeys, int *numRuntimeKeys, - IndexArrayKeyInfo **arrayKeys, int *numArrayKeys); + IndexArrayKeyInfo **arrayKeys, int *numArrayKeys, + bool *single_row); extern void ExecIndexEvalRuntimeKeys(ExprContext *econtext, IndexRuntimeKeyInfo *runtimeKeys, int numRuntimeKeys); extern bool ExecIndexEvalArrayKeys(ExprContext *econtext, diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 9a5ca7b3db..d5ec8ef37c 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1472,6 +1472,7 @@ typedef struct IndexScanState int iss_NumRuntimeKeys; bool iss_RuntimeKeysReady; ExprContext *iss_RuntimeContext; + bool iss_SingleRow; Relation iss_RelationDesc; struct IndexScanDescData *iss_ScanDesc; @@ -1517,6 +1518,7 @@ typedef struct IndexOnlyScanState int ioss_NumRuntimeKeys; bool ioss_RuntimeKeysReady; ExprContext *ioss_RuntimeContext; + bool ioss_SingleRow; Relation ioss_RelationDesc; struct IndexScanDescData *ioss_ScanDesc; TupleTableSlot *ioss_TableSlot; diff --git a/src/include/storage/predicate.h b/src/include/storage/predicate.h index 152b698611..8ed0063f2e 100644 --- a/src/include/storage/predicate.h +++ b/src/include/storage/predicate.h @@ -36,6 +36,20 @@ extern int max_predicate_locks_per_page; */ typedef void *SerializableXactHandle; +typedef struct PredicateLockRequest +{ + Relation relation; + BlockNumber blocknum; + Snapshot snapshot; + uint64 page_split_count; +} PredicateLockRequest; + +/* A small buffer for predicate lock requests that might later be elided. */ +typedef struct PredicateLockBuffer { + int size; + PredicateLockRequest requests[4]; +} PredicateLockBuffer; + /* * function prototypes */ @@ -64,6 +78,15 @@ extern void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, Bl extern void TransferPredicateLocksToHeapRelation(Relation relation); extern void ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe); +/* deferred predicate locks */ +extern void InitPredicateLockBuffer(PredicateLockBuffer *buffer); +extern void FlushPredicateLockBuffer(PredicateLockBuffer *buffer); +extern void ClearPredicateLockBuffer(PredicateLockBuffer *buffer); +extern void SchedulePredicateLockPage(PredicateLockBuffer *buffer, + Relation relation, + BlockNumber blkno, + Snapshot snapshot); + /* conflict detection (may also trigger rollback) */ extern bool CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot); extern void CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot); diff --git a/src/include/storage/predicate_internals.h b/src/include/storage/predicate_internals.h index 104f560d38..6a3dce6659 100644 --- a/src/include/storage/predicate_internals.h +++ b/src/include/storage/predicate_internals.h @@ -187,6 +187,7 @@ typedef struct PredXactListData SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */ PredXactListElement element; + pg_atomic_uint64 split_counters[64]; /* page split counters */ } PredXactListData; typedef struct PredXactListData *PredXactList; diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index bc113a70b4..545fff8b37 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -329,3 +329,196 @@ INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,1000) i; -- Test unsupported btree opclass parameters create index on btree_tall_tbl (id int4_ops(foo=1)); ERROR: operator class int4_ops has no options +-- +-- Test SIREAD locking +-- +CREATE TABLE test_table (i int, j int, PRIMARY KEY (i, j)); +INSERT INTO test_table VALUES (1, 1), (2, 2); +VACUUM test_table; +SET enable_seqscan = off; +SET enable_indexonlyscan = off; +-- Unique quals, found -> heap tuple lock only +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 1 AND j = 1; + QUERY PLAN +------------------------------------------------ + Index Scan using test_table_pkey on test_table + Index Cond: ((i = 1) AND (j = 1)) +(2 rows) + +SELECT * FROM test_table WHERE i = 1 AND j = 1; + i | j +---+--- + 1 | 1 +(1 row) + +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; + locktype | relation | page | tuple +----------+------------+------+------- + tuple | test_table | 0 | 1 +(1 row) + +ROLLBACK; +-- Unique quals, not found -> btree page lock +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 1 AND j = 2; + QUERY PLAN +------------------------------------------------ + Index Scan using test_table_pkey on test_table + Index Cond: ((i = 1) AND (j = 2)) +(2 rows) + +SELECT * FROM test_table WHERE i = 1 AND j = 2; + i | j +---+--- +(0 rows) + +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; + locktype | relation | page | tuple +----------+-----------------+------+------- + page | test_table_pkey | 1 | +(1 row) + +ROLLBACK; +-- Non-unique quals, found -> heap tuple and btree page locks +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 1; + QUERY PLAN +------------------------------------------------ + Index Scan using test_table_pkey on test_table + Index Cond: (i = 1) +(2 rows) + +SELECT * FROM test_table WHERE i = 1; + i | j +---+--- + 1 | 1 +(1 row) + +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; + locktype | relation | page | tuple +----------+-----------------+------+------- + page | test_table_pkey | 1 | + tuple | test_table | 0 | 1 +(2 rows) + +ROLLBACK; +-- Non-unique quals, not found -> btree page lock +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 3; + QUERY PLAN +------------------------------------------------ + Index Scan using test_table_pkey on test_table + Index Cond: (i = 3) +(2 rows) + +SELECT * FROM test_table WHERE i = 3; + i | j +---+--- +(0 rows) + +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; + locktype | relation | page | tuple +----------+-----------------+------+------- + page | test_table_pkey | 1 | +(1 row) + +ROLLBACK; +-- The same 4 cases, this time with an index-only scan +RESET enable_indexonlyscan; +-- Unique quals, found -> heap tuple lock only +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 1 AND j = 1; + QUERY PLAN +----------------------------------------------------- + Index Only Scan using test_table_pkey on test_table + Index Cond: ((i = 1) AND (j = 1)) +(2 rows) + +SELECT * FROM test_table WHERE i = 1 AND j = 1; + i | j +---+--- + 1 | 1 +(1 row) + +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; + locktype | relation | page | tuple +----------+------------+------+------- + tuple | test_table | 0 | 1 +(1 row) + +ROLLBACK; +-- Unique quals, not found -> btree page lock +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 1 AND j = 2; + QUERY PLAN +----------------------------------------------------- + Index Only Scan using test_table_pkey on test_table + Index Cond: ((i = 1) AND (j = 2)) +(2 rows) + +SELECT * FROM test_table WHERE i = 1 AND j = 2; + i | j +---+--- +(0 rows) + +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; + locktype | relation | page | tuple +----------+-----------------+------+------- + page | test_table_pkey | 1 | +(1 row) + +ROLLBACK; +-- Non-unique quals, found -> heap tuple and btree page locks +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 1; + QUERY PLAN +----------------------------------------------------- + Index Only Scan using test_table_pkey on test_table + Index Cond: (i = 1) +(2 rows) + +SELECT * FROM test_table WHERE i = 1; + i | j +---+--- + 1 | 1 +(1 row) + +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; + locktype | relation | page | tuple +----------+-----------------+------+------- + page | test_table_pkey | 1 | + tuple | test_table | 0 | 1 +(2 rows) + +ROLLBACK; +-- Non-unique quals, not found -> btree page lock +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 3; + QUERY PLAN +----------------------------------------------------- + Index Only Scan using test_table_pkey on test_table + Index Cond: (i = 3) +(2 rows) + +SELECT * FROM test_table WHERE i = 3; + i | j +---+--- +(0 rows) + +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; + locktype | relation | page | tuple +----------+-----------------+------+------- + page | test_table_pkey | 1 | +(1 row) + +ROLLBACK; +DROP TABLE test_table; diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index c60312db2d..739b243f5d 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -172,3 +172,82 @@ INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,1000) i; -- Test unsupported btree opclass parameters create index on btree_tall_tbl (id int4_ops(foo=1)); + +-- +-- Test SIREAD locking +-- + +CREATE TABLE test_table (i int, j int, PRIMARY KEY (i, j)); +INSERT INTO test_table VALUES (1, 1), (2, 2); +VACUUM test_table; +SET enable_seqscan = off; +SET enable_indexonlyscan = off; + +-- Unique quals, found -> heap tuple lock only +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 1 AND j = 1; +SELECT * FROM test_table WHERE i = 1 AND j = 1; +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; +ROLLBACK; + +-- Unique quals, not found -> btree page lock +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 1 AND j = 2; +SELECT * FROM test_table WHERE i = 1 AND j = 2; +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; +ROLLBACK; + +-- Non-unique quals, found -> heap tuple and btree page locks +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 1; +SELECT * FROM test_table WHERE i = 1; +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; +ROLLBACK; + +-- Non-unique quals, not found -> btree page lock +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 3; +SELECT * FROM test_table WHERE i = 3; +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; +ROLLBACK; + +-- The same 4 cases, this time with an index-only scan +RESET enable_indexonlyscan; + +-- Unique quals, found -> heap tuple lock only +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 1 AND j = 1; +SELECT * FROM test_table WHERE i = 1 AND j = 1; +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; +ROLLBACK; + +-- Unique quals, not found -> btree page lock +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 1 AND j = 2; +SELECT * FROM test_table WHERE i = 1 AND j = 2; +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; +ROLLBACK; + +-- Non-unique quals, found -> heap tuple and btree page locks +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 1; +SELECT * FROM test_table WHERE i = 1; +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; +ROLLBACK; + +-- Non-unique quals, not found -> btree page lock +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE; +EXPLAIN (COSTS OFF) +SELECT * FROM test_table WHERE i = 3; +SELECT * FROM test_table WHERE i = 3; +SELECT locktype, relation::regclass, page, tuple FROM pg_locks WHERE mode = 'SIReadLock' AND pid = pg_backend_pid() ORDER BY 1, 2, 3, 4; +ROLLBACK; + +DROP TABLE test_table; -- 2.30.2