Hello, Matthias! Updated patches attached.
Changes: * cleanup test logic a little bit * resolve issue with rescan in GIST (item->blkno == InvalidBlockNumber) * move test to the main isolation suite * add test for SpGist * update comment I mentioned before * allow GIST to set LP_DEAD in cases it is a safe even if LSN is updated Also, seems like SP-GIST version is broken, it fails like this: TRAP: failed Assert("BufferIsValid(so->pagePin)"), File: "../src/backend/access/spgist/spgscan.c", Line: 1139, PID: 612214 FETCH(ExceptionalCondition+0xbe)[0x644a0b9dfdbc] FETCH(spggettuple+0x289)[0x644a0b3743c6] FETCH(index_getnext_tid+0x166)[0x644a0b3382f7] FETCH(+0x3b392b)[0x644a0b56d92b] FETCH(+0x3887df)[0x644a0b5427df] FETCH(ExecScan+0x77)[0x644a0b542858] FETCH(+0x3b3b9b)[0x644a0b56db9b] FETCH(+0x376d8b)[0x644a0b530d8b] FETCH(+0x379bd9)[0x644a0b533bd9] FETCH(standard_ExecutorRun+0x19f)[0x644a0b531457] FETCH(ExecutorRun+0x5a)[0x644a0b5312b5] FETCH(+0x6276dc)[0x644a0b7e16dc] FETCH(+0x628936)[0x644a0b7e2936] FETCH(PortalRunFetch+0x1a0)[0x644a0b7e229c] FETCH(PerformPortalFetch+0x13b)[0x644a0b49d7e5] FETCH(standard_ProcessUtility+0x5f0)[0x644a0b7e3aab] FETCH(ProcessUtility+0x140)[0x644a0b7e34b4] FETCH(+0x627ceb)[0x644a0b7e1ceb] FETCH(+0x627a28)[0x644a0b7e1a28] FETCH(PortalRun+0x273)[0x644a0b7e12bb] FETCH(+0x61fae1)[0x644a0b7d9ae1] FETCH(PostgresMain+0x9eb)[0x644a0b7df170] FETCH(+0x61b3e2)[0x644a0b7d53e2] FETCH(postmaster_child_launch+0x137)[0x644a0b6e6e2d] FETCH(+0x53384b)[0x644a0b6ed84b] FETCH(+0x530f31)[0x644a0b6eaf31] FETCH(PostmasterMain+0x161f)[0x644a0b6ea812] FETCH(main+0x3a1)[0x644a0b5c29cf] Best regards, Mikhail. >
From 59b7746c96cca144c1d1d0362a96d8aa019e2a23 Mon Sep 17 00:00:00 2001 From: nkey <n...@toloka.ai> Date: Fri, 10 Jan 2025 17:55:30 +0100 Subject: [PATCH v5 2/3] RFC: Extend buffer pinning for GIST IOS This should fix issues with incorrect results when a GIST IOS encounters tuples removed from pages by a concurrent vacuum operation. Also, add ability to set LP_DEAD bits in more cases of IOS scans overs GIST. --- src/backend/access/gist/README | 16 ++++ src/backend/access/gist/gistget.c | 46 +++++++++-- src/backend/access/gist/gistscan.c | 115 ++++++++++++++++++++++++--- src/backend/access/gist/gistvacuum.c | 6 +- src/include/access/gist_private.h | 2 + 5 files changed, 166 insertions(+), 19 deletions(-) diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index 8015ff19f05..c7c2afad088 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -287,6 +287,22 @@ would complicate the insertion algorithm. So when an insertion sees a page with F_FOLLOW_RIGHT set, it immediately tries to bring the split that crashed in the middle to completion by adding the downlink in the parent. +Index-only scans and VACUUM +--------------------------- + +Index-only scans require that any tuple returned by the index scan has not +been removed from the index by a call to ambulkdelete through VACUUM. +To ensure this invariant, bulkdelete now requires a buffer cleanup lock, and +every Index-only scan (IOS) will keep a pin on each page that it is returning +tuples from. For ordered scans, we keep one pin for each matching leaf tuple, +for unordered scans we just keep an additional pin while we're still working +on the page's tuples. This ensures that pages seen by the scan won't be +cleaned up until after the tuples have been returned. + +These longer pin lifetimes can cause buffer exhaustion with messages like "no +unpinned buffers available" when the index has many pages that have similar +ordering; but future work can figure out how to best work that out. + Buffering build algorithm ------------------------- diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index cc40e928e0a..adf86fed67b 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -35,7 +35,7 @@ * away and the TID was re-used by a completely different heap tuple. */ static void -gistkillitems(IndexScanDesc scan) +gistkillitems(IndexScanDesc scan, bool pagePinned) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; Buffer buffer; @@ -60,9 +60,10 @@ gistkillitems(IndexScanDesc scan) /* * If page LSN differs it means that the page was modified since the last * read. killedItems could be not valid so LP_DEAD hints applying is not - * safe. + * safe. But in case then page was pinned - it is safe, because VACUUM is + * unable to remove tuples due to locking protocol. */ - if (BufferGetLSNAtomic(buffer) != so->curPageLSN) + if (!pagePinned && BufferGetLSNAtomic(buffer) != so->curPageLSN) { UnlockReleaseBuffer(buffer); so->numKilled = 0; /* reset counter */ @@ -395,6 +396,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, } so->nPageData = so->curPageData = 0; + Assert(so->pagePin == InvalidBuffer); scan->xs_hitup = NULL; /* might point into pageDataCxt */ if (so->pageDataCxt) MemoryContextReset(so->pageDataCxt); @@ -402,7 +404,8 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, /* * We save the LSN of the page as we read it, so that we know whether it * safe to apply LP_DEAD hints to the page later. This allows us to drop - * the pin for MVCC scans, which allows vacuum to avoid blocking. + * the pin for MVCC scans (except index-only scans), which allows vacuum + * to avoid blocking. */ so->curPageLSN = BufferGetLSNAtomic(buffer); @@ -460,6 +463,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, so->pageData[so->nPageData].heapPtr = it->t_tid; so->pageData[so->nPageData].recheck = recheck; so->pageData[so->nPageData].offnum = i; + so->pageData[so->nPageData].buffer = InvalidBuffer; /* * In an index-only scan, also fetch the data from the tuple. The @@ -471,7 +475,18 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, so->pageData[so->nPageData].recontup = gistFetchTuple(giststate, r, it); MemoryContextSwitchTo(oldcxt); + + /* + * Only maintain a single additional buffer pin for unordered + * IOS scans; as we have all data already in one place. + */ + if (so->nPageData == 0) + { + so->pagePin = buffer; + IncrBufferRefCount(buffer); + } } + so->nPageData++; } else @@ -501,7 +516,11 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, * In an index-only scan, also fetch the data from the tuple. */ if (scan->xs_want_itup) + { item->data.heap.recontup = gistFetchTuple(giststate, r, it); + item->data.heap.buffer = buffer; + IncrBufferRefCount(buffer); + } } else { @@ -567,6 +586,10 @@ getNextNearest(IndexScanDesc scan) /* free previously returned tuple */ pfree(scan->xs_hitup); scan->xs_hitup = NULL; + + Assert(BufferIsValid(so->pagePin)); + ReleaseBuffer(so->pagePin); + so->pagePin = InvalidBuffer; } do @@ -588,7 +611,11 @@ getNextNearest(IndexScanDesc scan) /* in an index-only scan, also return the reconstructed tuple. */ if (scan->xs_want_itup) + { + Assert(BufferIsValid(item->data.heap.buffer)); scan->xs_hitup = item->data.heap.recontup; + so->pagePin = item->data.heap.buffer; + } res = true; } else @@ -688,7 +715,6 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) && so->curPageData > 0 && so->curPageData == so->nPageData) { - if (so->killedItems == NULL) { MemoryContext oldCxt = @@ -704,13 +730,21 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) so->killedItems[so->numKilled++] = so->pageData[so->curPageData - 1].offnum; } + + if (scan->xs_want_itup && so->nPageData > 0) + { + Assert(BufferIsValid(so->pagePin)); + ReleaseBuffer(so->pagePin); + so->pagePin = InvalidBuffer; + } + /* find and process the next index page */ do { GISTSearchItem *item; if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0)) - gistkillitems(scan); + gistkillitems(scan, BufferIsValid(so->pagePin)); item = getNextGISTSearchItem(so); diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index 700fa959d03..932c2271510 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -110,6 +110,7 @@ gistbeginscan(Relation r, int nkeys, int norderbys) so->numKilled = 0; so->curBlkno = InvalidBlockNumber; so->curPageLSN = InvalidXLogRecPtr; + so->pagePin = InvalidBuffer; scan->opaque = so; @@ -151,18 +152,73 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, Assert(so->queueCxt == so->giststate->scanCxt); first_time = true; } - else if (so->queueCxt == so->giststate->scanCxt) - { - /* second time through */ - so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt, - "GiST queue context", - ALLOCSET_DEFAULT_SIZES); - first_time = false; - } else { - /* third or later time through */ - MemoryContextReset(so->queueCxt); + /* + * In the first scan of a query we allocate IOS items in the scan + * context, which is never reset. To not leak this memory, we + * manually free the queue entries. + */ + const bool freequeue = so->queueCxt == so->giststate->scanCxt; + /* + * Index-only scans require that vacuum can't clean up entries that + * we're still planning to return, so we hold a pin on the buffer until + * we're past the returned item (1 pin count for every index tuple). + * When rescan is called, however, we need to clean up the pins that + * we still hold, lest we leak them and lose a buffer entry to that + * page. + */ + const bool unpinqueue = scan->xs_want_itup; + + if (freequeue || unpinqueue) + { + while (!pairingheap_is_empty(so->queue)) + { + pairingheap_node *node; + GISTSearchItem *item; + + node = pairingheap_remove_first(so->queue); + item = pairingheap_container(GISTSearchItem, phNode, node); + + /* + * If we need to unpin a buffer for IOS' heap items, do so + * now. + */ + if (unpinqueue && item->blkno == InvalidBlockNumber) + { + Assert(BufferIsValid(item->data.heap.buffer)); + ReleaseBuffer(item->data.heap.buffer); + } + + /* + * item->data.heap.recontup is stored in the separate memory + * context so->pageDataCxt, which is always reset; so we don't + * need to free that. + * "item" itself is allocated into the queue context, which is + * generally reset in rescan. + * However, only in the first scan, we allocate these items + * into the main scan context, which isn't reset; so we must + * free these items, or else we'd leak the memory for the + * duration of the query. + */ + if (freequeue) + pfree(item); + } + } + + if (so->queueCxt == so->giststate->scanCxt) + { + /* second time through */ + so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt, + "GiST queue context", + ALLOCSET_DEFAULT_SIZES); + } + else + { + /* third or later time through */ + MemoryContextReset(so->queueCxt); + } + first_time = false; } @@ -341,6 +397,15 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, /* any previous xs_hitup will have been pfree'd in context resets above */ scan->xs_hitup = NULL; + + if (scan->xs_want_itup) + { + if (BufferIsValid(so->pagePin)) + { + ReleaseBuffer(so->pagePin); + so->pagePin = InvalidBuffer; + } + } } void @@ -348,6 +413,36 @@ gistendscan(IndexScanDesc scan) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + if (scan->xs_want_itup) + { + if (BufferIsValid(so->pagePin)) + { + ReleaseBuffer(so->pagePin); + so->pagePin = InvalidBuffer; + } + + /* unpin any leftover buffers */ + while (!pairingheap_is_empty(so->queue)) + { + pairingheap_node *node; + GISTSearchItem *item; + + /* + * Note: unlike gistrescan, there is no need to actually free the + * items here, as that's handled by memory context reset in the + * call to freeGISTstate() below. + */ + node = pairingheap_remove_first(so->queue); + item = pairingheap_container(GISTSearchItem, phNode, node); + + if (item->blkno == InvalidBlockNumber) + { + Assert(BufferIsValid(item->data.heap.buffer)); + ReleaseBuffer(item->data.heap.buffer); + } + } + } + /* * freeGISTstate is enough to clean up everything made by gistbeginscan, * as well as the queueCxt if there is a separate context for it. diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index fe0bfb781ca..840b3d586ed 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -289,10 +289,10 @@ restart: info->strategy); /* - * We are not going to stay here for a long time, aggressively grab an - * exclusive lock. + * We are not going to stay here for a long time, aggressively grab a + * cleanup lock. */ - LockBuffer(buffer, GIST_EXCLUSIVE); + LockBufferForCleanup(buffer); page = (Page) BufferGetPage(buffer); if (gistPageRecyclable(page)) diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 39404ec7cdb..e559117e7d7 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -124,6 +124,7 @@ typedef struct GISTSearchHeapItem * index-only scans */ OffsetNumber offnum; /* track offset in page to mark tuple as * LP_DEAD */ + Buffer buffer; /* buffer to unpin, when IOS */ } GISTSearchHeapItem; /* Unvisited item, either index page or heap tuple */ @@ -176,6 +177,7 @@ typedef struct GISTScanOpaqueData OffsetNumber curPageData; /* next item to return */ MemoryContext pageDataCxt; /* context holding the fetched tuples, for * index-only scans */ + Buffer pagePin; /* buffer of page, if pinned */ } GISTScanOpaqueData; typedef GISTScanOpaqueData *GISTScanOpaque; -- 2.43.0
From 5423affdba594ca0c1575f4f35c6ec479f82b216 Mon Sep 17 00:00:00 2001 From: nkey <n...@toloka.ai> Date: Fri, 10 Jan 2025 16:22:29 +0100 Subject: [PATCH v5 1/3] isolation tester showing broken index-only scans with GiST and SP-GiST Co-authored-by: Matthias van de Meent <boekewurm+postg...@gmail.com>, Michail Nikolaev <michail.nikol...@gmail.com> --- .../expected/index-only-scan-gist-vacuum.out | 67 +++++++++++ .../index-only-scan-spgist-vacuum.out | 67 +++++++++++ src/test/isolation/isolation_schedule | 2 + .../specs/index-only-scan-gist-vacuum.spec | 113 ++++++++++++++++++ .../specs/index-only-scan-spgist-vacuum.spec | 113 ++++++++++++++++++ 5 files changed, 362 insertions(+) create mode 100644 src/test/isolation/expected/index-only-scan-gist-vacuum.out create mode 100644 src/test/isolation/expected/index-only-scan-spgist-vacuum.out create mode 100644 src/test/isolation/specs/index-only-scan-gist-vacuum.spec create mode 100644 src/test/isolation/specs/index-only-scan-spgist-vacuum.spec diff --git a/src/test/isolation/expected/index-only-scan-gist-vacuum.out b/src/test/isolation/expected/index-only-scan-gist-vacuum.out new file mode 100644 index 00000000000..19117402f52 --- /dev/null +++ b/src/test/isolation/expected/index-only-scan-gist-vacuum.out @@ -0,0 +1,67 @@ +Parsed test spec with 2 sessions + +starting permutation: s2_mod s1_begin s1_prepare_sorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; + +step s1_begin: BEGIN; +step s1_prepare_sorted: + DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)'; + +step s1_fetch_1: + FETCH FROM foo; + + x +------------------ +1.4142135623730951 +(1 row) + +step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; <waiting ...> +step s1_fetch_all: + SELECT pg_sleep_for(INTERVAL '50ms'); + FETCH ALL FROM foo; + +pg_sleep_for +------------ + +(1 row) + +x +- +(0 rows) + +step s2_vacuum: <... completed> +step s1_commit: COMMIT; + +starting permutation: s2_mod s1_begin s1_prepare_unsorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; + +step s1_begin: BEGIN; +step s1_prepare_unsorted: + DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a; + +step s1_fetch_1: + FETCH FROM foo; + +a +----- +(1,1) +(1 row) + +step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; <waiting ...> +step s1_fetch_all: + SELECT pg_sleep_for(INTERVAL '50ms'); + FETCH ALL FROM foo; + +pg_sleep_for +------------ + +(1 row) + +a +- +(0 rows) + +step s2_vacuum: <... completed> +step s1_commit: COMMIT; diff --git a/src/test/isolation/expected/index-only-scan-spgist-vacuum.out b/src/test/isolation/expected/index-only-scan-spgist-vacuum.out new file mode 100644 index 00000000000..19117402f52 --- /dev/null +++ b/src/test/isolation/expected/index-only-scan-spgist-vacuum.out @@ -0,0 +1,67 @@ +Parsed test spec with 2 sessions + +starting permutation: s2_mod s1_begin s1_prepare_sorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; + +step s1_begin: BEGIN; +step s1_prepare_sorted: + DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)'; + +step s1_fetch_1: + FETCH FROM foo; + + x +------------------ +1.4142135623730951 +(1 row) + +step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; <waiting ...> +step s1_fetch_all: + SELECT pg_sleep_for(INTERVAL '50ms'); + FETCH ALL FROM foo; + +pg_sleep_for +------------ + +(1 row) + +x +- +(0 rows) + +step s2_vacuum: <... completed> +step s1_commit: COMMIT; + +starting permutation: s2_mod s1_begin s1_prepare_unsorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; + +step s1_begin: BEGIN; +step s1_prepare_unsorted: + DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a; + +step s1_fetch_1: + FETCH FROM foo; + +a +----- +(1,1) +(1 row) + +step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; <waiting ...> +step s1_fetch_all: + SELECT pg_sleep_for(INTERVAL '50ms'); + FETCH ALL FROM foo; + +pg_sleep_for +------------ + +(1 row) + +a +- +(0 rows) + +step s2_vacuum: <... completed> +step s1_commit: COMMIT; diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 143109aa4da..9720c9a2dc8 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -17,6 +17,8 @@ test: partial-index test: two-ids test: multiple-row-versions test: index-only-scan +test: index-only-scan-gist-vacuum +test: index-only-scan-spgist-vacuum test: predicate-lock-hot-tuple test: update-conflict-out test: deadlock-simple diff --git a/src/test/isolation/specs/index-only-scan-gist-vacuum.spec b/src/test/isolation/specs/index-only-scan-gist-vacuum.spec new file mode 100644 index 00000000000..b1688d44fa7 --- /dev/null +++ b/src/test/isolation/specs/index-only-scan-gist-vacuum.spec @@ -0,0 +1,113 @@ +# index-only-scan test showing wrong results with GiST +# +setup +{ + -- by using a low fillfactor and a wide tuple we can get multiple blocks + -- with just few rows + CREATE TABLE ios_needs_cleanup_lock (a point NOT NULL, b int not null, pad char(1024) default '') + WITH (AUTOVACUUM_ENABLED = false, FILLFACTOR = 10); + + INSERT INTO ios_needs_cleanup_lock SELECT point(g.i, g.i), g.i FROM generate_series(1, 10) g(i); + + CREATE INDEX ios_spgist_a ON ios_needs_cleanup_lock USING gist(a); +} +setup +{ + VACUUM ios_needs_cleanup_lock; +} + +teardown +{ + DROP TABLE ios_needs_cleanup_lock; +} + + +session s1 + +# Force an index-only scan, where possible: +setup { + SET enable_bitmapscan = false; + SET enable_indexonlyscan = true; + SET enable_indexscan = true; +} + +step s1_begin { BEGIN; } +step s1_commit { COMMIT; } + +step s1_prepare_sorted { + DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)'; +} + +step s1_prepare_unsorted { + DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a; +} + +step s1_fetch_1 { + FETCH FROM foo; +} + +step s1_fetch_all { + SELECT pg_sleep_for(INTERVAL '50ms'); + FETCH ALL FROM foo; +} + + +session s2 + +# Don't delete row 1 so we have a row for the cursor to "rest" on. +step s2_mod +{ + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; +} + +# Disable truncation, as otherwise we'll just wait for a timeout while trying +# to acquire the lock +step s2_vacuum { VACUUM (TRUNCATE false) ios_needs_cleanup_lock; } + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_sorted + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum (*) + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_unsorted + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum (*) + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit diff --git a/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec b/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec new file mode 100644 index 00000000000..b414c5d1695 --- /dev/null +++ b/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec @@ -0,0 +1,113 @@ +# index-only-scan test showing wrong results with SPGiST +# +setup +{ + -- by using a low fillfactor and a wide tuple we can get multiple blocks + -- with just few rows + CREATE TABLE ios_needs_cleanup_lock (a point NOT NULL, b int not null, pad char(1024) default '') + WITH (AUTOVACUUM_ENABLED = false, FILLFACTOR = 10); + + INSERT INTO ios_needs_cleanup_lock SELECT point(g.i, g.i), g.i FROM generate_series(1, 10) g(i); + + CREATE INDEX ios_spgist_a ON ios_needs_cleanup_lock USING spgist(a); +} +setup +{ + VACUUM ios_needs_cleanup_lock; +} + +teardown +{ + DROP TABLE ios_needs_cleanup_lock; +} + + +session s1 + +# Force an index-only scan, where possible: +setup { + SET enable_bitmapscan = false; + SET enable_indexonlyscan = true; + SET enable_indexscan = true; +} + +step s1_begin { BEGIN; } +step s1_commit { COMMIT; } + +step s1_prepare_sorted { + DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)'; +} + +step s1_prepare_unsorted { + DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a; +} + +step s1_fetch_1 { + FETCH FROM foo; +} + +step s1_fetch_all { + SELECT pg_sleep_for(INTERVAL '50ms'); + FETCH ALL FROM foo; +} + + +session s2 + +# Don't delete row 1 so we have a row for the cursor to "rest" on. +step s2_mod +{ + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; +} + +# Disable truncation, as otherwise we'll just wait for a timeout while trying +# to acquire the lock +step s2_vacuum { VACUUM (TRUNCATE false) ios_needs_cleanup_lock; } + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_sorted + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum (*) + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_unsorted + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum (*) + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit -- 2.43.0
From bf5e033d291a9dcfdd95ca95d576f5b40a8d34c2 Mon Sep 17 00:00:00 2001 From: nkey <n...@toloka.ai> Date: Fri, 10 Jan 2025 18:00:49 +0100 Subject: [PATCH v5 3/3] RFC: Extend buffer pinning for SP-GIST IOS This should fix issues with incorrect results when a SP-GIST IOS encounters tuples removed from pages by a concurrent vacuum operation. --- src/backend/access/spgist/spgscan.c | 112 ++++++++++++++++++++++---- src/backend/access/spgist/spgvacuum.c | 2 +- src/include/access/spgist_private.h | 3 + 3 files changed, 100 insertions(+), 17 deletions(-) diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c index 986362a777f..32e6a0a8a03 100644 --- a/src/backend/access/spgist/spgscan.c +++ b/src/backend/access/spgist/spgscan.c @@ -30,7 +30,8 @@ typedef void (*storeRes_func) (SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, bool isNull, SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *distances); + bool recheckDistances, double *distances, + Buffer pin); /* * Pairing heap comparison function for the SpGistSearchItem queue. @@ -300,6 +301,38 @@ spgPrepareScanKeys(IndexScanDesc scan) } } +/* + * Note: This removes all items from the pairingheap. + */ +static void +spgScanEndDropAllPagePins(IndexScanDesc scan, SpGistScanOpaque so) +{ + /* Guaranteed no pinned pages */ + if (so->scanQueue == NULL || !scan->xs_want_itup) + return; + + if (so->nPtrs > 0) + { + Assert(BufferIsValid(so->pagePin)); + ReleaseBuffer(so->pagePin); + so->pagePin = InvalidBuffer; + } + + while (!pairingheap_is_empty(so->scanQueue)) + { + pairingheap_node *node; + SpGistSearchItem *item; + + node = pairingheap_remove_first(so->scanQueue); + item = pairingheap_container(SpGistSearchItem, phNode, node); + if (!item->isLeaf) + continue; + + Assert(BufferIsValid(item->buffer)); + ReleaseBuffer(item->buffer); + } +} + IndexScanDesc spgbeginscan(Relation rel, int keysz, int orderbysz) { @@ -416,6 +449,9 @@ spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, /* preprocess scankeys, set up the representation in *so */ spgPrepareScanKeys(scan); + /* release any pinned buffers from earlier rescans */ + spgScanEndDropAllPagePins(scan, so); + /* set up starting queue entries */ resetSpGistScanOpaque(so); @@ -428,6 +464,12 @@ spgendscan(IndexScanDesc scan) { SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + /* + * release any pinned buffers from earlier rescans, before we drop their + * data by dropping the memory contexts. + */ + spgScanEndDropAllPagePins(scan, so); + MemoryContextDelete(so->tempCxt); MemoryContextDelete(so->traversalCxt); @@ -460,7 +502,7 @@ spgendscan(IndexScanDesc scan) static SpGistSearchItem * spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, Datum leafValue, bool recheck, bool recheckDistances, - bool isnull, double *distances) + bool isnull, double *distances, Buffer addPin) { SpGistSearchItem *item = spgAllocSearchItem(so, isnull, distances); @@ -479,6 +521,10 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, datumCopy(leafValue, so->state.attType.attbyval, so->state.attType.attlen); + Assert(BufferIsValid(addPin)); + IncrBufferRefCount(addPin); + item->buffer = addPin; + /* * If we're going to need to reconstruct INCLUDE attributes, store the * whole leaf tuple so we can get the INCLUDE attributes out of it. @@ -495,6 +541,7 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, { item->value = (Datum) 0; item->leafTuple = NULL; + item->buffer = InvalidBuffer; } item->traversalValue = NULL; item->isLeaf = true; @@ -513,7 +560,7 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, static bool spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, SpGistLeafTuple leafTuple, bool isnull, - bool *reportedSome, storeRes_func storeRes) + bool *reportedSome, storeRes_func storeRes, Buffer buffer) { Datum leafValue; double *distances; @@ -580,7 +627,8 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, recheck, recheckDistances, isnull, - distances); + distances, + buffer); spgAddSearchItemToQueue(so, heapItem); @@ -591,7 +639,7 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, /* non-ordered scan, so report the item right away */ Assert(!recheckDistances); storeRes(so, &leafTuple->heapPtr, leafValue, isnull, - leafTuple, recheck, false, NULL); + leafTuple, recheck, false, NULL, InvalidBuffer); *reportedSome = true; } } @@ -760,7 +808,7 @@ enum SpGistSpecialOffsetNumbers static OffsetNumber spgTestLeafTuple(SpGistScanOpaque so, SpGistSearchItem *item, - Page page, OffsetNumber offset, + Page page, OffsetNumber offset, Buffer buffer, bool isnull, bool isroot, bool *reportedSome, storeRes_func storeRes) @@ -799,7 +847,8 @@ spgTestLeafTuple(SpGistScanOpaque so, Assert(ItemPointerIsValid(&leafTuple->heapPtr)); - spgLeafTest(so, item, leafTuple, isnull, reportedSome, storeRes); + spgLeafTest(so, item, leafTuple, isnull, reportedSome, storeRes, + buffer); return SGLT_GET_NEXTOFFSET(leafTuple); } @@ -835,7 +884,8 @@ redirect: Assert(so->numberOfNonNullOrderBys > 0); storeRes(so, &item->heapPtr, item->value, item->isNull, item->leafTuple, item->recheck, - item->recheckDistances, item->distances); + item->recheckDistances, item->distances, + item->buffer); reportedSome = true; } else @@ -873,7 +923,7 @@ redirect: /* When root is a leaf, examine all its tuples */ for (offset = FirstOffsetNumber; offset <= max; offset++) (void) spgTestLeafTuple(so, item, page, offset, - isnull, true, + buffer, isnull, true, &reportedSome, storeRes); } else @@ -883,10 +933,24 @@ redirect: { Assert(offset >= FirstOffsetNumber && offset <= max); offset = spgTestLeafTuple(so, item, page, offset, - isnull, false, + buffer, isnull, false, &reportedSome, storeRes); if (offset == SpGistRedirectOffsetNumber) + { + Assert(so->nPtrs == 0); goto redirect; + } + } + + /* + * IOS: Make sure we have one additional pin on the buffer, + * so that vacuum won't remove any deleted TIDs and mark + * their pages ALL_VISIBLE while we still have a copy. + */ + if (so->want_itup && reportedSome) + { + IncrBufferRefCount(buffer); + so->pagePin = buffer; } } } @@ -929,9 +993,10 @@ static void storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, bool isnull, SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *distances) + bool recheckDistances, double *distances, + Buffer pin) { - Assert(!recheckDistances && !distances); + Assert(!recheckDistances && !distances && !BufferIsValid(pin)); tbm_add_tuples(so->tbm, heapPtr, 1, recheck); so->ntids++; } @@ -954,10 +1019,9 @@ spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm) /* storeRes subroutine for gettuple case */ static void -storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, - Datum leafValue, bool isnull, - SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *nonNullDistances) +storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, + bool isnull, SpGistLeafTuple leafTuple, bool recheck, + bool recheckDistances, double *nonNullDistances, Buffer pin) { Assert(so->nPtrs < MaxIndexTuplesPerPage); so->heapPtrs[so->nPtrs] = *heapPtr; @@ -1016,6 +1080,10 @@ storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, so->reconTups[so->nPtrs] = heap_form_tuple(so->reconTupDesc, leafDatums, leafIsnulls); + + /* move the buffer pin, if required */ + if (BufferIsValid(pin)) + so->pagePin = pin; } so->nPtrs++; } @@ -1065,7 +1133,19 @@ spggettuple(IndexScanDesc scan, ScanDirection dir) for (i = 0; i < so->nPtrs; i++) pfree(so->reconTups[i]); + + if (so->nPtrs > 0) + { + Assert(BufferIsValid(so->pagePin)); + ReleaseBuffer(so->pagePin); + so->pagePin = InvalidBuffer; + } } + else + { + Assert(!BufferIsValid(so->pagePin)); + } + so->iPtr = so->nPtrs = 0; spgWalk(scan->indexRelation, so, false, storeGettuple); diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 894aefa19e1..f02c270c5cc 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -629,7 +629,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno) buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, bds->info->strategy); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + LockBufferForCleanup(buffer); page = (Page) BufferGetPage(buffer); if (PageIsNew(page)) diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h index cb43a278f46..1948e53e2ff 100644 --- a/src/include/access/spgist_private.h +++ b/src/include/access/spgist_private.h @@ -175,6 +175,8 @@ typedef struct SpGistSearchItem bool isLeaf; /* SearchItem is heap item */ bool recheck; /* qual recheck is needed */ bool recheckDistances; /* distance recheck is needed */ + Buffer buffer; /* buffer pinned for this leaf tuple + * (IOS-only) */ /* array with numberOfOrderBys entries */ double distances[FLEXIBLE_ARRAY_MEMBER]; @@ -226,6 +228,7 @@ typedef struct SpGistScanOpaqueData TupleDesc reconTupDesc; /* if so, descriptor for reconstructed tuples */ int nPtrs; /* number of TIDs found on current page */ int iPtr; /* index for scanning through same */ + Buffer pagePin; /* output tuple's pinned buffer, if IOS */ ItemPointerData heapPtrs[MaxIndexTuplesPerPage]; /* TIDs from cur page */ bool recheck[MaxIndexTuplesPerPage]; /* their recheck flags */ bool recheckDistances[MaxIndexTuplesPerPage]; /* distance recheck -- 2.43.0